Skip to content

Commit c00c00e

Browse files
authored
Merge pull request #27 from EikeSchwass/feature/26_CustomizeStyleNamingConventions
Allow customizing DOCX style id mappings for headers, quote, etc.
2 parents a8040c2 + 00d6c56 commit c00c00e

5 files changed

Lines changed: 300 additions & 76 deletions

File tree

src/DocSharp.Docx/DocxToHtml/DocxToHtmlConverter.Paragraph.cs

Lines changed: 14 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,5 @@
1-
using System;
21
using System.Collections.Generic;
32
using System.Globalization;
4-
using System.Linq;
5-
using System.Text;
6-
using System.Threading.Tasks;
73
using DocSharp.Helpers;
84
using DocSharp.Writers;
95
using DocumentFormat.OpenXml.Wordprocessing;
@@ -19,42 +15,34 @@ internal override void ProcessParagraph(Paragraph paragraph, HtmlTextWriter sb)
1915
var numberingProperties = paragraph.GetEffectiveProperty<NumberingProperties>();
2016
var styleName = paragraph.GetStyleName();
2117

22-
if (styleName != null)
18+
// Check if the style can be mapped to heading, quote block or code block.
19+
if (StyleNamingResolver.TryGetStyleType(styleName, out var styleType))
2320
{
24-
// Check if the style can be mapped to heading, quote block or code block.
25-
switch (styleName.ToLowerInvariant())
21+
switch (styleType)
2622
{
27-
case "heading 1":
28-
case "heading1":
29-
case "title":
23+
case StyleType.Header1:
3024
tag = "h1";
3125
break;
32-
case "heading 2":
33-
case "heading2":
34-
case "subtitle":
26+
case StyleType.Header2:
3527
tag = "h2";
3628
break;
37-
case "heading 3":
38-
case "heading3":
29+
case StyleType.Header3:
3930
tag = "h3";
4031
break;
41-
case "heading 4":
42-
case "heading4":
32+
case StyleType.Header4:
4333
tag = "h4";
4434
break;
45-
case "heading 5":
46-
case "heading5":
35+
case StyleType.Header5:
4736
tag = "h5";
4837
break;
49-
case "heading 6":
50-
case "heading6":
38+
case StyleType.Header6:
5139
tag = "h6";
5240
break;
53-
case "quote":
54-
case "intense quote":
41+
case StyleType.Quote:
42+
case StyleType.IntenseQuote:
5543
tag = "blockquote";
5644
break;
57-
case "html preformatted": // This style is created by Microsoft Word when an HTML file is saved as DOCX
45+
case StyleType.HtmlPreformatted: // This style is created by Microsoft Word when an HTML file is saved as DOCX
5846
tag = "pre";
5947
break;
6048
}
@@ -88,7 +76,7 @@ internal override void ProcessParagraph(Paragraph paragraph, HtmlTextWriter sb)
8876
// var direction = paragraph.GetEffectiveProperty<TextDirection>() ??
8977
// cell.GetEffectiveProperty<TextDirection>();
9078
// // Direction is not applied to regular paragraphs in DOCX but only in table cells and text boxes
91-
79+
9280
if (alignment != null)
9381
{
9482
if (alignment == JustificationValues.Left || alignment == JustificationValues.Start)
@@ -121,7 +109,7 @@ internal override void ProcessParagraph(Paragraph paragraph, HtmlTextWriter sb)
121109

122110
if (paragraph.GetEffectiveBorder<TopBorder>() is TopBorder topBorder)
123111
ProcessBorder(topBorder, MapParagraphBorderAttribute(topBorder), ref styles);
124-
112+
125113
if (paragraph.GetEffectiveBorder<BottomBorder>() is BottomBorder bottomBorder)
126114
ProcessBorder(bottomBorder, MapParagraphBorderAttribute(bottomBorder), ref styles);
127115
// In the current implementation both BottomBorder and BetweenBorder are mapped to border-bottom in HTML,

src/DocSharp.Docx/DocxToHtml/DocxToHtmlConverter.cs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,11 @@ public partial class DocxToHtmlConverter : DocxToXmlWriterBase<HtmlTextWriter>
7373
/// </summary>
7474
public bool ExportFootnotesEndnotes { get; set; } = true;
7575

76+
/// <summary>
77+
/// Used to map DOCX styles by name. The default <see cref="DefaultStyleNamingResolver"/> can be overriden to customize style mappings.
78+
/// </summary>
79+
public IStyleNamingResolver StyleNamingResolver { get; set; } = new DefaultStyleNamingResolver();
80+
7681
internal override void ProcessDocument(Document document, HtmlTextWriter sb)
7782
{
7883
sb.WriteHtmlHeader(document.MainDocumentPart?.OpenXmlPackage.PackageProperties.Title);

src/DocSharp.Docx/DocxToMarkdownConverter.cs

Lines changed: 47 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,11 @@ public class DocxToMarkdownConverter : DocxToStringWriterBase<MarkdownStringWrit
8181
/// </summary>
8282
public bool ExportFootnotesEndnotes { get; set; } = true;
8383

84+
/// <summary>
85+
/// Used to map DOCX styles by name. The default <see cref="DefaultStyleNamingResolver"/> can be overriden to customize style mappings.
86+
/// </summary>
87+
public IStyleNamingResolver StyleNamingResolver { get; set; } = new DefaultStyleNamingResolver();
88+
8489
private bool isInEmphasis = false;
8590
private bool isAllCaps = false;
8691
private bool isInCodeBlockParagraph = false;
@@ -106,9 +111,9 @@ internal override void ProcessSection((List<OpenXmlElement> content, SectionProp
106111
base.ProcessSection(section, mainPart, writer);
107112

108113
// Add horizontal rule between sections
109-
if (section != Sections[Sections.Count - 1])
114+
if (section != Sections[Sections.Count - 1])
110115
{
111-
writer.WriteHorizontalLine();
116+
writer.WriteHorizontalLine();
112117
}
113118
}
114119

@@ -133,42 +138,34 @@ internal override void ProcessParagraph(Paragraph paragraph, MarkdownStringWrite
133138
bool isCode = false;
134139

135140
var styleName = paragraph.GetStyleName();
136-
if (styleName != null)
141+
// Check if the style can be mapped to heading, quote block or code block.
142+
if (StyleNamingResolver.TryGetStyleType(styleName, out var styleType))
137143
{
138-
// Check if the style can be mapped to heading, quote block or code block.
139-
switch (styleName.ToLowerInvariant())
144+
switch (styleType)
140145
{
141-
case "heading 1":
142-
case "heading1":
143-
case "title":
146+
case StyleType.Header1:
144147
sb.Write("# ");
145148
break;
146-
case "heading 2":
147-
case "heading2":
148-
case "subtitle":
149+
case StyleType.Header2:
149150
sb.Write("## ");
150151
break;
151-
case "heading 3":
152-
case "heading3":
152+
case StyleType.Header3:
153153
sb.Write("### ");
154154
break;
155-
case "heading 4":
156-
case "heading4":
155+
case StyleType.Header4:
157156
sb.Write("#### ");
158157
break;
159-
case "heading 5":
160-
case "heading5":
158+
case StyleType.Header5:
161159
sb.Write("##### ");
162160
break;
163-
case "heading 6":
164-
case "heading6":
161+
case StyleType.Header6:
165162
sb.Write("###### ");
166163
break;
167-
case "quote":
168-
case "intense quote":
164+
case StyleType.Quote:
165+
case StyleType.IntenseQuote:
169166
sb.Write("> ");
170167
break;
171-
case "html preformatted": // This style is created by Microsoft Word when an HTML file is saved as DOCX
168+
case StyleType.HtmlPreformatted:
172169
isCode = true;
173170
break;
174171
}
@@ -264,7 +261,7 @@ internal void ProcessListItem(NumberingProperties numPr, MarkdownStringWriter sb
264261
}
265262
else
266263
{
267-
int startNumber = levelOverride?.StartOverrideNumberingValue?.Val ??
264+
int startNumber = levelOverride?.StartOverrideNumberingValue?.Val ??
268265
levelOverrideLevel?.StartNumberingValue?.Val ??
269266
level.StartNumberingValue?.Val ?? 1;
270267
sb.Write($"{startNumber}. "); // Markdown renderers will automatically increase the number.
@@ -280,7 +277,7 @@ internal override void ProcessRun(Run run, MarkdownStringWriter sb)
280277
{
281278
return;
282279
}
283-
280+
284281
var text = run.GetFirstChild<Text>();
285282
bool hasText = text != null && !string.IsNullOrEmpty(text.InnerText);
286283
if (hasText && text!.InnerText.All(char.IsWhiteSpace))
@@ -309,7 +306,7 @@ internal override void ProcessRun(Run run, MarkdownStringWriter sb)
309306
isBold = OpenXmlHelpers.GetEffectiveProperty<Bold>(run) is Bold b && (b.Val is null || b.Val);
310307
isItalic = OpenXmlHelpers.GetEffectiveProperty<Italic>(run) is Italic i && (i.Val is null || i.Val);
311308

312-
isUnderline = OpenXmlHelpers.GetEffectiveProperty<Underline>(run) is Underline u &&
309+
isUnderline = OpenXmlHelpers.GetEffectiveProperty<Underline>(run) is Underline u &&
313310
u.Val != null && u.Val != UnderlineValues.None;
314311

315312
isStrikethrough = (OpenXmlHelpers.GetEffectiveProperty<DoubleStrike>(run) is DoubleStrike ds &&
@@ -359,11 +356,11 @@ internal override void ProcessRun(Run run, MarkdownStringWriter sb)
359356
// Check if the style should be mapped to an inline code element
360357
var styleName = run.GetStyleName();
361358
bool isCode = false;
362-
if ((styleName != null && styleName.Equals("html code", StringComparison.OrdinalIgnoreCase)) ||
363-
(CodeFontFamilies != null &&
364-
run.GetEffectiveProperty<RunFonts>() is RunFonts rf && rf?.Ascii?.Value != null &&
359+
if ((styleName != null && styleName.Equals("html code", StringComparison.OrdinalIgnoreCase)) ||
360+
(CodeFontFamilies != null &&
361+
run.GetEffectiveProperty<RunFonts>() is RunFonts rf && rf?.Ascii?.Value != null &&
365362
CodeFontFamilies.Contains(rf.Ascii.Value)))
366-
// (the "HTML Code" style is created by Microsoft Word when an HTML file is saved as DOCX)
363+
// (the "HTML Code" style is created by Microsoft Word when an HTML file is saved as DOCX)
367364
{
368365
isCode = true;
369366
}
@@ -447,7 +444,7 @@ internal override void ProcessBreak(Break br, MarkdownStringWriter sb)
447444
{
448445
sb.WriteLine();
449446
}
450-
else
447+
else
451448
{
452449
sb.Write("<br>");
453450
// (avoid standard soft break with two trailing spaces as it causes issues in lists and tables)
@@ -486,7 +483,7 @@ internal override void ProcessTable(Table table, MarkdownStringWriter sb)
486483
EnsureEmptyLine(sb);
487484

488485
int rowIndex = 0;
489-
foreach(var element in table.Elements())
486+
foreach (var element in table.Elements())
490487
{
491488
switch (element)
492489
{
@@ -524,7 +521,7 @@ internal void ProcessRow(TableRow tableRow, MarkdownStringWriter sb, int maxCell
524521
case TableCell cell:
525522
ProcessCell(cell, sb);
526523
++currentCellCount;
527-
if (currentCellCount < maxCellsCount &&
524+
if (currentCellCount < maxCellsCount &&
528525
cell.TableCellProperties?.GridSpan?.Val != null)
529526
{
530527
// Markdown does not support merged cells, add another empty cell for consistency.
@@ -589,11 +586,11 @@ internal void WriteHyperlink(string displayText, string target, bool isAnchor, s
589586
{
590587
if (isAnchor)
591588
target = "#" + target;
592-
else
589+
else
593590
// Microsoft Word already escapes spaces, but other DOCX writers may not do it,
594591
// causing the link not to be recognized properly by some Markdown processors.
595592
target = target.Replace(" ", "%20");
596-
593+
597594
sb.Write($"[{displayText}]({target}");
598595
if (!string.IsNullOrWhiteSpace(tooltip))
599596
{
@@ -613,7 +610,7 @@ internal override void ProcessDrawing(Drawing drawing, MarkdownStringWriter sb)
613610
string? tooltip = drawing.Inline.DocProperties?.HyperlinkOnClick?.Tooltip?.Value;
614611

615612
var mainDocumentPart = OpenXmlHelpers.GetMainDocumentPart(drawing);
616-
if (blip.Descendants<SVGBlip>().FirstOrDefault() is SVGBlip svgBlip &&
613+
if (blip.Descendants<SVGBlip>().FirstOrDefault() is SVGBlip svgBlip &&
617614
svgBlip.Embed?.Value is string svgRelId)
618615
{
619616
// Prefer the actual SVG image as web browsers can display it.
@@ -651,7 +648,7 @@ internal void ProcessImagePart(OpenXmlPart? rootPart, string relId, MarkdownStri
651648
ImagesOutputFolder = ImagesOutputFolder!.ReplaceAll(['/', '\\'], Path.DirectorySeparatorChar);
652649
if (!ImagesOutputFolder.EndsWith(Path.DirectorySeparatorChar))
653650
ImagesOutputFolder += Path.DirectorySeparatorChar;
654-
651+
655652
// Try to create the output directory if it doesn't exist.
656653
if (!Directory.Exists(ImagesOutputFolder))
657654
Directory.CreateDirectory(ImagesOutputFolder);
@@ -697,16 +694,16 @@ internal void ProcessImagePart(OpenXmlPart? rootPart, string relId, MarkdownStri
697694
catch (Exception ex)
698695
{
699696
// Image retrieval failed (probably format is not supported by the image converter, or the output directory is not writeable).
700-
#if DEBUG
701-
Debug.WriteLine("ProcessImagePart error: " + ex.Message);
702-
#endif
697+
#if DEBUG
698+
Debug.WriteLine("ProcessImagePart error: " + ex.Message);
699+
#endif
703700

704701
// Delete the image file and don't add a reference to it in Markdown.
705702
if (File.Exists(actualFilePath))
706703
File.Delete(actualFilePath);
707704
return;
708705
}
709-
706+
710707
if (File.Exists(actualFilePath))
711708
{
712709
Uri uri;
@@ -716,18 +713,18 @@ internal void ProcessImagePart(OpenXmlPart? rootPart, string relId, MarkdownStri
716713
}
717714
else
718715
{
719-
string baseUri = UriHelpers.NormalizeBaseUri(ImagesBaseUriOverride);
716+
string baseUri = UriHelpers.NormalizeBaseUri(ImagesBaseUriOverride);
720717
uri = new Uri(baseUri + fileName, UriKind.RelativeOrAbsolute);
721718
}
722719
EnsureWhiteSpace(sb);
723-
724-
if (hyperlinkId != null &&
720+
721+
if (hyperlinkId != null &&
725722
(rootPart.OpenXmlPackage as WordprocessingDocument)?.MainDocumentPart?.HyperlinkRelationships.FirstOrDefault(x => x.Id == hyperlinkId) is HyperlinkRelationship relationship)
726723
{
727724
// Image with hyperlink
728725
WriteHyperlink($"![{relId}]({uri.ToString().Replace(" ", "%20")})", relationship.Uri.OriginalString, false, hyperlinkTooltip, sb);
729726
}
730-
else
727+
else
731728
{
732729
// Regular image
733730
sb.Write($"![{relId}]({uri.ToString().Replace(" ", "%20")})");
@@ -738,9 +735,9 @@ internal void ProcessImagePart(OpenXmlPart? rootPart, string relId, MarkdownStri
738735
catch (Exception ex)
739736
{
740737
// Other generic error during image retrieval, don't stop the whole conversion.
741-
#if DEBUG
742-
Debug.WriteLine("ProcessImagePart error: " + ex.Message);
743-
#endif
738+
#if DEBUG
739+
Debug.WriteLine("ProcessImagePart error: " + ex.Message);
740+
#endif
744741
}
745742
}
746743

@@ -773,7 +770,7 @@ internal override void ProcessSymbolChar(SymbolChar symbolChar, MarkdownStringWr
773770
htmlEntity = $"&#{decimalValue};";
774771
}
775772
sb.Write(htmlEntity);
776-
}
773+
}
777774
}
778775

779776
internal override void ProcessMathElement(OpenXmlElement element, MarkdownStringWriter sb)
@@ -784,7 +781,7 @@ internal override void ProcessMathElement(OpenXmlElement element, MarkdownString
784781
// TODO: Ensure blank line before ?
785782
foreach (var subElement in oMathPara.Elements())
786783
{
787-
if (subElement is M.OfficeMath ||
784+
if (subElement is M.OfficeMath ||
788785
subElement is M.Run)
789786
{
790787
ProcessMathElement(subElement, sb);
@@ -825,7 +822,7 @@ internal override void ProcessMathElement(OpenXmlElement element, MarkdownString
825822
#endif
826823
}
827824
if (!string.IsNullOrWhiteSpace(latex))
828-
{
825+
{
829826
sb.Write($" $` {latex} `$ ");
830827
}
831828
if (element.LastChild != null && !element.LastChild.IsMathElement())

0 commit comments

Comments
 (0)