From 24e81a910400a4391b5bb47e2bac95846c3ef04c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89amonn=20McManus?= Date: Tue, 7 Apr 2026 14:02:31 -0700 Subject: [PATCH] Initial support for fenced code blocks in Markdown Javadoc. This removes one of the main areas where Markdown comments would be mangled. PiperOrigin-RevId: 896076206 --- .../java/javadoc/JavadocFormatter.java | 2 + .../java/javadoc/JavadocLexer.java | 25 ++++++-- .../java/javadoc/JavadocWriter.java | 33 ++++++++++- .../java/javadoc/MarkdownPositions.java | 36 ++++++++++-- .../googlejavaformat/java/javadoc/Token.java | 21 +++++++ .../java/JavadocFormattingTest.java | 54 ++++++++++++++++++ .../java/javadoc/MarkdownPositionsTest.java | 57 +++++++++++++++++++ 7 files changed, 215 insertions(+), 13 deletions(-) diff --git a/core/src/main/java/com/google/googlejavaformat/java/javadoc/JavadocFormatter.java b/core/src/main/java/com/google/googlejavaformat/java/javadoc/JavadocFormatter.java index 7538b40dd..bb39a0f22 100644 --- a/core/src/main/java/com/google/googlejavaformat/java/javadoc/JavadocFormatter.java +++ b/core/src/main/java/com/google/googlejavaformat/java/javadoc/JavadocFormatter.java @@ -40,6 +40,7 @@ import com.google.googlejavaformat.java.javadoc.Token.ListItemOpenTag; import com.google.googlejavaformat.java.javadoc.Token.ListOpenTag; import com.google.googlejavaformat.java.javadoc.Token.Literal; +import com.google.googlejavaformat.java.javadoc.Token.MarkdownFencedCodeBlock; import com.google.googlejavaformat.java.javadoc.Token.MoeBeginStripComment; import com.google.googlejavaformat.java.javadoc.Token.MoeEndStripComment; import com.google.googlejavaformat.java.javadoc.Token.OptionalLineBreak; @@ -134,6 +135,7 @@ private static String render(List input, int blockIndent, boolean classic case ParagraphCloseTag unused -> {} case ListItemCloseTag unused -> {} case OptionalLineBreak unused -> {} + case MarkdownFencedCodeBlock t -> output.writeMarkdownFencedCodeBlock(t); } } throw new AssertionError(); diff --git a/core/src/main/java/com/google/googlejavaformat/java/javadoc/JavadocLexer.java b/core/src/main/java/com/google/googlejavaformat/java/javadoc/JavadocLexer.java index 404396018..68640a926 100644 --- a/core/src/main/java/com/google/googlejavaformat/java/javadoc/JavadocLexer.java +++ b/core/src/main/java/com/google/googlejavaformat/java/javadoc/JavadocLexer.java @@ -127,11 +127,26 @@ private ImmutableList generateTokens() throws LexException { tokens.add(token); while (!input.isExhausted()) { - for (Token markdownToken : markdownPositions.tokensAt(input.position())) { - boolean consumed = input.tryConsume(markdownToken.value()); - verify(consumed, "Did not consume markdown token: %s", markdownToken); - var unused = input.readAndResetRecorded(); - tokens.add(markdownToken); + boolean moreMarkdown; + do { + moreMarkdown = false; + // If there are one or more markdown tokens at the current position, consume their text and + // add them to the token list. If a token has non-empty text, consuming its text changes the + // position, so we need to start looking for markdown tokens at the new position. It is + // assumed that there are no other tokens (markdown or otherwise) in a non-empty text span + // covered by a markdown token. + for (Token markdownToken : markdownPositions.tokensAt(input.position())) { + tokens.add(markdownToken); + if (!markdownToken.value().isEmpty()) { + boolean consumed = input.tryConsume(markdownToken.value()); + verify(consumed, "Did not consume markdown token: %s", markdownToken); + var unused = input.readAndResetRecorded(); + moreMarkdown = true; + } + } + } while (moreMarkdown); + if (input.isExhausted()) { + break; } token = readToken(); tokens.add(token); diff --git a/core/src/main/java/com/google/googlejavaformat/java/javadoc/JavadocWriter.java b/core/src/main/java/com/google/googlejavaformat/java/javadoc/JavadocWriter.java index b851a1240..019e129fc 100644 --- a/core/src/main/java/com/google/googlejavaformat/java/javadoc/JavadocWriter.java +++ b/core/src/main/java/com/google/googlejavaformat/java/javadoc/JavadocWriter.java @@ -23,6 +23,7 @@ import static com.google.googlejavaformat.java.javadoc.JavadocWriter.RequestedWhitespace.NONE; import static com.google.googlejavaformat.java.javadoc.JavadocWriter.RequestedWhitespace.WHITESPACE; +import com.google.googlejavaformat.java.javadoc.Token.BrTag; import com.google.googlejavaformat.java.javadoc.Token.CodeCloseTag; import com.google.googlejavaformat.java.javadoc.Token.CodeOpenTag; import com.google.googlejavaformat.java.javadoc.Token.FooterJavadocTagStart; @@ -33,6 +34,7 @@ import com.google.googlejavaformat.java.javadoc.Token.ListItemOpenTag; import com.google.googlejavaformat.java.javadoc.Token.ListOpenTag; import com.google.googlejavaformat.java.javadoc.Token.Literal; +import com.google.googlejavaformat.java.javadoc.Token.MarkdownFencedCodeBlock; import com.google.googlejavaformat.java.javadoc.Token.MoeBeginStripComment; import com.google.googlejavaformat.java.javadoc.Token.MoeEndStripComment; import com.google.googlejavaformat.java.javadoc.Token.PreCloseTag; @@ -310,7 +312,7 @@ void writeHtmlComment(HtmlComment token) { requestNewline(); } - void writeBr(Token token) { + void writeBr(BrTag token) { writeToken(token); requestNewline(); @@ -324,6 +326,22 @@ void writeLiteral(Literal token) { writeToken(token); } + void writeMarkdownFencedCodeBlock(MarkdownFencedCodeBlock token) { + flushWhitespace(); + output.append(token.start()); + token + .literal() + .lines() + .forEach( + line -> { + writeNewline(); + output.append(line); + }); + writeNewline(); + output.append(token.end()); + requestBlankLine(); + } + @Override public String toString() { return output.toString(); @@ -350,12 +368,13 @@ enum RequestedWhitespace { BLANK_LINE, } - private void writeToken(Token token) { + private void flushWhitespace() { if (requestedMoeBeginStripComment != null) { requestNewline(); } - if (requestedWhitespace == BLANK_LINE + if (classicJavadoc + && requestedWhitespace == BLANK_LINE && (!postWriteModifiedContinuingListStack.isEmpty() || continuingFooterTag)) { /* * We don't write blank lines inside lists or footer tags, even in cases where we otherwise @@ -374,6 +393,14 @@ private void writeToken(Token token) { writeNewline(); requestedWhitespace = NONE; } + } + + private void writeToken(Token token) { + if (token.value().isEmpty()) { + return; + } + + flushWhitespace(); boolean needWhitespace = (requestedWhitespace == WHITESPACE); /* diff --git a/core/src/main/java/com/google/googlejavaformat/java/javadoc/MarkdownPositions.java b/core/src/main/java/com/google/googlejavaformat/java/javadoc/MarkdownPositions.java index b83ec72b2..be70484d5 100644 --- a/core/src/main/java/com/google/googlejavaformat/java/javadoc/MarkdownPositions.java +++ b/core/src/main/java/com/google/googlejavaformat/java/javadoc/MarkdownPositions.java @@ -25,11 +25,13 @@ import com.google.googlejavaformat.java.javadoc.Token.ListItemCloseTag; import com.google.googlejavaformat.java.javadoc.Token.ListItemOpenTag; import com.google.googlejavaformat.java.javadoc.Token.ListOpenTag; +import com.google.googlejavaformat.java.javadoc.Token.MarkdownFencedCodeBlock; import com.google.googlejavaformat.java.javadoc.Token.ParagraphCloseTag; import com.google.googlejavaformat.java.javadoc.Token.ParagraphOpenTag; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.commonmark.node.BulletList; +import org.commonmark.node.FencedCodeBlock; import org.commonmark.node.Heading; import org.commonmark.node.ListItem; import org.commonmark.node.Node; @@ -102,6 +104,25 @@ void visit(Node node) { visitNodeList(paragraph.getNext()); } } + case FencedCodeBlock fencedCodeBlock -> { + // Any indentation before the code block is part of FencedCodeBlock. This makes sense + // because the lines inside the code block must also be indented by that amount. That + // indentation gets subtracted from FencedCodeBlock.getLiteral(), which is the actual text + // represented by the code block. + int start = startPosition(fencedCodeBlock) + fencedCodeBlock.getFenceIndent(); + MarkdownFencedCodeBlock token = + new MarkdownFencedCodeBlock( + input.substring(start, endPosition(fencedCodeBlock)), + fencedCodeBlock + .getFenceCharacter() + .repeat(fencedCodeBlock.getOpeningFenceLength()) + + fencedCodeBlock.getInfo(), + fencedCodeBlock + .getFenceCharacter() + .repeat(fencedCodeBlock.getClosingFenceLength()), + fencedCodeBlock.getLiteral()); + positionToToken.get(start).addLast(token); + } // TODO: others default -> {} } @@ -131,12 +152,17 @@ private void visitNodeList(Node node) { */ private void addSpan(Node node, Token startToken, Token endToken) { // We could write the first part more simply as a `put`, but we do it this way for symmetry. - var first = node.getSourceSpans().getFirst(); - int startPosition = first.getInputIndex(); - positionToToken.get(startPosition).addLast(startToken); + positionToToken.get(startPosition(node)).addLast(startToken); + positionToToken.get(endPosition(node)).addFirst(endToken); + } + + private int startPosition(Node node) { + return node.getSourceSpans().getFirst().getInputIndex(); + } + + private int endPosition(Node node) { var last = node.getSourceSpans().getLast(); - int endPosition = last.getInputIndex() + last.getLength(); - positionToToken.get(endPosition).addFirst(endToken); + return last.getInputIndex() + last.getLength(); } } diff --git a/core/src/main/java/com/google/googlejavaformat/java/javadoc/Token.java b/core/src/main/java/com/google/googlejavaformat/java/javadoc/Token.java index fcf67151b..23bdd4b76 100644 --- a/core/src/main/java/com/google/googlejavaformat/java/javadoc/Token.java +++ b/core/src/main/java/com/google/googlejavaformat/java/javadoc/Token.java @@ -103,6 +103,27 @@ record HtmlComment(String value) implements Token {} record BrTag(String value) implements Token {} + /** + * A fenced code block, like: + * + *
+   * ```java
+   * code block
+   * with an info string ("java")
+   * ```
+   * 
+ * + * @param value the full text of the code block as it appeared in the input, including the start + * and end fences and the literal content. + * @param start the start fence, including the info string if any ({@code ```java} in the + * example). + * @param end the end fence. + * @param literal the text that the code block represents. This does not include the start and end + * fences, nor any indentation that precedes these fences and every intervening line. + */ + record MarkdownFencedCodeBlock(String value, String start, String end, String literal) + implements Token {} + /** * Whitespace that is not in a {@code
} or {@code } section. Whitespace includes
    * leading newlines, asterisks, and tabs and spaces. In the output, it is translated to newlines
diff --git a/core/src/test/java/com/google/googlejavaformat/java/JavadocFormattingTest.java b/core/src/test/java/com/google/googlejavaformat/java/JavadocFormattingTest.java
index 3292ba26a..5cf8c2ffa 100644
--- a/core/src/test/java/com/google/googlejavaformat/java/JavadocFormattingTest.java
+++ b/core/src/test/java/com/google/googlejavaformat/java/JavadocFormattingTest.java
@@ -1717,6 +1717,60 @@ class Test {}
 ///
 /// A following paragraph.
 class Test {}
+""";
+    doFormatTest(input, expected);
+  }
+
+  @Test
+  public void markdownFencedCodeBlocks() {
+    assume().that(MARKDOWN_JAVADOC_SUPPORTED).isTrue();
+    // If fenced code blocks are not supported correctly, the contents of each one will be joined.
+    // If the input lines survive as separate lines, that means we identified the code block.
+    String input =
+"""
+/// ```
+/// foo
+/// bar
+/// ```
+///
+/// -  ```
+///    code block
+///    in a list
+///    ```
+///
+/// ~~~java
+/// code block
+/// with tildes and an info string ("java")
+/// ~~~
+///
+///  ````
+///  code block
+///  with more than three backticks and an extra leading space
+///  ````
+class Test {}
+""";
+    String expected =
+"""
+/// ```
+/// foo
+/// bar
+/// ```
+///
+/// - ```
+///   code block
+///   in a list
+///   ```
+///
+/// ~~~java
+/// code block
+/// with tildes and an info string ("java")
+/// ~~~
+///
+/// ````
+/// code block
+/// with more than three backticks and an extra leading space
+/// ````
+class Test {}
 """;
     doFormatTest(input, expected);
   }
diff --git a/core/src/test/java/com/google/googlejavaformat/java/javadoc/MarkdownPositionsTest.java b/core/src/test/java/com/google/googlejavaformat/java/javadoc/MarkdownPositionsTest.java
index ff4b6fa5b..e1b5fcd12 100644
--- a/core/src/test/java/com/google/googlejavaformat/java/javadoc/MarkdownPositionsTest.java
+++ b/core/src/test/java/com/google/googlejavaformat/java/javadoc/MarkdownPositionsTest.java
@@ -23,6 +23,7 @@
 import com.google.googlejavaformat.java.javadoc.Token.ListItemCloseTag;
 import com.google.googlejavaformat.java.javadoc.Token.ListItemOpenTag;
 import com.google.googlejavaformat.java.javadoc.Token.ListOpenTag;
+import com.google.googlejavaformat.java.javadoc.Token.MarkdownFencedCodeBlock;
 import com.google.googlejavaformat.java.javadoc.Token.ParagraphCloseTag;
 import com.google.googlejavaformat.java.javadoc.Token.ParagraphOpenTag;
 import java.util.Map;
@@ -100,6 +101,62 @@ public void heading() {
     assertThat(map).isEqualTo(expected);
   }
 
+  @Test
+  public void codeBlock() {
+    String text =
+"""
+- ```
+  foo
+  bar
+  ```
+
+~~~java
+code
+with tildes
+~~~
+
+  ````
+  indented code
+  with more than three backticks
+  ````
+""";
+    var positions = MarkdownPositions.parse(text);
+    ImmutableListMultimap map = positionToToken(positions, text);
+    int bullet = text.indexOf('-');
+    int firstCodeStart = text.indexOf("```");
+    int firstCodeEnd = text.indexOf("```", firstCodeStart + 3) + 3;
+    int secondCodeStart = text.indexOf("~~~", firstCodeEnd);
+    int secondCodeEnd = text.indexOf("~~~", secondCodeStart + 3) + 3;
+    int thirdCodeStart = text.indexOf("````", secondCodeEnd);
+    int thirdCodeEnd = text.indexOf("````", thirdCodeStart + 4) + 4;
+    ImmutableListMultimap expected =
+        ImmutableListMultimap.builder()
+            .put(bullet, new ListOpenTag(""))
+            .put(bullet, new ListItemOpenTag("- "))
+            .put(
+                firstCodeStart,
+                new MarkdownFencedCodeBlock(
+                    text.substring(firstCodeStart, firstCodeEnd), "```", "```", "foo\nbar\n"))
+            .put(firstCodeEnd, new ListItemCloseTag(""))
+            .put(firstCodeEnd, new ListCloseTag(""))
+            .put(
+                secondCodeStart,
+                new MarkdownFencedCodeBlock(
+                    text.substring(secondCodeStart, secondCodeEnd),
+                    "~~~java",
+                    "~~~",
+                    "code\nwith tildes\n"))
+            .put(
+                thirdCodeStart,
+                new MarkdownFencedCodeBlock(
+                    text.substring(thirdCodeStart, thirdCodeEnd),
+                    "````",
+                    "````",
+                    "indented code\nwith more than three backticks\n"))
+            .build();
+    assertThat(map).isEqualTo(expected);
+  }
+
   private static ImmutableListMultimap positionToToken(
       MarkdownPositions positions, String input) {
     return IntStream.rangeClosed(0, input.length())