From 32052d987f43b4f44ea63fda946d6d729682ae86 Mon Sep 17 00:00:00 2001 From: Eric Huss Date: Fri, 13 Feb 2026 07:45:22 -0800 Subject: [PATCH 1/8] Add negative lookahead to the grammar This adds the `!` prefix which represents negative lookahead. This was included in the original PEG paper, though it was called "NOT", whereas I went with a more explicit "NegativeLookahead". This will be helpful in several productions which need to have these kinds of exclusions. The syntax is also commonly used in regular expression engines which usually use `(?!expr)`. This is also common in many other PEG libraries. There is a small risk this could be confusing, since `!` is sometimes used for other purposes in other contexts. For example, Prolog uses `!` for their cut operator. I think this should be fine since it is common with PEG. --- dev-guide/src/grammar.md | 37 ++++++++++++++----- src/notation.md | 1 + tools/grammar/src/lib.rs | 3 ++ tools/grammar/src/parser.rs | 11 ++++++ .../src/grammar/render_markdown.rs | 5 +++ .../src/grammar/render_railroad.rs | 6 +++ 6 files changed, 53 insertions(+), 10 deletions(-) diff --git a/dev-guide/src/grammar.md b/dev-guide/src/grammar.md index 2f8d41f822..59c3e1e6ab 100644 --- a/dev-guide/src/grammar.md +++ b/dev-guide/src/grammar.md @@ -39,19 +39,35 @@ Sequence -> (` `* AdornedExpr)* ` `* Cut | (` `* AdornedExpr)+ -AdornedExpr -> ExprRepeat Suffix? Footnote? +AdornedExpr -> Prefix? Expr1 Quantifier? Suffix? Footnote? + +Prefix -> NegativeLookahead + +NegativeLookahead -> `!` Suffix -> ` _` * `_` Footnote -> `[^` ~[`]` LF]+ `]` -ExprRepeat -> - Expr1 `?` - | Expr1 `*?` - | Expr1 `*` - | Expr1 `+?` - | Expr1 `+` - | Expr1 `{` Range? `..` Range? `}` +Quantifier -> + Optional + | Repeat + | RepeatNonGreedy + | RepeatPlus + | RepeatPlusNonGreedy + | RepeatRange + +Optional -> `?` + +Repeat -> `*` + +RepeatNonGreedy -> `*?` + +RepeatPlus -> `+` + +RepeatPlusNonGreedy -> `+?` + +RepeatRange -> `{` Range? `..` Range? `}` Range -> [0-9]+ @@ -120,10 +136,11 @@ The general format is a series of productions separated by blank lines. The expr | Suffix | \_except \[LazyBooleanExpression\]\_ | Adds a suffix to the previous expression to provide an additional English description, rendered in subscript. This can contain limited Markdown, but try to avoid anything except basics like links. | | Footnote | \[^extern-safe\] | Adds a footnote, which can supply extra information that may be helpful to the user. The footnote itself should be defined outside of the code block like a normal Markdown footnote. | | Optional | Expr? | The preceding expression is optional. | +| NegativeLookahead | !Expr | Matches if Expr does not follow, without consuming any input. | | Repeat | Expr* | The preceding expression is repeated 0 or more times. | -| Repeat (non-greedy) | Expr*? | The preceding expression is repeated 0 or more times without being greedy. | +| RepeatNonGreedy | Expr*? | The preceding expression is repeated 0 or more times without being greedy. | | RepeatPlus | Expr+ | The preceding expression is repeated 1 or more times. | -| RepeatPlus (non-greedy) | Expr+? | The preceding expression is repeated 1 or more times without being greedy. | +| RepeatPlusNonGreedy | Expr+? | The preceding expression is repeated 1 or more times without being greedy. | | RepeatRange | Expr{2..4} | The preceding expression is repeated between the range of times specified. Either bound can be excluded, which works just like Rust ranges. | ## Automatic linking diff --git a/src/notation.md b/src/notation.md index cda298a734..611d230aa3 100644 --- a/src/notation.md +++ b/src/notation.md @@ -19,6 +19,7 @@ The following notations are used by the *Lexer* and *Syntax* grammar snippets: | xa..b | HEX_DIGIT1..6 | a to b repetitions of x | | Rule1 Rule2 | `fn` _Name_ _Parameters_ | Sequence of rules in order | | \| | `u8` \| `u16`, Block \| Item | Either one or another | +| ! | !COMMENT | Matches if the expression does not follow, without consuming any input | | \[ ] | \[`b` `B`] | Any of the characters listed | | \[ - ] | \[`a`-`z`] | Any of the characters in the range | | ~\[ ] | ~\[`b` `B`] | Any characters, except those listed | diff --git a/tools/grammar/src/lib.rs b/tools/grammar/src/lib.rs index 70e1a8f9a8..e5e3f57bfe 100644 --- a/tools/grammar/src/lib.rs +++ b/tools/grammar/src/lib.rs @@ -50,6 +50,8 @@ pub enum ExpressionKind { Sequence(Vec), /// `A?` Optional(Box), + /// `!A` + NegativeLookahead(Box), /// `A*` Repeat(Box), /// `A*?` @@ -113,6 +115,7 @@ impl Expression { match &self.kind { ExpressionKind::Grouped(e) | ExpressionKind::Optional(e) + | ExpressionKind::NegativeLookahead(e) | ExpressionKind::Repeat(e) | ExpressionKind::RepeatNonGreedy(e) | ExpressionKind::RepeatPlus(e) diff --git a/tools/grammar/src/parser.rs b/tools/grammar/src/parser.rs index d4240ae4d7..b25a3546da 100644 --- a/tools/grammar/src/parser.rs +++ b/tools/grammar/src/parser.rs @@ -251,6 +251,8 @@ impl Parser<'_> { self.parse_grouped()? } else if next == b'~' { self.parse_neg_expression()? + } else if next == b'!' { + self.parse_negative_lookahead()? } else { return Ok(None); }; @@ -387,6 +389,15 @@ impl Parser<'_> { Ok(ExpressionKind::NegExpression(box_kind(kind))) } + fn parse_negative_lookahead(&mut self) -> Result { + self.expect("!", "expected !")?; + self.space0(); + let Some(e) = self.parse_expr1()? else { + bail!(self, "expected expression after !"); + }; + Ok(ExpressionKind::NegativeLookahead(Box::new(e))) + } + /// Parse e.g. `F00F` after `U+`. fn parse_unicode(&mut self) -> Result { let mut xs = Vec::with_capacity(4); diff --git a/tools/mdbook-spec/src/grammar/render_markdown.rs b/tools/mdbook-spec/src/grammar/render_markdown.rs index a5540b4169..5869c964ad 100644 --- a/tools/mdbook-spec/src/grammar/render_markdown.rs +++ b/tools/mdbook-spec/src/grammar/render_markdown.rs @@ -67,6 +67,7 @@ fn last_expr(expr: &Expression) -> &ExpressionKind { ExpressionKind::Alt(es) | ExpressionKind::Sequence(es) => last_expr(es.last().unwrap()), ExpressionKind::Grouped(_) | ExpressionKind::Optional(_) + | ExpressionKind::NegativeLookahead(_) | ExpressionKind::Repeat(_) | ExpressionKind::RepeatNonGreedy(_) | ExpressionKind::RepeatPlus(_) @@ -119,6 +120,10 @@ fn render_expression(expr: &Expression, cx: &RenderCtx, output: &mut String) { render_expression(e, cx, output); output.push_str("?"); } + ExpressionKind::NegativeLookahead(e) => { + output.push('!'); + render_expression(e, cx, output); + } ExpressionKind::Repeat(e) => { render_expression(e, cx, output); output.push_str("\\*"); diff --git a/tools/mdbook-spec/src/grammar/render_railroad.rs b/tools/mdbook-spec/src/grammar/render_railroad.rs index 6efb065a34..5d20d77c48 100644 --- a/tools/mdbook-spec/src/grammar/render_railroad.rs +++ b/tools/mdbook-spec/src/grammar/render_railroad.rs @@ -139,6 +139,12 @@ fn render_expression(expr: &Expression, cx: &RenderCtx, stack: bool) -> Option { + let forward = render_expression(e, cx, stack)?; + let lbox = + LabeledBox::new(forward, Comment::new("not followed by".to_string())); + Box::new(lbox) + } // Treat `e?` and `e{..1}` / `e{0..1}` equally. ExpressionKind::Optional(e) | ExpressionKind::RepeatRange(e, None | Some(0), Some(1)) => { From 8c7058df25689e6576200128701412dc8c44d207 Mon Sep 17 00:00:00 2001 From: Eric Huss Date: Fri, 13 Feb 2026 08:22:17 -0800 Subject: [PATCH 2/8] Add Unicode to character range This adds the ability to specify Unicode code points in a character range. This will be useful for defining some productions without using English, and perhaps to be a little clearer. This also extends the Unicode grammar to allow up to 6 characters for larger code points. --- dev-guide/src/grammar.md | 8 ++- src/input-format.md | 4 +- tools/grammar/src/lib.rs | 32 ++++++++- tools/grammar/src/parser.rs | 66 ++++++++++++------- .../src/grammar/render_markdown.rs | 24 ++++--- .../src/grammar/render_railroad.rs | 16 ++++- 6 files changed, 111 insertions(+), 39 deletions(-) diff --git a/dev-guide/src/grammar.md b/dev-guide/src/grammar.md index 59c3e1e6ab..3a155b88e8 100644 --- a/dev-guide/src/grammar.md +++ b/dev-guide/src/grammar.md @@ -82,7 +82,7 @@ Expr1 -> | Group | NegativeExpression -Unicode -> `U+` [`A`-`Z` `0`-`9`]4..4 +Unicode -> `U+` [`A`-`Z` `0`-`9`]4..6 NonTerminal -> Name @@ -99,7 +99,11 @@ Characters -> | CharacterTerminal | CharacterName -CharacterRange -> BACKTICK BACKTICK `-` BACKTICK BACKTICK +CharacterRange -> Character `-` Character + +Character -> + BACKTICK BACKTICK + | Unicode CharacterTerminal -> Terminal diff --git a/src/input-format.md b/src/input-format.md index 2432da0339..be6bb670b3 100644 --- a/src/input-format.md +++ b/src/input-format.md @@ -3,7 +3,9 @@ r[input] r[input.syntax] ```grammar,lexer -@root CHAR -> +@root CHAR -> [U+0000-U+D7FF U+E000-U+10FFFF] // a Unicode scalar value + +@root ASCII -> [U+0000-U+007F] NUL -> U+0000 ``` diff --git a/tools/grammar/src/lib.rs b/tools/grammar/src/lib.rs index e5e3f57bfe..b558f31aae 100644 --- a/tools/grammar/src/lib.rs +++ b/tools/grammar/src/lib.rs @@ -3,6 +3,7 @@ use diagnostics::{Diagnostics, warn_or_err}; use regex::Regex; use std::collections::{HashMap, HashSet}; +use std::fmt::{Display, Formatter}; use std::path::{Path, PathBuf}; use std::sync::LazyLock; use walkdir::WalkDir; @@ -81,7 +82,7 @@ pub enum ExpressionKind { /// `^ A B C` Cut(Box), /// `U+0060` - Unicode(String), + Unicode((char, String)), } #[derive(Clone, Debug)] @@ -91,7 +92,34 @@ pub enum Characters { /// `` `_` `` Terminal(String), /// `` `A`-`Z` `` - Range(char, char), + Range(Character, Character), +} + +#[derive(Clone, Debug)] +pub enum Character { + Char(char), + /// `U+0060` + /// + /// The `String` is the hex digits after `U+`. + Unicode((char, String)), +} + +impl Character { + pub fn get_ch(&self) -> char { + match self { + Character::Char(ch) => *ch, + Character::Unicode((ch, _)) => *ch, + } + } +} + +impl Display for Character { + fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), std::fmt::Error> { + match self { + Character::Char(ch) => write!(f, "`{ch}`"), + Character::Unicode((_, s)) => write!(f, "U+{s}"), + } + } } impl Grammar { diff --git a/tools/grammar/src/parser.rs b/tools/grammar/src/parser.rs index b25a3546da..f1f8588770 100644 --- a/tools/grammar/src/parser.rs +++ b/tools/grammar/src/parser.rs @@ -1,6 +1,6 @@ //! A parser of the ENBF-like grammar. -use super::{Characters, Expression, ExpressionKind, Grammar, Production}; +use super::{Character, Characters, Expression, ExpressionKind, Grammar, Production}; use std::fmt; use std::fmt::Display; use std::path::Path; @@ -221,7 +221,7 @@ impl Parser<'_> { }; let kind = if self.take_str("U+") { - self.parse_unicode()? + ExpressionKind::Unicode(self.parse_unicode()?) } else if self.input[self.index..] .chars() .next() @@ -322,27 +322,19 @@ impl Parser<'_> { /// Parse an element of a character class, e.g. /// `` `a`-`b` `` | `` `term` `` | `` NonTerminal ``. fn parse_characters(&mut self) -> Result> { - if let Some(b'`') = self.peek() { - let recov = self.index; - let a = self.parse_terminal_str()?; + if let Some(a) = self.parse_character()? { if self.take_str("-") { - //~^ Parse `` `a`-`b` `` character range. - if a.len() > 1 { - self.index = recov + 1; - bail!(self, "invalid start terminal in range"); - } - let recov = self.index; - let b = self.parse_terminal_str()?; - if b.len() > 1 { - self.index = recov + 1; - bail!(self, "invalid end terminal in range"); - } - let a = a.chars().next().unwrap(); - let b = b.chars().next().unwrap(); + let Some(b) = self.parse_character()? else { + bail!(self, "expected character in range"); + }; Ok(Some(Characters::Range(a, b))) } else { //~^ Parse terminal in backticks. - Ok(Some(Characters::Terminal(a))) + let t = match a { + Character::Char(ch) => ch.to_string(), + Character::Unicode(_) => bail!(self, "unicode not supported"), + }; + Ok(Some(Characters::Terminal(t))) } } else if let Some(name) = self.parse_name() { //~^ Parse nonterminal identifier. @@ -352,6 +344,23 @@ impl Parser<'_> { } } + fn parse_character(&mut self) -> Result> { + if let Some(b'`') = self.peek() { + let recov = self.index; + let term = self.parse_terminal_str()?; + if term.len() > 1 { + self.index = recov + 1; + bail!(self, "invalid start terminal in range"); + } + let ch = term.chars().next().unwrap(); + Ok(Some(Character::Char(ch))) + } else if self.take_str("U+") { + Ok(Some(Character::Unicode(self.parse_unicode()?))) + } else { + Ok(None) + } + } + /// Parse e.g. ``. fn parse_prose(&mut self) -> Result { self.expect("<", "expected opening `<`")?; @@ -399,9 +408,9 @@ impl Parser<'_> { } /// Parse e.g. `F00F` after `U+`. - fn parse_unicode(&mut self) -> Result { - let mut xs = Vec::with_capacity(4); - for _ in 0..4 { + fn parse_unicode(&mut self) -> Result<(char, String)> { + let mut xs = Vec::with_capacity(6); + let mut push_next = || { match self.peek() { Some(x @ (b'0'..=b'9' | b'A'..=b'F')) => { xs.push(x); @@ -409,8 +418,19 @@ impl Parser<'_> { } _ => bail!(self, "expected 4 uppercase hexadecimal digits after `U+`"), } + Ok(()) + }; + for _ in 0..4 { + push_next()?; + } + for _ in 0..2 { + if push_next().is_err() { + break; + } } - Ok(ExpressionKind::Unicode(String::from_utf8(xs).unwrap())) + let s = String::from_utf8(xs).unwrap(); + let ch = char::from_u32(u32::from_str_radix(&s, 16).unwrap()).unwrap(); + Ok((ch, s)) } /// Parse `?` after expression. diff --git a/tools/mdbook-spec/src/grammar/render_markdown.rs b/tools/mdbook-spec/src/grammar/render_markdown.rs index 5869c964ad..1a311aa151 100644 --- a/tools/mdbook-spec/src/grammar/render_markdown.rs +++ b/tools/mdbook-spec/src/grammar/render_markdown.rs @@ -3,7 +3,7 @@ use super::RenderCtx; use crate::grammar::Grammar; use anyhow::bail; -use grammar::{Characters, Expression, ExpressionKind, Production}; +use grammar::{Character, Characters, Expression, ExpressionKind, Production}; use regex::Regex; use std::borrow::Cow; use std::fmt::Write; @@ -181,7 +181,7 @@ fn render_expression(expr: &Expression, cx: &RenderCtx, output: &mut String) { output.push_str("^ "); render_expression(e, cx, output); } - ExpressionKind::Unicode(s) => { + ExpressionKind::Unicode((_, s)) => { output.push_str("U+"); output.push_str(s); } @@ -222,12 +222,20 @@ fn render_characters(chars: &Characters, cx: &RenderCtx, output: &mut String) { markdown_escape(s) ) .unwrap(), - Characters::Range(a, b) => write!( - output, - "{a}\ - -{b}" - ) - .unwrap(), + Characters::Range(a, b) => { + let write_ch = |ch: &Character, output: &mut String| match ch { + Character::Char(ch) => write!( + output, + "{}", + markdown_escape(&ch.to_string()) + ) + .unwrap(), + Character::Unicode((_, s)) => write!(output, "U+{s}").unwrap(), + }; + write_ch(a, output); + output.push('-'); + write_ch(b, output); + } } } diff --git a/tools/mdbook-spec/src/grammar/render_railroad.rs b/tools/mdbook-spec/src/grammar/render_railroad.rs index 5d20d77c48..94b2c1d261 100644 --- a/tools/mdbook-spec/src/grammar/render_railroad.rs +++ b/tools/mdbook-spec/src/grammar/render_railroad.rs @@ -3,7 +3,7 @@ use super::RenderCtx; use crate::grammar::Grammar; use anyhow::bail; -use grammar::{Characters, Expression, ExpressionKind, Production}; +use grammar::{Character, Characters, Expression, ExpressionKind, Production}; use railroad::*; use regex::Regex; use std::fmt::Write; @@ -225,7 +225,7 @@ fn render_expression(expr: &Expression, cx: &RenderCtx, stack: bool) -> Option Box::new(Terminal::new(format!("U+{}", s))), + ExpressionKind::Unicode((_, s)) => Box::new(Terminal::new(format!("U+{}", s))), }; } }; @@ -244,7 +244,17 @@ fn render_characters(chars: &Characters, cx: &RenderCtx) -> Box { match chars { Characters::Named(s) => node_for_nt(cx, s), Characters::Terminal(s) => Box::new(Terminal::new(s.clone())), - Characters::Range(a, b) => Box::new(Terminal::new(format!("{a}-{b}"))), + Characters::Range(a, b) => { + let mut s = String::new(); + let write_ch = |ch: &Character, output: &mut String| match ch { + Character::Char(ch) => output.push(*ch), + Character::Unicode((_, s)) => write!(output, "U+{s}").unwrap(), + }; + write_ch(a, &mut s); + s.push('-'); + write_ch(b, &mut s); + Box::new(Terminal::new(s)) + } } } From 42f14b0ed670fae2da11eee35dc112c29c1c5d1d Mon Sep 17 00:00:00 2001 From: Eric Huss Date: Fri, 13 Feb 2026 09:26:19 -0800 Subject: [PATCH 3/8] Use negative lookahead in the grammar This replaces some suffixes and prose with the new negative lookahead syntax instead. This should all have the same meaning. --- src/identifiers.md | 2 +- src/input-format.md | 2 +- src/tokens.md | 30 +++++++++++++----------------- 3 files changed, 15 insertions(+), 19 deletions(-) diff --git a/src/identifiers.md b/src/identifiers.md index 979284a1c7..5abe303ca2 100644 --- a/src/identifiers.md +++ b/src/identifiers.md @@ -16,7 +16,7 @@ NON_KEYWORD_IDENTIFIER -> IDENTIFIER_OR_KEYWORD _except a [strict][lex.keywords. IDENTIFIER -> NON_KEYWORD_IDENTIFIER | RAW_IDENTIFIER RESERVED_RAW_IDENTIFIER -> - `r#` (`_` | `crate` | `self` | `Self` | `super`) _not immediately followed by XID_Continue_ + `r#` (`_` | `crate` | `self` | `Self` | `super`) !XID_Continue ``` diff --git a/src/input-format.md b/src/input-format.md index be6bb670b3..d6eca2dc2d 100644 --- a/src/input-format.md +++ b/src/input-format.md @@ -5,7 +5,7 @@ r[input.syntax] ```grammar,lexer @root CHAR -> [U+0000-U+D7FF U+E000-U+10FFFF] // a Unicode scalar value -@root ASCII -> [U+0000-U+007F] +ASCII -> [U+0000-U+007F] NUL -> U+0000 ``` diff --git a/src/tokens.md b/src/tokens.md index b6a0124320..fb5e06da26 100644 --- a/src/tokens.md +++ b/src/tokens.md @@ -115,7 +115,7 @@ r[lex.token.literal.suffix.syntax] ```grammar,lexer SUFFIX -> IDENTIFIER_OR_KEYWORD _except `_`_ -SUFFIX_NO_E -> SUFFIX _not beginning with `e` or `E`_ +SUFFIX_NO_E -> ![`e` `E`] SUFFIX ``` r[lex.token.literal.suffix.validity] @@ -253,8 +253,7 @@ r[lex.token.byte.syntax] BYTE_LITERAL -> `b'` ^ ( ASCII_FOR_CHAR | BYTE_ESCAPE ) `'` SUFFIX? -ASCII_FOR_CHAR -> - +ASCII_FOR_CHAR -> ![`'` `\` LF CR TAB] ASCII BYTE_ESCAPE -> `\x` HEX_DIGIT HEX_DIGIT @@ -272,8 +271,7 @@ r[lex.token.str-byte.syntax] BYTE_STRING_LITERAL -> `b"` ^ ( ASCII_FOR_STRING | BYTE_ESCAPE | STRING_CONTINUE )* `"` SUFFIX? -ASCII_FOR_STRING -> - +ASCII_FOR_STRING -> ![`"` `\` CR] ASCII ``` r[lex.token.str-byte.intro] @@ -309,8 +307,7 @@ RAW_BYTE_STRING_CONTENT -> `"` ^ ASCII_FOR_RAW*? `"` | `#` RAW_BYTE_STRING_CONTENT `#` -ASCII_FOR_RAW -> - +ASCII_FOR_RAW -> !CR ASCII ``` r[lex.token.str-byte-raw.intro] @@ -559,7 +556,7 @@ r[lex.token.literal.float.syntax] FLOAT_LITERAL -> DEC_LITERAL (`.` DEC_LITERAL)? FLOAT_EXPONENT SUFFIX? | DEC_LITERAL `.` DEC_LITERAL SUFFIX_NO_E? - | DEC_LITERAL `.` _not immediately followed by `.`, `_` or an XID_Start character_ + | DEC_LITERAL `.` !(`.` | `_` | XID_Start) FLOAT_EXPONENT -> (`e`|`E`) (`+`|`-`)? `_`* DEC_DIGIT (DEC_DIGIT|`_`)* @@ -608,13 +605,12 @@ r[lex.token.literal.reserved.syntax] RESERVED_NUMBER -> BIN_LITERAL [`2`-`9`] | OCT_LITERAL [`8`-`9`] - | ( BIN_LITERAL | OCT_LITERAL | HEX_LITERAL ) `.` _not immediately followed by `.`, `_` or an XID_Start character_ + | ( BIN_LITERAL | OCT_LITERAL | HEX_LITERAL ) `.` !(`.` | `_` | XID_Start) | ( BIN_LITERAL | OCT_LITERAL ) (`e`|`E`) - | `0b` `_`* - | `0o` `_`* - | `0x` `_`* + | `0b` `_`* !BIN_DIGIT + | `0o` `_`* !OCT_DIGIT + | `0x` `_`* !HEX_DIGIT | DEC_LITERAL ( `.` DEC_LITERAL )? (`e` | `E`) (`+` | `-`)? - ``` r[lex.token.literal.reserved.intro] @@ -657,16 +653,16 @@ r[lex.token.life.syntax] ```grammar,lexer LIFETIME_TOKEN -> RAW_LIFETIME - | `'` IDENTIFIER_OR_KEYWORD _not immediately followed by `'`_ + | `'` IDENTIFIER_OR_KEYWORD !`'` LIFETIME_OR_LABEL -> RAW_LIFETIME - | `'` NON_KEYWORD_IDENTIFIER _not immediately followed by `'`_ + | `'` NON_KEYWORD_IDENTIFIER !`'` RAW_LIFETIME -> - `'r#` IDENTIFIER_OR_KEYWORD _not immediately followed by `'`_ + `'r#` ^ IDENTIFIER_OR_KEYWORD !`'` -RESERVED_RAW_LIFETIME -> `'r#` (`_` | `crate` | `self` | `Self` | `super`) _not immediately followed by `'`_ +RESERVED_RAW_LIFETIME -> `'r#` (`_` | `crate` | `self` | `Self` | `super`) !(`'` | XID_Continue) ``` r[lex.token.life.intro] From 259df9111c1a2474e44d6c336589d3393b042290 Mon Sep 17 00:00:00 2001 From: Eric Huss Date: Fri, 13 Feb 2026 09:30:07 -0800 Subject: [PATCH 4/8] Fix LINE_COMMENT grammar This clarifies that bare `//` is explicitly meant to be either followed by LF or EOF. Otherwise it incorrectly matches other comment rules. --- src/comments.md | 3 ++- src/input-format.md | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/comments.md b/src/comments.md index a240e7dc58..1320077938 100644 --- a/src/comments.md +++ b/src/comments.md @@ -5,7 +5,8 @@ r[comments.syntax] ```grammar,lexer @root LINE_COMMENT -> `//` (~[`/` `!` LF] | `//`) ~LF* - | `//` + | `//` EOF + | `//` _immediately followed by LF_ BLOCK_COMMENT -> `/*` diff --git a/src/input-format.md b/src/input-format.md index d6eca2dc2d..3e35cba1ee 100644 --- a/src/input-format.md +++ b/src/input-format.md @@ -3,11 +3,13 @@ r[input] r[input.syntax] ```grammar,lexer -@root CHAR -> [U+0000-U+D7FF U+E000-U+10FFFF] // a Unicode scalar value +CHAR -> [U+0000-U+D7FF U+E000-U+10FFFF] // a Unicode scalar value ASCII -> [U+0000-U+007F] NUL -> U+0000 + +EOF -> !CHAR // End of file or input ``` r[input.intro] From feaa3886c9f09f2880052a65d965ee28cf075694 Mon Sep 17 00:00:00 2001 From: Eric Huss Date: Fri, 13 Feb 2026 09:35:57 -0800 Subject: [PATCH 5/8] Fix BLOCK_COMMENT order This fixes the BLOCK_COMMENT grammar so that it follows the rule that the first alternation that matches wins. The previous grammar would fail with the use of the cut operator to parse these two forms. --- src/comments.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/comments.md b/src/comments.md index 1320077938..e82cd28ace 100644 --- a/src/comments.md +++ b/src/comments.md @@ -9,12 +9,13 @@ r[comments.syntax] | `//` _immediately followed by LF_ BLOCK_COMMENT -> - `/*` + `/**/` + | `/***/` + | `/*` + ^ ( ~[`*` `!`] | `**` | BLOCK_COMMENT_OR_DOC ) ( BLOCK_COMMENT_OR_DOC | ~`*/` )* `*/` - | `/**/` - | `/***/` @root INNER_LINE_DOC -> `//!` ~[LF CR]* From f2ca8392652141e7527a714191ae0873d2f965ca Mon Sep 17 00:00:00 2001 From: Eric Huss Date: Fri, 13 Feb 2026 09:54:48 -0800 Subject: [PATCH 6/8] Fix handling of carriage returns in doc comments This fixes the doc comments so that they properly handle a carriage return by using the cut operator. Rustc will fail parsing if a doc comment contains a carriage return. This requires including (LF|EOF) at the end of line so the cut operator has something to complete the line. This also removes the negative `/` from OUTER_LINE_DOC. This does not work correctly with the check for CR, and is not needed because LINE_COMMENT already matches `////`. Later I plan to include a rule for comments that makes it clear the order that they are parsed. A negative lookahead is necessary in OUTER_BLOCK_DOC to prevent it from trying to parse what should be a BLOCK_COMMENT as an OUTER_BLOCK_DOC and failing due to the cut operator. --- src/comments.md | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/comments.md b/src/comments.md index e82cd28ace..6e4c06744f 100644 --- a/src/comments.md +++ b/src/comments.md @@ -18,20 +18,25 @@ BLOCK_COMMENT -> `*/` @root INNER_LINE_DOC -> - `//!` ~[LF CR]* + `//!` ^ LINE_DOC_COMMENT_CONTENT (LF | EOF) + +LINE_DOC_COMMENT_CONTENT -> (!CR ~LF)* INNER_BLOCK_DOC -> - `/*!` ( BLOCK_COMMENT_OR_DOC | ~[`*/` CR] )* `*/` + `/*!` ^ ( BLOCK_COMMENT_OR_DOC | BLOCK_CHAR )* `*/` @root OUTER_LINE_DOC -> - `///` (~`/` ~[LF CR]*)? + `///` ^ LINE_DOC_COMMENT_CONTENT (LF | EOF) OUTER_BLOCK_DOC -> - `/**` + `/**` ![`*` `/`] + ^ ( ~`*` | BLOCK_COMMENT_OR_DOC ) - ( BLOCK_COMMENT_OR_DOC | ~[`*/` CR] )* + ( BLOCK_COMMENT_OR_DOC | BLOCK_CHAR )* `*/` +BLOCK_CHAR -> (!(`*/` | CR) CHAR) + @root BLOCK_COMMENT_OR_DOC -> BLOCK_COMMENT | OUTER_BLOCK_DOC From 2d2850a0ec4345e35248d017ac6f8760009f09ce Mon Sep 17 00:00:00 2001 From: Eric Huss Date: Fri, 13 Feb 2026 10:27:55 -0800 Subject: [PATCH 7/8] Add a new COMMENT grammar rule This is intended to indicate the order that the rules are expected to be processed (as defined in this grammar). Of course real parsers can take a different approach if they have the same results. This is roughly similar to the order that rustc takes, though [`block_comment`](https://github.com/rust-lang/rust/blob/d7daac06d87e1252d10eaa44960164faac46beff/compiler/rustc_lexer/src/lib.rs#L782-L817) roughly takes the approach of combining the `/*` prefix, and then deciding if it is an inner doc comment, outer doc comment, or else a regular block comment. LINE_COMMENT must be first so that it is not confused with a doc comment. BLOCK_COMMENT must be last so that its cut operator does not interfere with doc comments that start with `/*`. It could be moved up higher in the list if it had negative lookahead to disambiguate OUTER_BLOCK_DOC, but the expression for that is more complicated than the one in OUTER_BLOCK_DOC. --- src/comments.md | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/comments.md b/src/comments.md index 6e4c06744f..bbb3332539 100644 --- a/src/comments.md +++ b/src/comments.md @@ -3,7 +3,15 @@ r[comments] r[comments.syntax] ```grammar,lexer -@root LINE_COMMENT -> +@root COMMENT -> + LINE_COMMENT + | INNER_LINE_DOC + | OUTER_LINE_DOC + | INNER_BLOCK_DOC + | OUTER_BLOCK_DOC + | BLOCK_COMMENT + +LINE_COMMENT -> `//` (~[`/` `!` LF] | `//`) ~LF* | `//` EOF | `//` _immediately followed by LF_ @@ -17,7 +25,7 @@ BLOCK_COMMENT -> ( BLOCK_COMMENT_OR_DOC | ~`*/` )* `*/` -@root INNER_LINE_DOC -> +INNER_LINE_DOC -> `//!` ^ LINE_DOC_COMMENT_CONTENT (LF | EOF) LINE_DOC_COMMENT_CONTENT -> (!CR ~LF)* @@ -25,7 +33,7 @@ LINE_DOC_COMMENT_CONTENT -> (!CR ~LF)* INNER_BLOCK_DOC -> `/*!` ^ ( BLOCK_COMMENT_OR_DOC | BLOCK_CHAR )* `*/` -@root OUTER_LINE_DOC -> +OUTER_LINE_DOC -> `///` ^ LINE_DOC_COMMENT_CONTENT (LF | EOF) OUTER_BLOCK_DOC -> @@ -37,7 +45,7 @@ OUTER_BLOCK_DOC -> BLOCK_CHAR -> (!(`*/` | CR) CHAR) -@root BLOCK_COMMENT_OR_DOC -> +BLOCK_COMMENT_OR_DOC -> BLOCK_COMMENT | OUTER_BLOCK_DOC | INNER_BLOCK_DOC From 2bd22f87a09de22e7d15e7c98dbed7435f5056e9 Mon Sep 17 00:00:00 2001 From: Eric Huss Date: Fri, 13 Feb 2026 10:28:57 -0800 Subject: [PATCH 8/8] Fix desugaring of doc comments rustc actually includes the spaces for doc comments. --- src/comments.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/comments.md b/src/comments.md index bbb3332539..ef283a9ea1 100644 --- a/src/comments.md +++ b/src/comments.md @@ -66,7 +66,7 @@ r[comments.doc.syntax] Line doc comments beginning with exactly _three_ slashes (`///`), and block doc comments (`/** ... */`), both outer doc comments, are interpreted as a special syntax for [`doc` attributes]. r[comments.doc.attributes] -That is, they are equivalent to writing `#[doc="..."]` around the body of the comment, i.e., `/// Foo` turns into `#[doc="Foo"]` and `/** Bar */` turns into `#[doc="Bar"]`. They must therefore appear before something that accepts an outer attribute. +That is, they are equivalent to writing `#[doc="..."]` around the body of the comment, i.e., `/// Foo` turns into `#[doc=" Foo"]` and `/** Bar */` turns into `#[doc=" Bar "]`. They must therefore appear before something that accepts an outer attribute. r[comments.doc.inner-syntax] Line comments beginning with `//!` and block comments `/*! ... */` are doc comments that apply to the parent of the comment, rather than the item that follows.