diff --git a/dev-guide/src/grammar.md b/dev-guide/src/grammar.md index 2f8d41f822..3a155b88e8 100644 --- a/dev-guide/src/grammar.md +++ b/dev-guide/src/grammar.md @@ -39,19 +39,35 @@ Sequence -> (` `* AdornedExpr)* ` `* Cut | (` `* AdornedExpr)+ -AdornedExpr -> ExprRepeat Suffix? Footnote? +AdornedExpr -> Prefix? Expr1 Quantifier? Suffix? Footnote? + +Prefix -> NegativeLookahead + +NegativeLookahead -> `!` Suffix -> ` _` * `_` Footnote -> `[^` ~[`]` LF]+ `]` -ExprRepeat -> - Expr1 `?` - | Expr1 `*?` - | Expr1 `*` - | Expr1 `+?` - | Expr1 `+` - | Expr1 `{` Range? `..` Range? `}` +Quantifier -> + Optional + | Repeat + | RepeatNonGreedy + | RepeatPlus + | RepeatPlusNonGreedy + | RepeatRange + +Optional -> `?` + +Repeat -> `*` + +RepeatNonGreedy -> `*?` + +RepeatPlus -> `+` + +RepeatPlusNonGreedy -> `+?` + +RepeatRange -> `{` Range? `..` Range? `}` Range -> [0-9]+ @@ -66,7 +82,7 @@ Expr1 -> | Group | NegativeExpression -Unicode -> `U+` [`A`-`Z` `0`-`9`]4..4 +Unicode -> `U+` [`A`-`Z` `0`-`9`]4..6 NonTerminal -> Name @@ -83,7 +99,11 @@ Characters -> | CharacterTerminal | CharacterName -CharacterRange -> BACKTICK BACKTICK `-` BACKTICK BACKTICK +CharacterRange -> Character `-` Character + +Character -> + BACKTICK BACKTICK + | Unicode CharacterTerminal -> Terminal @@ -120,10 +140,11 @@ The general format is a series of productions separated by blank lines. The expr | Suffix | \_except \[LazyBooleanExpression\]\_ | Adds a suffix to the previous expression to provide an additional English description, rendered in subscript. This can contain limited Markdown, but try to avoid anything except basics like links. | | Footnote | \[^extern-safe\] | Adds a footnote, which can supply extra information that may be helpful to the user. The footnote itself should be defined outside of the code block like a normal Markdown footnote. | | Optional | Expr? | The preceding expression is optional. | +| NegativeLookahead | !Expr | Matches if Expr does not follow, without consuming any input. | | Repeat | Expr* | The preceding expression is repeated 0 or more times. | -| Repeat (non-greedy) | Expr*? | The preceding expression is repeated 0 or more times without being greedy. | +| RepeatNonGreedy | Expr*? | The preceding expression is repeated 0 or more times without being greedy. | | RepeatPlus | Expr+ | The preceding expression is repeated 1 or more times. | -| RepeatPlus (non-greedy) | Expr+? | The preceding expression is repeated 1 or more times without being greedy. | +| RepeatPlusNonGreedy | Expr+? | The preceding expression is repeated 1 or more times without being greedy. | | RepeatRange | Expr{2..4} | The preceding expression is repeated between the range of times specified. Either bound can be excluded, which works just like Rust ranges. | ## Automatic linking diff --git a/src/comments.md b/src/comments.md index a240e7dc58..ef283a9ea1 100644 --- a/src/comments.md +++ b/src/comments.md @@ -3,34 +3,49 @@ r[comments] r[comments.syntax] ```grammar,lexer -@root LINE_COMMENT -> +@root COMMENT -> + LINE_COMMENT + | INNER_LINE_DOC + | OUTER_LINE_DOC + | INNER_BLOCK_DOC + | OUTER_BLOCK_DOC + | BLOCK_COMMENT + +LINE_COMMENT -> `//` (~[`/` `!` LF] | `//`) ~LF* - | `//` + | `//` EOF + | `//` _immediately followed by LF_ BLOCK_COMMENT -> - `/*` + `/**/` + | `/***/` + | `/*` + ^ ( ~[`*` `!`] | `**` | BLOCK_COMMENT_OR_DOC ) ( BLOCK_COMMENT_OR_DOC | ~`*/` )* `*/` - | `/**/` - | `/***/` -@root INNER_LINE_DOC -> - `//!` ~[LF CR]* +INNER_LINE_DOC -> + `//!` ^ LINE_DOC_COMMENT_CONTENT (LF | EOF) + +LINE_DOC_COMMENT_CONTENT -> (!CR ~LF)* INNER_BLOCK_DOC -> - `/*!` ( BLOCK_COMMENT_OR_DOC | ~[`*/` CR] )* `*/` + `/*!` ^ ( BLOCK_COMMENT_OR_DOC | BLOCK_CHAR )* `*/` -@root OUTER_LINE_DOC -> - `///` (~`/` ~[LF CR]*)? +OUTER_LINE_DOC -> + `///` ^ LINE_DOC_COMMENT_CONTENT (LF | EOF) OUTER_BLOCK_DOC -> - `/**` + `/**` ![`*` `/`] + ^ ( ~`*` | BLOCK_COMMENT_OR_DOC ) - ( BLOCK_COMMENT_OR_DOC | ~[`*/` CR] )* + ( BLOCK_COMMENT_OR_DOC | BLOCK_CHAR )* `*/` -@root BLOCK_COMMENT_OR_DOC -> +BLOCK_CHAR -> (!(`*/` | CR) CHAR) + +BLOCK_COMMENT_OR_DOC -> BLOCK_COMMENT | OUTER_BLOCK_DOC | INNER_BLOCK_DOC @@ -51,7 +66,7 @@ r[comments.doc.syntax] Line doc comments beginning with exactly _three_ slashes (`///`), and block doc comments (`/** ... */`), both outer doc comments, are interpreted as a special syntax for [`doc` attributes]. r[comments.doc.attributes] -That is, they are equivalent to writing `#[doc="..."]` around the body of the comment, i.e., `/// Foo` turns into `#[doc="Foo"]` and `/** Bar */` turns into `#[doc="Bar"]`. They must therefore appear before something that accepts an outer attribute. +That is, they are equivalent to writing `#[doc="..."]` around the body of the comment, i.e., `/// Foo` turns into `#[doc=" Foo"]` and `/** Bar */` turns into `#[doc=" Bar "]`. They must therefore appear before something that accepts an outer attribute. r[comments.doc.inner-syntax] Line comments beginning with `//!` and block comments `/*! ... */` are doc comments that apply to the parent of the comment, rather than the item that follows. diff --git a/src/identifiers.md b/src/identifiers.md index 979284a1c7..5abe303ca2 100644 --- a/src/identifiers.md +++ b/src/identifiers.md @@ -16,7 +16,7 @@ NON_KEYWORD_IDENTIFIER -> IDENTIFIER_OR_KEYWORD _except a [strict][lex.keywords. IDENTIFIER -> NON_KEYWORD_IDENTIFIER | RAW_IDENTIFIER RESERVED_RAW_IDENTIFIER -> - `r#` (`_` | `crate` | `self` | `Self` | `super`) _not immediately followed by XID_Continue_ + `r#` (`_` | `crate` | `self` | `Self` | `super`) !XID_Continue ``` diff --git a/src/input-format.md b/src/input-format.md index 2432da0339..3e35cba1ee 100644 --- a/src/input-format.md +++ b/src/input-format.md @@ -3,9 +3,13 @@ r[input] r[input.syntax] ```grammar,lexer -@root CHAR -> +CHAR -> [U+0000-U+D7FF U+E000-U+10FFFF] // a Unicode scalar value + +ASCII -> [U+0000-U+007F] NUL -> U+0000 + +EOF -> !CHAR // End of file or input ``` r[input.intro] diff --git a/src/notation.md b/src/notation.md index cda298a734..611d230aa3 100644 --- a/src/notation.md +++ b/src/notation.md @@ -19,6 +19,7 @@ The following notations are used by the *Lexer* and *Syntax* grammar snippets: | xa..b | HEX_DIGIT1..6 | a to b repetitions of x | | Rule1 Rule2 | `fn` _Name_ _Parameters_ | Sequence of rules in order | | \| | `u8` \| `u16`, Block \| Item | Either one or another | +| ! | !COMMENT | Matches if the expression does not follow, without consuming any input | | \[ ] | \[`b` `B`] | Any of the characters listed | | \[ - ] | \[`a`-`z`] | Any of the characters in the range | | ~\[ ] | ~\[`b` `B`] | Any characters, except those listed | diff --git a/src/tokens.md b/src/tokens.md index b6a0124320..fb5e06da26 100644 --- a/src/tokens.md +++ b/src/tokens.md @@ -115,7 +115,7 @@ r[lex.token.literal.suffix.syntax] ```grammar,lexer SUFFIX -> IDENTIFIER_OR_KEYWORD _except `_`_ -SUFFIX_NO_E -> SUFFIX _not beginning with `e` or `E`_ +SUFFIX_NO_E -> ![`e` `E`] SUFFIX ``` r[lex.token.literal.suffix.validity] @@ -253,8 +253,7 @@ r[lex.token.byte.syntax] BYTE_LITERAL -> `b'` ^ ( ASCII_FOR_CHAR | BYTE_ESCAPE ) `'` SUFFIX? -ASCII_FOR_CHAR -> - +ASCII_FOR_CHAR -> ![`'` `\` LF CR TAB] ASCII BYTE_ESCAPE -> `\x` HEX_DIGIT HEX_DIGIT @@ -272,8 +271,7 @@ r[lex.token.str-byte.syntax] BYTE_STRING_LITERAL -> `b"` ^ ( ASCII_FOR_STRING | BYTE_ESCAPE | STRING_CONTINUE )* `"` SUFFIX? -ASCII_FOR_STRING -> - +ASCII_FOR_STRING -> ![`"` `\` CR] ASCII ``` r[lex.token.str-byte.intro] @@ -309,8 +307,7 @@ RAW_BYTE_STRING_CONTENT -> `"` ^ ASCII_FOR_RAW*? `"` | `#` RAW_BYTE_STRING_CONTENT `#` -ASCII_FOR_RAW -> - +ASCII_FOR_RAW -> !CR ASCII ``` r[lex.token.str-byte-raw.intro] @@ -559,7 +556,7 @@ r[lex.token.literal.float.syntax] FLOAT_LITERAL -> DEC_LITERAL (`.` DEC_LITERAL)? FLOAT_EXPONENT SUFFIX? | DEC_LITERAL `.` DEC_LITERAL SUFFIX_NO_E? - | DEC_LITERAL `.` _not immediately followed by `.`, `_` or an XID_Start character_ + | DEC_LITERAL `.` !(`.` | `_` | XID_Start) FLOAT_EXPONENT -> (`e`|`E`) (`+`|`-`)? `_`* DEC_DIGIT (DEC_DIGIT|`_`)* @@ -608,13 +605,12 @@ r[lex.token.literal.reserved.syntax] RESERVED_NUMBER -> BIN_LITERAL [`2`-`9`] | OCT_LITERAL [`8`-`9`] - | ( BIN_LITERAL | OCT_LITERAL | HEX_LITERAL ) `.` _not immediately followed by `.`, `_` or an XID_Start character_ + | ( BIN_LITERAL | OCT_LITERAL | HEX_LITERAL ) `.` !(`.` | `_` | XID_Start) | ( BIN_LITERAL | OCT_LITERAL ) (`e`|`E`) - | `0b` `_`* - | `0o` `_`* - | `0x` `_`* + | `0b` `_`* !BIN_DIGIT + | `0o` `_`* !OCT_DIGIT + | `0x` `_`* !HEX_DIGIT | DEC_LITERAL ( `.` DEC_LITERAL )? (`e` | `E`) (`+` | `-`)? - ``` r[lex.token.literal.reserved.intro] @@ -657,16 +653,16 @@ r[lex.token.life.syntax] ```grammar,lexer LIFETIME_TOKEN -> RAW_LIFETIME - | `'` IDENTIFIER_OR_KEYWORD _not immediately followed by `'`_ + | `'` IDENTIFIER_OR_KEYWORD !`'` LIFETIME_OR_LABEL -> RAW_LIFETIME - | `'` NON_KEYWORD_IDENTIFIER _not immediately followed by `'`_ + | `'` NON_KEYWORD_IDENTIFIER !`'` RAW_LIFETIME -> - `'r#` IDENTIFIER_OR_KEYWORD _not immediately followed by `'`_ + `'r#` ^ IDENTIFIER_OR_KEYWORD !`'` -RESERVED_RAW_LIFETIME -> `'r#` (`_` | `crate` | `self` | `Self` | `super`) _not immediately followed by `'`_ +RESERVED_RAW_LIFETIME -> `'r#` (`_` | `crate` | `self` | `Self` | `super`) !(`'` | XID_Continue) ``` r[lex.token.life.intro] diff --git a/tools/grammar/src/lib.rs b/tools/grammar/src/lib.rs index 70e1a8f9a8..b558f31aae 100644 --- a/tools/grammar/src/lib.rs +++ b/tools/grammar/src/lib.rs @@ -3,6 +3,7 @@ use diagnostics::{Diagnostics, warn_or_err}; use regex::Regex; use std::collections::{HashMap, HashSet}; +use std::fmt::{Display, Formatter}; use std::path::{Path, PathBuf}; use std::sync::LazyLock; use walkdir::WalkDir; @@ -50,6 +51,8 @@ pub enum ExpressionKind { Sequence(Vec), /// `A?` Optional(Box), + /// `!A` + NegativeLookahead(Box), /// `A*` Repeat(Box), /// `A*?` @@ -79,7 +82,7 @@ pub enum ExpressionKind { /// `^ A B C` Cut(Box), /// `U+0060` - Unicode(String), + Unicode((char, String)), } #[derive(Clone, Debug)] @@ -89,7 +92,34 @@ pub enum Characters { /// `` `_` `` Terminal(String), /// `` `A`-`Z` `` - Range(char, char), + Range(Character, Character), +} + +#[derive(Clone, Debug)] +pub enum Character { + Char(char), + /// `U+0060` + /// + /// The `String` is the hex digits after `U+`. + Unicode((char, String)), +} + +impl Character { + pub fn get_ch(&self) -> char { + match self { + Character::Char(ch) => *ch, + Character::Unicode((ch, _)) => *ch, + } + } +} + +impl Display for Character { + fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), std::fmt::Error> { + match self { + Character::Char(ch) => write!(f, "`{ch}`"), + Character::Unicode((_, s)) => write!(f, "U+{s}"), + } + } } impl Grammar { @@ -113,6 +143,7 @@ impl Expression { match &self.kind { ExpressionKind::Grouped(e) | ExpressionKind::Optional(e) + | ExpressionKind::NegativeLookahead(e) | ExpressionKind::Repeat(e) | ExpressionKind::RepeatNonGreedy(e) | ExpressionKind::RepeatPlus(e) diff --git a/tools/grammar/src/parser.rs b/tools/grammar/src/parser.rs index d4240ae4d7..f1f8588770 100644 --- a/tools/grammar/src/parser.rs +++ b/tools/grammar/src/parser.rs @@ -1,6 +1,6 @@ //! A parser of the ENBF-like grammar. -use super::{Characters, Expression, ExpressionKind, Grammar, Production}; +use super::{Character, Characters, Expression, ExpressionKind, Grammar, Production}; use std::fmt; use std::fmt::Display; use std::path::Path; @@ -221,7 +221,7 @@ impl Parser<'_> { }; let kind = if self.take_str("U+") { - self.parse_unicode()? + ExpressionKind::Unicode(self.parse_unicode()?) } else if self.input[self.index..] .chars() .next() @@ -251,6 +251,8 @@ impl Parser<'_> { self.parse_grouped()? } else if next == b'~' { self.parse_neg_expression()? + } else if next == b'!' { + self.parse_negative_lookahead()? } else { return Ok(None); }; @@ -320,27 +322,19 @@ impl Parser<'_> { /// Parse an element of a character class, e.g. /// `` `a`-`b` `` | `` `term` `` | `` NonTerminal ``. fn parse_characters(&mut self) -> Result> { - if let Some(b'`') = self.peek() { - let recov = self.index; - let a = self.parse_terminal_str()?; + if let Some(a) = self.parse_character()? { if self.take_str("-") { - //~^ Parse `` `a`-`b` `` character range. - if a.len() > 1 { - self.index = recov + 1; - bail!(self, "invalid start terminal in range"); - } - let recov = self.index; - let b = self.parse_terminal_str()?; - if b.len() > 1 { - self.index = recov + 1; - bail!(self, "invalid end terminal in range"); - } - let a = a.chars().next().unwrap(); - let b = b.chars().next().unwrap(); + let Some(b) = self.parse_character()? else { + bail!(self, "expected character in range"); + }; Ok(Some(Characters::Range(a, b))) } else { //~^ Parse terminal in backticks. - Ok(Some(Characters::Terminal(a))) + let t = match a { + Character::Char(ch) => ch.to_string(), + Character::Unicode(_) => bail!(self, "unicode not supported"), + }; + Ok(Some(Characters::Terminal(t))) } } else if let Some(name) = self.parse_name() { //~^ Parse nonterminal identifier. @@ -350,6 +344,23 @@ impl Parser<'_> { } } + fn parse_character(&mut self) -> Result> { + if let Some(b'`') = self.peek() { + let recov = self.index; + let term = self.parse_terminal_str()?; + if term.len() > 1 { + self.index = recov + 1; + bail!(self, "invalid start terminal in range"); + } + let ch = term.chars().next().unwrap(); + Ok(Some(Character::Char(ch))) + } else if self.take_str("U+") { + Ok(Some(Character::Unicode(self.parse_unicode()?))) + } else { + Ok(None) + } + } + /// Parse e.g. ``. fn parse_prose(&mut self) -> Result { self.expect("<", "expected opening `<`")?; @@ -387,10 +398,19 @@ impl Parser<'_> { Ok(ExpressionKind::NegExpression(box_kind(kind))) } + fn parse_negative_lookahead(&mut self) -> Result { + self.expect("!", "expected !")?; + self.space0(); + let Some(e) = self.parse_expr1()? else { + bail!(self, "expected expression after !"); + }; + Ok(ExpressionKind::NegativeLookahead(Box::new(e))) + } + /// Parse e.g. `F00F` after `U+`. - fn parse_unicode(&mut self) -> Result { - let mut xs = Vec::with_capacity(4); - for _ in 0..4 { + fn parse_unicode(&mut self) -> Result<(char, String)> { + let mut xs = Vec::with_capacity(6); + let mut push_next = || { match self.peek() { Some(x @ (b'0'..=b'9' | b'A'..=b'F')) => { xs.push(x); @@ -398,8 +418,19 @@ impl Parser<'_> { } _ => bail!(self, "expected 4 uppercase hexadecimal digits after `U+`"), } + Ok(()) + }; + for _ in 0..4 { + push_next()?; + } + for _ in 0..2 { + if push_next().is_err() { + break; + } } - Ok(ExpressionKind::Unicode(String::from_utf8(xs).unwrap())) + let s = String::from_utf8(xs).unwrap(); + let ch = char::from_u32(u32::from_str_radix(&s, 16).unwrap()).unwrap(); + Ok((ch, s)) } /// Parse `?` after expression. diff --git a/tools/mdbook-spec/src/grammar/render_markdown.rs b/tools/mdbook-spec/src/grammar/render_markdown.rs index a5540b4169..1a311aa151 100644 --- a/tools/mdbook-spec/src/grammar/render_markdown.rs +++ b/tools/mdbook-spec/src/grammar/render_markdown.rs @@ -3,7 +3,7 @@ use super::RenderCtx; use crate::grammar::Grammar; use anyhow::bail; -use grammar::{Characters, Expression, ExpressionKind, Production}; +use grammar::{Character, Characters, Expression, ExpressionKind, Production}; use regex::Regex; use std::borrow::Cow; use std::fmt::Write; @@ -67,6 +67,7 @@ fn last_expr(expr: &Expression) -> &ExpressionKind { ExpressionKind::Alt(es) | ExpressionKind::Sequence(es) => last_expr(es.last().unwrap()), ExpressionKind::Grouped(_) | ExpressionKind::Optional(_) + | ExpressionKind::NegativeLookahead(_) | ExpressionKind::Repeat(_) | ExpressionKind::RepeatNonGreedy(_) | ExpressionKind::RepeatPlus(_) @@ -119,6 +120,10 @@ fn render_expression(expr: &Expression, cx: &RenderCtx, output: &mut String) { render_expression(e, cx, output); output.push_str("?"); } + ExpressionKind::NegativeLookahead(e) => { + output.push('!'); + render_expression(e, cx, output); + } ExpressionKind::Repeat(e) => { render_expression(e, cx, output); output.push_str("\\*"); @@ -176,7 +181,7 @@ fn render_expression(expr: &Expression, cx: &RenderCtx, output: &mut String) { output.push_str("^ "); render_expression(e, cx, output); } - ExpressionKind::Unicode(s) => { + ExpressionKind::Unicode((_, s)) => { output.push_str("U+"); output.push_str(s); } @@ -217,12 +222,20 @@ fn render_characters(chars: &Characters, cx: &RenderCtx, output: &mut String) { markdown_escape(s) ) .unwrap(), - Characters::Range(a, b) => write!( - output, - "{a}\ - -{b}" - ) - .unwrap(), + Characters::Range(a, b) => { + let write_ch = |ch: &Character, output: &mut String| match ch { + Character::Char(ch) => write!( + output, + "{}", + markdown_escape(&ch.to_string()) + ) + .unwrap(), + Character::Unicode((_, s)) => write!(output, "U+{s}").unwrap(), + }; + write_ch(a, output); + output.push('-'); + write_ch(b, output); + } } } diff --git a/tools/mdbook-spec/src/grammar/render_railroad.rs b/tools/mdbook-spec/src/grammar/render_railroad.rs index 6efb065a34..94b2c1d261 100644 --- a/tools/mdbook-spec/src/grammar/render_railroad.rs +++ b/tools/mdbook-spec/src/grammar/render_railroad.rs @@ -3,7 +3,7 @@ use super::RenderCtx; use crate::grammar::Grammar; use anyhow::bail; -use grammar::{Characters, Expression, ExpressionKind, Production}; +use grammar::{Character, Characters, Expression, ExpressionKind, Production}; use railroad::*; use regex::Regex; use std::fmt::Write; @@ -139,6 +139,12 @@ fn render_expression(expr: &Expression, cx: &RenderCtx, stack: bool) -> Option { + let forward = render_expression(e, cx, stack)?; + let lbox = + LabeledBox::new(forward, Comment::new("not followed by".to_string())); + Box::new(lbox) + } // Treat `e?` and `e{..1}` / `e{0..1}` equally. ExpressionKind::Optional(e) | ExpressionKind::RepeatRange(e, None | Some(0), Some(1)) => { @@ -219,7 +225,7 @@ fn render_expression(expr: &Expression, cx: &RenderCtx, stack: bool) -> Option Box::new(Terminal::new(format!("U+{}", s))), + ExpressionKind::Unicode((_, s)) => Box::new(Terminal::new(format!("U+{}", s))), }; } }; @@ -238,7 +244,17 @@ fn render_characters(chars: &Characters, cx: &RenderCtx) -> Box { match chars { Characters::Named(s) => node_for_nt(cx, s), Characters::Terminal(s) => Box::new(Terminal::new(s.clone())), - Characters::Range(a, b) => Box::new(Terminal::new(format!("{a}-{b}"))), + Characters::Range(a, b) => { + let mut s = String::new(); + let write_ch = |ch: &Character, output: &mut String| match ch { + Character::Char(ch) => output.push(*ch), + Character::Unicode((_, s)) => write!(output, "U+{s}").unwrap(), + }; + write_ch(a, &mut s); + s.push('-'); + write_ch(b, &mut s); + Box::new(Terminal::new(s)) + } } }