From 32052d987f43b4f44ea63fda946d6d729682ae86 Mon Sep 17 00:00:00 2001
From: Eric Huss <eric@huss.org>
Date: Fri, 13 Feb 2026 07:45:22 -0800
Subject: [PATCH 1/8] Add negative lookahead to the grammar

This adds the `!` prefix which represents negative lookahead. This was
included in the original PEG paper, though it was called "NOT", whereas
I went with a more explicit "NegativeLookahead".

This will be helpful in several productions which need to have these
kinds of exclusions.

The syntax is also commonly used in regular expression engines which
usually use `(?!expr)`. This is also common in many other PEG libraries.

There is a small risk this could be confusing, since `!` is sometimes
used for other purposes in other contexts. For example, Prolog uses `!`
for their cut operator. I think this should be fine since it is common
with PEG.
---
 dev-guide/src/grammar.md                      | 37 ++++++++++++++-----
 src/notation.md                               |  1 +
 tools/grammar/src/lib.rs                      |  3 ++
 tools/grammar/src/parser.rs                   | 11 ++++++
 .../src/grammar/render_markdown.rs            |  5 +++
 .../src/grammar/render_railroad.rs            |  6 +++
 6 files changed, 53 insertions(+), 10 deletions(-)
diff --git a/dev-guide/src/grammar.md b/dev-guide/src/grammar.md
index 2f8d41f822..59c3e1e6ab 100644
--- a/dev-guide/src/grammar.md
+++ b/dev-guide/src/grammar.md
@@ -39,19 +39,35 @@ Sequence ->
         (` `* AdornedExpr)* ` `* Cut
       | (` `* AdornedExpr)+
 
-AdornedExpr -> ExprRepeat Suffix? Footnote?
+AdornedExpr -> Prefix? Expr1 Quantifier? Suffix? Footnote?
+
+Prefix -> NegativeLookahead
+
+NegativeLookahead -> `!`
 
 Suffix -> ` _` <not underscore, unless in backtick>* `_`
 
 Footnote -> `[^` ~[`]` LF]+ `]`
 
-ExprRepeat ->
-      Expr1 `?`
-    | Expr1 `*?`
-    | Expr1 `*`
-    | Expr1 `+?`
-    | Expr1 `+`
-    | Expr1 `{` Range? `..` Range? `}`
+Quantifier ->
+      Optional
+    | Repeat
+    | RepeatNonGreedy
+    | RepeatPlus
+    | RepeatPlusNonGreedy
+    | RepeatRange
+
+Optional -> `?`
+
+Repeat -> `*`
+
+RepeatNonGreedy -> `*?`
+
+RepeatPlus -> `+`
+
+RepeatPlusNonGreedy -> `+?`
+
+RepeatRange -> `{` Range? `..` Range? `}`
 
 Range -> [0-9]+
 
@@ -120,10 +136,11 @@ The general format is a series of productions separated by blank lines. The expr
 | Suffix | \_except \[LazyBooleanExpression\]\_  | Adds a suffix to the previous expression to provide an additional English description, rendered in subscript. This can contain limited Markdown, but try to avoid anything except basics like links. |
 | Footnote | \[^extern-safe\] | Adds a footnote, which can supply extra information that may be helpful to the user. The footnote itself should be defined outside of the code block like a normal Markdown footnote. |
 | Optional | Expr? | The preceding expression is optional. |
+| NegativeLookahead | !Expr | Matches if Expr does not follow, without consuming any input. |
 | Repeat | Expr* | The preceding expression is repeated 0 or more times. |
-| Repeat (non-greedy) | Expr*? | The preceding expression is repeated 0 or more times without being greedy. |
+| RepeatNonGreedy | Expr*? | The preceding expression is repeated 0 or more times without being greedy. |
 | RepeatPlus | Expr+ | The preceding expression is repeated 1 or more times. |
-| RepeatPlus (non-greedy) | Expr+? | The preceding expression is repeated 1 or more times without being greedy. |
+| RepeatPlusNonGreedy | Expr+? | The preceding expression is repeated 1 or more times without being greedy. |
 | RepeatRange | Expr{2..4} | The preceding expression is repeated between the range of times specified. Either bound can be excluded, which works just like Rust ranges. |
 
 ## Automatic linking
diff --git a/src/notation.md b/src/notation.md
index cda298a734..611d230aa3 100644
--- a/src/notation.md
+++ b/src/notation.md
@@ -19,6 +19,7 @@ The following notations are used by the *Lexer* and *Syntax* grammar snippets:
 | x<sup>a..b</sup>  | HEX_DIGIT<sup>1..6</sup>      | a to b repetitions of x                   |
 | Rule1 Rule2       | `fn` _Name_ _Parameters_      | Sequence of rules in order                |
 | \|                | `u8` \| `u16`, Block \| Item  | Either one or another                     |
+| !                 | !COMMENT                      | Matches if the expression does not follow, without consuming any input |
 | \[ ]               | \[`b` `B`]                     | Any of the characters listed              |
 | \[ - ]             | \[`a`-`z`]                     | Any of the characters in the range        |
 | ~\[ ]              | ~\[`b` `B`]                    | Any characters, except those listed       |
diff --git a/tools/grammar/src/lib.rs b/tools/grammar/src/lib.rs
index 70e1a8f9a8..e5e3f57bfe 100644
--- a/tools/grammar/src/lib.rs
+++ b/tools/grammar/src/lib.rs
@@ -50,6 +50,8 @@ pub enum ExpressionKind {
     Sequence(Vec<Expression>),
     /// `A?`
     Optional(Box<Expression>),
+    /// `!A`
+    NegativeLookahead(Box<Expression>),
     /// `A*`
     Repeat(Box<Expression>),
     /// `A*?`
@@ -113,6 +115,7 @@ impl Expression {
         match &self.kind {
             ExpressionKind::Grouped(e)
             | ExpressionKind::Optional(e)
+            | ExpressionKind::NegativeLookahead(e)
             | ExpressionKind::Repeat(e)
             | ExpressionKind::RepeatNonGreedy(e)
             | ExpressionKind::RepeatPlus(e)
diff --git a/tools/grammar/src/parser.rs b/tools/grammar/src/parser.rs
index d4240ae4d7..b25a3546da 100644
--- a/tools/grammar/src/parser.rs
+++ b/tools/grammar/src/parser.rs
@@ -251,6 +251,8 @@ impl Parser<'_> {
             self.parse_grouped()?
         } else if next == b'~' {
             self.parse_neg_expression()?
+        } else if next == b'!' {
+            self.parse_negative_lookahead()?
         } else {
             return Ok(None);
         };
@@ -387,6 +389,15 @@ impl Parser<'_> {
         Ok(ExpressionKind::NegExpression(box_kind(kind)))
     }
 
+    fn parse_negative_lookahead(&mut self) -> Result<ExpressionKind> {
+        self.expect("!", "expected !")?;
+        self.space0();
+        let Some(e) = self.parse_expr1()? else {
+            bail!(self, "expected expression after !");
+        };
+        Ok(ExpressionKind::NegativeLookahead(Box::new(e)))
+    }
+
     /// Parse e.g. `F00F` after `U+`.
     fn parse_unicode(&mut self) -> Result<ExpressionKind> {
         let mut xs = Vec::with_capacity(4);
diff --git a/tools/mdbook-spec/src/grammar/render_markdown.rs b/tools/mdbook-spec/src/grammar/render_markdown.rs
index a5540b4169..5869c964ad 100644
--- a/tools/mdbook-spec/src/grammar/render_markdown.rs
+++ b/tools/mdbook-spec/src/grammar/render_markdown.rs
@@ -67,6 +67,7 @@ fn last_expr(expr: &Expression) -> &ExpressionKind {
         ExpressionKind::Alt(es) | ExpressionKind::Sequence(es) => last_expr(es.last().unwrap()),
         ExpressionKind::Grouped(_)
         | ExpressionKind::Optional(_)
+        | ExpressionKind::NegativeLookahead(_)
         | ExpressionKind::Repeat(_)
         | ExpressionKind::RepeatNonGreedy(_)
         | ExpressionKind::RepeatPlus(_)
@@ -119,6 +120,10 @@ fn render_expression(expr: &Expression, cx: &RenderCtx, output: &mut String) {
             render_expression(e, cx, output);
             output.push_str("<sup>?</sup>");
         }
+        ExpressionKind::NegativeLookahead(e) => {
+            output.push('!');
+            render_expression(e, cx, output);
+        }
         ExpressionKind::Repeat(e) => {
             render_expression(e, cx, output);
             output.push_str("<sup>\\*</sup>");
diff --git a/tools/mdbook-spec/src/grammar/render_railroad.rs b/tools/mdbook-spec/src/grammar/render_railroad.rs
index 6efb065a34..5d20d77c48 100644
--- a/tools/mdbook-spec/src/grammar/render_railroad.rs
+++ b/tools/mdbook-spec/src/grammar/render_railroad.rs
@@ -139,6 +139,12 @@ fn render_expression(expr: &Expression, cx: &RenderCtx, stack: bool) -> Option<B
                         make_seq(&es)?
                     }
                 }
+                ExpressionKind::NegativeLookahead(e) => {
+                    let forward = render_expression(e, cx, stack)?;
+                    let lbox =
+                        LabeledBox::new(forward, Comment::new("not followed by".to_string()));
+                    Box::new(lbox)
+                }
                 // Treat `e?` and `e{..1}` / `e{0..1}` equally.
                 ExpressionKind::Optional(e)
                 | ExpressionKind::RepeatRange(e, None | Some(0), Some(1)) => {

From 8c7058df25689e6576200128701412dc8c44d207 Mon Sep 17 00:00:00 2001
From: Eric Huss <eric@huss.org>
Date: Fri, 13 Feb 2026 08:22:17 -0800
Subject: [PATCH 2/8] Add Unicode to character range

This adds the ability to specify Unicode code points in a character
range. This will be useful for defining some productions without using
English, and perhaps to be a little clearer.

This also extends the Unicode grammar to allow up to 6 characters for
larger code points.
---
 dev-guide/src/grammar.md                      |  8 ++-
 src/input-format.md                           |  4 +-
 tools/grammar/src/lib.rs                      | 32 ++++++++-
 tools/grammar/src/parser.rs                   | 66 ++++++++++++-------
 .../src/grammar/render_markdown.rs            | 24 ++++---
 .../src/grammar/render_railroad.rs            | 16 ++++-
 6 files changed, 111 insertions(+), 39 deletions(-)

diff --git a/dev-guide/src/grammar.md b/dev-guide/src/grammar.md
index 59c3e1e6ab..3a155b88e8 100644
--- a/dev-guide/src/grammar.md
+++ b/dev-guide/src/grammar.md
@@ -82,7 +82,7 @@ Expr1 ->
     | Group
     | NegativeExpression
 
-Unicode -> `U+` [`A`-`Z` `0`-`9`]4..4
+Unicode -> `U+` [`A`-`Z` `0`-`9`]4..6
 
 NonTerminal -> Name
 
@@ -99,7 +99,11 @@ Characters ->
     | CharacterTerminal
     | CharacterName
 
-CharacterRange -> BACKTICK <any char> BACKTICK `-` BACKTICK <any char> BACKTICK
+CharacterRange -> Character `-` Character
+
+Character ->
+        BACKTICK <any char> BACKTICK
+      | Unicode
 
 CharacterTerminal -> Terminal
 
diff --git a/src/input-format.md b/src/input-format.md
index 2432da0339..be6bb670b3 100644
--- a/src/input-format.md
+++ b/src/input-format.md
@@ -3,7 +3,9 @@ r[input]
 
 r[input.syntax]
 ```grammar,lexer
-@root CHAR -> <a Unicode scalar value>
+@root CHAR -> [U+0000-U+D7FF U+E000-U+10FFFF] // a Unicode scalar value
+
+@root ASCII -> [U+0000-U+007F]
 
 NUL -> U+0000
 ```
diff --git a/tools/grammar/src/lib.rs b/tools/grammar/src/lib.rs
index e5e3f57bfe..b558f31aae 100644
--- a/tools/grammar/src/lib.rs
+++ b/tools/grammar/src/lib.rs
@@ -3,6 +3,7 @@
 use diagnostics::{Diagnostics, warn_or_err};
 use regex::Regex;
 use std::collections::{HashMap, HashSet};
+use std::fmt::{Display, Formatter};
 use std::path::{Path, PathBuf};
 use std::sync::LazyLock;
 use walkdir::WalkDir;
@@ -81,7 +82,7 @@ pub enum ExpressionKind {
     /// `^ A B C`
     Cut(Box<Expression>),
     /// `U+0060`
-    Unicode(String),
+    Unicode((char, String)),
 }
 
 #[derive(Clone, Debug)]
@@ -91,7 +92,34 @@ pub enum Characters {
     /// `` `_` ``
     Terminal(String),
     /// `` `A`-`Z` ``
-    Range(char, char),
+    Range(Character, Character),
+}
+
+#[derive(Clone, Debug)]
+pub enum Character {
+    Char(char),
+    /// `U+0060`
+    ///
+    /// The `String` is the hex digits after `U+`.
+    Unicode((char, String)),
+}
+
+impl Character {
+    pub fn get_ch(&self) -> char {
+        match self {
+            Character::Char(ch) => *ch,
+            Character::Unicode((ch, _)) => *ch,
+        }
+    }
+}
+
+impl Display for Character {
+    fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), std::fmt::Error> {
+        match self {
+            Character::Char(ch) => write!(f, "`{ch}`"),
+            Character::Unicode((_, s)) => write!(f, "U+{s}"),
+        }
+    }
 }
 
 impl Grammar {
diff --git a/tools/grammar/src/parser.rs b/tools/grammar/src/parser.rs
index b25a3546da..f1f8588770 100644
--- a/tools/grammar/src/parser.rs
+++ b/tools/grammar/src/parser.rs
@@ -1,6 +1,6 @@
 //! A parser of the ENBF-like grammar.
 
-use super::{Characters, Expression, ExpressionKind, Grammar, Production};
+use super::{Character, Characters, Expression, ExpressionKind, Grammar, Production};
 use std::fmt;
 use std::fmt::Display;
 use std::path::Path;
@@ -221,7 +221,7 @@ impl Parser<'_> {
         };
 
         let kind = if self.take_str("U+") {
-            self.parse_unicode()?
+            ExpressionKind::Unicode(self.parse_unicode()?)
         } else if self.input[self.index..]
             .chars()
             .next()
@@ -322,27 +322,19 @@ impl Parser<'_> {
     /// Parse an element of a character class, e.g.
     /// `` `a`-`b` `` | `` `term` `` | `` NonTerminal ``.
     fn parse_characters(&mut self) -> Result<Option<Characters>> {
-        if let Some(b'`') = self.peek() {
-            let recov = self.index;
-            let a = self.parse_terminal_str()?;
+        if let Some(a) = self.parse_character()? {
             if self.take_str("-") {
-                //~^ Parse `` `a`-`b` `` character range.
-                if a.len() > 1 {
-                    self.index = recov + 1;
-                    bail!(self, "invalid start terminal in range");
-                }
-                let recov = self.index;
-                let b = self.parse_terminal_str()?;
-                if b.len() > 1 {
-                    self.index = recov + 1;
-                    bail!(self, "invalid end terminal in range");
-                }
-                let a = a.chars().next().unwrap();
-                let b = b.chars().next().unwrap();
+                let Some(b) = self.parse_character()? else {
+                    bail!(self, "expected character in range");
+                };
                 Ok(Some(Characters::Range(a, b)))
             } else {
                 //~^ Parse terminal in backticks.
-                Ok(Some(Characters::Terminal(a)))
+                let t = match a {
+                    Character::Char(ch) => ch.to_string(),
+                    Character::Unicode(_) => bail!(self, "unicode not supported"),
+                };
+                Ok(Some(Characters::Terminal(t)))
             }
         } else if let Some(name) = self.parse_name() {
             //~^ Parse nonterminal identifier.
@@ -352,6 +344,23 @@ impl Parser<'_> {
         }
     }
 
+    fn parse_character(&mut self) -> Result<Option<Character>> {
+        if let Some(b'`') = self.peek() {
+            let recov = self.index;
+            let term = self.parse_terminal_str()?;
+            if term.len() > 1 {
+                self.index = recov + 1;
+                bail!(self, "invalid start terminal in range");
+            }
+            let ch = term.chars().next().unwrap();
+            Ok(Some(Character::Char(ch)))
+        } else if self.take_str("U+") {
+            Ok(Some(Character::Unicode(self.parse_unicode()?)))
+        } else {
+            Ok(None)
+        }
+    }
+
     /// Parse e.g. `<prose text>`.
     fn parse_prose(&mut self) -> Result<ExpressionKind> {
         self.expect("<", "expected opening `<`")?;
@@ -399,9 +408,9 @@ impl Parser<'_> {
     }
 
     /// Parse e.g. `F00F` after `U+`.
-    fn parse_unicode(&mut self) -> Result<ExpressionKind> {
-        let mut xs = Vec::with_capacity(4);
-        for _ in 0..4 {
+    fn parse_unicode(&mut self) -> Result<(char, String)> {
+        let mut xs = Vec::with_capacity(6);
+        let mut push_next = || {
             match self.peek() {
                 Some(x @ (b'0'..=b'9' | b'A'..=b'F')) => {
                     xs.push(x);
@@ -409,8 +418,19 @@ impl Parser<'_> {
                 }
                 _ => bail!(self, "expected 4 uppercase hexadecimal digits after `U+`"),
             }
+            Ok(())
+        };
+        for _ in 0..4 {
+            push_next()?;
+        }
+        for _ in 0..2 {
+            if push_next().is_err() {
+                break;
+            }
         }
-        Ok(ExpressionKind::Unicode(String::from_utf8(xs).unwrap()))
+        let s = String::from_utf8(xs).unwrap();
+        let ch = char::from_u32(u32::from_str_radix(&s, 16).unwrap()).unwrap();
+        Ok((ch, s))
     }
 
     /// Parse `?` after expression.
diff --git a/tools/mdbook-spec/src/grammar/render_markdown.rs b/tools/mdbook-spec/src/grammar/render_markdown.rs
index 5869c964ad..1a311aa151 100644
--- a/tools/mdbook-spec/src/grammar/render_markdown.rs
+++ b/tools/mdbook-spec/src/grammar/render_markdown.rs
@@ -3,7 +3,7 @@
 use super::RenderCtx;
 use crate::grammar::Grammar;
 use anyhow::bail;
-use grammar::{Characters, Expression, ExpressionKind, Production};
+use grammar::{Character, Characters, Expression, ExpressionKind, Production};
 use regex::Regex;
 use std::borrow::Cow;
 use std::fmt::Write;
@@ -181,7 +181,7 @@ fn render_expression(expr: &Expression, cx: &RenderCtx, output: &mut String) {
             output.push_str("^ ");
             render_expression(e, cx, output);
         }
-        ExpressionKind::Unicode(s) => {
+        ExpressionKind::Unicode((_, s)) => {
             output.push_str("U+");
             output.push_str(s);
         }
@@ -222,12 +222,20 @@ fn render_characters(chars: &Characters, cx: &RenderCtx, output: &mut String) {
             markdown_escape(s)
         )
         .unwrap(),
-        Characters::Range(a, b) => write!(
-            output,
-            "<span class=\"grammar-literal\">{a}\
-                 </span>-<span class=\"grammar-literal\">{b}</span>"
-        )
-        .unwrap(),
+        Characters::Range(a, b) => {
+            let write_ch = |ch: &Character, output: &mut String| match ch {
+                Character::Char(ch) => write!(
+                    output,
+                    "<span class=\"grammar-literal\">{}</span>",
+                    markdown_escape(&ch.to_string())
+                )
+                .unwrap(),
+                Character::Unicode((_, s)) => write!(output, "U+{s}").unwrap(),
+            };
+            write_ch(a, output);
+            output.push('-');
+            write_ch(b, output);
+        }
     }
 }
 
diff --git a/tools/mdbook-spec/src/grammar/render_railroad.rs b/tools/mdbook-spec/src/grammar/render_railroad.rs
index 5d20d77c48..94b2c1d261 100644
--- a/tools/mdbook-spec/src/grammar/render_railroad.rs
+++ b/tools/mdbook-spec/src/grammar/render_railroad.rs
@@ -3,7 +3,7 @@
 use super::RenderCtx;
 use crate::grammar::Grammar;
 use anyhow::bail;
-use grammar::{Characters, Expression, ExpressionKind, Production};
+use grammar::{Character, Characters, Expression, ExpressionKind, Production};
 use railroad::*;
 use regex::Regex;
 use std::fmt::Write;
@@ -225,7 +225,7 @@ fn render_expression(expr: &Expression, cx: &RenderCtx, stack: bool) -> Option<B
                     let lbox = LabeledBox::new(rhs, Comment::new("no backtracking".to_string()));
                     Box::new(lbox)
                 }
-                ExpressionKind::Unicode(s) => Box::new(Terminal::new(format!("U+{}", s))),
+                ExpressionKind::Unicode((_, s)) => Box::new(Terminal::new(format!("U+{}", s))),
             };
         }
     };
@@ -244,7 +244,17 @@ fn render_characters(chars: &Characters, cx: &RenderCtx) -> Box<dyn Node> {
     match chars {
         Characters::Named(s) => node_for_nt(cx, s),
         Characters::Terminal(s) => Box::new(Terminal::new(s.clone())),
-        Characters::Range(a, b) => Box::new(Terminal::new(format!("{a}-{b}"))),
+        Characters::Range(a, b) => {
+            let mut s = String::new();
+            let write_ch = |ch: &Character, output: &mut String| match ch {
+                Character::Char(ch) => output.push(*ch),
+                Character::Unicode((_, s)) => write!(output, "U+{s}").unwrap(),
+            };
+            write_ch(a, &mut s);
+            s.push('-');
+            write_ch(b, &mut s);
+            Box::new(Terminal::new(s))
+        }
     }
 }
 

From 42f14b0ed670fae2da11eee35dc112c29c1c5d1d Mon Sep 17 00:00:00 2001
From: Eric Huss <eric@huss.org>
Date: Fri, 13 Feb 2026 09:26:19 -0800
Subject: [PATCH 3/8] Use negative lookahead in the grammar

This replaces some suffixes and prose with the new negative lookahead
syntax instead. This should all have the same meaning.
---
 src/identifiers.md  |  2 +-
 src/input-format.md |  2 +-
 src/tokens.md       | 30 +++++++++++++-----------------
 3 files changed, 15 insertions(+), 19 deletions(-)

diff --git a/src/identifiers.md b/src/identifiers.md
index 979284a1c7..5abe303ca2 100644
--- a/src/identifiers.md
+++ b/src/identifiers.md
@@ -16,7 +16,7 @@ NON_KEYWORD_IDENTIFIER -> IDENTIFIER_OR_KEYWORD _except a [strict][lex.keywords.
 IDENTIFIER -> NON_KEYWORD_IDENTIFIER | RAW_IDENTIFIER
 
 RESERVED_RAW_IDENTIFIER ->
-    `r#` (`_` | `crate` | `self` | `Self` | `super`) _not immediately followed by XID_Continue_
+    `r#` (`_` | `crate` | `self` | `Self` | `super`) !XID_Continue
 ```
 
 <!-- When updating the version, update the UAX links, too. -->
diff --git a/src/input-format.md b/src/input-format.md
index be6bb670b3..d6eca2dc2d 100644
--- a/src/input-format.md
+++ b/src/input-format.md
@@ -5,7 +5,7 @@ r[input.syntax]
 ```grammar,lexer
 @root CHAR -> [U+0000-U+D7FF U+E000-U+10FFFF] // a Unicode scalar value
 
-@root ASCII -> [U+0000-U+007F]
+ASCII -> [U+0000-U+007F]
 
 NUL -> U+0000
 ```
diff --git a/src/tokens.md b/src/tokens.md
index b6a0124320..fb5e06da26 100644
--- a/src/tokens.md
+++ b/src/tokens.md
@@ -115,7 +115,7 @@ r[lex.token.literal.suffix.syntax]
 ```grammar,lexer
 SUFFIX -> IDENTIFIER_OR_KEYWORD _except `_`_
 
-SUFFIX_NO_E -> SUFFIX _not beginning with `e` or `E`_
+SUFFIX_NO_E -> ![`e` `E`] SUFFIX
 ```
 
 r[lex.token.literal.suffix.validity]
@@ -253,8 +253,7 @@ r[lex.token.byte.syntax]
 BYTE_LITERAL ->
     `b'` ^ ( ASCII_FOR_CHAR | BYTE_ESCAPE )  `'` SUFFIX?
 
-ASCII_FOR_CHAR ->
-    <any ASCII (i.e. 0x00 to 0x7F) except `'`, `\`, LF, CR, or TAB>
+ASCII_FOR_CHAR -> ![`'` `\` LF CR TAB] ASCII
 
 BYTE_ESCAPE ->
       `\x` HEX_DIGIT HEX_DIGIT
@@ -272,8 +271,7 @@ r[lex.token.str-byte.syntax]
 BYTE_STRING_LITERAL ->
     `b"` ^ ( ASCII_FOR_STRING | BYTE_ESCAPE | STRING_CONTINUE )* `"` SUFFIX?
 
-ASCII_FOR_STRING ->
-    <any ASCII (i.e 0x00 to 0x7F) except `"`, `\`, or CR>
+ASCII_FOR_STRING -> ![`"` `\` CR] ASCII
 ```
 
 r[lex.token.str-byte.intro]
@@ -309,8 +307,7 @@ RAW_BYTE_STRING_CONTENT ->
       `"` ^ ASCII_FOR_RAW*? `"`
     | `#` RAW_BYTE_STRING_CONTENT `#`
 
-ASCII_FOR_RAW ->
-    <any ASCII (i.e. 0x00 to 0x7F) except CR>
+ASCII_FOR_RAW -> !CR ASCII
 ```
 
 r[lex.token.str-byte-raw.intro]
@@ -559,7 +556,7 @@ r[lex.token.literal.float.syntax]
 FLOAT_LITERAL ->
       DEC_LITERAL (`.` DEC_LITERAL)? FLOAT_EXPONENT SUFFIX?
     | DEC_LITERAL `.` DEC_LITERAL SUFFIX_NO_E?
-    | DEC_LITERAL `.` _not immediately followed by `.`, `_` or an XID_Start character_
+    | DEC_LITERAL `.` !(`.` | `_` | XID_Start)
 
 FLOAT_EXPONENT ->
     (`e`|`E`) (`+`|`-`)? `_`* DEC_DIGIT (DEC_DIGIT|`_`)*
@@ -608,13 +605,12 @@ r[lex.token.literal.reserved.syntax]
 RESERVED_NUMBER ->
       BIN_LITERAL [`2`-`9`]
     | OCT_LITERAL [`8`-`9`]
-    | ( BIN_LITERAL | OCT_LITERAL | HEX_LITERAL ) `.` _not immediately followed by `.`, `_` or an XID_Start character_
+    | ( BIN_LITERAL | OCT_LITERAL | HEX_LITERAL ) `.` !(`.` | `_` | XID_Start)
     | ( BIN_LITERAL | OCT_LITERAL ) (`e`|`E`)
-    | `0b` `_`* <end of input or not BIN_DIGIT>
-    | `0o` `_`* <end of input or not OCT_DIGIT>
-    | `0x` `_`* <end of input or not HEX_DIGIT>
+    | `0b` `_`* !BIN_DIGIT
+    | `0o` `_`* !OCT_DIGIT
+    | `0x` `_`* !HEX_DIGIT
     | DEC_LITERAL ( `.` DEC_LITERAL )? (`e` | `E`) (`+` | `-`)? <end of input or not DEC_DIGIT>
-
 ```
 
 r[lex.token.literal.reserved.intro]
@@ -657,16 +653,16 @@ r[lex.token.life.syntax]
 ```grammar,lexer
 LIFETIME_TOKEN ->
       RAW_LIFETIME
-    | `'` IDENTIFIER_OR_KEYWORD _not immediately followed by `'`_
+    | `'` IDENTIFIER_OR_KEYWORD !`'`
 
 LIFETIME_OR_LABEL ->
       RAW_LIFETIME
-    | `'` NON_KEYWORD_IDENTIFIER _not immediately followed by `'`_
+    | `'` NON_KEYWORD_IDENTIFIER !`'`
 
 RAW_LIFETIME ->
-    `'r#` IDENTIFIER_OR_KEYWORD _not immediately followed by `'`_
+    `'r#` ^ IDENTIFIER_OR_KEYWORD !`'`
 
-RESERVED_RAW_LIFETIME -> `'r#` (`_` | `crate` | `self` | `Self` | `super`) _not immediately followed by `'`_
+RESERVED_RAW_LIFETIME -> `'r#` (`_` | `crate` | `self` | `Self` | `super`) !(`'` | XID_Continue)
 ```
 
 r[lex.token.life.intro]

From 259df9111c1a2474e44d6c336589d3393b042290 Mon Sep 17 00:00:00 2001
From: Eric Huss <eric@huss.org>
Date: Fri, 13 Feb 2026 09:30:07 -0800
Subject: [PATCH 4/8] Fix LINE_COMMENT grammar

This clarifies that bare `//` is explicitly meant to be either followed
by LF or EOF. Otherwise it incorrectly matches other comment rules.
---
 src/comments.md     | 3 ++-
 src/input-format.md | 4 +++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/comments.md b/src/comments.md
index a240e7dc58..1320077938 100644
--- a/src/comments.md
+++ b/src/comments.md
@@ -5,7 +5,8 @@ r[comments.syntax]
 ```grammar,lexer
 @root LINE_COMMENT ->
       `//` (~[`/` `!` LF] | `//`) ~LF*
-    | `//`
+    | `//` EOF
+    | `//` _immediately followed by LF_
 
 BLOCK_COMMENT ->
       `/*`
diff --git a/src/input-format.md b/src/input-format.md
index d6eca2dc2d..3e35cba1ee 100644
--- a/src/input-format.md
+++ b/src/input-format.md
@@ -3,11 +3,13 @@ r[input]
 
 r[input.syntax]
 ```grammar,lexer
-@root CHAR -> [U+0000-U+D7FF U+E000-U+10FFFF] // a Unicode scalar value
+CHAR -> [U+0000-U+D7FF U+E000-U+10FFFF] // a Unicode scalar value
 
 ASCII -> [U+0000-U+007F]
 
 NUL -> U+0000
+
+EOF -> !CHAR  // End of file or input
 ```
 
 r[input.intro]

From feaa3886c9f09f2880052a65d965ee28cf075694 Mon Sep 17 00:00:00 2001
From: Eric Huss <eric@huss.org>
Date: Fri, 13 Feb 2026 09:35:57 -0800
Subject: [PATCH 5/8] Fix BLOCK_COMMENT order

This fixes the BLOCK_COMMENT grammar so that it follows the rule that
the first alternation that matches wins. The previous grammar would fail
with the use of the cut operator to parse these two forms.
---
 src/comments.md | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/comments.md b/src/comments.md
index 1320077938..e82cd28ace 100644
--- a/src/comments.md
+++ b/src/comments.md
@@ -9,12 +9,13 @@ r[comments.syntax]
     | `//` _immediately followed by LF_
 
 BLOCK_COMMENT ->
-      `/*`
+      `/**/`
+    | `/***/`
+    | `/*`
+        ^
         ( ~[`*` `!`] | `**` | BLOCK_COMMENT_OR_DOC )
         ( BLOCK_COMMENT_OR_DOC | ~`*/` )*
       `*/`
-    | `/**/`
-    | `/***/`
 
 @root INNER_LINE_DOC ->
     `//!` ~[LF CR]*

From f2ca8392652141e7527a714191ae0873d2f965ca Mon Sep 17 00:00:00 2001
From: Eric Huss <eric@huss.org>
Date: Fri, 13 Feb 2026 09:54:48 -0800
Subject: [PATCH 6/8] Fix handling of carriage returns in doc comments

This fixes the doc comments so that they properly handle a carriage
return by using the cut operator. Rustc will fail parsing if a doc
comment contains a carriage return.

This requires including (LF|EOF) at the end of line so the cut operator
has something to complete the line.

This also removes the negative `/` from OUTER_LINE_DOC. This does not
work correctly with the check for CR, and is not needed because
LINE_COMMENT already matches `////`. Later I plan to include a rule for
comments that makes it clear the order that they are parsed.

A negative lookahead is necessary in OUTER_BLOCK_DOC to prevent it from
trying to parse what should be a BLOCK_COMMENT as an OUTER_BLOCK_DOC and
failing due to the cut operator.
---
 src/comments.md | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/src/comments.md b/src/comments.md
index e82cd28ace..6e4c06744f 100644
--- a/src/comments.md
+++ b/src/comments.md
@@ -18,20 +18,25 @@ BLOCK_COMMENT ->
       `*/`
 
 @root INNER_LINE_DOC ->
-    `//!` ~[LF CR]*
+    `//!` ^ LINE_DOC_COMMENT_CONTENT (LF | EOF)
+
+LINE_DOC_COMMENT_CONTENT -> (!CR ~LF)*
 
 INNER_BLOCK_DOC ->
-    `/*!` ( BLOCK_COMMENT_OR_DOC | ~[`*/` CR] )* `*/`
+    `/*!` ^ ( BLOCK_COMMENT_OR_DOC | BLOCK_CHAR )* `*/`
 
 @root OUTER_LINE_DOC ->
-    `///` (~`/` ~[LF CR]*)?
+    `///` ^ LINE_DOC_COMMENT_CONTENT (LF | EOF)
 
 OUTER_BLOCK_DOC ->
-    `/**`
+    `/**` ![`*` `/`]
+      ^
       ( ~`*` | BLOCK_COMMENT_OR_DOC )
-      ( BLOCK_COMMENT_OR_DOC | ~[`*/` CR] )*
+      ( BLOCK_COMMENT_OR_DOC | BLOCK_CHAR )*
     `*/`
 
+BLOCK_CHAR -> (!(`*/` | CR) CHAR)
+
 @root BLOCK_COMMENT_OR_DOC ->
       BLOCK_COMMENT
     | OUTER_BLOCK_DOC

From 2d2850a0ec4345e35248d017ac6f8760009f09ce Mon Sep 17 00:00:00 2001
From: Eric Huss <eric@huss.org>
Date: Fri, 13 Feb 2026 10:27:55 -0800
Subject: [PATCH 7/8] Add a new COMMENT grammar rule

This is intended to indicate the order that the rules are expected to be
processed (as defined in this grammar). Of course real parsers can take
a different approach if they have the same results.

This is roughly similar to the order that rustc takes, though
[`block_comment`](https://github.com/rust-lang/rust/blob/d7daac06d87e1252d10eaa44960164faac46beff/compiler/rustc_lexer/src/lib.rs#L782-L817)
roughly takes the approach of combining the `/*` prefix, and then
deciding if it is an inner doc comment, outer doc comment, or else a
regular block comment.

LINE_COMMENT must be first so that it is not confused with a doc
comment.

BLOCK_COMMENT must be last so that its cut operator does not interfere
with doc comments that start with `/*`. It could be moved up higher in
the list if it had negative lookahead to disambiguate OUTER_BLOCK_DOC,
but the expression for that is more complicated than the one in
OUTER_BLOCK_DOC.
---
 src/comments.md | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/src/comments.md b/src/comments.md
index 6e4c06744f..bbb3332539 100644
--- a/src/comments.md
+++ b/src/comments.md
@@ -3,7 +3,15 @@ r[comments]
 
 r[comments.syntax]
 ```grammar,lexer
-@root LINE_COMMENT ->
+@root COMMENT ->
+      LINE_COMMENT
+    | INNER_LINE_DOC
+    | OUTER_LINE_DOC
+    | INNER_BLOCK_DOC
+    | OUTER_BLOCK_DOC
+    | BLOCK_COMMENT
+
+LINE_COMMENT ->
       `//` (~[`/` `!` LF] | `//`) ~LF*
     | `//` EOF
     | `//` _immediately followed by LF_
@@ -17,7 +25,7 @@ BLOCK_COMMENT ->
         ( BLOCK_COMMENT_OR_DOC | ~`*/` )*
       `*/`
 
-@root INNER_LINE_DOC ->
+INNER_LINE_DOC ->
     `//!` ^ LINE_DOC_COMMENT_CONTENT (LF | EOF)
 
 LINE_DOC_COMMENT_CONTENT -> (!CR ~LF)*
@@ -25,7 +33,7 @@ LINE_DOC_COMMENT_CONTENT -> (!CR ~LF)*
 INNER_BLOCK_DOC ->
     `/*!` ^ ( BLOCK_COMMENT_OR_DOC | BLOCK_CHAR )* `*/`
 
-@root OUTER_LINE_DOC ->
+OUTER_LINE_DOC ->
     `///` ^ LINE_DOC_COMMENT_CONTENT (LF | EOF)
 
 OUTER_BLOCK_DOC ->
@@ -37,7 +45,7 @@ OUTER_BLOCK_DOC ->
 
 BLOCK_CHAR -> (!(`*/` | CR) CHAR)
 
-@root BLOCK_COMMENT_OR_DOC ->
+BLOCK_COMMENT_OR_DOC ->
       BLOCK_COMMENT
     | OUTER_BLOCK_DOC
     | INNER_BLOCK_DOC

From 2bd22f87a09de22e7d15e7c98dbed7435f5056e9 Mon Sep 17 00:00:00 2001
From: Eric Huss <eric@huss.org>
Date: Fri, 13 Feb 2026 10:28:57 -0800
Subject: [PATCH 8/8] Fix desugaring of doc comments

rustc actually includes the spaces for doc comments.
---
 src/comments.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/comments.md b/src/comments.md
index bbb3332539..ef283a9ea1 100644
--- a/src/comments.md
+++ b/src/comments.md
@@ -66,7 +66,7 @@ r[comments.doc.syntax]
 Line doc comments beginning with exactly _three_ slashes (`///`), and block doc comments (`/** ... */`), both outer doc comments, are interpreted as a special syntax for [`doc` attributes].
 
 r[comments.doc.attributes]
-That is, they are equivalent to writing `#[doc="..."]` around the body of the comment, i.e., `/// Foo` turns into `#[doc="Foo"]` and `/** Bar */` turns into `#[doc="Bar"]`. They must therefore appear before something that accepts an outer attribute.
+That is, they are equivalent to writing `#[doc="..."]` around the body of the comment, i.e., `/// Foo` turns into `#[doc=" Foo"]` and `/** Bar */` turns into `#[doc=" Bar "]`. They must therefore appear before something that accepts an outer attribute.
 
 r[comments.doc.inner-syntax]
 Line comments beginning with `//!` and block comments `/*! ... */` are doc comments that apply to the parent of the comment, rather than the item that follows.