vcspull/docs/_ext/cli_usage_lexer.py at 0854eb4b5dfd04dc80c6f5c448fc11cb163ced55 · vcs-python/vcspull · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
"""Pygments lexer for CLI usage/help output.

This module provides a custom Pygments lexer for highlighting command-line
usage text typically generated by argparse, getopt, or similar libraries.
"""

from __future__ import annotations

from pygments.lexer import RegexLexer, bygroups, include
from pygments.token import Generic, Name, Operator, Punctuation, Text, Whitespace


class CLIUsageLexer(RegexLexer):
    """Lexer for CLI usage/help text (argparse, etc.).

    Highlights usage patterns including options, arguments, and meta-variables.

    Examples
    --------
    >>> from pygments.token import Token
    >>> lexer = CLIUsageLexer()
    >>> tokens = list(lexer.get_tokens("usage: cmd [-h]"))
    >>> tokens[0]
    (Token.Generic.Heading, 'usage:')
    >>> tokens[2]
    (Token.Name.Label, 'cmd')
    """

    name = "CLI Usage"
    aliases = ["cli-usage", "usage"]  # noqa: RUF012
    filenames: list[str] = []  # noqa: RUF012
    mimetypes = ["text/x-cli-usage"]  # noqa: RUF012

    tokens = {  # noqa: RUF012
        "root": [
            # "usage:" at start of line
            (r"^(usage:)(\s+)", bygroups(Generic.Heading, Whitespace)),  # type: ignore[no-untyped-call]
            # Continuation lines (leading whitespace for wrapped usage)
            (r"^(\s+)(?=\S)", Whitespace),
            include("inline"),
        ],
        "inline": [
            # Whitespace
            (r"\s+", Whitespace),
            # Long options with = value (e.g., --log-level=VALUE)
            (
                r"(--[a-zA-Z0-9][-a-zA-Z0-9]*)(=)([A-Z][A-Z0-9_]*|[a-z][-a-z0-9]*)",
                bygroups(Name.Tag, Operator, Name.Variable),  # type: ignore[no-untyped-call]
            ),
            # Long options standalone
            (r"--[a-zA-Z0-9][-a-zA-Z0-9]*", Name.Tag),
            # Short options with space-separated value (e.g., -S socket-path)
            (
                r"(-[a-zA-Z0-9])(\s+)([A-Z][A-Z0-9_]*|[a-z][-a-z0-9]*)",
                bygroups(Name.Attribute, Whitespace, Name.Variable),  # type: ignore[no-untyped-call]
            ),
            # Short options standalone
            (r"-[a-zA-Z0-9]", Name.Attribute),
            # UPPERCASE meta-variables (COMMAND, FILE, PATH)
            (r"\b[A-Z][A-Z0-9_]+\b", Name.Constant),
            # Opening bracket - enter optional state
            (r"\[", Punctuation, "optional"),
            # Closing bracket (fallback for unmatched)
            (r"\]", Punctuation),
            # Choice separator (pipe)
            (r"\|", Operator),
            # Parentheses for grouping
            (r"[()]", Punctuation),
            # Positional/command names (lowercase with dashes)
            (r"\b[a-z][-a-z0-9]*\b", Name.Label),
            # Catch-all for any other text
            (r"[^\s\[\]|()]+", Text),
        ],
        "optional": [
            # Nested optional bracket
            (r"\[", Punctuation, "#push"),
            # End optional
            (r"\]", Punctuation, "#pop"),
            # Contents use inline rules
            include("inline"),
        ],
    }


def tokenize_usage(text: str) -> list[tuple[str, str]]:
    """Tokenize usage text and return list of (token_type, value) tuples.

    Parameters
    ----------
    text : str
        CLI usage text to tokenize.

    Returns
    -------
    list[tuple[str, str]]
        List of (token_type_name, text_value) tuples.

    Examples
    --------
    >>> result = tokenize_usage("usage: cmd [-h]")
    >>> result[0]
    ('Token.Generic.Heading', 'usage:')
    >>> result[2]
    ('Token.Name.Label', 'cmd')
    >>> result[4]
    ('Token.Punctuation', '[')
    >>> result[5]
    ('Token.Name.Attribute', '-h')
    >>> result[6]
    ('Token.Punctuation', ']')
    """
    lexer = CLIUsageLexer()
    return [
        (str(tok_type), tok_value) for tok_type, tok_value in lexer.get_tokens(text)
    ]