Skip to content
This repository was archived by the owner on Mar 26, 2026. It is now read-only.
Closed
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
171 changes: 129 additions & 42 deletions gapic/utils/rst.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,105 @@
# limitations under the License.

import re
from typing import Optional
from typing import Optional, List, Dict

import pypandoc # type: ignore

from gapic.utils.lines import wrap

# --- PERFORMANCE CACHE ---
_RAW_RST_CACHE: Dict[str, str] = {}


def _aggressive_fast_convert(text: str) -> Optional[str]:
"""
Converts common Markdown (Code, Links, Lists) to RST using pure Python.
Only gives up (returns None) for complex structures like Tables.
"""
# 1. TABLE CHECK (The only thing we strictly need Pandoc for)
# If we see a pipe surrounded by spaces, it's likely a table.
if re.search(r" \| ", text) or re.search(r"\|\n", text):
return None

# 2. CODE BLOCKS: `code` -> ``code``
# RST requires double backticks. Markdown uses one.
# We look for backticks that aren't already double.
# Regex: Negative lookbehind/lookahead to ensure we don't match ``already rst``.
converted = re.sub(r"(?<!`)`([^`]+)`(?!`)", r"``\1``", text)

# 3. LINKS: [Text](URL) -> `Text <URL>`__
# We use anonymous links (__) to avoid collision issues.
converted = re.sub(r"\[([^\]]+)\]\(([^)]+)\)", r"`\1 <\2>`__", converted)

# 4. BOLD: **text** -> **text** (Compatible, no change needed)

# 5. HEADINGS: # Heading -> Heading\n=======
# (Simple fix for H1/H2, mostly sufficient for docstrings)
converted = re.sub(r"^# (.*)$", r"\1\n" + "=" * 10, converted, flags=re.MULTILINE)
converted = re.sub(r"^## (.*)$", r"\1\n" + "-" * 10, converted, flags=re.MULTILINE)
Comment on lines +56 to +57
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The regular expression substitution for headings uses a fixed-length underline ('=' * 10 and '-' * 10). In reStructuredText, the underline for a heading must be at least as long as the heading text itself. A fixed length may be too short for longer headings, resulting in invalid RST.

You can fix this by using a lambda function in re.sub to dynamically set the underline length based on the matched heading text's length.

Suggested change
converted = re.sub(r"^# (.*)$", r"\1\n" + "=" * 10, converted, flags=re.MULTILINE)
converted = re.sub(r"^## (.*)$", r"\1\n" + "-" * 10, converted, flags=re.MULTILINE)
converted = re.sub(r"^# (.*)$", lambda m: f"{m.group(1)}\n{'=' * len(m.group(1))}", converted, flags=re.MULTILINE)
converted = re.sub(r"^## (.*)$", lambda m: f"{m.group(1)}\n{'-' * len(m.group(1))}", converted, flags=re.MULTILINE)


# 6. LISTS: Markdown lists (- item) work in RST mostly fine.
# We just ensure there's a newline before a list starts to satisfy RST strictness.
converted = re.sub(r"(\n[^-*].*)\n\s*[-*] ", r"\1\n\n- ", converted)

return converted


def batch_convert_docstrings(docstrings: List[str]):
"""
Optimized Batch Processor.
1. Tries Aggressive Python Conversion first.
2. Only sends Tables/Complex items to Pandoc.
"""
unique_docs = set(docstrings)

# Filter: Only keep strings that need conversion and aren't in cache
candidates = [
d for d in unique_docs
if d
and d not in _RAW_RST_CACHE
and re.search(r"[|*`_[\]#]", d) # Only interesting chars
]

if not candidates:
return

pandoc_batch: List[str] = []

# 1. Try Python Conversion
for doc in candidates:
fast_result = _aggressive_fast_convert(doc)
if fast_result is not None:
# Success: Saved ~50ms per call
_RAW_RST_CACHE[doc] = fast_result.strip()
else:
# Failed: Must use Pandoc (Tables, etc)
pandoc_batch.append(doc)

# 2. Process Remainder with Pandoc (Likely < 10 items)
if not pandoc_batch:
return

separator = "\n\n__GAPIC_BATCH_SPLIT__\n\n"
giant_payload = separator.join(pandoc_batch)

try:
converted_payload = pypandoc.convert_text(
giant_payload,
"rst",
format="commonmark",
extra_args=["--columns=1000"]
)
except Exception:
return
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Catching a bare Exception and silently returning can hide important issues with pypandoc or the environment. It's better to log the exception to aid in debugging potential failures in the batch conversion process. Using the warnings module is a good way to surface this without requiring logging configuration.

Suggested change
except Exception:
return
except Exception as e:
import warnings
warnings.warn(f"pypandoc batch conversion failed: {e}")
return


split_marker = "__GAPIC_BATCH_SPLIT__"
results = converted_payload.split(split_marker)

if len(results) == len(pandoc_batch):
for original, converted in zip(pandoc_batch, results):
_RAW_RST_CACHE[original] = converted.strip()
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

If the number of converted results from pypandoc does not match the number of items in the batch, the function returns silently without processing the items. This could indicate a problem with the separator or with pypandoc's output. This condition should be reported to help with debugging.

    if len(results) != len(pandoc_batch):
        import warnings
        warnings.warn(
            "pypandoc batch conversion returned unexpected number of results. "
            f"Expected {len(pandoc_batch)}, got {len(results)}."
        )
        return

    for original, converted in zip(pandoc_batch, results):
        _RAW_RST_CACHE[original] = converted.strip()



def rst(
text: str,
Expand All @@ -27,59 +120,53 @@ def rst(
nl: Optional[bool] = None,
source_format: str = "commonmark",
):
"""Convert the given text to ReStructured Text.

Args:
text (str): The text to convert.
width (int): The number of columns.
indent (int): The number of columns to indent each line of text
(except the first).
nl (bool): Whether to append a trailing newline.
Defaults to appending a newline if the result is more than
one line long.
source_format (str): The source format. This is ``commonmark`` by
default, which is what is used by convention in protocol buffers.

Returns:
str: The same text, in RST format.
"""
# Quick check: If the text block does not appear to have any formatting,
# do not convert it.
# (This makes code generation significantly faster; calling out to pandoc
# is by far the most expensive thing we do.)
if not re.search(r"[|*`_[\]]", text):
"""Convert the given text to ReStructured Text."""
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The detailed docstring for the rst function, which explained the purpose of the width, indent, nl, and source_format parameters, has been removed. While the function body has changed, the signature and parameters remain the same. Restoring the detailed docstring would improve code clarity and maintainability for future developers.

    """Convert the given text to ReStructured Text.

    Args:
        text (str): The text to convert.
        width (int): The number of columns.
        indent (int): The number of columns to indent each line of text
            (except the first).
        nl (bool): Whether to append a trailing newline.
            Defaults to appending a newline if the result is more than
            one line long.
        source_format (str): The source format. This is ``commonmark`` by
            default, which is what is used by convention in protocol buffers.

    Returns:
        str: The same text, in RST format.
    """


# 1. Super Fast Path: No special chars? Just wrap.
if not re.search(r"[|*`_[\]#]", text):
answer = wrap(
text,
indent=indent,
offset=indent + 3,
width=width - indent,
)
return _finalize(answer, nl, indent)

# 2. Check Cache
if text in _RAW_RST_CACHE:
raw_rst = _RAW_RST_CACHE[text]
else:
# Convert from CommonMark to ReStructured Text.
answer = (
pypandoc.convert_text(
# Slow Path: Missed by batch or new string.
# TRY PYTHON CONVERT FIRST.
# This prevents the 'Slow Path' from actually being slow.
fast_result = _aggressive_fast_convert(text)

if fast_result is not None:
raw_rst = fast_result.strip()
else:
# The absolute last resort: Shell out to Pandoc
raw_rst = pypandoc.convert_text(
text,
"rst",
format=source_format,
extra_args=["--columns=%d" % (width - indent)],
)
.strip()
.replace("\n", f"\n{' ' * indent}")
)
extra_args=["--columns=1000"]
).strip()
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The call to pypandoc.convert_text is not wrapped in a try...except block. If pypandoc fails for any reason (e.g., it's not installed correctly, or it encounters an input it can't handle), it will raise an exception and crash the entire generation process. It would be more robust to handle this exception and fall back gracefully, for instance by returning the original unformatted text.

            # The absolute last resort: Shell out to Pandoc
            try:
                raw_rst = pypandoc.convert_text(
                    text,
                    "rst",
                    format=source_format,
                    extra_args=["--columns=1000"],
                ).strip()
            except Exception:
                # If pandoc fails, it's better to have unformatted text
                # than to crash.
                raw_rst = text


_RAW_RST_CACHE[text] = raw_rst

# 3. Python Formatting
if "::" in raw_rst or ".. code" in raw_rst:
answer = raw_rst.replace("\n", f"\n{' ' * indent}")
else:
answer = wrap(raw_rst, indent=indent, offset=indent, width=width - indent)

return _finalize(answer, nl, indent)

# Add a newline to the end of the document if any line breaks are
# already present.
#
# This causes the closing """ to be on the subsequent line only when
# appropriate.

def _finalize(answer, nl, indent):
"""Helper to handle trailing newlines and quotes."""
if nl or ("\n" in answer and nl is None):
answer += "\n" + " " * indent

# If the text ends in a double-quote, append a period.
# This ensures that we do not get a parse error when this output is
# followed by triple-quotes.
if answer.endswith('"'):
answer += "."

# Done; return the answer.
return answer
return answer
Loading