-
Notifications
You must be signed in to change notification settings - Fork 78
optimize rst #2505
optimize rst #2505
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -13,12 +13,105 @@ | |||||||||||||
| # limitations under the License. | ||||||||||||||
|
|
||||||||||||||
| import re | ||||||||||||||
| from typing import Optional | ||||||||||||||
| from typing import Optional, List, Dict | ||||||||||||||
|
|
||||||||||||||
| import pypandoc # type: ignore | ||||||||||||||
|
|
||||||||||||||
| from gapic.utils.lines import wrap | ||||||||||||||
|
|
||||||||||||||
| # --- PERFORMANCE CACHE --- | ||||||||||||||
| _RAW_RST_CACHE: Dict[str, str] = {} | ||||||||||||||
|
|
||||||||||||||
|
|
||||||||||||||
| def _aggressive_fast_convert(text: str) -> Optional[str]: | ||||||||||||||
| """ | ||||||||||||||
| Converts common Markdown (Code, Links, Lists) to RST using pure Python. | ||||||||||||||
| Only gives up (returns None) for complex structures like Tables. | ||||||||||||||
| """ | ||||||||||||||
| # 1. TABLE CHECK (The only thing we strictly need Pandoc for) | ||||||||||||||
| # If we see a pipe surrounded by spaces, it's likely a table. | ||||||||||||||
| if re.search(r" \| ", text) or re.search(r"\|\n", text): | ||||||||||||||
| return None | ||||||||||||||
|
|
||||||||||||||
| # 2. CODE BLOCKS: `code` -> ``code`` | ||||||||||||||
| # RST requires double backticks. Markdown uses one. | ||||||||||||||
| # We look for backticks that aren't already double. | ||||||||||||||
| # Regex: Negative lookbehind/lookahead to ensure we don't match ``already rst``. | ||||||||||||||
| converted = re.sub(r"(?<!`)`([^`]+)`(?!`)", r"``\1``", text) | ||||||||||||||
|
|
||||||||||||||
| # 3. LINKS: [Text](URL) -> `Text <URL>`__ | ||||||||||||||
| # We use anonymous links (__) to avoid collision issues. | ||||||||||||||
| converted = re.sub(r"\[([^\]]+)\]\(([^)]+)\)", r"`\1 <\2>`__", converted) | ||||||||||||||
|
|
||||||||||||||
| # 4. BOLD: **text** -> **text** (Compatible, no change needed) | ||||||||||||||
|
|
||||||||||||||
| # 5. HEADINGS: # Heading -> Heading\n======= | ||||||||||||||
| # (Simple fix for H1/H2, mostly sufficient for docstrings) | ||||||||||||||
| converted = re.sub(r"^# (.*)$", r"\1\n" + "=" * 10, converted, flags=re.MULTILINE) | ||||||||||||||
| converted = re.sub(r"^## (.*)$", r"\1\n" + "-" * 10, converted, flags=re.MULTILINE) | ||||||||||||||
|
|
||||||||||||||
| # 6. LISTS: Markdown lists (- item) work in RST mostly fine. | ||||||||||||||
| # We just ensure there's a newline before a list starts to satisfy RST strictness. | ||||||||||||||
| converted = re.sub(r"(\n[^-*].*)\n\s*[-*] ", r"\1\n\n- ", converted) | ||||||||||||||
|
|
||||||||||||||
| return converted | ||||||||||||||
|
|
||||||||||||||
|
|
||||||||||||||
| def batch_convert_docstrings(docstrings: List[str]): | ||||||||||||||
| """ | ||||||||||||||
| Optimized Batch Processor. | ||||||||||||||
| 1. Tries Aggressive Python Conversion first. | ||||||||||||||
| 2. Only sends Tables/Complex items to Pandoc. | ||||||||||||||
| """ | ||||||||||||||
| unique_docs = set(docstrings) | ||||||||||||||
|
|
||||||||||||||
| # Filter: Only keep strings that need conversion and aren't in cache | ||||||||||||||
| candidates = [ | ||||||||||||||
| d for d in unique_docs | ||||||||||||||
| if d | ||||||||||||||
| and d not in _RAW_RST_CACHE | ||||||||||||||
| and re.search(r"[|*`_[\]#]", d) # Only interesting chars | ||||||||||||||
| ] | ||||||||||||||
|
|
||||||||||||||
| if not candidates: | ||||||||||||||
| return | ||||||||||||||
|
|
||||||||||||||
| pandoc_batch: List[str] = [] | ||||||||||||||
|
|
||||||||||||||
| # 1. Try Python Conversion | ||||||||||||||
| for doc in candidates: | ||||||||||||||
| fast_result = _aggressive_fast_convert(doc) | ||||||||||||||
| if fast_result is not None: | ||||||||||||||
| # Success: Saved ~50ms per call | ||||||||||||||
| _RAW_RST_CACHE[doc] = fast_result.strip() | ||||||||||||||
| else: | ||||||||||||||
| # Failed: Must use Pandoc (Tables, etc) | ||||||||||||||
| pandoc_batch.append(doc) | ||||||||||||||
|
|
||||||||||||||
| # 2. Process Remainder with Pandoc (Likely < 10 items) | ||||||||||||||
| if not pandoc_batch: | ||||||||||||||
| return | ||||||||||||||
|
|
||||||||||||||
| separator = "\n\n__GAPIC_BATCH_SPLIT__\n\n" | ||||||||||||||
| giant_payload = separator.join(pandoc_batch) | ||||||||||||||
|
|
||||||||||||||
| try: | ||||||||||||||
| converted_payload = pypandoc.convert_text( | ||||||||||||||
| giant_payload, | ||||||||||||||
| "rst", | ||||||||||||||
| format="commonmark", | ||||||||||||||
| extra_args=["--columns=1000"] | ||||||||||||||
| ) | ||||||||||||||
| except Exception: | ||||||||||||||
| return | ||||||||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Catching a bare
Suggested change
|
||||||||||||||
|
|
||||||||||||||
| split_marker = "__GAPIC_BATCH_SPLIT__" | ||||||||||||||
| results = converted_payload.split(split_marker) | ||||||||||||||
|
|
||||||||||||||
| if len(results) == len(pandoc_batch): | ||||||||||||||
| for original, converted in zip(pandoc_batch, results): | ||||||||||||||
| _RAW_RST_CACHE[original] = converted.strip() | ||||||||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If the number of converted results from if len(results) != len(pandoc_batch):
import warnings
warnings.warn(
"pypandoc batch conversion returned unexpected number of results. "
f"Expected {len(pandoc_batch)}, got {len(results)}."
)
return
for original, converted in zip(pandoc_batch, results):
_RAW_RST_CACHE[original] = converted.strip() |
||||||||||||||
|
|
||||||||||||||
|
|
||||||||||||||
| def rst( | ||||||||||||||
| text: str, | ||||||||||||||
|
|
@@ -27,59 +120,53 @@ def rst( | |||||||||||||
| nl: Optional[bool] = None, | ||||||||||||||
| source_format: str = "commonmark", | ||||||||||||||
| ): | ||||||||||||||
| """Convert the given text to ReStructured Text. | ||||||||||||||
|
|
||||||||||||||
| Args: | ||||||||||||||
| text (str): The text to convert. | ||||||||||||||
| width (int): The number of columns. | ||||||||||||||
| indent (int): The number of columns to indent each line of text | ||||||||||||||
| (except the first). | ||||||||||||||
| nl (bool): Whether to append a trailing newline. | ||||||||||||||
| Defaults to appending a newline if the result is more than | ||||||||||||||
| one line long. | ||||||||||||||
| source_format (str): The source format. This is ``commonmark`` by | ||||||||||||||
| default, which is what is used by convention in protocol buffers. | ||||||||||||||
|
|
||||||||||||||
| Returns: | ||||||||||||||
| str: The same text, in RST format. | ||||||||||||||
| """ | ||||||||||||||
| # Quick check: If the text block does not appear to have any formatting, | ||||||||||||||
| # do not convert it. | ||||||||||||||
| # (This makes code generation significantly faster; calling out to pandoc | ||||||||||||||
| # is by far the most expensive thing we do.) | ||||||||||||||
| if not re.search(r"[|*`_[\]]", text): | ||||||||||||||
| """Convert the given text to ReStructured Text.""" | ||||||||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The detailed docstring for the """Convert the given text to ReStructured Text.
Args:
text (str): The text to convert.
width (int): The number of columns.
indent (int): The number of columns to indent each line of text
(except the first).
nl (bool): Whether to append a trailing newline.
Defaults to appending a newline if the result is more than
one line long.
source_format (str): The source format. This is ``commonmark`` by
default, which is what is used by convention in protocol buffers.
Returns:
str: The same text, in RST format.
""" |
||||||||||||||
|
|
||||||||||||||
| # 1. Super Fast Path: No special chars? Just wrap. | ||||||||||||||
| if not re.search(r"[|*`_[\]#]", text): | ||||||||||||||
| answer = wrap( | ||||||||||||||
| text, | ||||||||||||||
| indent=indent, | ||||||||||||||
| offset=indent + 3, | ||||||||||||||
| width=width - indent, | ||||||||||||||
| ) | ||||||||||||||
| return _finalize(answer, nl, indent) | ||||||||||||||
|
|
||||||||||||||
| # 2. Check Cache | ||||||||||||||
| if text in _RAW_RST_CACHE: | ||||||||||||||
| raw_rst = _RAW_RST_CACHE[text] | ||||||||||||||
| else: | ||||||||||||||
| # Convert from CommonMark to ReStructured Text. | ||||||||||||||
| answer = ( | ||||||||||||||
| pypandoc.convert_text( | ||||||||||||||
| # Slow Path: Missed by batch or new string. | ||||||||||||||
| # TRY PYTHON CONVERT FIRST. | ||||||||||||||
| # This prevents the 'Slow Path' from actually being slow. | ||||||||||||||
| fast_result = _aggressive_fast_convert(text) | ||||||||||||||
|
|
||||||||||||||
| if fast_result is not None: | ||||||||||||||
| raw_rst = fast_result.strip() | ||||||||||||||
| else: | ||||||||||||||
| # The absolute last resort: Shell out to Pandoc | ||||||||||||||
| raw_rst = pypandoc.convert_text( | ||||||||||||||
| text, | ||||||||||||||
| "rst", | ||||||||||||||
| format=source_format, | ||||||||||||||
| extra_args=["--columns=%d" % (width - indent)], | ||||||||||||||
| ) | ||||||||||||||
| .strip() | ||||||||||||||
| .replace("\n", f"\n{' ' * indent}") | ||||||||||||||
| ) | ||||||||||||||
| extra_args=["--columns=1000"] | ||||||||||||||
| ).strip() | ||||||||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The call to # The absolute last resort: Shell out to Pandoc
try:
raw_rst = pypandoc.convert_text(
text,
"rst",
format=source_format,
extra_args=["--columns=1000"],
).strip()
except Exception:
# If pandoc fails, it's better to have unformatted text
# than to crash.
raw_rst = text |
||||||||||||||
|
|
||||||||||||||
| _RAW_RST_CACHE[text] = raw_rst | ||||||||||||||
|
|
||||||||||||||
| # 3. Python Formatting | ||||||||||||||
| if "::" in raw_rst or ".. code" in raw_rst: | ||||||||||||||
| answer = raw_rst.replace("\n", f"\n{' ' * indent}") | ||||||||||||||
| else: | ||||||||||||||
| answer = wrap(raw_rst, indent=indent, offset=indent, width=width - indent) | ||||||||||||||
|
|
||||||||||||||
| return _finalize(answer, nl, indent) | ||||||||||||||
|
|
||||||||||||||
| # Add a newline to the end of the document if any line breaks are | ||||||||||||||
| # already present. | ||||||||||||||
| # | ||||||||||||||
| # This causes the closing """ to be on the subsequent line only when | ||||||||||||||
| # appropriate. | ||||||||||||||
|
|
||||||||||||||
| def _finalize(answer, nl, indent): | ||||||||||||||
| """Helper to handle trailing newlines and quotes.""" | ||||||||||||||
| if nl or ("\n" in answer and nl is None): | ||||||||||||||
| answer += "\n" + " " * indent | ||||||||||||||
|
|
||||||||||||||
| # If the text ends in a double-quote, append a period. | ||||||||||||||
| # This ensures that we do not get a parse error when this output is | ||||||||||||||
| # followed by triple-quotes. | ||||||||||||||
| if answer.endswith('"'): | ||||||||||||||
| answer += "." | ||||||||||||||
|
|
||||||||||||||
| # Done; return the answer. | ||||||||||||||
| return answer | ||||||||||||||
| return answer | ||||||||||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The regular expression substitution for headings uses a fixed-length underline (
'=' * 10and'-' * 10). In reStructuredText, the underline for a heading must be at least as long as the heading text itself. A fixed length may be too short for longer headings, resulting in invalid RST.You can fix this by using a lambda function in
re.subto dynamically set the underline length based on the matched heading text's length.