From 4f2147af345141d97ad691fef9aed8a616034042 Mon Sep 17 00:00:00 2001 From: qiancai Date: Mon, 27 Apr 2026 11:37:53 +0800 Subject: [PATCH 1/4] support generating release notes by ai --- scripts/release_notes_ai/__init__.py | 1 + scripts/release_notes_ai/ai_client.py | 296 +++++++ scripts/release_notes_ai/cli.py | 283 ++++++ scripts/release_notes_ai/constants.py | 98 +++ scripts/release_notes_ai/excel_workbook.py | 906 ++++++++++++++++++++ scripts/release_notes_ai/github_client.py | 321 +++++++ scripts/release_notes_ai/markdown_writer.py | 121 +++ scripts/release_notes_ai/models.py | 101 +++ scripts/release_notes_ai/requirements.txt | 3 + scripts/release_notes_ai/scope_filter.py | 366 ++++++++ scripts/release_notes_ai/utils.py | 87 ++ scripts/release_notes_generate_ai.py | 10 + 12 files changed, 2593 insertions(+) create mode 100644 scripts/release_notes_ai/__init__.py create mode 100644 scripts/release_notes_ai/ai_client.py create mode 100644 scripts/release_notes_ai/cli.py create mode 100644 scripts/release_notes_ai/constants.py create mode 100644 scripts/release_notes_ai/excel_workbook.py create mode 100644 scripts/release_notes_ai/github_client.py create mode 100644 scripts/release_notes_ai/markdown_writer.py create mode 100644 scripts/release_notes_ai/models.py create mode 100644 scripts/release_notes_ai/requirements.txt create mode 100644 scripts/release_notes_ai/scope_filter.py create mode 100644 scripts/release_notes_ai/utils.py create mode 100644 scripts/release_notes_generate_ai.py diff --git a/scripts/release_notes_ai/__init__.py b/scripts/release_notes_ai/__init__.py new file mode 100644 index 0000000000000..65f7e128c779b --- /dev/null +++ b/scripts/release_notes_ai/__init__.py @@ -0,0 +1 @@ +"""Helpers for generating TiDB release notes with AI.""" diff --git a/scripts/release_notes_ai/ai_client.py b/scripts/release_notes_ai/ai_client.py new file mode 100644 index 0000000000000..503e28b63023b --- /dev/null +++ b/scripts/release_notes_ai/ai_client.py @@ -0,0 +1,296 @@ +from __future__ import annotations + +import dataclasses +from functools import lru_cache +import json +import os +import shlex +import shutil +import subprocess +import tempfile +import textwrap +from pathlib import Path +from typing import Any + +from .constants import BUG_FIXES_REFERENCE, IMPROVEMENTS_REFERENCE +from .models import GeneratedNote, RowContext + + +class AIClient: + def __init__(self, command: str, model: str | None, timeout: int): + self.command = shlex.split(command) + self.model = model + self.timeout = timeout + + def generate(self, prompt: str, expected_links: list[str], contributors: list[str]) -> GeneratedNote: + result, errors = self._run_and_validate(prompt, expected_links, contributors) + if result: + return result + + repair_prompt = build_repair_prompt(prompt, errors) + result, repair_errors = self._run_and_validate(repair_prompt, expected_links, contributors) + if result: + return result + raise ValueError("; ".join(repair_errors)) + + def _run_and_validate( + self, prompt: str, expected_links: list[str], contributors: list[str] + ) -> tuple[GeneratedNote | None, list[str]]: + output = self._run(prompt) + try: + data = extract_json_object(output) + except ValueError as exc: + return None, [str(exc)] + return validate_ai_response(data, expected_links, contributors) + + def _run(self, prompt: str) -> str: + command = list(self.command) + if not command: + raise ValueError("AI command is empty. Pass a command with --ai-command.") + if not is_executable_available(command[0]): + raise FileNotFoundError( + f"AI command executable not found: {command[0]!r}. " + "Install it or pass a custom command with --ai-command." + ) + + with tempfile.TemporaryDirectory() as temp_dir: + output_path: Path | None = None + if self._is_codex_exec(command): + if self.model: + command.extend(["-m", self.model]) + temp_path = Path(temp_dir) + schema_path = temp_path / "ai-output-schema.json" + output_path = temp_path / "ai-output.txt" + schema_path.write_text(json.dumps(ai_output_schema()), encoding="utf-8") + output_path.touch() + command.extend(["--output-schema", str(schema_path)]) + command.extend(["--output-last-message", str(output_path)]) + + completed = subprocess.run( + command, + input=prompt, + text=True, + capture_output=True, + timeout=self.timeout, + check=False, + ) + if completed.returncode != 0: + raise RuntimeError( + "AI command failed with exit code " + f"{completed.returncode}: {summarize_process_output(completed)}" + ) + if output_path and output_path.exists(): + last_message = output_path.read_text(encoding="utf-8").strip() + if last_message: + return last_message + return completed.stdout.strip() + + @staticmethod + def _is_codex_exec(command: list[str]) -> bool: + if not command: + return False + executable = Path(command[0]).name + return executable == "codex" and "exec" in command[1:] + + +def is_executable_available(executable: str) -> bool: + if os.sep in executable or (os.altsep and os.altsep in executable): + return Path(executable).exists() + return shutil.which(executable) is not None + + +def ai_output_schema() -> dict[str, Any]: + return { + "type": "object", + "additionalProperties": False, + "required": ["type", "release_note", "needs_review", "reason"], + "properties": { + "type": {"type": "string", "enum": ["improvement", "bug_fix"]}, + "release_note": {"type": "string"}, + "needs_review": {"type": "boolean"}, + "reason": {"type": "string"}, + }, + } + + +def summarize_process_output(completed: subprocess.CompletedProcess[str]) -> str: + parts = [] + if completed.stderr.strip(): + parts.append("stderr:\n" + tail_output(completed.stderr)) + if completed.stdout.strip(): + parts.append("stdout:\n" + tail_output(completed.stdout)) + return "\n\n".join(parts) or "no output" + + +def tail_output(text: str, max_lines: int = 40, max_chars: int = 4000) -> str: + tail = "\n".join(text.strip().splitlines()[-max_lines:]) + if len(tail) > max_chars: + tail = "...[truncated]\n" + tail[-max_chars:] + return tail + + +def build_generation_prompt( + row_context: RowContext, + expected_links: list[str], + contributors: list[str], +) -> str: + improvements_reference = load_reference_file(IMPROVEMENTS_REFERENCE) + bug_fixes_reference = load_reference_file(BUG_FIXES_REFERENCE) + context = { + "row_number": row_context.row_number, + "component": row_context.component, + "raw_component_from_excel": row_context.raw_component, + "issue_type_from_excel": row_context.issue_type, + "pr_title_from_excel": row_context.pr_title, + "formatted_release_note_from_excel": row_context.formatted_release_note, + "expected_links": expected_links, + "contributors": contributors, + "issues": [dataclasses.asdict(issue) for issue in row_context.issues], + "pull_requests": [dataclasses.asdict(pull) for pull in row_context.pulls], + } + return textwrap.dedent( + f""" + You write exactly one English TiDB release note entry. + + Return only a JSON object with exactly these keys: + - type: "improvement" or "bug_fix" + - release_note: one Markdown bullet that starts with "- " + - needs_review: true or false + - reason: a short reason for the type and wording + + Rules: + - Write from the user's perspective. + - Use the Excel issue_type as a strong signal, but decide the final type from the issue, + PR description, and code changes. + - For improvements, follow the Improvements reference below. + - For bug fixes, follow the Bug fixes reference below. + - Do not end the release note with a period. + - Include every expected link in Markdown release-note style. + - Include every contributor as @[user](https://github.com/user). + - If there is no issue URL, use the PR link as the suffix link. + - Do not expose internal function names unless they are the user-visible behavior. + - If the available context is insufficient, still draft the best note and set needs_review + to true. + + Expected links: + {json.dumps(expected_links, ensure_ascii=False, indent=2)} + + Contributors: + {json.dumps(contributors, ensure_ascii=False, indent=2)} + + Row context: + {json.dumps(context, ensure_ascii=False, indent=2)} + + Improvements reference: + {improvements_reference} + + Bug fixes reference: + {bug_fixes_reference} + """ + ).strip() + + +def build_repair_prompt(original_prompt: str, errors: list[str]) -> str: + return textwrap.dedent( + f""" + Your previous answer did not satisfy the required JSON schema or release-note rules. + + Validation errors: + {json.dumps(errors, ensure_ascii=False, indent=2)} + + Rewrite the answer. Return only the corrected JSON object. + + Original task: + {original_prompt} + """ + ).strip() + + +@lru_cache(maxsize=None) +def load_reference_file(path: Path) -> str: + try: + return path.read_text(encoding="utf-8") + except FileNotFoundError as exc: + raise FileNotFoundError( + f"Cannot find release-note reference file: {path}. " + "Make sure the repo-local write-review-translate-release-notes skill is present." + ) from exc + + +def extract_json_object(output: str) -> dict[str, Any]: + output = output.strip() + if not output: + raise ValueError("AI command returned no output") + try: + data = json.loads(output) + except json.JSONDecodeError: + candidates = extract_json_object_candidates(output) + if not candidates: + raise ValueError("AI output did not contain a JSON object") from None + required_keys = {"type", "release_note", "needs_review", "reason"} + data = next( + (candidate for candidate in candidates if required_keys <= candidate.keys()), + candidates[0], + ) + if not isinstance(data, dict): + raise ValueError("AI output JSON is not an object") + return data + + +def extract_json_object_candidates(output: str) -> list[dict[str, Any]]: + decoder = json.JSONDecoder() + candidates: list[dict[str, Any]] = [] + for index, char in enumerate(output): + if char != "{": + continue + try: + data, _end = decoder.raw_decode(output[index:]) + except json.JSONDecodeError: + continue + if isinstance(data, dict): + candidates.append(data) + return candidates + + +def validate_ai_response( + data: dict[str, Any], + expected_links: list[str], + contributors: list[str], +) -> tuple[GeneratedNote | None, list[str]]: + errors: list[str] = [] + note_type = data.get("type") + release_note = data.get("release_note") + needs_review = data.get("needs_review") + reason = data.get("reason") + + if note_type not in {"improvement", "bug_fix"}: + errors.append('type must be "improvement" or "bug_fix"') + if not isinstance(release_note, str) or not release_note.startswith("- "): + errors.append('release_note must be a string that starts with "- "') + if isinstance(release_note, str) and release_note.rstrip().endswith("."): + errors.append("release_note must not end with a period") + if not isinstance(needs_review, bool): + errors.append("needs_review must be a boolean") + if not isinstance(reason, str): + errors.append("reason must be a string") + + if isinstance(release_note, str): + for link in expected_links: + if link and link not in release_note: + errors.append(f"release_note is missing expected link: {link}") + for contributor in contributors: + expected = f"@[{contributor}](https://github.com/{contributor})" + if contributor and expected not in release_note: + errors.append(f"release_note is missing contributor: {contributor}") + + if errors: + return None, errors + return ( + GeneratedNote( + note_type=str(note_type), + release_note=str(release_note).strip(), + needs_review=bool(needs_review), + reason=str(reason).strip(), + ), + [], + ) diff --git a/scripts/release_notes_ai/cli.py b/scripts/release_notes_ai/cli.py new file mode 100644 index 0000000000000..ee1d79a074c4a --- /dev/null +++ b/scripts/release_notes_ai/cli.py @@ -0,0 +1,283 @@ +from __future__ import annotations + +import argparse +import os +import tempfile +from pathlib import Path + +import openpyxl + +from .ai_client import AIClient +from .excel_workbook import ( + clear_output_columns, + generate_notes_without_ai, + generate_notes_for_sheet, + merge_rows_by_issue_and_component, + prepare_sheet_columns, + sort_sheet_rows_by_component, + store_existing_release_notes, + update_pr_authors_and_dup_notes, +) +from .github_client import GitHubClient +from .markdown_writer import write_release_file +from .scope_filter import move_prs_not_in_scope, parse_date_value + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Generate English release notes with AI from a tirelease workbook." + ) + parser.add_argument("--version", required=True, help="Target TiDB version, for example 8.5.7.") + parser.add_argument("--excel", required=True, help="Path to the release note Excel workbook.") + parser.add_argument( + "--releases-dir", + required=True, + help="Path to the existing English release notes directory.", + ) + parser.add_argument("--sheet", default="pr_for_release_note", help="Workbook sheet name.") + parser.add_argument("--github-token-file", help="Path to a GitHub token file.") + parser.add_argument( + "--ai-command", + default="codex --ask-for-approval never exec --sandbox read-only --ephemeral", + help="Command-line AI command. The prompt is passed through stdin.", + ) + parser.add_argument( + "--ai-model", + default="gpt-5.4", + help="Model name passed to codex exec with -m.", + ) + parser.add_argument( + "--involve-ai-generation", + type=parse_on_off, + default="ON", + help=( + "Whether to use AI for non-dup release notes. Use ON to generate with AI, " + "or OFF to output the original formated_release_note values. Default: ON." + ), + ) + parser.add_argument( + "--output-release-file", + help="Output Markdown file. Defaults to release-{version}-updated-by-ai.md.", + ) + parser.add_argument( + "--ai-timeout", + type=int, + default=600, + help="Timeout in seconds for each AI command invocation.", + ) + parser.add_argument( + "--ai-workers", + type=int, + default=3, + help=( + "Number of concurrent AI command invocations. The default is conservative " + "for codex exec subprocesses." + ), + ) + parser.add_argument( + "--github-workers", + type=int, + default=8, + help="Number of concurrent GitHub API prefetch workers.", + ) + parser.add_argument( + "--author-workers", + type=int, + default=3, + help="Number of concurrent workers used to resolve bot-authored cherry-pick PR authors.", + ) + parser.add_argument( + "--checkpoint-interval", + type=int, + default=1, + help=( + "Save the Excel workbook after every N completed AI rows. " + "Default: 1. Use 0 to disable." + ), + ) + parser.add_argument( + "--force-regenerate", + action="store_true", + help="Clear existing AI release notes and regenerate all non-dup rows.", + ) + parser.add_argument( + "--release-date", + default="TBD", + help='Release date text for the Markdown header, for example "August 14, 2025".', + ) + parser.add_argument( + "--skip-scope-preprocess", + action="store_true", + help="Skip moving not-in-scope PR rows to the PRs_not_in_scope sheet.", + ) + parser.add_argument( + "--scope-base-branch-start-date", + help=( + "Override the estimated release-m.n branch start date for x.y.0 scope " + "preprocessing, in YYYY-MM-DD format." + ), + ) + return parser.parse_args() + + +def main() -> int: + args = parse_args() + validate_positive_int("--ai-workers", args.ai_workers) + validate_positive_int("--github-workers", args.github_workers) + validate_positive_int("--author-workers", args.author_workers) + if args.checkpoint_interval < 0: + raise ValueError("--checkpoint-interval must be greater than or equal to 0") + base_branch_start_date = None + if args.scope_base_branch_start_date: + base_branch_start_date = parse_date_value(args.scope_base_branch_start_date) + if not base_branch_start_date: + raise ValueError("--scope-base-branch-start-date must use YYYY-MM-DD format") + + token = load_github_token(args.github_token_file) + github = GitHubClient(token) + involve_ai_generation = args.involve_ai_generation == "ON" + ai = AIClient(args.ai_command, args.ai_model, args.ai_timeout) if involve_ai_generation else None + + output_file = ( + Path(args.output_release_file) + if args.output_release_file + else Path(args.releases_dir) / f"release-{args.version}-updated-by-ai.md" + ) + + excel_path = Path(args.excel) + processed_excel_path = default_processed_excel_path(excel_path) + workbook = openpyxl.load_workbook(excel_path) + if args.sheet not in workbook.sheetnames: + raise ValueError(f"Cannot find sheet {args.sheet!r} in {args.excel}") + sheet = workbook[args.sheet] + if not args.skip_scope_preprocess: + move_prs_not_in_scope( + workbook, + sheet, + args.version, + Path(args.releases_dir), + github, + base_branch_start_date=base_branch_start_date, + ) + sort_sheet_rows_by_component(sheet) + header = prepare_sheet_columns(sheet) + clear_output_columns(sheet, header, clear_ai=args.force_regenerate) + + existing_notes = store_existing_release_notes(Path(args.releases_dir), args.version) + update_pr_authors_and_dup_notes( + sheet, + header, + existing_notes, + github, + author_workers=args.author_workers, + ) + merge_rows_by_issue_and_component(sheet, header) + + if involve_ai_generation: + checkpoint_callback = build_checkpoint_callback( + workbook, + processed_excel_path, + args.checkpoint_interval, + ) + markdown_entries = generate_notes_for_sheet( + sheet, + header, + github, + ai, + ai_workers=args.ai_workers, + github_workers=args.github_workers, + checkpoint_callback=checkpoint_callback, + ) + else: + markdown_entries = generate_notes_without_ai(sheet, header) + save_workbook_safely(workbook, processed_excel_path) + write_release_file(output_file, args.version, args.release_date, markdown_entries) + + print(f"Original Excel workbook unchanged: {excel_path}", flush=True) + print(f"Processed Excel workbook: {processed_excel_path}", flush=True) + print(f"Generated release note file: {output_file}", flush=True) + return 0 + + +def validate_positive_int(name: str, value: int) -> None: + if value < 1: + raise ValueError(f"{name} must be greater than or equal to 1") + + +def parse_on_off(value: str) -> str: + normalized = value.strip().upper() + if normalized not in {"ON", "OFF"}: + raise argparse.ArgumentTypeError("value must be ON or OFF") + return normalized + + +def default_processed_excel_path(excel_path: Path) -> Path: + return excel_path.with_name(f"{excel_path.stem}_processed{excel_path.suffix}") + + +def build_checkpoint_callback( + workbook: openpyxl.Workbook, + excel_path: Path, + checkpoint_interval: int, +): + if checkpoint_interval <= 0: + return None + + def checkpoint(completed: int, total: int) -> None: + if completed % checkpoint_interval != 0 and completed != total: + return + save_workbook_safely(workbook, excel_path) + print( + f"Checkpoint saved after {completed}/{total} AI row(s): {excel_path}", + flush=True, + ) + + return checkpoint + + +def save_workbook_safely(workbook: openpyxl.Workbook, excel_path: Path) -> None: + excel_path = excel_path.resolve() + temp_file = tempfile.NamedTemporaryFile( + prefix=f".{excel_path.stem}.", + suffix=excel_path.suffix, + dir=excel_path.parent, + delete=False, + ) + temp_path = Path(temp_file.name) + temp_file.close() + saved_temp = False + try: + workbook.save(temp_path) + saved_temp = True + os.replace(temp_path, excel_path) + except Exception as exc: + if saved_temp and temp_path.exists(): + raise RuntimeError( + f"Failed to replace {excel_path}: {exc}. " + f"A complete temporary workbook remains at {temp_path}." + ) from exc + temp_path.unlink(missing_ok=True) + raise RuntimeError(f"Failed to save workbook {excel_path}: {exc}") from exc + + +def load_github_token(token_file: str | None) -> str | None: + import shutil + import subprocess + + if token_file: + return Path(token_file).read_text(encoding="utf-8").strip() + if os.environ.get("GITHUB_TOKEN"): + return os.environ["GITHUB_TOKEN"].strip() + gh = shutil.which("gh") + if not gh: + return None + completed = subprocess.run( + [gh, "auth", "token"], + text=True, + capture_output=True, + timeout=10, + check=False, + ) + if completed.returncode == 0 and completed.stdout.strip(): + return completed.stdout.strip() + return None diff --git a/scripts/release_notes_ai/constants.py b/scripts/release_notes_ai/constants.py new file mode 100644 index 0000000000000..c3e947167a23b --- /dev/null +++ b/scripts/release_notes_ai/constants.py @@ -0,0 +1,98 @@ +from __future__ import annotations + +import re +from pathlib import Path + + +REPO_ROOT = Path(__file__).resolve().parents[2] +IMPROVEMENTS_REFERENCE = ( + REPO_ROOT + / ".ai" + / "skills" + / "write-review-translate-release-notes" + / "references" + / "improvements.md" +) +BUG_FIXES_REFERENCE = ( + REPO_ROOT + / ".ai" + / "skills" + / "write-review-translate-release-notes" + / "references" + / "bug-fixes.md" +) + +BOT_AUTHORS = {"ti-chi-bot", "ti-srebot"} +# Keep the misspelled source column name because tirelease exports it this way. +REQUIRED_HEADERS = { + "pr_author", + "pr_link", + "pr_title", + "formated_release_note", + "issue_type", +} +COMPONENT_HEADERS = ("component", "components") + +GITHUB_ITEM_URL_RE = re.compile( + r"https://github\.com/(?P[^/\s]+)/(?P[\w.-]+)/" + r"(?Pissues|pull)/(?P\d+)" +) +ISSUE_URL_RE = re.compile( + r"https://github\.com/(?P[^/\s]+)/(?P[\w.-]+)/issues/(?P\d+)" +) +PR_URL_RE = re.compile( + r"https://github\.com/(?P[^/\s]+)/(?P[\w.-]+)/pull/(?P\d+)" +) +AUTHOR_RE = re.compile(r"@\[([^\]]+)\]") + +TOP_LEVEL_COMPONENTS = ["TiDB", "TiKV", "PD", "TiFlash", "TiProxy"] +TOOL_COMPONENTS = [ + "Backup & Restore (BR)", + "TiCDC", + "TiDB Data Migration (DM)", + "TiDB Lightning", + "Dumpling", + "TiUP", + "TiDB Binlog", + "sync-diff-inspector", +] +COMPONENT_ALIASES = { + "tidb": "TiDB", + "tikv": "TiKV", + "pd": "PD", + "tiflash": "TiFlash", + "tiproxy": "TiProxy", + "br": "Backup & Restore (BR)", + "backup & restore": "Backup & Restore (BR)", + "backup & restore (br)": "Backup & Restore (BR)", + "cdc": "TiCDC", + "ticdc": "TiCDC", + "dm": "TiDB Data Migration (DM)", + "tidb data migration": "TiDB Data Migration (DM)", + "tidb data migration (dm)": "TiDB Data Migration (DM)", + "tidb lightning": "TiDB Lightning", + "lightning": "TiDB Lightning", + "dumpling": "Dumpling", + "tiup": "TiUP", + "tidb binlog": "TiDB Binlog", + "ng monitoring": "TiDB", + "sync_diff": "sync-diff-inspector", + "sync-diff-inspector": "sync-diff-inspector", + "sync diff inspector": "sync-diff-inspector", + "planner": "TiDB", + "execution": "TiDB", + "sql-infra": "TiDB", + "transaction": "TiDB", + "engine": "TiDB", + "observability": "TiDB", + "dxf": "TiDB", + "storage": "TiDB", + "tidb-dashboard": "TiDB", + "tidb dashboard": "TiDB", + "ddl": "TiDB", + "coprocessor": "TiDB", + "compute": "TiDB", + "scheduling": "TiDB", + "spm": "TiDB", + "ng-monitoring": "TiDB", +} diff --git a/scripts/release_notes_ai/excel_workbook.py b/scripts/release_notes_ai/excel_workbook.py new file mode 100644 index 0000000000000..260b4b807d04e --- /dev/null +++ b/scripts/release_notes_ai/excel_workbook.py @@ -0,0 +1,906 @@ +from __future__ import annotations + +import copy +import re +import sys +from concurrent.futures import ThreadPoolExecutor, as_completed +from collections import OrderedDict +from pathlib import Path +from typing import Any, Callable + +from openpyxl.styles import PatternFill + +from .ai_client import build_generation_prompt +from .constants import ( + AUTHOR_RE, + BOT_AUTHORS, + COMPONENT_HEADERS, + GITHUB_ITEM_URL_RE, + REQUIRED_HEADERS, + TOOL_COMPONENTS, + TOP_LEVEL_COMPONENTS, +) +from .models import ( + ExistingNote, + GitHubDataCache, + MarkdownEntry, + RowContext, + RowGenerationResult, + RowInput, +) +from .utils import ( + extract_issue_urls, + extract_pr_urls, + normalize_component, + normalize_raw_component, + normalized_release_component, + replace_author_markdown, + split_lines, + split_multi_value, + str_value, + unique_ordered, +) + + +GRAY_FILL = PatternFill(start_color="D3D3D3", end_color="D3D3D3", fill_type="solid") + + +def prepare_sheet_columns(sheet: Any) -> dict[str, int]: + header = get_header(sheet) + missing = sorted(REQUIRED_HEADERS - set(header)) + if missing: + raise ValueError(f"Missing required Excel columns: {', '.join(missing)}") + get_component_col(header) + + ai_col = header.get("release_notes_written_by_ai") + formatted_col = header["formated_release_note"] + if not ai_col: + sheet.insert_cols(formatted_col + 1) + sheet.cell(row=1, column=formatted_col + 1, value="release_notes_written_by_ai") + header = get_header(sheet) + + if "published_release_notes" not in header: + last_col = sheet.max_column + sheet.cell(row=1, column=last_col + 1, value="published_release_notes") + header = get_header(sheet) + return header + + +def get_header(sheet: Any) -> dict[str, int]: + header: dict[str, int] = {} + for index, cell in enumerate(sheet[1], start=1): + if cell.value: + header[str(cell.value).strip()] = index + return header + + +def clear_output_columns(sheet: Any, header: dict[str, int], clear_ai: bool = True) -> None: + for row_number in range(2, sheet.max_row + 1): + if clear_ai: + sheet.cell(row=row_number, column=header["release_notes_written_by_ai"]).value = None + sheet.cell(row=row_number, column=header["published_release_notes"]).value = None + + +def sort_sheet_rows_by_component(sheet: Any) -> None: + header = get_header(sheet) + component_col = get_component_col(header) + if sheet.max_row <= 2: + return + + snapshots = [ + (row_number, component_sort_key(sheet.cell(row=row_number, column=component_col).value), snapshot_row(sheet, row_number)) + for row_number in range(2, sheet.max_row + 1) + ] + sorted_snapshots = sorted(snapshots, key=lambda item: item[1]) + if [row_number for row_number, _key, _snapshot in snapshots] == [ + row_number for row_number, _key, _snapshot in sorted_snapshots + ]: + return + + for target_row, (_source_row, _key, snapshot) in enumerate(sorted_snapshots, start=2): + restore_row(sheet, target_row, snapshot) + + print("Sorted worksheet rows by component before release-note generation", flush=True) + + +def component_sort_key(value: Any) -> tuple[int, str]: + component = normalize_raw_component(value) + if not component: + return (1, "") + return (0, component.casefold()) + + +def snapshot_row(sheet: Any, row_number: int) -> dict[str, Any]: + row_dimension = sheet.row_dimensions[row_number] + return { + "height": row_dimension.height, + "hidden": row_dimension.hidden, + "outline_level": row_dimension.outlineLevel, + "collapsed": row_dimension.collapsed, + "cells": [snapshot_cell(sheet.cell(row=row_number, column=column)) for column in range(1, sheet.max_column + 1)], + } + + +def snapshot_cell(cell: Any) -> dict[str, Any]: + return { + "value": cell.value, + "style": copy.copy(cell._style), + "number_format": cell.number_format, + "hyperlink": copy.copy(cell.hyperlink) if cell.hyperlink else None, + "comment": copy.copy(cell.comment) if cell.comment else None, + } + + +def restore_row(sheet: Any, row_number: int, snapshot: dict[str, Any]) -> None: + row_dimension = sheet.row_dimensions[row_number] + row_dimension.height = snapshot["height"] + row_dimension.hidden = snapshot["hidden"] + row_dimension.outlineLevel = snapshot["outline_level"] + row_dimension.collapsed = snapshot["collapsed"] + for column, cell_snapshot in enumerate(snapshot["cells"], start=1): + cell = sheet.cell(row=row_number, column=column) + cell.value = cell_snapshot["value"] + cell._style = copy.copy(cell_snapshot["style"]) + cell.number_format = cell_snapshot["number_format"] + cell._hyperlink = copy.copy(cell_snapshot["hyperlink"]) if cell_snapshot["hyperlink"] else None + cell.comment = copy.copy(cell_snapshot["comment"]) if cell_snapshot["comment"] else None + + +def get_component_col(header: dict[str, int]) -> int: + for name in COMPONENT_HEADERS: + if name in header: + return header[name] + raise ValueError("Missing required Excel column: component or components") + + +def issue_urls_for_row(sheet: Any, header: dict[str, int], row_number: int) -> list[str]: + candidates: list[str] = [] + if "issue_url" in header: + candidates.append(str_value(sheet.cell(row=row_number, column=header["issue_url"]).value)) + candidates.append(str_value(sheet.cell(row=row_number, column=header["formated_release_note"]).value)) + return unique_ordered(url for text in candidates for url in extract_issue_urls(text)) + + +def first_issue_url_for_row(sheet: Any, header: dict[str, int], row_number: int) -> str | None: + issue_urls = issue_urls_for_row(sheet, header, row_number) + return issue_urls[0] if issue_urls else None + + +def store_existing_release_notes(releases_dir: Path, version: str) -> list[ExistingNote]: + existing_notes: list[ExistingNote] = [] + seen: set[tuple[str, tuple[str, ...]]] = set() + target_version = parse_semver_tuple(version) + + for file_path in sorted(releases_dir.rglob("*.md")): + if should_skip_release_file(file_path, target_version): + continue + level1 = level2 = level3 = "" + with file_path.open("r", encoding="utf-8") as file: + for raw_line in file: + line = raw_line.strip() + authors = AUTHOR_RE.findall(line) + item_url = GITHUB_ITEM_URL_RE.search(line) + if item_url: + key = (item_url.group(), tuple(authors)) + if key in seen: + continue + seen.add(key) + note_level = level1 + level2 + level3 + note_type, component = classify_note_level(note_level) + existing_notes.append( + ExistingNote( + url=item_url.group(), + line=line, + file_name=file_path.name, + note_level=note_level, + authors=authors, + note_type=note_type, + component=component, + ) + ) + continue + + heading = parse_release_note_heading(raw_line) + if not heading: + continue + heading_level, label = heading + if heading_level == 1: + level1 = "> " + label + level2 = level3 = "" + elif heading_level == 2: + level2 = "> " + label + level3 = "" + elif heading_level == 3: + level3 = "> " + label + return existing_notes + + +def should_skip_release_file(file_path: Path, target_version: tuple[int, int, int]) -> bool: + if "updated-by-ai" in file_path.stem: + return True + file_version = release_file_semver_tuple(file_path) + if not file_version: + return False + return file_version >= target_version + + +def parse_semver_tuple(version: str) -> tuple[int, int, int]: + match = re.match(r"^(?P\d+)\.(?P\d+)\.(?P\d+)", version) + if not match: + raise ValueError(f"Invalid TiDB version: {version}") + return ( + int(match.group("major")), + int(match.group("minor")), + int(match.group("patch")), + ) + + +def release_file_semver_tuple(file_path: Path) -> tuple[int, int, int] | None: + match = re.match( + r"^release-(?P\d+)\.(?P\d+)\.(?P\d+)", + file_path.stem, + ) + if not match: + return None + return ( + int(match.group("major")), + int(match.group("minor")), + int(match.group("patch")), + ) + + +def parse_release_note_heading(raw_line: str) -> tuple[int, str] | None: + line = raw_line.rstrip() + section = re.match(r"^##\s+(.+?)\s*$", line) + if section: + return 1, section.group(1).strip() + + top_component = re.match(r"^[+-]\s+(.+?)\s*$", line) + if top_component: + label = top_component.group(1).strip() + if label.lower() == "tools" or normalized_release_component(label): + return 2, label + + tool_component = re.match(r"^ {4}[+-]\s+(.+?)\s*$", line) + if tool_component: + label = tool_component.group(1).strip() + if normalized_release_component(label): + return 3, label + return None + + +def update_pr_authors_and_dup_notes( + sheet: Any, + header: dict[str, int], + existing_notes: list[ExistingNote], + github: Any, + author_workers: int = 1, +) -> None: + apply_bot_author_replacements(sheet, header, github, author_workers) + existing_notes_by_url = index_existing_notes_by_url(existing_notes) + + for row_number in range(2, sheet.max_row + 1): + author_cell = sheet.cell(row=row_number, column=header["pr_author"]) + current_author = str_value(author_cell.value) + + issue_url = first_issue_url_for_row(sheet, header, row_number) + if not issue_url: + continue + + current_authors = split_multi_value(current_author) + dup_notes = [] + for existing in existing_notes_by_url.get(issue_url, []): + if existing.authors and not set(current_authors).intersection(existing.authors): + continue + dup_notes.append(existing.dup_text) + + if dup_notes: + dup_col = header["published_release_notes"] + sheet.cell(row=row_number, column=dup_col, value="\n".join(unique_ordered(dup_notes))) + fill_row(sheet, row_number) + print(f"Row {row_number}: found duplicated release note for {issue_url}", flush=True) + + +def apply_bot_author_replacements( + sheet: Any, + header: dict[str, int], + github: Any, + author_workers: int, +) -> None: + requests = bot_author_requests(sheet, header) + if not requests: + return + print( + f"Resolving {len(requests)} bot-authored PR row(s) with {author_workers} worker(s)", + flush=True, + ) + + replacements = resolve_bot_author_replacements(requests, github, author_workers) + for row_number in sorted(replacements): + current_author, actual_author = replacements[row_number] + author_cell = sheet.cell(row=row_number, column=header["pr_author"]) + formatted_cell = sheet.cell(row=row_number, column=header["formated_release_note"]) + formatted_note = str_value(formatted_cell.value) + print( + f"Replacing bot author in row {row_number}: {current_author} -> {actual_author}", + flush=True, + ) + author_cell.value = actual_author + formatted_cell.value = replace_author_markdown( + formatted_note, current_author, actual_author + ) + + +def bot_author_requests(sheet: Any, header: dict[str, int]) -> list[tuple[int, str, str, str]]: + requests = [] + for row_number in range(2, sheet.max_row + 1): + current_author = str_value(sheet.cell(row=row_number, column=header["pr_author"]).value) + pr_link = str_value(sheet.cell(row=row_number, column=header["pr_link"]).value) + if current_author not in BOT_AUTHORS or not pr_link: + continue + pr_title = str_value(sheet.cell(row=row_number, column=header["pr_title"]).value) + requests.append((row_number, pr_link, pr_title, current_author)) + return requests + + +def resolve_bot_author_replacements( + requests: list[tuple[int, str, str, str]], + github: Any, + author_workers: int, +) -> dict[int, tuple[str, str]]: + replacements: dict[int, tuple[str, str]] = {} + total = len(requests) + if author_workers == 1: + for completed, request in enumerate(requests, start=1): + row_number, pr_link, pr_title, current_author = request + actual_author = resolve_bot_author(github, request) + print_bot_author_progress(completed, total, row_number, current_author, actual_author) + if actual_author != current_author: + replacements[row_number] = (current_author, actual_author) + return replacements + + with ThreadPoolExecutor(max_workers=author_workers) as executor: + futures = { + executor.submit(resolve_bot_author, github, request): request + for request in requests + } + for completed, future in enumerate(as_completed(futures), start=1): + row_number, _pr_link, _pr_title, current_author = futures[future] + actual_author = future.result() + print_bot_author_progress(completed, total, row_number, current_author, actual_author) + if actual_author != current_author: + replacements[row_number] = (current_author, actual_author) + return replacements + + +def print_bot_author_progress( + completed: int, + total: int, + row_number: int, + current_author: str, + actual_author: str, +) -> None: + status = "unchanged" if actual_author == current_author else f"{current_author} -> {actual_author}" + print( + f"Resolved bot author {completed}/{total}: row {row_number} ({status})", + flush=True, + ) + + +def resolve_bot_author(github: Any, request: tuple[int, str, str, str]) -> str: + row_number, pr_link, pr_title, current_author = request + try: + return github.get_original_author_for_cherry_pick( + row_number, + pr_link, + pr_title, + current_author, + ) + except Exception as exc: # noqa: BLE001 + print( + f"Row {row_number}: failed to resolve bot author for {pr_link}: {exc}", + file=sys.stderr, + flush=True, + ) + return current_author + + +def index_existing_notes_by_url(existing_notes: list[ExistingNote]) -> dict[str, list[ExistingNote]]: + indexed: dict[str, list[ExistingNote]] = {} + for existing in existing_notes: + indexed.setdefault(existing.url, []).append(existing) + return indexed + + +def merge_rows_by_issue_and_component(sheet: Any, header: dict[str, int]) -> None: + groups: OrderedDict[tuple[str, str], list[int]] = OrderedDict() + component_col = get_component_col(header) + for row_number in range(2, sheet.max_row + 1): + issue_url = first_issue_url_for_row(sheet, header, row_number) + if not issue_url: + continue + component = normalize_raw_component(sheet.cell(row=row_number, column=component_col).value) + if not component: + continue + groups.setdefault((issue_url, component), []).append(row_number) + + rows_to_delete: list[int] = [] + for (_issue_url, _component), rows in groups.items(): + if len(rows) <= 1: + continue + keep_row = rows[0] + merge_pr_links(sheet, header, keep_row, rows) + merge_authors(sheet, header, keep_row, rows) + merge_dup_notes(sheet, header, keep_row, rows) + fill_first_empty_values(sheet, header, keep_row, rows) + if str_value(sheet.cell(row=keep_row, column=header["published_release_notes"]).value): + fill_row(sheet, keep_row) + rows_to_delete.extend(rows[1:]) + + for row_number in sorted(rows_to_delete, reverse=True): + sheet.delete_rows(row_number, 1) + + +def merge_pr_links(sheet: Any, header: dict[str, int], keep_row: int, rows: list[int]) -> None: + links: list[str] = [] + for row in rows: + links.extend(split_multi_value(sheet.cell(row=row, column=header["pr_link"]).value)) + sheet.cell(row=keep_row, column=header["pr_link"], value=", ".join(unique_ordered(links))) + + +def merge_authors(sheet: Any, header: dict[str, int], keep_row: int, rows: list[int]) -> None: + authors: list[str] = [] + for row in rows: + authors.extend(split_multi_value(sheet.cell(row=row, column=header["pr_author"]).value)) + sheet.cell(row=keep_row, column=header["pr_author"], value=", ".join(unique_ordered(authors))) + + +def merge_dup_notes(sheet: Any, header: dict[str, int], keep_row: int, rows: list[int]) -> None: + notes: list[str] = [] + for row in rows: + notes.extend(split_lines(sheet.cell(row=row, column=header["published_release_notes"]).value)) + if notes: + sheet.cell(row=keep_row, column=header["published_release_notes"], value="\n".join(unique_ordered(notes))) + + +def fill_first_empty_values(sheet: Any, header: dict[str, int], keep_row: int, rows: list[int]) -> None: + columns_to_skip = { + header["pr_link"], + header["pr_author"], + header["published_release_notes"], + header["release_notes_written_by_ai"], + } + for col in range(1, sheet.max_column + 1): + if col in columns_to_skip: + continue + keep_cell = sheet.cell(row=keep_row, column=col) + if str_value(keep_cell.value): + continue + for row in rows[1:]: + value = sheet.cell(row=row, column=col).value + if str_value(value): + keep_cell.value = value + break + + +def generate_notes_for_sheet( + sheet: Any, + header: dict[str, int], + github: Any, + ai: Any, + ai_workers: int = 1, + github_workers: int = 1, + checkpoint_callback: Callable[[int, int], None] | None = None, +) -> list[MarkdownEntry]: + entries_by_row: dict[int, list[MarkdownEntry]] = {} + row_inputs = [ + build_row_input(sheet, header, row_number) + for row_number in range(2, sheet.max_row + 1) + ] + rows_to_generate: list[RowInput] = [] + + for row_input in row_inputs: + row_number = row_input.row_number + component = row_input.component + dup_text = str_value(sheet.cell(row=row_number, column=header["published_release_notes"]).value) + if dup_text: + sheet.cell(row=row_number, column=header["release_notes_written_by_ai"]).value = None + entries_by_row[row_number] = dup_entries_for_row(row_input, dup_text) + continue + + ai_cell = sheet.cell(row=row_number, column=header["release_notes_written_by_ai"]) + expected_links = row_input.issue_urls or row_input.pr_urls + if not expected_links: + ai_cell.value = "AI_GENERATION_FAILED: missing issue URL and PR URL" + continue + + existing_note = str_value(ai_cell.value) + if is_reusable_ai_note(existing_note): + note_type = classify_note_type_from_text(existing_note, row_input.issue_type) + entries_by_row[row_number] = [ + MarkdownEntry( + note_type or "improvement", + component, + existing_note, + row_input.raw_component, + ) + ] + print(f"Row {row_number}: skipped existing AI release note", flush=True) + continue + + rows_to_generate.append(row_input) + + github_cache = prefetch_github_data(rows_to_generate, github, github_workers) + total_to_generate = len(rows_to_generate) + if total_to_generate: + print( + f"Generating AI release notes for {total_to_generate} row(s) " + f"with {ai_workers} worker(s)", + flush=True, + ) + + completed = 0 + with ThreadPoolExecutor(max_workers=ai_workers) as executor: + futures = [ + executor.submit(generate_note_for_row, row_input, github_cache, ai) + for row_input in rows_to_generate + ] + for future in as_completed(futures): + result = future.result() + apply_generation_result(sheet, header, result, entries_by_row) + completed += 1 + if checkpoint_callback: + checkpoint_callback(completed, total_to_generate) + + entries: list[MarkdownEntry] = [] + for row_input in row_inputs: + entries.extend(entries_by_row.get(row_input.row_number, [])) + return entries + + +def generate_notes_without_ai(sheet: Any, header: dict[str, int]) -> list[MarkdownEntry]: + entries: list[MarkdownEntry] = [] + for row_number in range(2, sheet.max_row + 1): + row_input = build_row_input(sheet, header, row_number) + dup_text = str_value(sheet.cell(row=row_number, column=header["published_release_notes"]).value) + if dup_text: + entries.extend(dup_entries_for_row(row_input, dup_text)) + continue + + formatted_notes = split_lines(row_input.formatted_release_note) + if not formatted_notes: + print( + f"Row {row_number}: skipped non-dup row because formated_release_note is empty", + file=sys.stderr, + flush=True, + ) + continue + note_type = classify_note_type_from_text( + row_input.formatted_release_note, + row_input.issue_type, + ) + for note in formatted_notes: + entries.append( + MarkdownEntry( + note_type or "improvement", + row_input.component, + note, + row_input.raw_component, + ) + ) + + print( + f"AI generation is OFF; generated Markdown from formated_release_note for {len(entries)} note(s)", + flush=True, + ) + return entries + + +def dup_entries_for_row(row_input: RowInput, dup_text: str) -> list[MarkdownEntry]: + entries: list[MarkdownEntry] = [] + for dup_note in split_lines(dup_text): + note_type = classify_note_type_from_text( + dup_note, + row_input.issue_type, + ) + dup_component = parse_component_from_dup(dup_note) or row_input.component + if note_type in {"improvement", "bug_fix"}: + entries.append( + MarkdownEntry( + note_type, + normalize_component(dup_component), + dup_note, + row_input.raw_component, + ) + ) + return entries + + +def build_row_input(sheet: Any, header: dict[str, int], row_number: int) -> RowInput: + raw_component = normalize_raw_component( + sheet.cell(row=row_number, column=get_component_col(header)).value + ) + return RowInput( + row_number=row_number, + component=release_component_for_row(sheet, header, row_number), + raw_component=raw_component, + issue_type=str_value(sheet.cell(row=row_number, column=header["issue_type"]).value), + pr_title=str_value(sheet.cell(row=row_number, column=header["pr_title"]).value), + pr_authors=split_multi_value(sheet.cell(row=row_number, column=header["pr_author"]).value), + pr_urls=extract_pr_urls(str_value(sheet.cell(row=row_number, column=header["pr_link"]).value)), + issue_urls=issue_urls_for_row(sheet, header, row_number), + formatted_release_note=str_value( + sheet.cell(row=row_number, column=header["formated_release_note"]).value + ), + ) + + +def is_reusable_ai_note(note: str) -> bool: + return bool(note) and not note.startswith("AI_GENERATION_FAILED:") + + +def prefetch_github_data(row_inputs: list[RowInput], github: Any, github_workers: int) -> GitHubDataCache: + issue_urls = unique_ordered(url for row_input in row_inputs for url in row_input.issue_urls) + pr_urls = unique_ordered(url for row_input in row_inputs for url in row_input.pr_urls) + issues = {} + pulls = {} + + if not issue_urls and not pr_urls: + return GitHubDataCache(issues=issues, pulls=pulls) + + print( + f"Prefetching GitHub data: {len(issue_urls)} issue(s), {len(pr_urls)} PR(s) " + f"with {github_workers} worker(s)", + flush=True, + ) + + with ThreadPoolExecutor(max_workers=github_workers) as executor: + futures = { + executor.submit(github.get_issue, issue_url): ("issue", issue_url) + for issue_url in issue_urls + } + futures.update( + { + executor.submit(github.get_pull, pr_url): ("pull", pr_url) + for pr_url in pr_urls + } + ) + for future in as_completed(futures): + item_type, url = futures[future] + try: + data = future.result() + except Exception as exc: # noqa: BLE001 + print(f"Failed to prefetch GitHub {item_type} {url}: {exc}", file=sys.stderr, flush=True) + continue + if item_type == "issue": + issues[url] = data + else: + pulls[url] = data + return GitHubDataCache(issues=issues, pulls=pulls) + + +def generate_note_for_row( + row_input: RowInput, + github_cache: GitHubDataCache, + ai: Any, +) -> RowGenerationResult: + expected_links = row_input.issue_urls or row_input.pr_urls + row_context = build_row_context_from_cache(row_input, github_cache) + contributors = unique_ordered( + [author for author in row_context.pr_authors if author not in BOT_AUTHORS] + ) + try: + prompt = build_generation_prompt(row_context, expected_links, contributors) + generated = ai.generate(prompt, expected_links, contributors) + return RowGenerationResult( + row_number=row_input.row_number, + component=row_input.component, + raw_component=row_input.raw_component, + note_type=generated.note_type, + note=generated.release_note, + error=None, + needs_review=generated.needs_review, + reason=generated.reason, + ) + except Exception as exc: # noqa: BLE001 + return RowGenerationResult( + row_number=row_input.row_number, + component=row_input.component, + raw_component=row_input.raw_component, + note_type=None, + note=None, + error=str(exc), + ) + + +def build_row_context_from_cache(row_input: RowInput, github_cache: GitHubDataCache) -> RowContext: + pr_authors = list(row_input.pr_authors) + issues = [ + github_cache.issues[issue_url] + for issue_url in row_input.issue_urls + if issue_url in github_cache.issues + ] + pulls = [] + for pr_url in row_input.pr_urls: + pull = github_cache.pulls.get(pr_url) + if not pull: + continue + pulls.append(pull) + if pull.author: + pr_authors.append(pull.author) + return RowContext( + row_number=row_input.row_number, + component=row_input.component, + raw_component=row_input.raw_component, + issue_type=row_input.issue_type, + pr_title=row_input.pr_title, + pr_authors=unique_ordered(pr_authors), + pr_urls=row_input.pr_urls, + issue_urls=row_input.issue_urls, + formatted_release_note=row_input.formatted_release_note, + issues=issues, + pulls=pulls, + ) + + +def apply_generation_result( + sheet: Any, + header: dict[str, int], + result: RowGenerationResult, + entries_by_row: dict[int, list[MarkdownEntry]], +) -> None: + ai_cell = sheet.cell(row=result.row_number, column=header["release_notes_written_by_ai"]) + if result.error: + ai_cell.value = f"AI_GENERATION_FAILED: {result.error}" + print( + f"Row {result.row_number}: AI generation failed: {result.error}", + file=sys.stderr, + flush=True, + ) + return + if not result.note or not result.note_type: + ai_cell.value = "AI_GENERATION_FAILED: empty AI generation result" + print( + f"Row {result.row_number}: AI generation failed: empty AI generation result", + file=sys.stderr, + flush=True, + ) + return + + ai_cell.value = result.note + entries_by_row[result.row_number] = [ + MarkdownEntry(result.note_type, result.component, result.note, result.raw_component) + ] + review_marker = " (needs review)" if result.needs_review else "" + print( + f"Row {result.row_number}: generated {result.note_type}{review_marker}: {result.reason}", + flush=True, + ) + + +def release_component_for_row(sheet: Any, header: dict[str, int], row_number: int) -> str: + raw_component = normalize_raw_component( + sheet.cell(row=row_number, column=get_component_col(header)).value + ) + raw_lower = raw_component.lower() + raw_release_component = release_component_from_raw(raw_component) + if raw_release_component: + return raw_release_component + + urls = issue_urls_for_row(sheet, header, row_number) + urls.extend(extract_pr_urls(str_value(sheet.cell(row=row_number, column=header["pr_link"]).value))) + repos = {match.group("repo").lower() for url in urls for match in [GITHUB_ITEM_URL_RE.search(url)] if match} + + if "pd" in repos: + return "PD" + if "tikv" in repos: + return "TiKV" + if "tiflash" in repos: + return "TiFlash" + if "ng-monitoring" in repos: + return "TiDB" + if "tiup" in repos: + return "TiUP" + if repos.intersection({"tiflow", "ticdc"}): + if "dm" in raw_lower and "cdc" not in raw_lower: + return "TiDB Data Migration (DM)" + return "TiCDC" + if "tidb" in repos: + if "br" in raw_lower: + return "Backup & Restore (BR)" + if "lightning" in raw_lower: + return "TiDB Lightning" + if "dumpling" in raw_lower: + return "Dumpling" + return "TiDB" + if "tidb-dashboard" in repos: + return "TiDB" + return normalize_component(raw_component) + + +def release_component_from_raw(raw_component: str) -> str: + normalized_raw = normalize_component(raw_component) + if normalized_raw in TOP_LEVEL_COMPONENTS or normalized_raw in TOOL_COMPONENTS: + return normalized_raw + + token_components = [ + normalize_component(token) + for token in split_multi_value(raw_component) + ] + if not token_components: + return "" + + for component in [ + "Backup & Restore (BR)", + "TiDB Lightning", + "Dumpling", + "TiUP", + "sync-diff-inspector", + ]: + if component in token_components: + return component + + for component in TOP_LEVEL_COMPONENTS: + if component in token_components: + return component + + if "TiDB Data Migration (DM)" in token_components: + return "TiDB Data Migration (DM)" + if "TiCDC" in token_components: + return "TiCDC" + + return "" + + +def classify_note_level(note_level: str) -> tuple[str | None, str | None]: + labels = [label.strip() for label in re.findall(r">\s*([^>]+)", note_level)] + if not labels: + return None, None + section = labels[0].lower() + note_type = None + if "bug fixes" in section or "error fixes" in section: + note_type = "bug_fix" + elif "improvements" in section: + note_type = "improvement" + + component_labels = labels[1:] + if component_labels and component_labels[0].lower() == "tools": + component_labels = component_labels[1:] + for label in reversed(component_labels): + component = normalized_release_component(label) + if component: + return note_type, component + return note_type, None + + +def classify_note_type_from_text(note: str, issue_type: str) -> str | None: + note_lower = note.lower() + issue_type_lower = issue_type.lower() + if "> bug fixes" in note_lower or "> 错误修复" in note_lower: + return "bug_fix" + if "> improvements" in note_lower or "> 改进提升" in note_lower: + return "improvement" + if "bug" in issue_type_lower or "fix" in issue_type_lower: + return "bug_fix" + if "improvement" in issue_type_lower or "enhancement" in issue_type_lower: + return "improvement" + if note.strip().startswith("- Fix "): + return "bug_fix" + return "improvement" + + +def parse_component_from_dup(note: str) -> str | None: + labels = [label.strip() for label in re.findall(r">\s*([^>]+)", note)] + cleaned: list[str] = [] + for label in labels: + if " - " in label: + label = label.split(" - ", 1)[0] + cleaned.append(label.strip()) + if len(cleaned) < 2: + return None + return normalized_release_component(cleaned[-1]) + + +def fill_row(sheet: Any, row_number: int) -> None: + for column in range(1, sheet.max_column + 1): + sheet.cell(row=row_number, column=column).fill = copy.copy(GRAY_FILL) diff --git a/scripts/release_notes_ai/github_client.py b/scripts/release_notes_ai/github_client.py new file mode 100644 index 0000000000000..f0f4d1b5e2ff2 --- /dev/null +++ b/scripts/release_notes_ai/github_client.py @@ -0,0 +1,321 @@ +from __future__ import annotations + +import re +import sys +import threading +import time +from typing import Any + +import requests +from requests.adapters import HTTPAdapter +from urllib3.util.retry import Retry + +from .constants import GITHUB_ITEM_URL_RE +from .models import IssueInfo, PullInfo +from .utils import parse_github_url + + +def create_retry_policy() -> Retry: + return Retry( + total=3, + connect=3, + read=3, + status=3, + backoff_factor=1, + status_forcelist=(500, 502, 503, 504), + allowed_methods=frozenset(["GET"]), + respect_retry_after_header=True, + raise_on_status=False, + ) + + +class GitHubClient: + def __init__( + self, + token: str | None, + max_rate_limit_retries: int = 3, + max_rate_limit_sleep: int = 600, + ): + self.max_rate_limit_retries = max_rate_limit_retries + self.max_rate_limit_sleep = max_rate_limit_sleep + self.headers = { + "Accept": "application/vnd.github+json", + "X-GitHub-Api-Version": "2022-11-28", + } + if token: + self.headers["Authorization"] = f"Bearer {token}" + self._thread_local = threading.local() + + def get_session(self) -> requests.Session: + session = getattr(self._thread_local, "session", None) + if session is None: + session = requests.Session() + session.headers.update(self.headers) + adapter = HTTPAdapter(max_retries=create_retry_policy()) + session.mount("https://", adapter) + self._thread_local.session = session + return session + + def get_json(self, api_path: str) -> dict[str, Any]: + data = self.get_api_json(api_path) + if not isinstance(data, dict): + raise ValueError(f"Expected object response from {api_path}") + return data + + def get_api_json(self, api_path: str, params: dict[str, Any] | None = None) -> Any: + return self.get_url_json(f"https://api.github.com{api_path}", params=params) + + def get_url_json(self, url: str, params: dict[str, Any] | None = None) -> Any: + last_response: requests.Response | None = None + for attempt in range(self.max_rate_limit_retries + 1): + response = self.get_session().get(url, params=params, timeout=30) + last_response = response + if self.is_rate_limited(response) and attempt < self.max_rate_limit_retries: + sleep_seconds = self.rate_limit_sleep_seconds(response, attempt) + print( + "GitHub API rate limit reached; retrying in " + f"{sleep_seconds} seconds: {url}", + file=sys.stderr, + flush=True, + ) + time.sleep(sleep_seconds) + continue + response.raise_for_status() + return response.json() + if last_response is not None: + last_response.raise_for_status() + raise RuntimeError(f"GitHub API request failed: {url}") + + def is_rate_limited(self, response: requests.Response) -> bool: + if response.status_code == 429: + return True + if response.status_code != 403: + return False + if response.headers.get("x-ratelimit-remaining") == "0": + return True + message = response.text.lower() + return "rate limit" in message or "abuse detection" in message + + def rate_limit_sleep_seconds(self, response: requests.Response, attempt: int) -> int: + retry_after = response.headers.get("retry-after") + if retry_after and retry_after.isdigit(): + return min(max(int(retry_after), 1), self.max_rate_limit_sleep) + reset = response.headers.get("x-ratelimit-reset") + if reset and reset.isdigit(): + wait_seconds = int(reset) - int(time.time()) + 5 + return min(max(wait_seconds, 1), self.max_rate_limit_sleep) + return min(2 ** attempt, self.max_rate_limit_sleep) + + def get_pull(self, pr_url: str) -> PullInfo: + owner, repo, number = parse_github_url(pr_url, "pull") + pull = self.get_json(f"/repos/{owner}/{repo}/pulls/{number}") + files_summary = self.get_pull_files_summary(owner, repo, number) + return PullInfo( + url=pr_url, + title=str(pull.get("title") or ""), + body=str(pull.get("body") or ""), + author=str((pull.get("user") or {}).get("login") or ""), + head_ref=str((pull.get("head") or {}).get("ref") or ""), + base_ref=str((pull.get("base") or {}).get("ref") or ""), + files_summary=files_summary, + merged_at=str(pull.get("merged_at") or ""), + created_at=str(pull.get("created_at") or ""), + ) + + def get_issue(self, issue_url: str) -> IssueInfo: + owner, repo, number = parse_github_url(issue_url, "issues") + issue = self.get_json(f"/repos/{owner}/{repo}/issues/{number}") + labels = [ + str(label.get("name")) + for label in issue.get("labels", []) + if isinstance(label, dict) and label.get("name") + ] + return IssueInfo( + url=issue_url, + title=str(issue.get("title") or ""), + body=str(issue.get("body") or ""), + labels=labels, + ) + + def get_pull_files_summary( + self, + owner: str, + repo: str, + number: str, + max_files: int = 80, + max_patch_chars: int = 1200, + max_total_chars: int = 60000, + ) -> str: + lines: list[str] = [] + page = 1 + total_chars = 0 + while len(lines) < max_files: + files = self.get_api_json( + f"/repos/{owner}/{repo}/pulls/{number}/files", + params={"per_page": 100, "page": page}, + ) + if not isinstance(files, list) or not files: + break + for item in files: + if len(lines) >= max_files or total_chars >= max_total_chars: + break + if not isinstance(item, dict): + continue + patch = str(item.get("patch") or "") + if len(patch) > max_patch_chars: + patch = patch[:max_patch_chars] + "\n...[patch truncated]" + block = "\n".join( + [ + f"file: {item.get('filename', '')}", + f"status: {item.get('status', '')}", + f"additions: {item.get('additions', 0)}", + f"deletions: {item.get('deletions', 0)}", + "patch:", + patch, + ] + ) + lines.append(block) + total_chars += len(block) + page += 1 + if not lines: + return "No changed-file information is available." + if len(lines) >= max_files: + lines.append("...[file list truncated]") + return "\n\n".join(lines) + + def list_pulls_for_base( + self, + owner: str, + repo: str, + base: str, + state: str = "closed", + max_pages: int = 10, + ) -> list[PullInfo]: + pulls: list[PullInfo] = [] + for page in range(1, max_pages + 1): + data = self.get_api_json( + f"/repos/{owner}/{repo}/pulls", + params={ + "state": state, + "base": base, + "sort": "created", + "direction": "asc", + "per_page": 100, + "page": page, + }, + ) + if not isinstance(data, list) or not data: + break + for pull in data: + if not isinstance(pull, dict): + continue + pulls.append( + PullInfo( + url=str(pull.get("html_url") or ""), + title=str(pull.get("title") or ""), + body=str(pull.get("body") or ""), + author=str((pull.get("user") or {}).get("login") or ""), + head_ref=str((pull.get("head") or {}).get("ref") or ""), + base_ref=str((pull.get("base") or {}).get("ref") or ""), + files_summary="", + merged_at=str(pull.get("merged_at") or ""), + created_at=str(pull.get("created_at") or ""), + ) + ) + if len(data) < 100: + break + return pulls + + def get_original_author_for_cherry_pick( + self, row_number: int, cp_pr_link: str, cp_pr_title: str, current_author: str + ) -> str: + default_owner, default_repo, _cp_number = parse_github_url(cp_pr_link, "pull") + target_ref = find_original_pr_reference(cp_pr_title, default_owner, default_repo) + if not target_ref: + try: + cp_info = self.get_pull(cp_pr_link) + target_ref = ( + find_original_pr_reference(cp_info.head_ref, default_owner, default_repo) + or find_original_pr_reference(cp_info.title, default_owner, default_repo) + or find_original_pr_reference(cp_info.body, default_owner, default_repo) + ) + except Exception as exc: # noqa: BLE001 + print( + f"Row {row_number}: failed to inspect cherry-pick PR " + f"{cp_pr_link}: {exc}", + file=sys.stderr, + ) + return current_author + + if not target_ref: + print( + f"Row {row_number}: failed to find the original PR for " + f"{cp_pr_link} created by {current_author}.", + file=sys.stderr, + ) + return current_author + + target_owner, target_repo, target_number = target_ref + target_pr_link = f"https://github.com/{target_owner}/{target_repo}/pull/{target_number}" + try: + return self.get_pull(target_pr_link).author or current_author + except Exception as exc: # noqa: BLE001 + print( + f"Row {row_number}: failed to find the non-bot author for " + f"{cp_pr_link}: {exc}", + file=sys.stderr, + ) + return current_author + + +def find_original_pr_reference( + text: str, + default_owner: str, + default_repo: str, +) -> tuple[str, str, str] | None: + text = text or "" + marker_lines = [ + line + for line in text.splitlines() + if re.search(r"\b(backport|cherry[- ]?pick|original|source|from)\b", line, re.I) + ] + for line in marker_lines: + reference = find_pr_reference_in_text(line, default_owner, default_repo) + if reference: + return reference + + same_repo = re.search(r"\(#(?P\d+)\)\s*$", text) + if same_repo: + return default_owner, default_repo, same_repo.group("number") + + branch = re.search(r"(?:^|[/_-])cherry-pick-(?P\d+)(?:\D|$)", text) + if branch: + return default_owner, default_repo, branch.group("number") + + if "\n" not in text and len(text) <= 300: + return find_pr_reference_in_text(text, default_owner, default_repo) + + return None + + +def find_pr_reference_in_text( + text: str, + default_owner: str, + default_repo: str, +) -> tuple[str, str, str] | None: + for full_url in GITHUB_ITEM_URL_RE.finditer(text or ""): + if full_url.group("kind") == "pull": + return full_url.group("owner"), full_url.group("repo"), full_url.group("number") + + cross_repo = re.search( + r"(?[\w.-]+)/(?P[\w.-]+)#(?P\d+)\b", + text or "", + ) + if cross_repo: + return cross_repo.group("owner"), cross_repo.group("repo"), cross_repo.group("number") + + same_repo = re.search(r"\(#(?P\d+)\)\s*$", text or "") + if same_repo: + return default_owner, default_repo, same_repo.group("number") + + return None diff --git a/scripts/release_notes_ai/markdown_writer.py b/scripts/release_notes_ai/markdown_writer.py new file mode 100644 index 0000000000000..38d02cdf51950 --- /dev/null +++ b/scripts/release_notes_ai/markdown_writer.py @@ -0,0 +1,121 @@ +from __future__ import annotations + +from collections import defaultdict +from pathlib import Path + +from .constants import TOOL_COMPONENTS, TOP_LEVEL_COMPONENTS +from .models import MarkdownEntry +from .utils import normalize_component, str_value + + +def write_release_file( + output_file: Path, + version: str, + release_date: str, + entries: list[MarkdownEntry], +) -> None: + major_minor = ".".join(version.split(".")[:2]) + grouped = group_markdown_entries(entries) + content: list[str] = [ + "---", + f"title: TiDB {version} Release Notes", + f"summary: Learn about the improvements and bug fixes in TiDB {version}.", + "---", + "", + f"# TiDB {version} Release Notes", + "", + f"Release date: {release_date}", + "", + f"TiDB version: {version}", + "", + "Quick access: " + f"[Quick start](https://docs.pingcap.com/tidb/v{major_minor}/quick-start-with-tidb) | " + f"[Production deployment](https://docs.pingcap.com/tidb/v{major_minor}/production-deployment-using-tiup)", + "", + ] + + content.extend(render_section("## Improvements", grouped["improvement"])) + content.append("") + content.extend(render_section("## Bug fixes", grouped["bug_fix"])) + content.append("") + while content and content[-1] == "": + content.pop() + + output_file.parent.mkdir(parents=True, exist_ok=True) + output_file.write_text("\n".join(content) + "\n", encoding="utf-8") + + +def group_markdown_entries(entries: list[MarkdownEntry]) -> dict[str, dict[str, list[MarkdownEntry]]]: + grouped: dict[str, dict[str, list[MarkdownEntry]]] = { + "improvement": defaultdict(list), + "bug_fix": defaultdict(list), + } + for entry in entries: + if entry.note_type not in grouped: + continue + component = normalize_component(entry.component) or "Other" + grouped[entry.note_type][component].append(entry) + return grouped + + +def render_section(title: str, entries_by_component: dict[str, list[MarkdownEntry]]) -> list[str]: + lines = [title, ""] + top_components = [ + component + for component in TOP_LEVEL_COMPONENTS + if component in entries_by_component and entries_by_component[component] + ] + unknown_top_components = sorted( + component + for component in entries_by_component + if component not in TOP_LEVEL_COMPONENTS + and component not in TOOL_COMPONENTS + and entries_by_component[component] + ) + tool_components = [ + component + for component in TOOL_COMPONENTS + if component in entries_by_component and entries_by_component[component] + ] + + for component in top_components + unknown_top_components: + lines.append(f"+ {component}") + lines.append("") + for entry in entries_by_component[component]: + lines.append(f" {note_with_component_marker(entry)}") + lines.append("") + + if tool_components: + lines.append("+ Tools") + lines.append("") + for component in tool_components: + lines.append(f" + {component}") + lines.append("") + for entry in entries_by_component[component]: + lines.append(f" {note_with_component_marker(entry)}") + lines.append("") + + while lines and lines[-1] == "": + lines.pop() + return lines + + +def note_with_component_marker(entry: MarkdownEntry) -> str: + note = ensure_release_note_bullet(entry.note) + raw_component = sanitize_component_marker(entry.raw_component) + if not raw_component or "" + + +def ensure_release_note_bullet(note: str) -> str: + note = str_value(note) + if note.startswith("- "): + return note + if note.startswith(("+ ", "* ")): + return "- " + note[2:].lstrip() + return f"- {note}" + + +def sanitize_component_marker(component: str) -> str: + return " ".join(str_value(component).replace("--", "- -").split()) diff --git a/scripts/release_notes_ai/models.py b/scripts/release_notes_ai/models.py new file mode 100644 index 0000000000000..7e89853cb3202 --- /dev/null +++ b/scripts/release_notes_ai/models.py @@ -0,0 +1,101 @@ +from __future__ import annotations + +import dataclasses + + +@dataclasses.dataclass +class ExistingNote: + url: str + line: str + file_name: str + note_level: str + authors: list[str] + note_type: str | None + component: str | None + + @property + def dup_text(self) -> str: + return f"- (dup): {self.file_name} {self.note_level} {self.line}" + + +@dataclasses.dataclass +class PullInfo: + url: str + title: str + body: str + author: str + head_ref: str + base_ref: str + files_summary: str + merged_at: str = "" + created_at: str = "" + + +@dataclasses.dataclass +class IssueInfo: + url: str + title: str + body: str + labels: list[str] + + +@dataclasses.dataclass +class GeneratedNote: + note_type: str + release_note: str + needs_review: bool + reason: str + + +@dataclasses.dataclass +class RowContext: + row_number: int + component: str + raw_component: str + issue_type: str + pr_title: str + pr_authors: list[str] + pr_urls: list[str] + issue_urls: list[str] + formatted_release_note: str + issues: list[IssueInfo] + pulls: list[PullInfo] + + +@dataclasses.dataclass +class RowInput: + row_number: int + component: str + raw_component: str + issue_type: str + pr_title: str + pr_authors: list[str] + pr_urls: list[str] + issue_urls: list[str] + formatted_release_note: str + + +@dataclasses.dataclass +class GitHubDataCache: + issues: dict[str, IssueInfo] + pulls: dict[str, PullInfo] + + +@dataclasses.dataclass +class MarkdownEntry: + note_type: str + component: str + note: str + raw_component: str = "" + + +@dataclasses.dataclass +class RowGenerationResult: + row_number: int + component: str + raw_component: str + note_type: str | None + note: str | None + error: str | None + needs_review: bool = False + reason: str = "" diff --git a/scripts/release_notes_ai/requirements.txt b/scripts/release_notes_ai/requirements.txt new file mode 100644 index 0000000000000..89cfc13a2a578 --- /dev/null +++ b/scripts/release_notes_ai/requirements.txt @@ -0,0 +1,3 @@ +openpyxl>=3.1 +requests>=2.31 +urllib3>=1.26 diff --git a/scripts/release_notes_ai/scope_filter.py b/scripts/release_notes_ai/scope_filter.py new file mode 100644 index 0000000000000..019824068d6e1 --- /dev/null +++ b/scripts/release_notes_ai/scope_filter.py @@ -0,0 +1,366 @@ +from __future__ import annotations + +import copy +import re +from dataclasses import dataclass +from datetime import date, datetime +from pathlib import Path +from typing import Any + +from .excel_workbook import get_header +from .models import PullInfo +from .utils import parse_github_url, str_value + + +OUT_OF_SCOPE_SHEET = "PRs_not_in_scope" +REASON_HEADER = "Reason" +SCOPE_REQUIRED_HEADERS = {"pr_status", "pr_merge_time", "pr_link"} + + +@dataclass(frozen=True) +class Version: + major: int + minor: int + patch: int + + @property + def release_branch(self) -> str: + return f"release-{self.major}.{self.minor}" + + @property + def text(self) -> str: + return f"{self.major}.{self.minor}.{self.patch}" + + @property + def previous_patch_text(self) -> str: + return f"{self.major}.{self.minor}.{self.patch - 1}" + + +@dataclass(frozen=True) +class TimelineRelease: + version: Version + display_version: str + release_date: date + + +@dataclass +class ScopeContext: + version: Version + releases_dir: Path + github: Any + base_branch_start_date: date | None = None + timeline: list[TimelineRelease] | None = None + release_branch_pulls: dict[str, list[PullInfo]] | None = None + + def __post_init__(self) -> None: + if self.timeline is None: + self.timeline = parse_release_timeline(self.releases_dir / "release-timeline.md") + if self.release_branch_pulls is None: + self.release_branch_pulls = {} + + +def move_prs_not_in_scope( + workbook: Any, + sheet: Any, + version: str, + releases_dir: Path, + github: Any, + base_branch_start_date: date | None = None, + target_sheet_name: str = OUT_OF_SCOPE_SHEET, +) -> int: + header = get_header(sheet) + missing = sorted(SCOPE_REQUIRED_HEADERS - set(header)) + if missing: + raise ValueError( + "Missing required Excel columns for scope preprocessing: " + + ", ".join(missing) + ) + + context = ScopeContext( + version=parse_version(version), + releases_dir=releases_dir, + github=github, + base_branch_start_date=base_branch_start_date, + ) + target = ensure_out_of_scope_sheet(workbook, sheet, target_sheet_name) + + rows_to_move: list[tuple[int, str]] = [] + for row_number in range(2, sheet.max_row + 1): + reason = out_of_scope_reason(sheet, header, row_number, context) + if reason: + rows_to_move.append((row_number, reason)) + + for row_number, reason in rows_to_move: + append_row_with_reason(sheet, target, row_number, reason) + + for row_number, _reason in reversed(rows_to_move): + sheet.delete_rows(row_number, 1) + + if rows_to_move: + print( + f"Moved {len(rows_to_move)} row(s) to {target_sheet_name} before release-note generation", + flush=True, + ) + return len(rows_to_move) + + +def ensure_out_of_scope_sheet(workbook: Any, source_sheet: Any, target_sheet_name: str) -> Any: + if target_sheet_name in workbook.sheetnames: + target = workbook[target_sheet_name] + if target.max_row == 0 or not target.cell(row=1, column=1).value: + copy_header(source_sheet, target) + else: + ensure_reason_header(source_sheet, target) + return target + + target = workbook.create_sheet(target_sheet_name) + copy_header(source_sheet, target) + return target + + +def copy_header(source_sheet: Any, target_sheet: Any) -> None: + for column in range(1, source_sheet.max_column + 1): + copy_cell(source_sheet.cell(row=1, column=column), target_sheet.cell(row=1, column=column)) + ensure_reason_header(source_sheet, target_sheet) + + +def ensure_reason_header(source_sheet: Any, target_sheet: Any) -> None: + target_sheet.cell(row=1, column=source_sheet.max_column + 1, value=REASON_HEADER) + + +def append_row_with_reason(source_sheet: Any, target_sheet: Any, row_number: int, reason: str) -> None: + target_row = target_sheet.max_row + 1 + for column in range(1, source_sheet.max_column + 1): + copy_cell( + source_sheet.cell(row=row_number, column=column), + target_sheet.cell(row=target_row, column=column), + ) + target_sheet.cell(row=target_row, column=source_sheet.max_column + 1, value=reason) + + +def copy_cell(source_cell: Any, target_cell: Any) -> None: + target_cell.value = source_cell.value + if source_cell.has_style: + target_cell._style = copy.copy(source_cell._style) + if source_cell.number_format: + target_cell.number_format = source_cell.number_format + if source_cell.hyperlink: + target_cell._hyperlink = copy.copy(source_cell.hyperlink) + if source_cell.comment: + target_cell.comment = copy.copy(source_cell.comment) + + +def out_of_scope_reason( + sheet: Any, + header: dict[str, int], + row_number: int, + context: ScopeContext, +) -> str | None: + status = str_value(sheet.cell(row=row_number, column=header["pr_status"]).value).lower() + if status != "merged": + return f"PR status is {status or 'empty'}, not merged" + + merge_date = parse_date_value(sheet.cell(row=row_number, column=header["pr_merge_time"]).value) + if not merge_date: + return None + + if context.version.patch >= 1: + previous_date = release_date_for_version(context.timeline or [], context.version.previous_patch_text) + if not previous_date: + raise ValueError( + f"Cannot find release date for previous version {context.version.previous_patch_text} " + "in releases/release-timeline.md" + ) + if merge_date < previous_date: + return ( + f"PR merged on {merge_date.isoformat()}, before previous release " + f"{context.version.previous_patch_text} date {previous_date.isoformat()}" + ) + return None + + return major_release_out_of_scope_reason(sheet, header, row_number, merge_date, context) + + +def major_release_out_of_scope_reason( + sheet: Any, + header: dict[str, int], + row_number: int, + merge_date: date, + context: ScopeContext, +) -> str | None: + latest_zero = latest_released_zero_patch(context.timeline or [], context.version.text) + if not latest_zero: + raise ValueError("Cannot find a previously released x.y.0 version in releases/release-timeline.md") + + if merge_date >= latest_zero.release_date: + return None + + branch_start = context.base_branch_start_date or estimated_release_branch_start_date(context, latest_zero) + if not branch_start: + return None + if merge_date < branch_start: + return ( + f"PR merged on {merge_date.isoformat()}, before estimated {latest_zero.version.release_branch} " + f"branch start date {branch_start.isoformat()}" + ) + + pr_link = str_value(sheet.cell(row=row_number, column=header["pr_link"]).value) + cherry_pick = find_release_branch_cherry_pick(context, latest_zero, pr_link) + if not cherry_pick: + return None + cherry_pick_date = parse_date_value(cherry_pick.merged_at) + if cherry_pick_date and cherry_pick_date < latest_zero.release_date: + return ( + f"Cherry-pick PR {cherry_pick.url} merged on {cherry_pick_date.isoformat()} " + f"before {latest_zero.display_version} release date {latest_zero.release_date.isoformat()}" + ) + return None + + +def estimated_release_branch_start_date( + context: ScopeContext, + latest_zero: TimelineRelease, +) -> date | None: + branch_pulls = release_branch_pulls(context, latest_zero.version.release_branch) + created_dates = [parse_date_value(pull.created_at) for pull in branch_pulls] + created_dates = [value for value in created_dates if value] + return min(created_dates) if created_dates else None + + +def find_release_branch_cherry_pick( + context: ScopeContext, + latest_zero: TimelineRelease, + pr_link: str, +) -> PullInfo | None: + try: + owner, repo, number = parse_github_url(pr_link, "pull") + except ValueError: + return None + if (owner, repo) != ("pingcap", "tidb"): + return None + + candidates = [] + for pull in release_branch_pulls(context, latest_zero.version.release_branch): + haystack = "\n".join([pull.title, pull.body, pull.head_ref, pull.url]) + if references_original_pr(haystack, owner, repo, number, pr_link): + candidates.append(pull) + + merged_candidates = [ + pull for pull in candidates if parse_date_value(pull.merged_at) + ] + if not merged_candidates: + return None + return min( + merged_candidates, + key=lambda pull: parse_date_value(pull.merged_at) or date.max, + ) + + +def references_original_pr( + text: str, + owner: str, + repo: str, + number: str, + pr_link: str, +) -> bool: + text = text or "" + patterns = [ + re.escape(pr_link), + rf"(? list[PullInfo]: + assert context.release_branch_pulls is not None + if branch not in context.release_branch_pulls: + context.release_branch_pulls[branch] = context.github.list_pulls_for_base( + "pingcap", + "tidb", + branch, + state="closed", + ) + return context.release_branch_pulls[branch] + + +def parse_release_timeline(path: Path) -> list[TimelineRelease]: + releases: list[TimelineRelease] = [] + if not path.exists(): + raise FileNotFoundError(f"Cannot find release timeline: {path}") + pattern = re.compile( + r"\|\s*\[(?P[^\]]+)\]\([^)]+\)\s*\|\s*(?P\d{4}-\d{2}-\d{2})\s*\|" + ) + for line in path.read_text(encoding="utf-8").splitlines(): + match = pattern.search(line) + if not match: + continue + try: + version = parse_version(match.group("version")) + except ValueError: + continue + release_date = date.fromisoformat(match.group("date")) + releases.append(TimelineRelease(version, match.group("version"), release_date)) + return releases + + +def release_date_for_version(timeline: list[TimelineRelease], version_text: str) -> date | None: + for release in timeline: + if release.version.text == version_text: + return release.release_date + return None + + +def latest_released_zero_patch( + timeline: list[TimelineRelease], + target_version_text: str, +) -> TimelineRelease | None: + zero_patch_releases = [ + release + for release in timeline + if release.version.patch == 0 and release.version.text != target_version_text + ] + if not zero_patch_releases: + return None + return max(zero_patch_releases, key=lambda release: release.release_date) + + +def parse_version(version: str) -> Version: + match = re.match(r"^(?P\d+)\.(?P\d+)\.(?P\d+)", version) + if not match: + raise ValueError(f"Invalid TiDB version: {version}") + return Version( + major=int(match.group("major")), + minor=int(match.group("minor")), + patch=int(match.group("patch")), + ) + + +def parse_date_value(value: Any) -> date | None: + if value is None: + return None + if isinstance(value, datetime): + return value.date() + if isinstance(value, date): + return value + text = str_value(value) + if not text: + return None + text = text.replace("Z", "+00:00") + try: + return datetime.fromisoformat(text).date() + except ValueError: + pass + match = re.search(r"\d{4}-\d{2}-\d{2}", text) + if match: + return date.fromisoformat(match.group()) + return None diff --git a/scripts/release_notes_ai/utils.py b/scripts/release_notes_ai/utils.py new file mode 100644 index 0000000000000..1c0641787019c --- /dev/null +++ b/scripts/release_notes_ai/utils.py @@ -0,0 +1,87 @@ +from __future__ import annotations + +from typing import Any, Iterable + +from .constants import ( + COMPONENT_ALIASES, + GITHUB_ITEM_URL_RE, + ISSUE_URL_RE, + PR_URL_RE, + TOOL_COMPONENTS, + TOP_LEVEL_COMPONENTS, +) + + +def parse_github_url(url: str, expected_kind: str) -> tuple[str, str, str]: + match = GITHUB_ITEM_URL_RE.search(url) + if not match: + raise ValueError(f"Invalid GitHub URL: {url}") + if match.group("kind") != expected_kind: + raise ValueError(f"Expected a GitHub {expected_kind} URL, got: {url}") + return match.group("owner"), match.group("repo"), match.group("number") + + +def extract_issue_urls(text: str) -> list[str]: + return unique_ordered(match.group() for match in ISSUE_URL_RE.finditer(text or "")) + + +def extract_pr_urls(text: str) -> list[str]: + return unique_ordered(match.group() for match in PR_URL_RE.finditer(text or "")) + + +def replace_author_markdown(text: str, old_author: str, new_author: str) -> str: + text = text or "" + return text.replace( + f"[{old_author}](https://github.com/{old_author}", + f"[{new_author}](https://github.com/{new_author}", + ) + + +def normalize_component(component: str) -> str: + cleaned = " ".join(str_value(component).split()) + if not cleaned: + return "" + return COMPONENT_ALIASES.get(cleaned.lower(), cleaned) + + +def normalize_raw_component(component: Any) -> str: + return " ".join(str_value(component).split()) + + +def normalized_release_component(component: str) -> str | None: + normalized = normalize_component(component) + if normalized in TOP_LEVEL_COMPONENTS or normalized in TOOL_COMPONENTS: + return normalized + return None + + +def split_multi_value(value: Any) -> list[str]: + text = str_value(value) + if not text: + return [] + return [item.strip() for item in text.replace("\n", ",").split(",") if item.strip()] + + +def split_lines(value: Any) -> list[str]: + text = str_value(value) + if not text: + return [] + return [line.strip() for line in text.splitlines() if line.strip()] + + +def unique_ordered(values: Iterable[str]) -> list[str]: + result: list[str] = [] + seen: set[str] = set() + for value in values: + cleaned = str_value(value) + if not cleaned or cleaned in seen: + continue + seen.add(cleaned) + result.append(cleaned) + return result + + +def str_value(value: Any) -> str: + if value is None: + return "" + return str(value).strip() diff --git a/scripts/release_notes_generate_ai.py b/scripts/release_notes_generate_ai.py new file mode 100644 index 0000000000000..5d1e701f56cec --- /dev/null +++ b/scripts/release_notes_generate_ai.py @@ -0,0 +1,10 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +"""CLI entry point for generating English TiDB release notes with AI.""" + +from release_notes_ai.cli import main + + +if __name__ == "__main__": + raise SystemExit(main()) From 4583453ca945f965df345d18bbc7941e4ae045fb Mon Sep 17 00:00:00 2001 From: qiancai Date: Wed, 29 Apr 2026 17:55:48 +0800 Subject: [PATCH 2/4] improve the filter logic and move the prompt to an independent file --- scripts/release_notes_ai/ai_client.py | 83 +++--- scripts/release_notes_ai/cli.py | 8 + scripts/release_notes_ai/constants.py | 3 + scripts/release_notes_ai/excel_workbook.py | 240 ++++++++++++++++-- .../release_notes_ai/prompts/generation.md | 40 +++ scripts/release_notes_generate_ai.py | 37 ++- 6 files changed, 345 insertions(+), 66 deletions(-) create mode 100644 scripts/release_notes_ai/prompts/generation.md diff --git a/scripts/release_notes_ai/ai_client.py b/scripts/release_notes_ai/ai_client.py index 503e28b63023b..d2770e3fbc56c 100644 --- a/scripts/release_notes_ai/ai_client.py +++ b/scripts/release_notes_ai/ai_client.py @@ -12,7 +12,11 @@ from pathlib import Path from typing import Any -from .constants import BUG_FIXES_REFERENCE, IMPROVEMENTS_REFERENCE +from .constants import ( + BUG_FIXES_REFERENCE, + GENERATION_PROMPT_TEMPLATE, + IMPROVEMENTS_REFERENCE, +) from .models import GeneratedNote, RowContext @@ -134,6 +138,7 @@ def build_generation_prompt( expected_links: list[str], contributors: list[str], ) -> str: + prompt_template = load_prompt_template(GENERATION_PROMPT_TEMPLATE) improvements_reference = load_reference_file(IMPROVEMENTS_REFERENCE) bug_fixes_reference = load_reference_file(BUG_FIXES_REFERENCE) context = { @@ -148,46 +153,16 @@ def build_generation_prompt( "issues": [dataclasses.asdict(issue) for issue in row_context.issues], "pull_requests": [dataclasses.asdict(pull) for pull in row_context.pulls], } - return textwrap.dedent( - f""" - You write exactly one English TiDB release note entry. - - Return only a JSON object with exactly these keys: - - type: "improvement" or "bug_fix" - - release_note: one Markdown bullet that starts with "- " - - needs_review: true or false - - reason: a short reason for the type and wording - - Rules: - - Write from the user's perspective. - - Use the Excel issue_type as a strong signal, but decide the final type from the issue, - PR description, and code changes. - - For improvements, follow the Improvements reference below. - - For bug fixes, follow the Bug fixes reference below. - - Do not end the release note with a period. - - Include every expected link in Markdown release-note style. - - Include every contributor as @[user](https://github.com/user). - - If there is no issue URL, use the PR link as the suffix link. - - Do not expose internal function names unless they are the user-visible behavior. - - If the available context is insufficient, still draft the best note and set needs_review - to true. - - Expected links: - {json.dumps(expected_links, ensure_ascii=False, indent=2)} - - Contributors: - {json.dumps(contributors, ensure_ascii=False, indent=2)} - - Row context: - {json.dumps(context, ensure_ascii=False, indent=2)} - - Improvements reference: - {improvements_reference} - - Bug fixes reference: - {bug_fixes_reference} - """ - ).strip() + return render_prompt_template( + prompt_template, + { + "EXPECTED_LINKS": json.dumps(expected_links, ensure_ascii=False, indent=2), + "CONTRIBUTORS": json.dumps(contributors, ensure_ascii=False, indent=2), + "ROW_CONTEXT": json.dumps(context, ensure_ascii=False, indent=2), + "IMPROVEMENTS_REFERENCE": improvements_reference, + "BUG_FIXES_REFERENCE": bug_fixes_reference, + }, + ) def build_repair_prompt(original_prompt: str, errors: list[str]) -> str: @@ -206,6 +181,32 @@ def build_repair_prompt(original_prompt: str, errors: list[str]) -> str: ).strip() +def render_prompt_template(template: str, values: dict[str, str]) -> str: + for key, value in values.items(): + template = template.replace(f"{{{{{key}}}}}", value) + return template.strip() + + +@lru_cache(maxsize=None) +def load_prompt_template(path: Path) -> str: + try: + return strip_prompt_template_heading(path.read_text(encoding="utf-8")) + except FileNotFoundError as exc: + raise FileNotFoundError( + f"Cannot find release-note prompt template: {path}. " + "Make sure scripts/release_notes_ai/prompts/generation.md exists." + ) from exc + + +def strip_prompt_template_heading(template: str) -> str: + lines = template.splitlines() + if lines and lines[0].startswith("# "): + lines = lines[1:] + if lines and not lines[0].strip(): + lines = lines[1:] + return "\n".join(lines) + + @lru_cache(maxsize=None) def load_reference_file(path: Path) -> str: try: diff --git a/scripts/release_notes_ai/cli.py b/scripts/release_notes_ai/cli.py index ee1d79a074c4a..fdeaccfda3efb 100644 --- a/scripts/release_notes_ai/cli.py +++ b/scripts/release_notes_ai/cli.py @@ -13,6 +13,7 @@ generate_notes_without_ai, generate_notes_for_sheet, merge_rows_by_issue_and_component, + move_rows_with_issues_already_in_same_series, prepare_sheet_columns, sort_sheet_rows_by_component, store_existing_release_notes, @@ -164,6 +165,13 @@ def main() -> int: clear_output_columns(sheet, header, clear_ai=args.force_regenerate) existing_notes = store_existing_release_notes(Path(args.releases_dir), args.version) + move_rows_with_issues_already_in_same_series( + workbook, + sheet, + header, + existing_notes, + args.version, + ) update_pr_authors_and_dup_notes( sheet, header, diff --git a/scripts/release_notes_ai/constants.py b/scripts/release_notes_ai/constants.py index c3e947167a23b..89cca90e52d2b 100644 --- a/scripts/release_notes_ai/constants.py +++ b/scripts/release_notes_ai/constants.py @@ -21,6 +21,9 @@ / "references" / "bug-fixes.md" ) +GENERATION_PROMPT_TEMPLATE = ( + REPO_ROOT / "scripts" / "release_notes_ai" / "prompts" / "generation.md" +) BOT_AUTHORS = {"ti-chi-bot", "ti-srebot"} # Keep the misspelled source column name because tirelease exports it this way. diff --git a/scripts/release_notes_ai/excel_workbook.py b/scripts/release_notes_ai/excel_workbook.py index 260b4b807d04e..177f28fd64c9f 100644 --- a/scripts/release_notes_ai/excel_workbook.py +++ b/scripts/release_notes_ai/excel_workbook.py @@ -43,6 +43,7 @@ GRAY_FILL = PatternFill(start_color="D3D3D3", end_color="D3D3D3", fill_type="solid") +SAME_SERIES_REASON_HEADER = "reason" def prepare_sheet_columns(sheet: Any) -> dict[str, int]: @@ -168,7 +169,7 @@ def first_issue_url_for_row(sheet: Any, header: dict[str, int], row_number: int) def store_existing_release_notes(releases_dir: Path, version: str) -> list[ExistingNote]: existing_notes: list[ExistingNote] = [] - seen: set[tuple[str, tuple[str, ...]]] = set() + seen: set[tuple[str, tuple[str, ...], str]] = set() target_version = parse_semver_tuple(version) for file_path in sorted(releases_dir.rglob("*.md")): @@ -179,25 +180,26 @@ def store_existing_release_notes(releases_dir: Path, version: str) -> list[Exist for raw_line in file: line = raw_line.strip() authors = AUTHOR_RE.findall(line) - item_url = GITHUB_ITEM_URL_RE.search(line) - if item_url: - key = (item_url.group(), tuple(authors)) - if key in seen: - continue - seen.add(key) + item_urls = [match.group() for match in GITHUB_ITEM_URL_RE.finditer(line)] + if item_urls: note_level = level1 + level2 + level3 note_type, component = classify_note_level(note_level) - existing_notes.append( - ExistingNote( - url=item_url.group(), - line=line, - file_name=file_path.name, - note_level=note_level, - authors=authors, - note_type=note_type, - component=component, + for item_url in item_urls: + key = (item_url, tuple(authors), file_path.name) + if key in seen: + continue + seen.add(key) + existing_notes.append( + ExistingNote( + url=item_url, + line=line, + file_name=file_path.name, + note_level=note_level, + authors=authors, + note_type=note_type, + component=component, + ) ) - ) continue heading = parse_release_note_heading(raw_line) @@ -283,22 +285,207 @@ def update_pr_authors_and_dup_notes( author_cell = sheet.cell(row=row_number, column=header["pr_author"]) current_author = str_value(author_cell.value) - issue_url = first_issue_url_for_row(sheet, header, row_number) - if not issue_url: + issue_urls = issue_urls_for_row(sheet, header, row_number) + if not issue_urls: continue current_authors = split_multi_value(current_author) dup_notes = [] - for existing in existing_notes_by_url.get(issue_url, []): - if existing.authors and not set(current_authors).intersection(existing.authors): - continue - dup_notes.append(existing.dup_text) + for issue_url in issue_urls: + for existing in existing_notes_by_url.get(issue_url, []): + if existing.authors and not set(current_authors).intersection(existing.authors): + continue + dup_notes.append(existing.dup_text) if dup_notes: dup_col = header["published_release_notes"] sheet.cell(row=row_number, column=dup_col, value="\n".join(unique_ordered(dup_notes))) fill_row(sheet, row_number) - print(f"Row {row_number}: found duplicated release note for {issue_url}", flush=True) + print( + f"Row {row_number}: found duplicated release note for {', '.join(issue_urls)}", + flush=True, + ) + + +def move_rows_with_issues_already_in_same_series( + workbook: Any, + sheet: Any, + header: dict[str, int], + existing_notes: list[ExistingNote], + version: str, +) -> int: + files_by_issue_url = same_series_release_files_by_issue_url(existing_notes, version) + if not files_by_issue_url: + return 0 + + target_sheet_name = same_series_issues_sheet_name(version) + target, reason_col = ensure_sheet_with_reason(workbook, sheet, target_sheet_name) + rows_to_move: list[tuple[int, str]] = [] + + for row_number in range(2, sheet.max_row + 1): + issue_urls = issue_urls_for_row(sheet, header, row_number) + reason = same_series_issue_reason(issue_urls, files_by_issue_url) + if reason: + rows_to_move.append((row_number, reason)) + + for row_number, reason in rows_to_move: + append_row_with_reason(sheet, target, row_number, reason, reason_col) + + for row_number, _reason in reversed(rows_to_move): + sheet.delete_rows(row_number, 1) + + if rows_to_move: + print( + f"Moved {len(rows_to_move)} row(s) to {target_sheet_name} because their issues " + "already appear in earlier release notes from the same major.minor series", + flush=True, + ) + return len(rows_to_move) + + +def same_series_release_files_by_issue_url( + existing_notes: list[ExistingNote], + version: str, +) -> dict[str, list[str]]: + target_version = parse_semver_tuple(version) + files_by_issue_url: dict[str, list[str]] = {} + + for existing in existing_notes: + match = GITHUB_ITEM_URL_RE.search(existing.url) + if not match or match.group("kind") != "issues": + continue + + file_version = release_file_semver_tuple(Path(existing.file_name)) + if not file_version: + continue + if file_version[:2] != target_version[:2] or file_version >= target_version: + continue + + files = files_by_issue_url.setdefault(existing.url, []) + if existing.file_name not in files: + files.append(existing.file_name) + + for issue_url, files in list(files_by_issue_url.items()): + files_by_issue_url[issue_url] = sorted(files, key=release_file_name_sort_key) + return files_by_issue_url + + +def same_series_issues_sheet_name(version: str) -> str: + major, minor, _patch = parse_semver_tuple(version) + return f"issues_already_in_earlier_v{major}.{minor}_notes" + + +def same_series_issue_reason( + issue_urls: list[str], + files_by_issue_url: dict[str, list[str]], +) -> str | None: + reasons = [] + for issue_url in issue_urls: + files = files_by_issue_url.get(issue_url) + if files: + reasons.append(f"{issue_url} appears in {', '.join(files)}") + return "; ".join(reasons) if reasons else None + + +def release_file_name_sort_key(file_name: str) -> tuple[int, int, int, str]: + version = release_file_semver_tuple(Path(file_name)) + if not version: + return (sys.maxsize, sys.maxsize, sys.maxsize, file_name) + return (*version, file_name) + + +def ensure_sheet_with_reason( + workbook: Any, + source_sheet: Any, + target_sheet_name: str, +) -> tuple[Any, int]: + if target_sheet_name in workbook.sheetnames: + target = workbook[target_sheet_name] + if not str_value(target.cell(row=1, column=1).value): + reason_col = copy_header_with_reason(source_sheet, target) + else: + reason_col = ensure_same_series_reason_header(source_sheet, target) + return target, reason_col + + target = workbook.create_sheet(target_sheet_name) + reason_col = copy_header_with_reason(source_sheet, target) + return target, reason_col + + +def copy_header_with_reason(source_sheet: Any, target_sheet: Any) -> int: + for column in range(1, source_sheet.max_column + 1): + copy_cell( + source_sheet.cell(row=1, column=column), + target_sheet.cell(row=1, column=column), + ) + return ensure_same_series_reason_header(source_sheet, target_sheet) + + +def ensure_same_series_reason_header(source_sheet: Any, target_sheet: Any) -> int: + reason_col = find_header_column(target_sheet, SAME_SERIES_REASON_HEADER) + if not reason_col: + reason_col = max(source_sheet.max_column, target_sheet.max_column) + 1 + copy_missing_header_cells(source_sheet, target_sheet) + target_sheet.cell(row=1, column=reason_col, value=SAME_SERIES_REASON_HEADER) + return reason_col + + while reason_col <= source_sheet.max_column: + target_sheet.insert_cols(reason_col) + reason_col += 1 + + copy_missing_header_cells(source_sheet, target_sheet) + return reason_col + + +def copy_missing_header_cells(source_sheet: Any, target_sheet: Any) -> None: + for column in range(1, source_sheet.max_column + 1): + if not str_value(target_sheet.cell(row=1, column=column).value): + copy_cell( + source_sheet.cell(row=1, column=column), + target_sheet.cell(row=1, column=column), + ) + + +def find_header_column(sheet: Any, header_name: str) -> int | None: + for column in range(1, sheet.max_column + 1): + if str_value(sheet.cell(row=1, column=column).value) == header_name: + return column + return None + + +def append_row_with_reason( + source_sheet: Any, + target_sheet: Any, + row_number: int, + reason: str, + reason_col: int, +) -> None: + target_row = target_sheet.max_row + 1 + source_dimension = source_sheet.row_dimensions[row_number] + target_dimension = target_sheet.row_dimensions[target_row] + target_dimension.height = source_dimension.height + target_dimension.hidden = source_dimension.hidden + target_dimension.outlineLevel = source_dimension.outlineLevel + target_dimension.collapsed = source_dimension.collapsed + + for column in range(1, source_sheet.max_column + 1): + copy_cell( + source_sheet.cell(row=row_number, column=column), + target_sheet.cell(row=target_row, column=column), + ) + target_sheet.cell(row=target_row, column=reason_col, value=reason) + + +def copy_cell(source_cell: Any, target_cell: Any) -> None: + target_cell.value = source_cell.value + if source_cell.has_style: + target_cell._style = copy.copy(source_cell._style) + if source_cell.number_format: + target_cell.number_format = source_cell.number_format + if source_cell.hyperlink: + target_cell._hyperlink = copy.copy(source_cell.hyperlink) + if source_cell.comment: + target_cell.comment = copy.copy(source_cell.comment) def apply_bot_author_replacements( @@ -407,7 +594,12 @@ def resolve_bot_author(github: Any, request: tuple[int, str, str, str]) -> str: def index_existing_notes_by_url(existing_notes: list[ExistingNote]) -> dict[str, list[ExistingNote]]: indexed: dict[str, list[ExistingNote]] = {} + seen: set[tuple[str, tuple[str, ...]]] = set() for existing in existing_notes: + key = (existing.url, tuple(existing.authors)) + if key in seen: + continue + seen.add(key) indexed.setdefault(existing.url, []).append(existing) return indexed diff --git a/scripts/release_notes_ai/prompts/generation.md b/scripts/release_notes_ai/prompts/generation.md new file mode 100644 index 0000000000000..8eb5b1e993381 --- /dev/null +++ b/scripts/release_notes_ai/prompts/generation.md @@ -0,0 +1,40 @@ +# Generation Prompt + +You are a senior technical writer who has profound knowledge of TiDB. + +Your task is to write exactly one English release note entry for a TiDB issue or PR. + +Return only a JSON object with exactly these keys: + +- type: "improvement" or "bug_fix" +- release_note: one Markdown bullet that starts with "- " +- needs_review: true or false +- reason: a short reason for the type and wording + +Rules: + +- Write from the user's perspective. +- Use the Excel issue_type as a strong signal, but decide the final type from the issue, PR description, and code changes. +- For improvements, follow the Improvements reference below. +- For bug fixes, follow the Bug fixes reference below. +- Do not end the release note with a period. +- Include every expected link in Markdown release-note style. +- Include every contributor as @[user](https://github.com/user). +- If there is no issue URL, use the PR link as the suffix link. +- Do not expose internal function names unless they are the user-visible behavior. +- If the available context is insufficient, still draft the best note and set needs_review to true. + +Expected links: +{{EXPECTED_LINKS}} + +Contributors: +{{CONTRIBUTORS}} + +Row context: +{{ROW_CONTEXT}} + +Improvements reference: +{{IMPROVEMENTS_REFERENCE}} + +Bug fixes reference: +{{BUG_FIXES_REFERENCE}} diff --git a/scripts/release_notes_generate_ai.py b/scripts/release_notes_generate_ai.py index 5d1e701f56cec..bdcb30ba8433b 100644 --- a/scripts/release_notes_generate_ai.py +++ b/scripts/release_notes_generate_ai.py @@ -1,7 +1,42 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -"""CLI entry point for generating English TiDB release notes with AI.""" +""" This script generates English TiDB release notes from a workbook with PR links and issue links of a specific release. + +What does this script do? + + - Filter out the PRs and issues that are not in the target release scope. For example, PRs that were merged before this previous path release. + - Move the issues that already appeared in earlier notes from the same major.minor series to a separate worksheet. + - Mark the release notes that are already published in other series as ``(dup)`` and reuse the release notes for the same issue. + - Generate the English release note using AI according to the release note draft provided in the PR, the description and code changes of the PR, the descriptions of the issue + - Map components in the workbook to the corresponding release note components. + - Generate the release note file for the target release according to the release note template file. + +Typical usage: + + python3 scripts/release_notes_generate_ai.py \ + --version 8.5.7 \ + --excel /path/to/tirelease.xlsx \ + --releases-dir releases \ + --github-token-file /path/to/github-token.txt + +Useful options: + + --involve-ai-generation OFF + Skip AI generation and use the source ``formated_release_note`` values + for non-duplicate rows. + + --force-regenerate + Clear existing AI-generated notes in the processed workbook and generate + them again. + + --output-release-file /path/to/release-8.5.7.md + Write the generated Markdown to a custom path. By default, the output is + ``release--updated-by-ai.md`` under ``--releases-dir``. + +Run ``python3 scripts/release_notes_generate_ai.py --help`` for the full option +list. +""" from release_notes_ai.cli import main From 195da0b95b759c8511ec1130213d4ee50ec68224 Mon Sep 17 00:00:00 2001 From: qiancai Date: Wed, 6 May 2026 14:59:38 +0800 Subject: [PATCH 3/4] update the naming rule of the release note file --- scripts/release_notes_ai/cli.py | 14 ++++++++++++-- scripts/release_notes_generate_ai.py | 6 ++++-- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/scripts/release_notes_ai/cli.py b/scripts/release_notes_ai/cli.py index fdeaccfda3efb..7aea9b9ee43db 100644 --- a/scripts/release_notes_ai/cli.py +++ b/scripts/release_notes_ai/cli.py @@ -58,7 +58,10 @@ def parse_args() -> argparse.Namespace: ) parser.add_argument( "--output-release-file", - help="Output Markdown file. Defaults to release-{version}-updated-by-ai.md.", + help=( + "Output Markdown file. Defaults to release-{version}-updated-by-ai.md " + "if release-{version}.md already exists, otherwise release-{version}.md." + ), ) parser.add_argument( "--ai-timeout", @@ -142,7 +145,7 @@ def main() -> int: output_file = ( Path(args.output_release_file) if args.output_release_file - else Path(args.releases_dir) / f"release-{args.version}-updated-by-ai.md" + else default_output_release_file(Path(args.releases_dir), args.version) ) excel_path = Path(args.excel) @@ -219,6 +222,13 @@ def parse_on_off(value: str) -> str: return normalized +def default_output_release_file(releases_dir: Path, version: str) -> Path: + release_file = releases_dir / f"release-{version}.md" + if release_file.is_file(): + return releases_dir / f"release-{version}-updated-by-ai.md" + return release_file + + def default_processed_excel_path(excel_path: Path) -> Path: return excel_path.with_name(f"{excel_path.stem}_processed{excel_path.suffix}") diff --git a/scripts/release_notes_generate_ai.py b/scripts/release_notes_generate_ai.py index bdcb30ba8433b..03d1ec7f8a59b 100644 --- a/scripts/release_notes_generate_ai.py +++ b/scripts/release_notes_generate_ai.py @@ -31,8 +31,10 @@ them again. --output-release-file /path/to/release-8.5.7.md - Write the generated Markdown to a custom path. By default, the output is - ``release--updated-by-ai.md`` under ``--releases-dir``. + Write the generated Markdown to a custom path. By default, the output + under ``--releases-dir`` is ``release--updated-by-ai.md`` if + ``release-.md`` already exists, otherwise + ``release-.md``. Run ``python3 scripts/release_notes_generate_ai.py --help`` for the full option list. From 259870d8186173c3a079e96f33c9e28a169d75e1 Mon Sep 17 00:00:00 2001 From: qiancai Date: Wed, 6 May 2026 16:24:10 +0800 Subject: [PATCH 4/4] add the usage descriptions for the scripts --- scripts/release-notes-generator-readme.md | 79 +++++++++++++++++++++++ scripts/release_notes_ai/__init__.py | 1 - scripts/release_notes_ai/cli.py | 34 +++------- scripts/release_notes_ai/constants.py | 2 +- scripts/release_notes_generate_ai.py | 35 ++-------- 5 files changed, 94 insertions(+), 57 deletions(-) create mode 100644 scripts/release-notes-generator-readme.md delete mode 100644 scripts/release_notes_ai/__init__.py diff --git a/scripts/release-notes-generator-readme.md b/scripts/release-notes-generator-readme.md new file mode 100644 index 0000000000000..d966eaf3b83c4 --- /dev/null +++ b/scripts/release-notes-generator-readme.md @@ -0,0 +1,79 @@ +# Release notes generator + +`scripts/release_notes_generate_ai.py` generates English TiDB release notes according to PRs and issues in a specified excel file. + +## What it does + +**Scope filtering:** + +- Filters out PRs and issues that are not in the target release scope. For example, it filters out PRs that were merged before the previous patch release. +- Moves issues that already appeared in earlier notes from the same major.minor series to a separate worksheet. + +**Duplicate handling:** + +- Marks release notes that are already published in other series as `(dup)` and reuses the release notes for the same issue. + +**Release note generation:** + +- Generates English release notes using AI according to the release note draft provided in the PR, the PR description and code changes, and the issue description. +- Maps components in the workbook to the corresponding release note components. + +**File output in Markdown:** + +- Generates the release note file for the target release according to the release note template file. +- Add the improvements and bug fixes of each component to the corresponding sections of the release note file. + +## Prerequisites + +- Install Python dependencies: + + ```bash + python3 -m pip install -r scripts/release_notes_ai/requirements.txt + ``` + +- Prepare a GitHub token with access to the public repositories and set the GitHub token in the `GITHUB_TOKEN` environment variable: + + ```bash + export GITHUB_TOKEN= + ``` + +- Install and log in to Codex CLI. The default `--ai-command` uses `codex exec`, so the installed Codex CLI must support `exec`, `--sandbox read-only`, `--ephemeral`, `--output-schema`, `--output-last-message`, and `-m `. + +## Typical usage + +```bash +python3 scripts/release_notes_generate_ai.py \ + --version 8.5.7 \ + --excel /path/to/release-note-excel.xlsx \ + --releases-dir releases +``` + +## Option descriptions + +| Option | Required | Default value | Usage example | Description | +| --- | --- | --- | --- | --- | +| `--version ` | Yes | None | `--version 8.5.7` | Target TiDB version. This value is used for scope filtering, existing release-note lookup, generated Markdown front matter, and the default output file name. | +| `--excel ` | Yes | None | `--excel /path/to/release-note-excel.xlsx` | Path to the source release note excel file. The source workbook is not overwritten. The processed workbook is written to `_processed.xlsx`. | +| `--releases-dir ` | Yes | None | `--releases-dir releases` | Path to the existing English release notes directory. The script scans this directory for historical release notes and writes the generated Markdown under this directory unless `--output-release-file` is specified. | +| `--sheet ` | No | `pr_for_release_note` | `--sheet pr_for_release_note` | Workbook sheet to process. | +| `--ai-command ` | No | `codex --ask-for-approval never exec --sandbox read-only --ephemeral` | `--ai-command "codex --ask-for-approval never exec --sandbox read-only --ephemeral"` | Command used to invoke the AI generator. The prompt is passed through standard input. When the command is `codex exec`, the script also passes `--output-schema` and `--output-last-message`. | +| `--ai-model ` | No | `gpt-5.4` | `--ai-model gpt-5.4` | Model name passed to `codex exec` with `-m`. | +| `--involve-ai-generation ` | No | `ON` | `--involve-ai-generation OFF` | Whether to generate non-duplicate release notes with AI. Use `ON` to invoke AI, or `OFF` to use the source `formated_release_note` values. | +| `--output-release-file ` | No | Conditional | `--output-release-file /path/to/release-8.5.7.md` | Write the generated Markdown to a custom path. By default, the output under `--releases-dir` is `release--updated-by-ai.md` if `release-.md` already exists, otherwise `release-.md`. | +| `--ai-timeout ` | No | `600` | `--ai-timeout 600` | Timeout in seconds for each AI command invocation. | +| `--ai-workers ` | No | `3` | `--ai-workers 3` | Number of concurrent AI command invocations. | +| `--github-workers ` | No | `8` | `--github-workers 8` | Number of concurrent GitHub API prefetch workers. | +| `--author-workers ` | No | `3` | `--author-workers 3` | Number of concurrent workers used to resolve bot-authored cherry-pick PR authors. | +| `--checkpoint-interval ` | No | `1` | `--checkpoint-interval 1` | Save the processed workbook after every N completed AI rows. Use `0` to disable checkpoint saves. | +| `--force-regenerate` | No | Disabled | `--force-regenerate` | Clear existing AI-generated notes in the processed workbook and generate all non-duplicate rows again. | +| `--release-date ` | No | `TBD` | `--release-date "August 14, 2025"` | Release date text for the generated Markdown header. | +| `--skip-scope-preprocess` | No | Disabled | `--skip-scope-preprocess` | Skip moving not-in-scope PR rows to the `PRs_not_in_scope` sheet. | +| `--scope-base-branch-start-date ` | No | Estimated from release history | `--scope-base-branch-start-date 2025-01-01` | Override the estimated release-m.n branch start date for x.y.0 scope preprocessing. The value must use the `YYYY-MM-DD` format. | + +## Generated files + +- The source excel file passed to `--excel` is not overwritten. +- The processed excel file is written to `_processed.xlsx` next to the source workbook. +- The generated Markdown file is written to `--output-release-file` when that option is specified. +- If `--output-release-file` is omitted and `release-.md` already exists under `--releases-dir`, the generated Markdown file is written to `release--updated-by-ai.md`. +- If `--output-release-file` is omitted and `release-.md` does not exist under `--releases-dir`, the generated Markdown file is written to `release-.md`. diff --git a/scripts/release_notes_ai/__init__.py b/scripts/release_notes_ai/__init__.py deleted file mode 100644 index 65f7e128c779b..0000000000000 --- a/scripts/release_notes_ai/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Helpers for generating TiDB release notes with AI.""" diff --git a/scripts/release_notes_ai/cli.py b/scripts/release_notes_ai/cli.py index 7aea9b9ee43db..b1e913a7bca0d 100644 --- a/scripts/release_notes_ai/cli.py +++ b/scripts/release_notes_ai/cli.py @@ -26,7 +26,7 @@ def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser( - description="Generate English release notes with AI from a tirelease workbook." + description="Generate English release notes with AI according to PRs and issues in a specified excel file." ) parser.add_argument("--version", required=True, help="Target TiDB version, for example 8.5.7.") parser.add_argument("--excel", required=True, help="Path to the release note Excel workbook.") @@ -36,7 +36,6 @@ def parse_args() -> argparse.Namespace: help="Path to the existing English release notes directory.", ) parser.add_argument("--sheet", default="pr_for_release_note", help="Workbook sheet name.") - parser.add_argument("--github-token-file", help="Path to a GitHub token file.") parser.add_argument( "--ai-command", default="codex --ask-for-approval never exec --sandbox read-only --ephemeral", @@ -137,7 +136,10 @@ def main() -> int: if not base_branch_start_date: raise ValueError("--scope-base-branch-start-date must use YYYY-MM-DD format") - token = load_github_token(args.github_token_file) + try: + token = load_github_token() + except ValueError as exc: + raise SystemExit(f"error: {exc}") from None github = GitHubClient(token) involve_ai_generation = args.involve_ai_generation == "ON" ai = AIClient(args.ai_command, args.ai_model, args.ai_timeout) if involve_ai_generation else None @@ -278,24 +280,8 @@ def save_workbook_safely(workbook: openpyxl.Workbook, excel_path: Path) -> None: raise RuntimeError(f"Failed to save workbook {excel_path}: {exc}") from exc -def load_github_token(token_file: str | None) -> str | None: - import shutil - import subprocess - - if token_file: - return Path(token_file).read_text(encoding="utf-8").strip() - if os.environ.get("GITHUB_TOKEN"): - return os.environ["GITHUB_TOKEN"].strip() - gh = shutil.which("gh") - if not gh: - return None - completed = subprocess.run( - [gh, "auth", "token"], - text=True, - capture_output=True, - timeout=10, - check=False, - ) - if completed.returncode == 0 and completed.stdout.strip(): - return completed.stdout.strip() - return None +def load_github_token() -> str: + token = os.environ.get("GITHUB_TOKEN", "").strip() + if not token: + raise ValueError("GITHUB_TOKEN environment variable is required") + return token diff --git a/scripts/release_notes_ai/constants.py b/scripts/release_notes_ai/constants.py index 89cca90e52d2b..12d0d9ca8dcee 100644 --- a/scripts/release_notes_ai/constants.py +++ b/scripts/release_notes_ai/constants.py @@ -26,7 +26,7 @@ ) BOT_AUTHORS = {"ti-chi-bot", "ti-srebot"} -# Keep the misspelled source column name because tirelease exports it this way. +# Keep the misspelled source column name because release note excel file exports it this way. REQUIRED_HEADERS = { "pr_author", "pr_link", diff --git a/scripts/release_notes_generate_ai.py b/scripts/release_notes_generate_ai.py index 03d1ec7f8a59b..0a6d3eb761268 100644 --- a/scripts/release_notes_generate_ai.py +++ b/scripts/release_notes_generate_ai.py @@ -1,43 +1,16 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -""" This script generates English TiDB release notes from a workbook with PR links and issue links of a specific release. - -What does this script do? - - - Filter out the PRs and issues that are not in the target release scope. For example, PRs that were merged before this previous path release. - - Move the issues that already appeared in earlier notes from the same major.minor series to a separate worksheet. - - Mark the release notes that are already published in other series as ``(dup)`` and reuse the release notes for the same issue. - - Generate the English release note using AI according to the release note draft provided in the PR, the description and code changes of the PR, the descriptions of the issue - - Map components in the workbook to the corresponding release note components. - - Generate the release note file for the target release according to the release note template file. +"""Generate TiDB improvements and bug fixes for release notes according to PRs and issues in a specified excel file. Typical usage: python3 scripts/release_notes_generate_ai.py \ --version 8.5.7 \ - --excel /path/to/tirelease.xlsx \ - --releases-dir releases \ - --github-token-file /path/to/github-token.txt - -Useful options: - - --involve-ai-generation OFF - Skip AI generation and use the source ``formated_release_note`` values - for non-duplicate rows. - - --force-regenerate - Clear existing AI-generated notes in the processed workbook and generate - them again. - - --output-release-file /path/to/release-8.5.7.md - Write the generated Markdown to a custom path. By default, the output - under ``--releases-dir`` is ``release--updated-by-ai.md`` if - ``release-.md`` already exists, otherwise - ``release-.md``. + --excel /path/to/release-note-excel.xlsx \ + --releases-dir releases -Run ``python3 scripts/release_notes_generate_ai.py --help`` for the full option -list. +For detailed usage and options, see scripts/release-notes-generator-readme.md. """ from release_notes_ai.cli import main