From 684bb958633713db39306236c264ec5659ce4056 Mon Sep 17 00:00:00 2001 From: ebembi-crdb Date: Wed, 8 Apr 2026 17:34:03 +0530 Subject: [PATCH] Add AI-powered release notes reviewer GitHub Action Introduces a GitHub Action that automatically reviews PRs touching release notes files using schema validation, technical accuracy checks, and OpenAI GPT-4o content quality analysis. - Copies core reviewer logic into .github/scripts/release_review/ - Adds standalone runner script (.github/scripts/run_release_review.py) - Adds workflow that triggers on PRs to main touching releases.yml or _includes/releases/** - Posts results as a neutral (advisory) check run and updatable PR comment - Never blocks merge; requires OPENAI_API_KEY repository secret --- .github/scripts/release_review/__init__.py | 5 + .github/scripts/release_review/config.py | 190 ++++++ .../scripts/release_review/github_client.py | 306 ++++++++++ .github/scripts/release_review/reporter.py | 252 ++++++++ .github/scripts/release_review/reviewer.py | 571 ++++++++++++++++++ .github/scripts/release_review/schemas.py | 127 ++++ .github/scripts/run_release_review.py | 47 ++ .github/workflows/release-notes-review.yml | 38 ++ 8 files changed, 1536 insertions(+) create mode 100644 .github/scripts/release_review/__init__.py create mode 100644 .github/scripts/release_review/config.py create mode 100644 .github/scripts/release_review/github_client.py create mode 100644 .github/scripts/release_review/reporter.py create mode 100644 .github/scripts/release_review/reviewer.py create mode 100644 .github/scripts/release_review/schemas.py create mode 100644 .github/scripts/run_release_review.py create mode 100644 .github/workflows/release-notes-review.yml diff --git a/.github/scripts/release_review/__init__.py b/.github/scripts/release_review/__init__.py new file mode 100644 index 00000000000..2efdb19df2d --- /dev/null +++ b/.github/scripts/release_review/__init__.py @@ -0,0 +1,5 @@ +from .schemas import Issue, Links, ReviewPayload +from .config import ReleaseReviewConfig, get_config +from .reviewer import ReleaseNotesReviewer, ReviewResult, get_reviewer +from .github_client import GitHubClient, get_github_client +from .reporter import Reporter, get_reporter diff --git a/.github/scripts/release_review/config.py b/.github/scripts/release_review/config.py new file mode 100644 index 00000000000..df063a67d1d --- /dev/null +++ b/.github/scripts/release_review/config.py @@ -0,0 +1,190 @@ +""" +Configuration settings for the release review service. + +Loads settings from environment variables and optional YAML config file. +""" +import os +from pathlib import Path +from typing import Dict, Optional +from dataclasses import dataclass, field + +import yaml + + +@dataclass +class GitHubConfig: + """GitHub-related configuration.""" + token: str = "" + check_run_name: str = "Release Notes Review (AI)" + check_run_title: str = "Advisory Release Notes Review (AI)" + max_annotations: int = 50 + max_annotation_message_length: int = 640 + bot_comment_marker: str = "" + + +@dataclass +class SecurityConfig: + """Security-related configuration.""" + docs_agent_secret: str = "" + signature_header: str = "X-Docs-Agent-Signature" + idempotency_header: str = "X-Idempotency-Key" + + +@dataclass +class StoreConfig: + """Storage configuration.""" + db_path: str = "docs_agent.db" + stale_job_ttl_seconds: int = 3600 # 1 hour + + +@dataclass +class FeatureFlags: + """Feature flags for the service.""" + post_comments: bool = True + post_check_runs: bool = True + + +@dataclass +class SeverityMapping: + """Mapping of severity levels to GitHub annotation levels.""" + high: str = "failure" + medium: str = "warning" + low: str = "notice" + + +@dataclass +class ReleaseReviewConfig: + """Main configuration for the release review service.""" + github: GitHubConfig = field(default_factory=GitHubConfig) + security: SecurityConfig = field(default_factory=SecurityConfig) + store: StoreConfig = field(default_factory=StoreConfig) + features: FeatureFlags = field(default_factory=FeatureFlags) + severity_mapping: SeverityMapping = field(default_factory=SeverityMapping) + + @classmethod + def from_env(cls, config_path: Optional[str] = None) -> "ReleaseReviewConfig": + """ + Load configuration from environment variables and optional YAML file. + + Environment variables take precedence over YAML config. + """ + config = cls() + + # Load from YAML if path provided or default exists + yaml_path = config_path or os.getenv("RELEASE_REVIEWER_CONFIG") + if yaml_path and Path(yaml_path).exists(): + config = cls._load_yaml(yaml_path, config) + else: + # Check default locations + default_paths = [ + Path("config/release-reviewer.yml"), + Path("config/release-reviewer.yaml"), + ] + for path in default_paths: + if path.exists(): + config = cls._load_yaml(str(path), config) + break + + # Override with environment variables + config.github.token = os.getenv("GITHUB_TOKEN", config.github.token) + config.security.docs_agent_secret = os.getenv( + "DOCS_AGENT_SECRET", config.security.docs_agent_secret + ) + config.store.db_path = os.getenv("DOCS_AGENT_DB", config.store.db_path) + + # Feature flags from env + post_comments = os.getenv("POST_COMMENTS") + if post_comments is not None: + config.features.post_comments = post_comments.lower() in ("true", "1", "yes") + + post_check_runs = os.getenv("POST_CHECK_RUNS") + if post_check_runs is not None: + config.features.post_check_runs = post_check_runs.lower() in ("true", "1", "yes") + + return config + + @classmethod + def _load_yaml(cls, path: str, config: "ReleaseReviewConfig") -> "ReleaseReviewConfig": + """Load configuration from YAML file.""" + try: + with open(path, "r") as f: + data = yaml.safe_load(f) or {} + + # GitHub settings + if "github" in data: + gh = data["github"] + config.github.check_run_name = gh.get( + "check_run_name", config.github.check_run_name + ) + config.github.check_run_title = gh.get( + "check_run_title", config.github.check_run_title + ) + config.github.max_annotations = gh.get( + "max_annotations", config.github.max_annotations + ) + config.github.max_annotation_message_length = gh.get( + "max_annotation_message_length", + config.github.max_annotation_message_length + ) + + # Store settings + if "store" in data: + store = data["store"] + config.store.stale_job_ttl_seconds = store.get( + "stale_job_ttl_seconds", config.store.stale_job_ttl_seconds + ) + + # Severity mapping + if "severity_mapping" in data: + sm = data["severity_mapping"] + config.severity_mapping.high = sm.get("high", config.severity_mapping.high) + config.severity_mapping.medium = sm.get("medium", config.severity_mapping.medium) + config.severity_mapping.low = sm.get("low", config.severity_mapping.low) + + # Feature flags + if "features" in data: + features = data["features"] + config.features.post_comments = features.get( + "post_comments", config.features.post_comments + ) + config.features.post_check_runs = features.get( + "post_check_runs", config.features.post_check_runs + ) + + except Exception as e: + # Log warning but don't fail - use defaults + import logging + logging.warning(f"Failed to load config from {path}: {e}") + + return config + + def validate(self) -> None: + """Validate that required configuration is present.""" + errors = [] + + if not self.github.token: + errors.append("GITHUB_TOKEN is required") + + if not self.security.docs_agent_secret: + errors.append("DOCS_AGENT_SECRET is required") + + if errors: + raise ValueError(f"Configuration errors: {', '.join(errors)}") + + +# Global config instance (lazy loaded) +_config: Optional[ReleaseReviewConfig] = None + + +def get_config() -> ReleaseReviewConfig: + """Get the global configuration instance.""" + global _config + if _config is None: + _config = ReleaseReviewConfig.from_env() + return _config + + +def reset_config() -> None: + """Reset the global configuration (useful for testing).""" + global _config + _config = None diff --git a/.github/scripts/release_review/github_client.py b/.github/scripts/release_review/github_client.py new file mode 100644 index 00000000000..413d27deab3 --- /dev/null +++ b/.github/scripts/release_review/github_client.py @@ -0,0 +1,306 @@ +""" +GitHub API client for the release review service. + +Handles creating check runs, posting/updating PR comments, and fetching PR info. +""" +import logging +import time +from typing import Optional, Dict, Any, List, Tuple +from dataclasses import dataclass + +import requests +from requests.adapters import HTTPAdapter +from urllib3.util.retry import Retry + +from .config import get_config +from .schemas import ReviewPayload, Annotation + +logger = logging.getLogger(__name__) + + +class GitHubAPIError(Exception): + """Exception for GitHub API errors.""" + + def __init__(self, message: str, status_code: int = 0, retry_after: Optional[int] = None): + super().__init__(message) + self.status_code = status_code + self.retry_after = retry_after + + +class GitHubRateLimitError(GitHubAPIError): + """Exception for GitHub rate limit errors.""" + pass + + +@dataclass +class CheckRunOutput: + """Output data for a GitHub check run.""" + title: str + summary: str + annotations: List[Dict[str, Any]] + + +class GitHubClient: + """Client for interacting with GitHub API.""" + + BASE_URL = "https://api.github.com" + + def __init__(self, token: Optional[str] = None): + """Initialize the GitHub client.""" + config = get_config() + self.token = token or config.github.token + self.config = config.github + + # Set up session with retry logic + self.session = requests.Session() + retry_strategy = Retry( + total=2, + backoff_factor=0.5, + status_forcelist=[500, 502, 503, 504], + ) + adapter = HTTPAdapter(max_retries=retry_strategy) + self.session.mount("https://", adapter) + self.session.mount("http://", adapter) + + def _headers(self) -> Dict[str, str]: + """Get the headers for GitHub API requests.""" + return { + "Authorization": f"Bearer {self.token}", + "Accept": "application/vnd.github+json", + "X-GitHub-Api-Version": "2022-11-28", + } + + def _handle_response(self, response: requests.Response) -> Dict[str, Any]: + """Handle GitHub API response and raise appropriate errors.""" + # Check for rate limiting + if response.status_code in (403, 429): + remaining = response.headers.get("X-RateLimit-Remaining", "unknown") + retry_after = response.headers.get("Retry-After") + + if remaining == "0" or response.status_code == 429: + retry_seconds = int(retry_after) if retry_after else 60 + raise GitHubRateLimitError( + f"GitHub rate limit exceeded. Retry after {retry_seconds}s", + status_code=response.status_code, + retry_after=retry_seconds + ) + + # Check for other errors + if not response.ok: + try: + error_data = response.json() + message = error_data.get("message", response.text) + except Exception: + message = response.text + + raise GitHubAPIError( + f"GitHub API error: {message}", + status_code=response.status_code + ) + + return response.json() + + def get_pr_head_sha(self, repo: str, pr_number: int) -> str: + """ + Get the head SHA of a pull request. + + Args: + repo: Repository in 'owner/repo' format + pr_number: Pull request number + + Returns: + The head commit SHA + """ + url = f"{self.BASE_URL}/repos/{repo}/pulls/{pr_number}" + logger.debug(f"Fetching PR info: {url}") + + response = self.session.get(url, headers=self._headers()) + data = self._handle_response(response) + + return data["head"]["sha"] + + def create_check_run( + self, + repo: str, + head_sha: str, + output: CheckRunOutput + ) -> int: + """ + Create a GitHub check run with neutral conclusion. + + Args: + repo: Repository in 'owner/repo' format + head_sha: The commit SHA to attach the check run to + output: Check run output data + + Returns: + The check run ID + """ + url = f"{self.BASE_URL}/repos/{repo}/check-runs" + + # Truncate annotations to max allowed + annotations = output.annotations[:self.config.max_annotations] + if len(output.annotations) > self.config.max_annotations: + logger.warning( + f"Truncating annotations from {len(output.annotations)} " + f"to {self.config.max_annotations}" + ) + + # Truncate annotation messages + for ann in annotations: + if len(ann.get("message", "")) > self.config.max_annotation_message_length: + ann["message"] = ( + ann["message"][:self.config.max_annotation_message_length - 3] + "..." + ) + + payload = { + "name": self.config.check_run_name, + "head_sha": head_sha, + "status": "completed", + "conclusion": "neutral", + "output": { + "title": output.title, + "summary": output.summary, + "annotations": annotations + } + } + + logger.debug(f"Creating check run: {url}") + response = self.session.post(url, headers=self._headers(), json=payload) + data = self._handle_response(response) + + check_run_id = data["id"] + logger.info(f"Created check run {check_run_id} for {repo}") + return check_run_id + + def find_bot_comment(self, repo: str, pr_number: int) -> Optional[int]: + """ + Find an existing bot comment on the PR. + + Looks for a comment containing the bot marker. + + Args: + repo: Repository in 'owner/repo' format + pr_number: Pull request number + + Returns: + The comment ID if found, None otherwise + """ + url = f"{self.BASE_URL}/repos/{repo}/issues/{pr_number}/comments" + logger.debug(f"Searching for bot comment: {url}") + + # Paginate through comments + page = 1 + per_page = 100 + + while True: + response = self.session.get( + url, + headers=self._headers(), + params={"page": page, "per_page": per_page} + ) + comments = self._handle_response(response) + + if not comments: + break + + for comment in comments: + body = comment.get("body", "") + if self.config.bot_comment_marker in body: + logger.debug(f"Found existing bot comment: {comment['id']}") + return comment["id"] + + if len(comments) < per_page: + break + + page += 1 + + return None + + def create_comment(self, repo: str, pr_number: int, body: str) -> int: + """ + Create a new PR comment. + + Args: + repo: Repository in 'owner/repo' format + pr_number: Pull request number + body: Comment body (markdown) + + Returns: + The comment ID + """ + url = f"{self.BASE_URL}/repos/{repo}/issues/{pr_number}/comments" + logger.debug(f"Creating comment: {url}") + + response = self.session.post( + url, + headers=self._headers(), + json={"body": body} + ) + data = self._handle_response(response) + + comment_id = data["id"] + logger.info(f"Created comment {comment_id} on PR {pr_number}") + return comment_id + + def update_comment(self, repo: str, comment_id: int, body: str) -> int: + """ + Update an existing PR comment. + + Args: + repo: Repository in 'owner/repo' format + comment_id: The comment ID to update + body: New comment body (markdown) + + Returns: + The comment ID + """ + url = f"{self.BASE_URL}/repos/{repo}/issues/comments/{comment_id}" + logger.debug(f"Updating comment: {url}") + + response = self.session.patch( + url, + headers=self._headers(), + json={"body": body} + ) + self._handle_response(response) + + logger.info(f"Updated comment {comment_id}") + return comment_id + + def create_or_update_comment(self, repo: str, pr_number: int, body: str) -> int: + """ + Create a new comment or update existing bot comment. + + Args: + repo: Repository in 'owner/repo' format + pr_number: Pull request number + body: Comment body (markdown) + + Returns: + The comment ID (new or existing) + """ + existing_comment_id = self.find_bot_comment(repo, pr_number) + + if existing_comment_id: + return self.update_comment(repo, existing_comment_id, body) + else: + return self.create_comment(repo, pr_number, body) + + +# Global client instance (lazy loaded) +_client: Optional[GitHubClient] = None + + +def get_github_client() -> GitHubClient: + """Get the global GitHub client instance.""" + global _client + if _client is None: + _client = GitHubClient() + return _client + + +def reset_github_client() -> None: + """Reset the global GitHub client (useful for testing).""" + global _client + _client = None diff --git a/.github/scripts/release_review/reporter.py b/.github/scripts/release_review/reporter.py new file mode 100644 index 00000000000..e2b8464f9d2 --- /dev/null +++ b/.github/scripts/release_review/reporter.py @@ -0,0 +1,252 @@ +""" +Reporter for formatting check run outputs and PR comments. + +Builds the check run summary/annotations and the PR comment body +from the review payload. +""" +import logging +from typing import Dict, List, Any, Tuple +from dataclasses import dataclass + +from .schemas import ReviewPayload, Issue, Annotation +from .config import get_config +from .github_client import CheckRunOutput + +logger = logging.getLogger(__name__) + + +@dataclass +class SeverityCounts: + """Counts of issues by severity.""" + high: int = 0 + medium: int = 0 + low: int = 0 + + def total(self) -> int: + return self.high + self.medium + self.low + + def summary_line(self) -> str: + return f"High: {self.high} · Medium: {self.medium} · Low: {self.low}" + + +class Reporter: + """Formats review results for GitHub.""" + + def __init__(self): + self.config = get_config() + + def count_severities(self, issues: List[Issue]) -> SeverityCounts: + """Count issues by severity level.""" + counts = SeverityCounts() + for issue in issues: + if issue.severity == "HIGH": + counts.high += 1 + elif issue.severity == "MEDIUM": + counts.medium += 1 + elif issue.severity == "LOW": + counts.low += 1 + return counts + + def build_check_output(self, payload: ReviewPayload) -> CheckRunOutput: + """ + Build the check run output from the review payload. + + Args: + payload: The review payload + + Returns: + CheckRunOutput with title, summary, and annotations + """ + counts = self.count_severities(payload.issues) + + # Build title + title = self.config.github.check_run_title + + # Build summary + summary_parts = [counts.summary_line()] + + if payload.summary: + summary_parts.append("") + summary_parts.append(payload.summary) + + # Add up to 10 example issues in summary + if payload.issues: + summary_parts.append("") + summary_parts.append("**Sample issues:**") + for issue in payload.issues[:10]: + location = "" + if issue.file: + location = f" (`{issue.file}" + if issue.line: + location += f":{issue.line}" + location += "`)" + summary_parts.append(f"- [{issue.severity}] {issue.title}{location}") + + if len(payload.issues) > 10: + summary_parts.append(f"- ... and {len(payload.issues) - 10} more") + + summary = "\n".join(summary_parts) + + # Build annotations + annotations = self._build_annotations(payload) + + return CheckRunOutput( + title=title, + summary=summary, + annotations=annotations + ) + + def _build_annotations(self, payload: ReviewPayload) -> List[Dict[str, Any]]: + """Build GitHub annotations from issues and explicit annotations.""" + annotations = [] + + # First, add explicit annotations from the payload + if payload.annotations: + for ann in payload.annotations: + annotations.append({ + "path": ann.path, + "start_line": ann.start_line, + "end_line": ann.end_line or ann.start_line, + "annotation_level": ann.annotation_level, + "message": ann.message + }) + + # Then, create annotations from issues that have file/line info + for issue in payload.issues: + if issue.file and issue.line: + # Map severity to annotation level + level = self._severity_to_annotation_level(issue.severity) + + message = f"{issue.title}: {issue.message}" + if issue.suggestion: + message += f"\n\nSuggestion: {issue.suggestion}" + + annotations.append({ + "path": issue.file, + "start_line": issue.line, + "end_line": issue.line, + "annotation_level": level, + "message": message + }) + + # Warn if we have too many annotations + max_annotations = self.config.github.max_annotations + if len(annotations) > max_annotations: + logger.warning( + f"Payload has {len(annotations)} annotations, " + f"but GitHub only accepts {max_annotations}. Truncating." + ) + + return annotations + + def _severity_to_annotation_level(self, severity: str) -> str: + """Map issue severity to GitHub annotation level.""" + mapping = { + "HIGH": self.config.severity_mapping.high, + "MEDIUM": self.config.severity_mapping.medium, + "LOW": self.config.severity_mapping.low, + } + return mapping.get(severity, "notice") + + def build_comment_body(self, payload: ReviewPayload) -> str: + """ + Build the PR comment body from the review payload. + + Args: + payload: The review payload + + Returns: + Markdown-formatted comment body + """ + counts = self.count_severities(payload.issues) + marker = self.config.github.bot_comment_marker + + # Start with marker and header + lines = [ + marker, + f"**Release Notes Advisory (AI)** — {payload.summary or 'Review complete'}", + "", + counts.summary_line(), + "", + ] + + # Group issues by severity + high_issues = [i for i in payload.issues if i.severity == "HIGH"] + medium_issues = [i for i in payload.issues if i.severity == "MEDIUM"] + low_issues = [i for i in payload.issues if i.severity == "LOW"] + + # Add each severity section + if high_issues: + lines.extend(self._format_issue_section("HIGH", high_issues)) + + if medium_issues: + lines.extend(self._format_issue_section("MEDIUM", medium_issues)) + + if low_issues: + lines.extend(self._format_issue_section("LOW", low_issues)) + + # Add links section + if payload.links: + lines.append("---") + lines.append("") + lines.append("**Links**") + if payload.links.deploy_preview: + lines.append(f"- [Deploy Preview]({payload.links.deploy_preview})") + if payload.links.full_report: + lines.append(f"- [Full JSON Report]({payload.links.full_report})") + lines.append("") + + # Footer + lines.append("---") + lines.append("_Posted by docs-fast-agent — advisory only, does not block merge._") + + return "\n".join(lines) + + def _format_issue_section( + self, + severity: str, + issues: List[Issue] + ) -> List[str]: + """Format a section of issues for the PR comment.""" + lines = [ + f"### {severity} ({len(issues)})", + "", + ] + + for issue in issues: + # Title and message + lines.append(f"- **{issue.title}:** {issue.message}") + + # File and line + if issue.file: + location = f"`{issue.file}" + if issue.line: + location += f":{issue.line}" + location += "`" + lines.append(f" {location}") + + # Suggestion + if issue.suggestion: + lines.append(f" **Suggestion:** {issue.suggestion}") + + lines.append("") + + return lines + + +# Global reporter instance +_reporter: Reporter = None + + +def get_reporter() -> Reporter: + """Get the global reporter instance.""" + global _reporter + if _reporter is None: + _reporter = Reporter() + return _reporter + + +def reset_reporter() -> None: + """Reset the global reporter (useful for testing).""" + global _reporter + _reporter = None diff --git a/.github/scripts/release_review/reviewer.py b/.github/scripts/release_review/reviewer.py new file mode 100644 index 00000000000..5dea2ebf750 --- /dev/null +++ b/.github/scripts/release_review/reviewer.py @@ -0,0 +1,571 @@ +""" +AI-powered Release Notes Reviewer. + +This module analyzes release notes PRs and generates review issues +using AI (OpenAI) based on the CockroachDB style guide. +""" +import os +import re +import logging +import json +from typing import List, Dict, Any, Optional, Tuple +from dataclasses import dataclass, field +from datetime import datetime + +import requests +import yaml +from openai import OpenAI + +from .schemas import Issue, ReviewPayload, Links + +logger = logging.getLogger(__name__) + +# Style guide content (loaded from file or embedded) +STYLE_GUIDE = """ +# Release Note Writing Guide for CockroachDB + +## Style and Tone +- Use clear, concise, and correct language +- Use the second-person imperative present tense for instructions +- Use active voice instead of passive for clarity +- Avoid using "please" when giving instructions +- Avoid hyperbolic language like "simple," "just," "easily," or "actually" +- Use contractions to simplify language, except for clear directives (use "cannot" instead of "can't") +- Avoid forward-looking language about future features + +## Format and Structure +- Use title case for page titles +- Use sentence case for all headings +- Use the Oxford (serial) comma +- When listing a range of versions, use "to" not a dash (e.g., v22.1.0 to v22.1.4) + +## Technical Content +- Link to relevant documentation when referencing CockroachDB features +- Use inline code format (backticks) for code, commands, or technical syntax +- Include GitHub issue or PR numbers for reference + +## Version References +- Format as vXX.X.X (e.g., v21.1.8) with lowercase 'v' + +## Technical Terminology +- Use "CockroachDB" (proper capitalization) +- Use "PostgreSQL" (not "Postgres") +- Use inclusive terminology (allowlist/denylist, main/primary) + +## Release Note Requirements +- Clearly describe what changed or was added +- Mention any impact on users, including breaking changes +- Be factual and technical without unnecessary jargon +- Include GitHub issue or PR numbers for reference +""" + + +@dataclass +class ParsedYAMLRelease: + """Parsed release entry from releases.yml.""" + release_name: str + major_version: str + release_date: str + release_type: str + go_version: Optional[str] = None + sha: Optional[str] = None + previous_release: Optional[str] = None + raw: Dict[str, Any] = field(default_factory=dict) + + +@dataclass +class ParsedMarkdownRelease: + """Parsed release notes from markdown file.""" + version: str + release_date: str + sections: Dict[str, List[str]] # section_name -> list of notes + pr_references: List[str] # List of PR numbers referenced + link_definitions: Dict[str, str] # PR number -> URL + raw_content: str = "" + + +@dataclass +class ReviewResult: + """Result of the review process.""" + issues: List[Issue] + summary: str + yaml_data: Optional[ParsedYAMLRelease] = None + markdown_data: Optional[ParsedMarkdownRelease] = None + + +class ReleaseNotesReviewer: + """AI-powered reviewer for CockroachDB release notes PRs.""" + + # Required YAML fields + REQUIRED_YAML_FIELDS = [ + "release_name", "major_version", "release_date", "release_type" + ] + + # Valid release types + VALID_RELEASE_TYPES = [ + "Production", "Testing", "Preview", "Beta", "Alpha", "Withdrawn" + ] + + # Valid section headers in markdown + VALID_SECTIONS = [ + "backward-incompatible-changes", "security-updates", "sql-language-changes", + "operational-changes", "command-line-changes", "db-console-changes", + "bug-fixes", "performance-improvements", "contributors", "doc-updates", + "enterprise-edition-changes", "general-changes" + ] + + def __init__(self, github_token: Optional[str] = None, openai_api_key: Optional[str] = None): + """Initialize the reviewer.""" + self.github_token = github_token or os.getenv("GITHUB_TOKEN") + self.openai_api_key = openai_api_key or os.getenv("OPENAI_API_KEY") + self.openai_client = None + if self.openai_api_key: + self.openai_client = OpenAI(api_key=self.openai_api_key) + + def _github_headers(self) -> Dict[str, str]: + """Get GitHub API headers.""" + return { + "Authorization": f"Bearer {self.github_token}", + "Accept": "application/vnd.github+json", + "X-GitHub-Api-Version": "2022-11-28", + } + + def fetch_pr_files(self, repo: str, pr_number: int) -> List[Dict[str, Any]]: + """Fetch the files changed in a PR.""" + url = f"https://api.github.com/repos/{repo}/pulls/{pr_number}/files" + response = requests.get(url, headers=self._github_headers()) + response.raise_for_status() + return response.json() + + def fetch_file_content(self, repo: str, path: str, ref: str) -> str: + """Fetch file content from GitHub.""" + url = f"https://api.github.com/repos/{repo}/contents/{path}?ref={ref}" + response = requests.get(url, headers=self._github_headers()) + response.raise_for_status() + data = response.json() + + if data.get("encoding") == "base64": + import base64 + return base64.b64decode(data["content"]).decode("utf-8") + return data.get("content", "") + + def fetch_pr_diff(self, repo: str, pr_number: int) -> str: + """Fetch the diff of a PR.""" + url = f"https://api.github.com/repos/{repo}/pulls/{pr_number}" + headers = self._github_headers() + headers["Accept"] = "application/vnd.github.diff" + response = requests.get(url, headers=headers) + response.raise_for_status() + return response.text + + def check_pr_exists(self, pr_number: str) -> bool: + """Check if a PR exists in cockroachdb/cockroach.""" + url = f"https://api.github.com/repos/cockroachdb/cockroach/pulls/{pr_number}" + response = requests.get(url, headers=self._github_headers()) + return response.status_code == 200 + + def parse_yaml_diff(self, diff: str) -> Optional[ParsedYAMLRelease]: + """Parse the YAML additions from a diff.""" + # Extract added lines from releases.yml + yaml_section = False + yaml_lines = [] + + for line in diff.split("\n"): + if "releases.yml" in line: + yaml_section = True + continue + if yaml_section: + if line.startswith("diff --git"): + break + if line.startswith("+") and not line.startswith("+++"): + yaml_lines.append(line[1:]) # Remove the '+' prefix + + if not yaml_lines: + return None + + # Parse the YAML + yaml_content = "\n".join(yaml_lines) + try: + # Handle the case where we're adding to a list + if yaml_content.strip().startswith("-"): + data = yaml.safe_load(yaml_content) + if isinstance(data, list) and len(data) > 0: + release = data[0] + else: + release = data + else: + release = yaml.safe_load(yaml_content) + + if not release: + return None + + return ParsedYAMLRelease( + release_name=release.get("release_name", ""), + major_version=release.get("major_version", ""), + release_date=release.get("release_date", ""), + release_type=release.get("release_type", ""), + go_version=release.get("go_version"), + sha=release.get("sha"), + previous_release=release.get("previous_release"), + raw=release + ) + except yaml.YAMLError as e: + logger.warning(f"Failed to parse YAML: {e}") + return None + + def parse_markdown_diff(self, diff: str) -> Optional[ParsedMarkdownRelease]: + """Parse the markdown additions from a diff.""" + # Extract added lines from .md file + md_section = False + md_lines = [] + md_filename = "" + + for line in diff.split("\n"): + if ".md" in line and "diff --git" in line: + md_section = True + md_filename = line + continue + if md_section: + if line.startswith("diff --git"): + break + if line.startswith("+") and not line.startswith("+++"): + md_lines.append(line[1:]) + + if not md_lines: + return None + + content = "\n".join(md_lines) + + # Extract version from header + version_match = re.search(r"## (v[\d.]+(?:-[\w.]+)?)", content) + version = version_match.group(1) if version_match else "" + + # Extract release date + date_match = re.search(r"Release Date:\s*(.+)", content) + release_date = date_match.group(1).strip() if date_match else "" + + # Extract sections + sections: Dict[str, List[str]] = {} + current_section = None + current_notes = [] + + for line in md_lines: + # Check for section header + section_match = re.search(r'

', line) + if section_match: + if current_section and current_notes: + sections[current_section] = current_notes + current_section = section_match.group(1) + current_notes = [] + elif current_section and line.strip().startswith("-"): + current_notes.append(line.strip()) + elif current_section and line.strip() and not line.startswith("[#"): + current_notes.append(line.strip()) + + if current_section and current_notes: + sections[current_section] = current_notes + + # Extract PR references + pr_refs = re.findall(r"\[#(\d+)\]", content) + + # Extract link definitions + link_defs = {} + for match in re.finditer(r"\[#(\d+)\]:\s*(https://[^\s]+)", content): + link_defs[match.group(1)] = match.group(2) + + return ParsedMarkdownRelease( + version=version, + release_date=release_date, + sections=sections, + pr_references=pr_refs, + link_definitions=link_defs, + raw_content=content + ) + + def check_schema_format(self, yaml_data: Optional[ParsedYAMLRelease], + md_data: Optional[ParsedMarkdownRelease]) -> List[Issue]: + """ + HIGH severity: Schema/Format Checks + - Required YAML fields present + - Valid date formats + - Correct version patterns + - Valid category tags + """ + issues = [] + + if yaml_data: + # Check required fields + for field in self.REQUIRED_YAML_FIELDS: + value = getattr(yaml_data, field, None) or yaml_data.raw.get(field) + if not value: + issues.append(Issue( + severity="HIGH", + title=f"Missing required YAML field: {field}", + message=f"The '{field}' field is required in releases.yml but is missing or empty.", + file="src/current/_data/releases.yml", + suggestion=f"Add the '{field}' field with an appropriate value." + )) + + # Check release_type validity + if yaml_data.release_type and yaml_data.release_type not in self.VALID_RELEASE_TYPES: + issues.append(Issue( + severity="HIGH", + title="Invalid release_type", + message=f"release_type '{yaml_data.release_type}' is not a valid type. " + f"Valid types are: {', '.join(self.VALID_RELEASE_TYPES)}", + file="src/current/_data/releases.yml", + suggestion=f"Use one of: {', '.join(self.VALID_RELEASE_TYPES)}" + )) + + # Check date format (YYYY-MM-DD) + if yaml_data.release_date: + try: + datetime.strptime(str(yaml_data.release_date), "%Y-%m-%d") + except ValueError: + issues.append(Issue( + severity="HIGH", + title="Invalid date format", + message=f"release_date '{yaml_data.release_date}' is not in YYYY-MM-DD format.", + file="src/current/_data/releases.yml", + suggestion="Use format: YYYY-MM-DD (e.g., 2026-03-25)" + )) + + # Check version pattern (vXX.X.X or vXX.X.X-suffix) + if yaml_data.release_name: + if not re.match(r"^v\d+\.\d+\.\d+(-[\w.]+)?$", yaml_data.release_name): + issues.append(Issue( + severity="HIGH", + title="Invalid version format", + message=f"release_name '{yaml_data.release_name}' doesn't match expected pattern vXX.X.X[-suffix].", + file="src/current/_data/releases.yml", + suggestion="Use format: vXX.X.X or vXX.X.X-beta.1 (lowercase 'v')" + )) + + if md_data: + # Check version matches in markdown + if yaml_data and md_data.version and yaml_data.release_name: + if md_data.version != yaml_data.release_name: + issues.append(Issue( + severity="HIGH", + title="Version mismatch between YAML and Markdown", + message=f"YAML has '{yaml_data.release_name}' but Markdown has '{md_data.version}'.", + file="src/current/_includes/releases/", + suggestion="Ensure version numbers match in both files." + )) + + return issues + + def check_technical_accuracy(self, yaml_data: Optional[ParsedYAMLRelease], + md_data: Optional[ParsedMarkdownRelease]) -> List[Issue]: + """ + HIGH severity: Technical Accuracy Checks + - Version numbers match + - Referenced PRs exist + - Backport references valid + - No broken internal links + """ + issues = [] + + if md_data: + # Check that all PR references have link definitions + referenced_prs = set(md_data.pr_references) + defined_prs = set(md_data.link_definitions.keys()) + + missing_links = referenced_prs - defined_prs + for pr in missing_links: + issues.append(Issue( + severity="HIGH", + title=f"Missing link definition for PR #{pr}", + message=f"PR #{pr} is referenced in the text but has no link definition at the bottom.", + file="src/current/_includes/releases/", + suggestion=f"Add: [#{pr}]: https://github.com/cockroachdb/cockroach/pull/{pr}" + )) + + # Check a sample of PRs to see if they exist (limit to avoid rate limits) + prs_to_check = list(referenced_prs)[:5] + for pr in prs_to_check: + if not self.check_pr_exists(pr): + issues.append(Issue( + severity="HIGH", + title=f"Referenced PR #{pr} does not exist", + message=f"PR #{pr} was referenced but could not be found in cockroachdb/cockroach.", + file="src/current/_includes/releases/", + suggestion="Verify the PR number is correct." + )) + + # Check for orphaned link definitions (defined but not referenced) + orphaned_links = defined_prs - referenced_prs + for pr in orphaned_links: + issues.append(Issue( + severity="MEDIUM", + title=f"Orphaned link definition for PR #{pr}", + message=f"Link definition for #{pr} exists but is not referenced in the text.", + file="src/current/_includes/releases/", + suggestion="Remove unused link definition or add reference in text." + )) + + return issues + + def check_content_quality_with_ai(self, md_data: Optional[ParsedMarkdownRelease]) -> List[Issue]: + """ + MEDIUM severity: Content Quality Checks using AI + - Release note text not empty + - No placeholder text + - Action-oriented (starts with verb) + - Appropriate length + - No duplicates + - Style guide compliance + """ + issues = [] + + if not md_data or not self.openai_client: + return issues + + # Prepare the content for AI review + content_to_review = md_data.raw_content + + prompt = f"""You are a technical writing reviewer for CockroachDB release notes. + +Review the following release notes content and identify issues based on these criteria: + +MEDIUM SEVERITY ISSUES: +1. Empty or placeholder text (TODO, TBD, FIXME, placeholder, lorem ipsum) +2. Notes that don't start with an action verb (should describe what changed) +3. Notes that are too short (<10 words) or too long (>100 words for a single point) +4. Duplicate or very similar release notes +5. Style guide violations: + - Using passive voice instead of active voice + - Using "please" in instructions + - Using hyperbolic words like "simple", "just", "easily", "actually" + - Incorrect capitalization (should be "CockroachDB" not "cockroachdb") + - Using "Postgres" instead of "PostgreSQL" + - Not using inclusive language (should use allowlist/denylist, main/primary) + +STYLE GUIDE: +{STYLE_GUIDE} + +RELEASE NOTES CONTENT: +{content_to_review} + +Respond with a JSON array of issues found. Each issue should have: +- "title": Short title of the issue +- "message": Detailed explanation +- "suggestion": How to fix it +- "line_hint": A snippet of the problematic text (for locating it) + +If no issues found, return an empty array: [] + +Return ONLY valid JSON, no other text.""" + + try: + response = self.openai_client.chat.completions.create( + model="gpt-4o", + messages=[ + {"role": "system", "content": "You are a technical writing reviewer. Respond only with valid JSON."}, + {"role": "user", "content": prompt} + ], + temperature=0.3, + max_tokens=2000 + ) + + result_text = response.choices[0].message.content.strip() + + # Clean up the response (remove markdown code blocks if present) + if result_text.startswith("```"): + result_text = re.sub(r"^```(?:json)?\n?", "", result_text) + result_text = re.sub(r"\n?```$", "", result_text) + + ai_issues = json.loads(result_text) + + for ai_issue in ai_issues: + issues.append(Issue( + severity="MEDIUM", + title=ai_issue.get("title", "Content quality issue"), + message=ai_issue.get("message", ""), + suggestion=ai_issue.get("suggestion", ""), + metadata={"line_hint": ai_issue.get("line_hint", "")} + )) + + except Exception as e: + logger.error(f"AI content review failed: {e}") + # Don't fail the whole review if AI fails + + return issues + + def review_pr(self, repo: str, pr_number: int, commit_sha: Optional[str] = None) -> ReviewResult: + """ + Review a release notes PR and generate issues. + + Args: + repo: Repository in 'owner/repo' format + pr_number: Pull request number + commit_sha: Optional commit SHA (fetched if not provided) + + Returns: + ReviewResult with issues and parsed data + """ + issues = [] + + # Fetch PR diff + logger.info(f"Fetching PR diff for {repo}#{pr_number}") + diff = self.fetch_pr_diff(repo, pr_number) + + # Parse YAML and Markdown from diff + yaml_data = self.parse_yaml_diff(diff) + md_data = self.parse_markdown_diff(diff) + + logger.info(f"Parsed YAML: {yaml_data is not None}, Markdown: {md_data is not None}") + + # Run schema/format checks (HIGH) + schema_issues = self.check_schema_format(yaml_data, md_data) + issues.extend(schema_issues) + logger.info(f"Schema/format issues: {len(schema_issues)}") + + # Run technical accuracy checks (HIGH) + accuracy_issues = self.check_technical_accuracy(yaml_data, md_data) + issues.extend(accuracy_issues) + logger.info(f"Technical accuracy issues: {len(accuracy_issues)}") + + # Run content quality checks with AI (MEDIUM) + if md_data: + quality_issues = self.check_content_quality_with_ai(md_data) + issues.extend(quality_issues) + logger.info(f"Content quality issues: {len(quality_issues)}") + + # Generate summary + high_count = sum(1 for i in issues if i.severity == "HIGH") + medium_count = sum(1 for i in issues if i.severity == "MEDIUM") + low_count = sum(1 for i in issues if i.severity == "LOW") + + if not issues: + summary = "No issues found. Release notes look good!" + else: + summary = f"Found {len(issues)} issue(s): {high_count} HIGH, {medium_count} MEDIUM, {low_count} LOW" + + return ReviewResult( + issues=issues, + summary=summary, + yaml_data=yaml_data, + markdown_data=md_data + ) + + def create_review_payload(self, repo: str, pr_number: int, + commit_sha: str, result: ReviewResult) -> ReviewPayload: + """Create a ReviewPayload from the review result.""" + return ReviewPayload( + source="ai-release-notes-reviewer", + repo=repo, + pr_number=pr_number, + commit_sha=commit_sha, + generated_at=datetime.utcnow().isoformat() + "Z", + summary=result.summary, + issues=result.issues, + links=Links( + full_report=None # Could add link to stored full report + ) + ) + + +def get_reviewer() -> ReleaseNotesReviewer: + """Get a configured reviewer instance.""" + return ReleaseNotesReviewer() diff --git a/.github/scripts/release_review/schemas.py b/.github/scripts/release_review/schemas.py new file mode 100644 index 00000000000..b4958c7d656 --- /dev/null +++ b/.github/scripts/release_review/schemas.py @@ -0,0 +1,127 @@ +""" +Pydantic models for the /api/v1/release-review-results endpoint. + +This module defines the request and response schemas for receiving +AI-generated release-notes reviews and posting them to GitHub. +""" +import re +from typing import List, Optional, Dict, Any, Literal +from pydantic import BaseModel, Field, HttpUrl, field_validator +from datetime import datetime + + +# ----------------------------------------------------------------------------- +# Request Models +# ----------------------------------------------------------------------------- + +class Issue(BaseModel): + """A single review issue found by the AI.""" + severity: Literal["HIGH", "MEDIUM", "LOW"] = Field( + ..., description="Severity level of the issue" + ) + title: str = Field(..., description="Short title of the issue") + message: str = Field(..., description="Detailed message describing the issue") + file: Optional[str] = Field(None, description="File path where the issue was found") + line: Optional[int] = Field(None, description="Line number in the file") + suggestion: Optional[str] = Field(None, description="Suggested fix for the issue") + metadata: Optional[Dict[str, Any]] = Field(None, description="Additional metadata") + + +class Annotation(BaseModel): + """A GitHub check run annotation.""" + path: str = Field(..., description="File path for the annotation") + start_line: int = Field(..., ge=1, description="Starting line number") + end_line: Optional[int] = Field(None, description="Ending line number") + annotation_level: Literal["notice", "warning", "failure"] = Field( + ..., description="Annotation severity level" + ) + message: str = Field(..., description="Annotation message") + + +class Links(BaseModel): + """Optional links to related resources.""" + deploy_preview: Optional[HttpUrl] = Field(None, description="Deploy preview URL") + full_report: Optional[HttpUrl] = Field(None, description="Full JSON report URL") + + +class ReviewPayload(BaseModel): + """Request payload for the release review results endpoint.""" + source: str = Field(..., description="Source identifier of the review system") + repo: str = Field(..., description="Repository in 'owner/repo' format") + pr_number: int = Field(..., ge=1, description="Pull request number") + commit_sha: Optional[str] = Field(None, description="Git commit SHA (40 hex chars)") + generated_at: str = Field(..., description="ISO8601 datetime when review was generated") + summary: Optional[str] = Field(None, description="Short summary of the review") + issues: List[Issue] = Field(default_factory=list, description="List of issues found") + annotations: Optional[List[Annotation]] = Field( + default_factory=list, description="GitHub annotations for the check run" + ) + links: Optional[Links] = Field(None, description="Related links") + idempotency_key: Optional[str] = Field( + None, description="Idempotency key from payload (fallback)" + ) + + @field_validator("repo") + @classmethod + def validate_repo_format(cls, v: str) -> str: + """Validate repo is in 'owner/repo' format.""" + if not re.match(r"^[^/]+/[^/]+$", v): + raise ValueError("repo must be in 'owner/repo' format") + return v + + @field_validator("commit_sha") + @classmethod + def validate_commit_sha(cls, v: Optional[str]) -> Optional[str]: + """Validate commit SHA is 40 hex characters if provided.""" + if v is not None and not re.match(r"^[0-9a-f]{40}$", v.lower()): + raise ValueError("commit_sha must be 40 hexadecimal characters") + return v.lower() if v else None + + @field_validator("generated_at") + @classmethod + def validate_generated_at(cls, v: str) -> str: + """Validate generated_at is a valid ISO8601 datetime.""" + try: + # Try parsing the datetime + datetime.fromisoformat(v.replace("Z", "+00:00")) + except ValueError: + raise ValueError("generated_at must be a valid ISO8601 datetime") + return v + + +# ----------------------------------------------------------------------------- +# Response Models +# ----------------------------------------------------------------------------- + +class ReviewResponse(BaseModel): + """Response from the release review results endpoint.""" + status: Literal["ok", "error"] = Field(..., description="Response status") + check_run_id: Optional[int] = Field(None, description="GitHub check run ID") + comment_id: Optional[int] = Field(None, description="GitHub PR comment ID") + message: str = Field(..., description="Human-readable message") + + +class ErrorResponse(BaseModel): + """Error response model.""" + status: Literal["error"] = "error" + message: str = Field(..., description="Error message") + detail: Optional[str] = Field(None, description="Detailed error information") + + +# ----------------------------------------------------------------------------- +# Internal Models +# ----------------------------------------------------------------------------- + +class Job(BaseModel): + """Internal job representation for the store.""" + id: Optional[int] = None + idempotency_key: str + repo: str + pr_number: int + created_at: datetime + processed_at: Optional[datetime] = None + check_run_id: Optional[int] = None + comment_id: Optional[int] = None + payload_json: str + status: Literal["pending", "processing", "completed", "failed"] = "pending" + error_message: Optional[str] = None diff --git a/.github/scripts/run_release_review.py b/.github/scripts/run_release_review.py new file mode 100644 index 00000000000..621e12e73e0 --- /dev/null +++ b/.github/scripts/run_release_review.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python3 +""" +Standalone runner for the release notes AI reviewer. +Invoked from the GitHub Action workflow. +""" +import os +import sys +import logging + +sys.path.insert(0, os.path.dirname(__file__)) + +from release_review.reviewer import ReleaseNotesReviewer +from release_review.reporter import Reporter +from release_review.github_client import GitHubClient +from release_review.config import ReleaseReviewConfig + +def main(): + logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s") + logger = logging.getLogger(__name__) + + repo = os.environ["GITHUB_REPOSITORY"] + pr_number = int(os.environ["PR_NUMBER"]) + commit_sha = os.environ["COMMIT_SHA"] + + config = ReleaseReviewConfig.from_env() + reviewer = ReleaseNotesReviewer() + reporter = Reporter() + github = GitHubClient() + + logger.info(f"Reviewing PR #{pr_number} in {repo}") + result = reviewer.review_pr(repo, pr_number, commit_sha) + payload = reviewer.create_review_payload(repo, pr_number, commit_sha, result) + + if config.features.post_check_runs: + check_output = reporter.build_check_output(payload) + check_run_id = github.create_check_run(repo, commit_sha, check_output) + logger.info(f"Created check run: {check_run_id}") + + if config.features.post_comments: + comment_body = reporter.build_comment_body(payload) + comment_id = github.create_or_update_comment(repo, pr_number, comment_body) + logger.info(f"Posted comment: {comment_id}") + + logger.info(f"Review complete: {result.summary}") + +if __name__ == "__main__": + main() diff --git a/.github/workflows/release-notes-review.yml b/.github/workflows/release-notes-review.yml new file mode 100644 index 00000000000..aadc0d0f70a --- /dev/null +++ b/.github/workflows/release-notes-review.yml @@ -0,0 +1,38 @@ +name: Release Notes AI Review + +on: + pull_request: + types: [opened, synchronize, reopened, ready_for_review] + branches: [main] + paths: + - 'src/current/_data/releases.yml' + - 'src/current/_includes/releases/**' + +permissions: + pull-requests: write + checks: write + contents: read + +jobs: + ai-review: + name: AI Release Notes Review + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.11' + + - name: Install dependencies + run: pip install requests pydantic PyYAML openai + + - name: Run AI release notes review + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + PR_NUMBER: ${{ github.event.pull_request.number }} + COMMIT_SHA: ${{ github.event.pull_request.head.sha }} + run: python .github/scripts/run_release_review.py