From 0d6a449dd141d14d88e56caa74ed9f2e6932ee90 Mon Sep 17 00:00:00 2001 From: Sora Morimoto Date: Mon, 9 Feb 2026 11:38:38 +0900 Subject: [PATCH] Add AI-powered duplicate issue detection system MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a three-part system for detecting and managing duplicate GitHub issues: 1. Detection workflow (dedupe-issues.yml): Triggers on new issues or manual dispatch. Uses actions/ai-inference with GitHub MCP to find up to 3 duplicates via structured JSON Schema output, then posts a comment via comment-on-duplicates.sh with a 3-day grace period. 2. Auto-close workflow (auto-close-duplicates.yml): Runs daily to close issues that were flagged as duplicates over 3 days ago with no human activity or author opt-out (👎 reaction). 3. Backfill workflow (backfill-duplicate-detection.yml): Manual dispatch to trigger duplicate detection on existing open issues that haven't been checked yet. Shared GitHub API helpers (pagination, typed interfaces, authentication) live in scripts/lib/github.ts. Bot detection uses an HTML comment sentinel () instead of fragile login heuristics. All actions are pinned to SHA hashes. --- .github/prompts/dedupe.prompt.yml | 35 ++++ .github/workflows/auto-close-duplicates.yml | 29 +++ .../backfill-duplicate-detection.yml | 43 +++++ .github/workflows/dedupe-issues.yml | 51 ++++++ scripts/auto-close-duplicates.ts | 167 ++++++++++++++++++ scripts/backfill-duplicate-detection.ts | 107 +++++++++++ scripts/comment-on-duplicates.sh | 103 +++++++++++ scripts/lib/github.ts | 104 +++++++++++ 8 files changed, 639 insertions(+) create mode 100644 .github/prompts/dedupe.prompt.yml create mode 100644 .github/workflows/auto-close-duplicates.yml create mode 100644 .github/workflows/backfill-duplicate-detection.yml create mode 100644 .github/workflows/dedupe-issues.yml create mode 100755 scripts/auto-close-duplicates.ts create mode 100755 scripts/backfill-duplicate-detection.ts create mode 100755 scripts/comment-on-duplicates.sh create mode 100644 scripts/lib/github.ts diff --git a/.github/prompts/dedupe.prompt.yml b/.github/prompts/dedupe.prompt.yml new file mode 100644 index 00000000..c37db53c --- /dev/null +++ b/.github/prompts/dedupe.prompt.yml @@ -0,0 +1,35 @@ +messages: + - role: system + content: You are a duplicate issue detector. You have access to GitHub MCP tools to read and search issues. + - role: user + content: | + Find up to 3 likely duplicate issues for issue #{{issue_number}} in the acacode/swagger-typescript-api repository. + + To do this, follow these steps precisely: + + 1. Read issue #{{issue_number}} including its comments. Check if the issue (a) is closed, (b) does not need to be deduped (e.g. because it is broad product feedback without a specific solution, or positive feedback), or (c) already has a duplicates comment containing ``. If so, do not proceed — return an empty duplicates array. + 2. Summarize the issue: what is the core problem, symptoms, and affected features. + 3. Search for duplicates of this issue using diverse keywords and search approaches based on the summary. Try at least 5 different search queries to maximize coverage. + 4. Filter out false positives that are likely not actually duplicates of the original issue. If there are no duplicates remaining, return an empty duplicates array. + 5. Return the remaining duplicate issue numbers (up to 3), ranked by confidence (highest first). +model: openai/gpt-4o +responseFormat: json_schema +jsonSchema: |- + { + "name": "duplicate_detection_result", + "strict": true, + "schema": { + "type": "object", + "properties": { + "duplicates": { + "type": "array", + "items": { + "type": "integer" + }, + "description": "Issue numbers of potential duplicates, ranked by confidence (highest first). Empty array if no duplicates found or issue should be skipped." + } + }, + "additionalProperties": false, + "required": ["duplicates"] + } + } diff --git a/.github/workflows/auto-close-duplicates.yml b/.github/workflows/auto-close-duplicates.yml new file mode 100644 index 00000000..ccd99e39 --- /dev/null +++ b/.github/workflows/auto-close-duplicates.yml @@ -0,0 +1,29 @@ +name: Auto-Close Duplicate Issues + +on: + schedule: + - cron: 0 9 * * * + workflow_dispatch: + +permissions: + contents: read + issues: write + +jobs: + auto-close: + runs-on: ubuntu-latest + timeout-minutes: 10 + + steps: + - name: Checkout tree + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + - name: Set-up Mise + uses: jdx/mise-action@6d1e696aa24c1aa1bcc1adea0212707c71ab78a8 # v3.6.1 + with: + cache: false + - name: Run auto-close script + run: bun run scripts/auto-close-duplicates.ts + env: + GITHUB_TOKEN: ${{ github.token }} + GITHUB_REPOSITORY_OWNER: ${{ github.repository_owner }} + GITHUB_REPOSITORY_NAME: ${{ github.event.repository.name }} diff --git a/.github/workflows/backfill-duplicate-detection.yml b/.github/workflows/backfill-duplicate-detection.yml new file mode 100644 index 00000000..2f2f76e8 --- /dev/null +++ b/.github/workflows/backfill-duplicate-detection.yml @@ -0,0 +1,43 @@ +name: Backfill Duplicate Detection + +on: + workflow_dispatch: + inputs: + days_back: + description: Number of days to look back for issues + required: false + default: "90" + dry_run: + description: Run in dry-run mode (only log, do not trigger workflows) + required: false + default: "true" + type: choice + options: + - "true" + - "false" + +permissions: + contents: read + issues: read + actions: write + +jobs: + backfill: + runs-on: ubuntu-latest + timeout-minutes: 30 + + steps: + - name: Checkout tree + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + - name: Set-up Mise + uses: jdx/mise-action@6d1e696aa24c1aa1bcc1adea0212707c71ab78a8 # v3.6.1 + with: + cache: false + - name: Run backfill script + run: ./scripts/backfill-duplicate-detection.ts + env: + GITHUB_TOKEN: ${{ github.token }} + GITHUB_REPOSITORY_OWNER: ${{ github.repository_owner }} + GITHUB_REPOSITORY_NAME: ${{ github.event.repository.name }} + DAYS_BACK: ${{ github.event.inputs.days_back }} + DRY_RUN: ${{ github.event.inputs.dry_run }} diff --git a/.github/workflows/dedupe-issues.yml b/.github/workflows/dedupe-issues.yml new file mode 100644 index 00000000..ef6b41a4 --- /dev/null +++ b/.github/workflows/dedupe-issues.yml @@ -0,0 +1,51 @@ +name: Issue Dedupe + +on: + issues: + types: + - opened + workflow_dispatch: + inputs: + issue_number: + description: Issue number to check for duplicates + required: true + type: number + +permissions: + contents: read + issues: write + models: read + +jobs: + dedupe: + runs-on: ubuntu-latest + timeout-minutes: 10 + + steps: + - name: Checkout tree + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + + - name: AI duplicate detection + uses: actions/ai-inference@a380166897b5408b8fb7dddd148142794cb5624a # v2.0.6 + id: ai + with: + prompt-file: .github/prompts/dedupe.prompt.yml + input: | + issue_number: ${{ github.event.issue.number || inputs.issue_number }} + enable-github-mcp: true + + - name: Post comment if duplicates found + run: | + DUPLICATES=$(echo "$AI_RESPONSE" | jq -r '.duplicates | map(tostring) | join(" ")') + + if [ -z "$DUPLICATES" ] || [ "$DUPLICATES" = "null" ]; then + echo "No duplicates found" + exit 0 + fi + + echo "Duplicates found: $DUPLICATES" + ./scripts/comment-on-duplicates.sh --base-issue "$ISSUE_NUMBER" --potential-duplicates $DUPLICATES + env: + AI_RESPONSE: ${{ steps.ai.outputs.response }} + GH_TOKEN: ${{ github.token }} + ISSUE_NUMBER: ${{ github.event.issue.number || inputs.issue_number }} diff --git a/scripts/auto-close-duplicates.ts b/scripts/auto-close-duplicates.ts new file mode 100755 index 00000000..17d6ba53 --- /dev/null +++ b/scripts/auto-close-duplicates.ts @@ -0,0 +1,167 @@ +#!/usr/bin/env bun + +import { consola } from "consola"; +import type { + GitHubComment, + GitHubIssue, + GitHubReaction, +} from "./lib/github.js"; +import { + API_BASE, + fetchAllPages, + fetchGitHub, + GITHUB_REPOSITORY_NAME, + GITHUB_REPOSITORY_OWNER, + getIssueComments, +} from "./lib/github.js"; + +const THREE_DAYS_MS = 3 * 24 * 60 * 60 * 1000; + +async function getOpenIssuesOlderThan3Days(): Promise { + const threeDaysAgo = new Date(Date.now() - THREE_DAYS_MS); + const url = `${API_BASE}/issues?state=open&per_page=100&sort=created&direction=asc`; + + const issues = await fetchAllPages(url); + + return issues.filter((issue) => { + if (issue.pull_request) return false; + return new Date(issue.created_at) < threeDaysAgo; + }); +} + +async function getCommentReactions( + commentId: number, +): Promise { + const url = `${API_BASE}/issues/comments/${commentId}/reactions?per_page=100`; + return fetchAllPages(url); +} + +async function closeIssue(issueNumber: number, reason: string): Promise { + await fetchGitHub(`${API_BASE}/issues/${issueNumber}/labels`, { + method: "POST", + body: JSON.stringify({ labels: ["duplicate"] }), + }); + + await fetchGitHub(`${API_BASE}/issues/${issueNumber}`, { + method: "PATCH", + body: JSON.stringify({ + state: "closed", + state_reason: "not_planned", + }), + }); + + await fetchGitHub(`${API_BASE}/issues/${issueNumber}/comments`, { + method: "POST", + body: JSON.stringify({ + body: reason, + }), + }); +} + +async function hasActivityAfterComment( + issue: GitHubIssue, + botCommentDate: Date, +): Promise { + const comments = await getIssueComments(issue.number); + + const laterComments = comments.filter((comment) => { + if (comment.user?.login.endsWith("[bot]")) return false; + const commentDate = new Date(comment.created_at); + return commentDate > botCommentDate; + }); + + return laterComments.length > 0; +} + +async function hasCreatorThumbsDown( + issue: GitHubIssue, + botComment: GitHubComment, +): Promise { + if (!issue.user) { + return false; + } + + const reactions = await getCommentReactions(botComment.id); + + return reactions.some( + (reaction) => + reaction.content === "-1" && reaction.user?.login === issue.user?.login, + ); +} + +async function main(): Promise { + consola.info("Starting auto-close duplicates script..."); + consola.info( + `Repository: ${GITHUB_REPOSITORY_OWNER}/${GITHUB_REPOSITORY_NAME}`, + ); + + const issues = await getOpenIssuesOlderThan3Days(); + consola.info(`Found ${issues.length} open issues older than 3 days`); + + let processedCount = 0; + let closedCount = 0; + + for (const issue of issues) { + processedCount++; + consola.info(`Processing issue #${issue.number}: ${issue.title}`); + + const comments = await getIssueComments(issue.number); + + const botComment = comments.find( + (comment) => + comment.user?.login === "github-actions[bot]" && + comment.body.includes(""), + ); + + if (!botComment) { + consola.info(` No duplicate bot comment found, skipping`); + await new Promise((resolve) => setTimeout(resolve, 1000)); + continue; + } + + const botCommentDate = new Date(botComment.created_at); + const now = new Date(); + const timeSinceComment = now.getTime() - botCommentDate.getTime(); + + if (timeSinceComment < THREE_DAYS_MS) { + consola.info(` Bot comment is less than 3 days old, skipping`); + await new Promise((resolve) => setTimeout(resolve, 1000)); + continue; + } + + const hasActivity = await hasActivityAfterComment(issue, botCommentDate); + if (hasActivity) { + consola.info(` Has activity after bot comment, skipping`); + await new Promise((resolve) => setTimeout(resolve, 1000)); + continue; + } + + const hasThumbsDown = await hasCreatorThumbsDown(issue, botComment); + if (hasThumbsDown) { + consola.info(` Creator reacted with thumbs down, skipping`); + await new Promise((resolve) => setTimeout(resolve, 1000)); + continue; + } + + consola.info(` Closing issue #${issue.number} as duplicate`); + await closeIssue( + issue.number, + "This issue has been automatically closed as a duplicate. It was marked as a duplicate over 3 days ago with no further activity. If you believe this was closed in error, please comment and we'll re-evaluate.", + ); + + closedCount++; + + await new Promise((resolve) => setTimeout(resolve, 1000)); + } + + consola.info("\n=== Summary ==="); + consola.info(`Processed issues: ${processedCount}`); + consola.info(`Closed issues: ${closedCount}`); +} + +try { + await main(); +} catch (error) { + consola.error("Error running auto-close script:", error); + process.exit(1); +} diff --git a/scripts/backfill-duplicate-detection.ts b/scripts/backfill-duplicate-detection.ts new file mode 100755 index 00000000..d1ae9eb9 --- /dev/null +++ b/scripts/backfill-duplicate-detection.ts @@ -0,0 +1,107 @@ +#!/usr/bin/env bun + +import { consola } from "consola"; +import type { GitHubIssue } from "./lib/github.js"; +import { + API_BASE, + fetchAllPages, + fetchGitHub, + GITHUB_REPOSITORY_NAME, + GITHUB_REPOSITORY_OWNER, + getIssueComments, +} from "./lib/github.js"; + +const DAYS_BACK = Number.parseInt(process.env.DAYS_BACK ?? "90", 10); +const DRY_RUN = process.env.DRY_RUN !== "false"; + +async function getOpenIssuesInRange(daysBack: number): Promise { + const startDate = new Date(); + startDate.setDate(startDate.getDate() - daysBack); + const since = startDate.toISOString(); + + const url = `${API_BASE}/issues?state=open&since=${since}&per_page=100&sort=created&direction=asc`; + const issues = await fetchAllPages(url); + return issues.filter((issue) => !issue.pull_request); +} + +async function hasDuplicateDetectionComment( + issueNumber: number, +): Promise { + const comments = await getIssueComments(issueNumber); + + return comments.some( + (comment) => + comment.user?.login === "github-actions[bot]" && + comment.body.includes(""), + ); +} + +async function triggerWorkflow(issueNumber: number): Promise { + const url = `${API_BASE}/actions/workflows/dedupe-issues.yml/dispatches`; + + await fetchGitHub(url, { + method: "POST", + body: JSON.stringify({ + ref: "main", + inputs: { + issue_number: issueNumber.toString(), + }, + }), + }); +} + +async function main(): Promise { + consola.info("Starting backfill duplicate detection script..."); + consola.info( + `Repository: ${GITHUB_REPOSITORY_OWNER}/${GITHUB_REPOSITORY_NAME}`, + ); + consola.info(`Days back: ${DAYS_BACK}`); + consola.info(`Dry run: ${DRY_RUN}`); + + const issues = await getOpenIssuesInRange(DAYS_BACK); + consola.info( + `Found ${issues.length} open issues in the past ${DAYS_BACK} days`, + ); + + let processedCount = 0; + let triggeredCount = 0; + + for (const issue of issues) { + processedCount++; + consola.info(`Processing issue #${issue.number}: ${issue.title}`); + + const hasComment = await hasDuplicateDetectionComment(issue.number); + + if (hasComment) { + consola.info(` Already has duplicate detection comment, skipping`); + await new Promise((resolve) => setTimeout(resolve, 1000)); + continue; + } + + if (DRY_RUN) { + consola.info( + ` [DRY RUN] Would trigger workflow for issue #${issue.number}`, + ); + triggeredCount++; + } else { + consola.info(` Triggering workflow for issue #${issue.number}`); + await triggerWorkflow(issue.number); + triggeredCount++; + } + + await new Promise((resolve) => setTimeout(resolve, 1000)); + } + + consola.info("\n=== Summary ==="); + consola.info(`Processed issues: ${processedCount}`); + consola.info( + `${DRY_RUN ? "Would trigger" : "Triggered"} workflows: ${triggeredCount}`, + ); +} + +try { + await main(); +} catch (error) { + consola.error("Error running backfill script:", error); + process.exit(1); +} diff --git a/scripts/comment-on-duplicates.sh b/scripts/comment-on-duplicates.sh new file mode 100755 index 00000000..1333ee92 --- /dev/null +++ b/scripts/comment-on-duplicates.sh @@ -0,0 +1,103 @@ +#!/usr/bin/env bash +# +# Comments on a GitHub issue with a list of potential duplicates. +# Usage: ./comment-on-duplicates.sh --base-issue 123 --potential-duplicates 456 789 101 +# + +set -euo pipefail + +if [[ -z "${GITHUB_REPOSITORY:-}" ]]; then + echo "Error: GITHUB_REPOSITORY environment variable is required" >&2 + exit 1 +fi +REPO="$GITHUB_REPOSITORY" +BASE_ISSUE="" +DUPLICATES=() + +# Parse arguments +while [[ $# -gt 0 ]]; do + case $1 in + --base-issue) + BASE_ISSUE="$2" + shift 2 + ;; + --potential-duplicates) + shift + while [[ $# -gt 0 && ! "$1" =~ ^-- ]]; do + DUPLICATES+=("$1") + shift + done + ;; + *) + echo "Unknown option: $1" >&2 + exit 1 + ;; + esac +done + +# Validate base issue +if [[ -z "$BASE_ISSUE" ]]; then + echo "Error: --base-issue is required" >&2 + exit 1 +fi + +if ! [[ "$BASE_ISSUE" =~ ^[0-9]+$ ]]; then + echo "Error: --base-issue must be a number, got: $BASE_ISSUE" >&2 + exit 1 +fi + +# Validate duplicates +if [[ ${#DUPLICATES[@]} -eq 0 ]]; then + echo "Error: --potential-duplicates requires at least one issue number" >&2 + exit 1 +fi + +if [[ ${#DUPLICATES[@]} -gt 3 ]]; then + echo "Error: --potential-duplicates accepts at most 3 issues" >&2 + exit 1 +fi + +for dup in "${DUPLICATES[@]}"; do + if ! [[ "$dup" =~ ^[0-9]+$ ]]; then + echo "Error: duplicate issue must be a number, got: $dup" >&2 + exit 1 + fi +done + +# Validate that base issue exists +if ! gh issue view "$BASE_ISSUE" --repo "$REPO" &>/dev/null; then + echo "Error: issue #$BASE_ISSUE does not exist in $REPO" >&2 + exit 1 +fi + +# Validate that all duplicate issues exist +for dup in "${DUPLICATES[@]}"; do + if ! gh issue view "$dup" --repo "$REPO" &>/dev/null; then + echo "Error: issue #$dup does not exist in $REPO" >&2 + exit 1 + fi +done + +# Build comment body +BODY=''$'\n\n' +COUNT=${#DUPLICATES[@]} +if [[ $COUNT -eq 1 ]]; then + BODY+="Found 1 possible duplicate issue:"$'\n\n' +else + BODY+="Found $COUNT possible duplicate issues:"$'\n\n' +fi + +INDEX=1 +for dup in "${DUPLICATES[@]}"; do + BODY+="$INDEX. https://github.com/$REPO/issues/$dup"$'\n' + ((INDEX++)) +done + +BODY+=$'\n'"This issue will be automatically closed as a duplicate in 3 days."$'\n\n' +BODY+="- If your issue is a duplicate, please close it and 👍 the existing issue instead"$'\n' +BODY+="- To prevent auto-closure, add a comment or 👎 this comment (only the issue author's reaction is checked)"$'\n' + +# Post the comment +gh issue comment "$BASE_ISSUE" --repo "$REPO" --body "$BODY" + +echo "Posted duplicate comment on issue #$BASE_ISSUE" diff --git a/scripts/lib/github.ts b/scripts/lib/github.ts new file mode 100644 index 00000000..7db834aa --- /dev/null +++ b/scripts/lib/github.ts @@ -0,0 +1,104 @@ +import { consola } from "consola"; + +export interface GitHubIssue { + number: number; + title: string; + created_at: string; + user: { login: string } | null; + state: string; + pull_request?: unknown; +} + +export interface GitHubComment { + id: number; + user: { login: string } | null; + body: string; + created_at: string; +} + +export interface GitHubReaction { + id: number; + user: { login: string } | null; + content: string; +} + +const GITHUB_TOKEN = process.env.GITHUB_TOKEN; + +export const GITHUB_REPOSITORY_OWNER = process.env.GITHUB_REPOSITORY_OWNER; +export const GITHUB_REPOSITORY_NAME = process.env.GITHUB_REPOSITORY_NAME; + +if (!GITHUB_TOKEN) { + consola.error("Error: GITHUB_TOKEN environment variable is required"); + process.exit(1); +} + +if (!GITHUB_REPOSITORY_OWNER || !GITHUB_REPOSITORY_NAME) { + consola.error( + "Error: GITHUB_REPOSITORY_OWNER and GITHUB_REPOSITORY_NAME environment variables are required", + ); + process.exit(1); +} + +export const API_BASE = `https://api.github.com/repos/${GITHUB_REPOSITORY_OWNER}/${GITHUB_REPOSITORY_NAME}`; + +const HEADERS = { + Authorization: `Bearer ${GITHUB_TOKEN}`, + Accept: "application/vnd.github+json", + "X-GitHub-Api-Version": "2022-11-28", +}; + +export async function fetchGitHub( + url: string, + options: RequestInit = {}, +): Promise { + const response = await fetch(url, { + ...options, + headers: { ...HEADERS, ...options.headers }, + }); + + if (!response.ok) { + throw new Error( + `GitHub API error: ${response.status} ${response.statusText}`, + ); + } + + if (response.status === 204) { + return undefined as T; + } + + return (await response.json()) as T; +} + +function getNextPageUrl(linkHeader: string | null): string | null { + if (!linkHeader) return null; + const match = linkHeader.match(/<([^>]+)>;\s*rel="next"/); + return match?.[1] ?? null; +} + +export async function fetchAllPages(url: string): Promise { + const results: T[] = []; + let nextUrl: string | null = url; + + while (nextUrl) { + const response = await fetch(nextUrl, { headers: HEADERS }); + + if (!response.ok) { + throw new Error( + `GitHub API error: ${response.status} ${response.statusText}`, + ); + } + + const page = (await response.json()) as T[]; + results.push(...page); + nextUrl = getNextPageUrl(response.headers.get("link")); + } + + return results; +} + +export async function getIssueComments( + issueNumber: number, +): Promise { + const url = `${API_BASE}/issues/${issueNumber}/comments?per_page=100`; + return fetchAllPages(url); +}