From 6df84209820718099728d96a140b7a8c1891e355 Mon Sep 17 00:00:00 2001 From: Den Rozhnovskiy Date: Thu, 5 Feb 2026 17:41:20 +0500 Subject: [PATCH 01/28] chore(UI): update html report --- codeclone/templates.py | 1304 ++++++++++++++++++++++++++-------------- pyproject.toml | 2 +- uv.lock | 2 +- 3 files changed, 870 insertions(+), 438 deletions(-) diff --git a/codeclone/templates.py b/codeclone/templates.py index d8870e0..c0c1306 100644 --- a/codeclone/templates.py +++ b/codeclone/templates.py @@ -6,161 +6,127 @@ Licensed under the MIT License. """ +from __future__ import annotations + from string import Template FONT_CSS_URL = ( - "https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700" - "&family=JetBrains+Mono:wght@400;500;600&display=swap" + "https://fonts.googleapis.com/css2?" + "family=Inter:wght@400;500;600;700&" + "family=JetBrains+Mono:wght@400;500&" + "display=swap" ) -REPORT_TEMPLATE = Template(r""" - +REPORT_TEMPLATE = Template( + r""" ${title} - - @@ -962,6 +1029,25 @@
+ +
+
+
+
+ +
+
+ +
+
+
+
@@ -970,8 +1056,21 @@
v${version}
+ '; - const container = $('.toast-container'); + const container = $$('.toast-container'); container.appendChild(toast); setTimeout(() => toast.classList.add('toast-show'), 10); @@ -1192,9 +1192,9 @@ // ========== Command Palette ========== function initCommandPalette() { - const palette = $('#command-palette'); - const input = $('#command-input'); - const results = $('#command-results'); + const palette = $$('#command-palette'); + const input = $$('#command-input'); + const results = $$('#command-results'); const commands = [ { @@ -1239,7 +1239,7 @@ icon: '🔍', label: 'Focus Search', shortcut: '/', - action: () => $('.search')?.focus() + action: () => $$('.search')?.focus() }, { icon: '📜', @@ -1334,8 +1334,8 @@ input.value = ''; } - $('#command-btn')?.addEventListener('click', openCommandPalette); - $('.command-backdrop')?.addEventListener('click', closeCommandPalette); + $$('#command-btn')?.addEventListener('click', openCommandPalette); + $$('.command-backdrop')?.addEventListener('click', closeCommandPalette); input?.addEventListener('input', (e) => { renderCommands(e.target.value); @@ -1345,7 +1345,7 @@ if (e.key === 'Escape') { closeCommandPalette(); } else if (e.key === 'Enter') { - const selected = $('.command-item.selected'); + const selected = $$('.command-item.selected'); if (selected) selected.click(); } }); @@ -1355,8 +1355,8 @@ // ========== Statistics ========== function calculateStats() { - const groups = $$('.group'); - const items = $$('.item'); + const groups = $$$$('.group'); + const items = $$$$('.item'); state.stats.totalGroups = groups.length; state.stats.totalItems = items.length; @@ -1373,7 +1373,7 @@ } function showStats() { - const dashboard = $('#stats-dashboard'); + const dashboard = $$('#stats-dashboard'); if (dashboard.children.length > 0) { dashboard.style.display = dashboard.style.display === 'none' ? 'grid' : 'none'; return; @@ -1428,7 +1428,7 @@ // ========== Charts ========== function showCharts() { - const container = $('#chart-container'); + const container = $$('#chart-container'); container.style.display = container.style.display === 'none' ? 'block' : 'none'; if (container.style.display === 'block') { @@ -1443,7 +1443,7 @@ const data = { generated: new Date().toISOString(), stats: state.stats, - groups: Array.from($$('.group')).map(g => ({ + groups: Array.from($$$$('.group')).map(g => ({ title: g.querySelector('.group-title')?.textContent, items: g.querySelectorAll('.item').length })) @@ -1467,14 +1467,14 @@ // ========== Group Controls ========== function expandAll() { - $$('.items').forEach(b => b.style.display = ''); - $$('[data-toggle-group]').forEach(c => c.style.transform = 'rotate(0deg)'); + $$$$('.items').forEach(b => b.style.display = ''); + $$$$('[data-toggle-group]').forEach(c => c.style.transform = 'rotate(0deg)'); showToast('All groups expanded', 'info'); } function collapseAll() { - $$('.items').forEach(b => b.style.display = 'none'); - $$('[data-toggle-group]').forEach(c => c.style.transform = 'rotate(-90deg)'); + $$$$('.items').forEach(b => b.style.display = 'none'); + $$$$('[data-toggle-group]').forEach(c => c.style.transform = 'rotate(-90deg)'); showToast('All groups collapsed', 'info'); } @@ -1493,7 +1493,7 @@ // / - Focus search if (e.key === '/') { e.preventDefault(); - $('.search')?.focus(); + $$('.search')?.focus(); } // T - Toggle theme @@ -1517,11 +1517,11 @@ // Escape - Close modals if (e.key === 'Escape') { if (state.commandPaletteOpen) { - const palette = $('#command-palette'); + const palette = $$('#command-palette'); palette?.classList.remove('show'); state.commandPaletteOpen = false; } - const search = $('.search'); + const search = $$('.search'); if (search && search.value) { search.value = ''; search.dispatchEvent(new Event('input', { bubbles: true })); @@ -1530,7 +1530,7 @@ }); // ========== Group Toggle ========== - $$('.group-head').forEach((head) => { + $$$$('.group-head').forEach((head) => { head.addEventListener('click', (e) => { if (e.target.closest('button')) return; const btn = head.querySelector('[data-toggle-group]'); @@ -1538,11 +1538,11 @@ }); }); - $$('[data-toggle-group]').forEach((btn) => { + $$$$('[data-toggle-group]').forEach((btn) => { btn.addEventListener('click', (e) => { e.stopPropagation(); const id = btn.getAttribute('data-toggle-group'); - const body = $('#group-body-' + id); + const body = $$('#group-body-' + id); if (!body) return; const isHidden = body.style.display === 'none'; @@ -1553,19 +1553,19 @@ // ========== Section Management ========== function initSection(sectionId) { - const section = $('section[data-section="' + sectionId + '"]'); + const section = $$('section[data-section="' + sectionId + '"]'); if (!section) return; - const groups = Array.from($$('.group[data-group="' + sectionId + '"]')); - const searchInput = $('#search-' + sectionId); - const btnPrev = $('[data-prev="' + sectionId + '"]'); - const btnNext = $('[data-next="' + sectionId + '"]'); - const meta = $('[data-page-meta="' + sectionId + '"]'); - const selPageSize = $('[data-pagesize="' + sectionId + '"]'); - const btnClear = $('[data-clear="' + sectionId + '"]'); - const btnCollapseAll = $('[data-collapse-all="' + sectionId + '"]'); - const btnExpandAll = $('[data-expand-all="' + sectionId + '"]'); - const pill = $('[data-count-pill="' + sectionId + '"]'); + const groups = Array.from($$$$('.group[data-group="' + sectionId + '"]')); + const searchInput = $$('#search-' + sectionId); + const btnPrev = $$('[data-prev="' + sectionId + '"]'); + const btnNext = $$('[data-next="' + sectionId + '"]'); + const meta = $$('[data-page-meta="' + sectionId + '"]'); + const selPageSize = $$('[data-pagesize="' + sectionId + '"]'); + const btnClear = $$('[data-clear="' + sectionId + '"]'); + const btnCollapseAll = $$('[data-collapse-all="' + sectionId + '"]'); + const btnExpandAll = $$('[data-expand-all="' + sectionId + '"]'); + const pill = $$('[data-count-pill="' + sectionId + '"]'); const sectionState = { q: '', @@ -1669,8 +1669,8 @@ } // ========== Event Listeners ========== - $('#theme-toggle')?.addEventListener('click', toggleTheme); - $('#export-btn')?.addEventListener('click', () => exportReport('json')); + $$('#theme-toggle')?.addEventListener('click', toggleTheme); + $$('#export-btn')?.addEventListener('click', () => exportReport('json')); // ========== Initialize ========== initTheme(); @@ -1681,7 +1681,7 @@ // Welcome message setTimeout(() => { - const groupCount = $$('.group').length; + const groupCount = $$$$('.group').length; if (groupCount > 0) { showToast(groupCount + ' clone groups loaded', 'success'); } From d66a3b76cd69c341cf8c794d98bfb0b205662339 Mon Sep 17 00:00:00 2001 From: Den Rozhnovskiy Date: Thu, 5 Feb 2026 17:52:11 +0500 Subject: [PATCH 03/28] chore(UI): update html report --- codeclone/templates.py | 191 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 172 insertions(+), 19 deletions(-) diff --git a/codeclone/templates.py b/codeclone/templates.py index 6a17fdb..de6993d 100644 --- a/codeclone/templates.py +++ b/codeclone/templates.py @@ -1125,6 +1125,159 @@ // ========== Utilities ========== const $$ = (sel) => document.querySelector(sel); const $$$$ = (sel) => document.querySelectorAll(sel); + const svg = (parts) => parts.join(''); + const ICONS = { + info: svg([ + '' + ]), + success: svg([ + '' + ]), + warning: svg([ + '' + ]), + error: svg([ + '' + ]), + exportJson: svg([ + '' + ]), + exportPdf: svg([ + '' + ]), + stats: svg([ + '' + ]), + charts: svg([ + '' + ]), + refresh: svg([ + '' + ]), + search: svg([ + '' + ]), + scrollTop: svg([ + '' + ]), + scrollBottom: svg([ + '' + ]), + theme: svg([ + '' + ]), + expand: svg([ + '' + ]), + collapse: svg([ + '' + ]), + cloneGroups: svg([ + '' + ]), + totalClones: svg([ + '' + ]), + avgGroup: svg([ + '' + ]), + largestGroup: svg([ + '' + ]) + }; // ========== State Management ========== const state = { @@ -1159,10 +1312,10 @@ // ========== Toast Notifications ========== function showToast(message, type = 'info') { const icons = { - info: 'i', - success: '✅', - warning: '⚠️', - error: '❌' + info: ICONS.info, + success: ICONS.success, + warning: ICONS.warning, + error: ICONS.error }; const toast = document.createElement('div'); @@ -1201,31 +1354,31 @@ section: 'Actions', items: [ { - icon: '📥', + icon: ICONS.exportJson, label: 'Export as JSON', shortcut: '⌘E', action: () => exportReport('json') }, { - icon: '📄', + icon: ICONS.exportPdf, label: 'Export as PDF', shortcut: null, action: () => exportReport('pdf') }, { - icon: '📊', + icon: ICONS.stats, label: 'Show Statistics', shortcut: '⌘S', action: () => showStats() }, { - icon: '📈', + icon: ICONS.charts, label: 'Show Charts', shortcut: null, action: () => showCharts() }, { - icon: '🔄', + icon: ICONS.refresh, label: 'Refresh View', shortcut: '⌘R', action: () => location.reload() @@ -1236,19 +1389,19 @@ section: 'Navigation', items: [ { - icon: '🔍', + icon: ICONS.search, label: 'Focus Search', shortcut: '/', action: () => $$('.search')?.focus() }, { - icon: '📜', + icon: ICONS.scrollTop, label: 'Scroll to Top', shortcut: null, action: () => window.scrollTo(0, 0) }, { - icon: '⬇️', + icon: ICONS.scrollBottom, label: 'Scroll to Bottom', shortcut: null, action: () => window.scrollTo(0, document.body.scrollHeight) @@ -1259,19 +1412,19 @@ section: 'View', items: [ { - icon: '🌓', + icon: ICONS.theme, label: 'Toggle Theme', shortcut: 'T', action: () => toggleTheme() }, { - icon: '📖', + icon: ICONS.expand, label: 'Expand All', shortcut: null, action: () => expandAll() }, { - icon: '📕', + icon: ICONS.collapse, label: 'Collapse All', shortcut: null, action: () => collapseAll() @@ -1383,25 +1536,25 @@ const stats = [ { - icon: '📊', + icon: ICONS.cloneGroups, value: state.stats.totalGroups, label: 'Clone Groups', trend: null }, { - icon: '📦', + icon: ICONS.totalClones, value: state.stats.totalItems, label: 'Total Clones', trend: null }, { - icon: '📈', + icon: ICONS.avgGroup, value: state.stats.avgGroupSize, label: 'Avg Group Size', trend: null }, { - icon: '🔝', + icon: ICONS.largestGroup, value: state.stats.largestGroup, label: 'Largest Group', trend: null From c5aea24df742e90d65fbc94f522c16224d22c294 Mon Sep 17 00:00:00 2001 From: Den Rozhnovskiy Date: Thu, 5 Feb 2026 18:57:08 +0500 Subject: [PATCH 04/28] feat(core): improves clone-detection precision and explainability with deterministic normalization and CFG upgrades, adds segment-level internal clone reporting, refreshes the HTML report UI, and introduces baseline versioning. This is a breaking change for CI workflows that rely on existing baselines. --- CHANGELOG.md | 91 ++++++++++++++++++++++ CONTRIBUTING.md | 12 +++ README.md | 17 +++- SECURITY.md | 8 +- codeclone.baseline.json | 4 +- codeclone/baseline.py | 41 +++++++++- codeclone/blocks.py | 57 ++++++++++++++ codeclone/cache.py | 23 +++++- codeclone/cfg.py | 80 ++++++++++++++++--- codeclone/cli.py | 58 ++++++++++++-- codeclone/extractor.py | 19 ++++- codeclone/html_report.py | 13 +++- codeclone/normalize.py | 72 +++++++++++++++++ codeclone/report.py | 37 ++++++++- codeclone/templates.py | 9 +++ docs/architecture.md | 34 +++++++- docs/cfg.md | 14 +++- tests/test_baseline.py | 18 ++++- tests/test_cache.py | 21 ++++- tests/test_cfg.py | 149 ++++++++++++++++++++++++++++++++++-- tests/test_cli_inprocess.py | 143 ++++++++++++++++++++++++++++++++-- tests/test_cli_unit.py | 10 +++ tests/test_extractor.py | 18 +++-- tests/test_html_report.py | 74 +++++++++++++++++- tests/test_normalize.py | 94 +++++++++++++++++++++++ tests/test_report.py | 79 ++++++++++++++++++- tests/test_security.py | 36 +++++++++ tests/test_segments.py | 105 +++++++++++++++++++++++++ 28 files changed, 1270 insertions(+), 66 deletions(-) create mode 100644 tests/test_segments.py diff --git a/CHANGELOG.md b/CHANGELOG.md index e86fef2..92c7c8b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,96 @@ # Changelog +## [1.3.0] - 2026-02-05 + +### Overview + +This release improves clone-detection precision and explainability with deterministic +normalization and CFG upgrades, adds segment-level internal clone reporting, refreshes +the HTML report UI, and introduces baseline versioning. This is a breaking change for CI +workflows that rely on existing baselines. + +### Clone Detection Accuracy + +- **Commutative normalization** + Canonicalized operand order for `+`, `*`, `|`, `&`, `^` when operands are free of side + effects, enabling safe detection of reordered expressions. + +- **Local logical equivalence** + Normalized `not (x in y)` to `x not in y` and `not (x is y)` to `x is not y` without + De Morgan transformations or broader boolean rewrites. + +### CFG Precision + +- **Short‑circuit modeling** + Represented `and`/`or` as micro‑CFGs with explicit branch splits after each operand. + +- **Exception linking** + Linked `try/except` only to statements that may raise (calls, attribute access, indexing, + `await`, `yield from`, `raise`) instead of blanket connections. + +### Segment‑Level Detection + +- **Window fingerprints** + Added deterministic segment windows inside functions for internal clone discovery. + +- **Candidate generation** + Used an order‑insensitive signature for candidate grouping and a strict segment hash for + final confirmation; segment matches do not affect baseline or CI failure logic. + +### Baseline & CI + +- Baselines are now **versioned** and include a schema version. +- Mismatched baseline versions **fail fast** and require regeneration. + +**Breaking (CI):** baseline version mismatch now fails hard; CI requires baseline regeneration on upgrade. + +Update the baseline: + +```bash +codeclone . --update-baseline +``` + +### HTML Report UI + +- **Visual refresh** + Introduced a redesigned, modern HTML report layout with a sticky top bar and improved + typography and spacing. + +- **Interactive tooling** + Added a command palette, keyboard shortcuts, toast notifications, and quick actions for + common tasks (export, stats, charts, navigation). + +- **Reporting widgets** + Added a stats dashboard and chart container to surface high‑level clone metrics directly + in the report. + +- **Icon system** + Replaced emoji glyphs with inline SVG icons for consistent rendering and a fully + self‑contained UI. + +- **Segment reporting** + Added a dedicated “Segment clones” section and summary metric in HTML/TXT/JSON outputs. + +### Cache & Internals + +- Extended cache schema to store segment fingerprints (cache version bump). + +### Packaging + +- Removed an invalid PyPI classifier from the package metadata. + +### Documentation + +- Updated architecture and CFG documentation to reflect new normalization, CFG, and + segment‑level detection behavior. +- Updated README, SECURITY, and CONTRIBUTING guidance for 1.3.0. + +### Testing & Security + +- Expanded security tests (HTML escaping and safety checks). + +--- + ## [1.2.1] - 2026-02-02 ### Overview diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 7987e01..8ed5d91 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -30,6 +30,7 @@ We especially welcome contributions in the following areas: - Control Flow Graph (CFG) construction and semantics - AST normalization improvements +- Segment-level clone detection and reporting - False-positive reduction - HTML report UX improvements - Performance optimizations @@ -83,6 +84,14 @@ Such changes often require design-level discussion and may be staged across vers --- +## Security & Safety Expectations + +- Assume **untrusted input** (paths and source code). +- Add **negative tests** for any normalization or CFG change. +- Changes must preserve determinism and avoid new false positives. + +--- + ## Development Setup ```bash @@ -128,6 +137,9 @@ CodeClone follows **semantic versioning**: - **MINOR**: new detection capabilities (for example, CFG improvements) - **PATCH**: bug fixes, performance improvements, and UI/UX polish +Baselines are versioned. Any change to detection behavior must include documentation +and tests, and may require baseline regeneration. + --- ## License diff --git a/README.md b/README.md index 21b7cc9..65d2815 100644 --- a/README.md +++ b/README.md @@ -63,6 +63,12 @@ Typical use cases: - no `__init__` noise, - size and statement-count thresholds. +### Segment-level internal clone detection + +- Detects repeated **segment windows** inside the same function. +- Uses a two‑step deterministic match (candidate signature → strict hash). +- Included in reports for explainability, **not** in baseline/CI failure logic. + ### Control-Flow Awareness (CFG v1) - Each function is converted into a **Control Flow Graph**. @@ -74,6 +80,8 @@ Typical use cases: - `with` / `async with` - `match` / `case` (Python 3.10+) - Current CFG semantics (v1): + - `and` / `or` are modeled as short‑circuit micro‑CFG branches, + - `try/except` links only from statements that may raise, - `break` and `continue` are treated as statements (no jump targets), - after-blocks are explicit and always present, - focus is on **structural similarity**, not precise runtime semantics. @@ -86,6 +94,7 @@ This design keeps clone detection **stable, deterministic, and low-noise**. - Conservative defaults tuned for real-world Python projects. - Explicit thresholds for size and statement count. - No probabilistic scoring or heuristic similarity thresholds. +- Safe commutative normalization and local logical equivalences only. - Focus on *architectural duplication*, not micro-similarities. ### CI-friendly baseline mode @@ -149,6 +158,9 @@ codeclone . --update-baseline Commit the generated baseline file to the repository. +Baselines are **versioned**. If CodeClone is upgraded, regenerate the baseline to keep +CI deterministic and explainable. + ### 2. Use in CI ```bash @@ -215,8 +227,9 @@ repos: 2. Normalize AST (names, constants, attributes, annotations). 3. Build a **Control Flow Graph (CFG)** per function. 4. Compute stable CFG fingerprints. -5. Detect function-level and block-level clones. -6. Apply conservative filters to suppress noise. +5. Extract segment windows for internal clone discovery. +6. Detect function-level, block-level, and segment-level clones. +7. Apply conservative filters to suppress noise. See the architectural overview: diff --git a/SECURITY.md b/SECURITY.md index 533843a..5e84183 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -9,7 +9,8 @@ The following versions currently receive security updates: | Version | Supported | |---------|-----------| -| 1.2.x | Yes | +| 1.3.x | Yes | +| 1.2.x | No | | 1.1.x | No | | 1.0.x | No | @@ -33,6 +34,11 @@ Potential risk areas include: These areas are explicitly tested and hardened, but are still the primary focus of ongoing security review. +Additional safeguards: + +- HTML report content is escaped to prevent script injection. +- Reports are static and do not execute analyzed code. + --- ## Reporting a Vulnerability diff --git a/codeclone.baseline.json b/codeclone.baseline.json index ce9a169..dcc2fc8 100644 --- a/codeclone.baseline.json +++ b/codeclone.baseline.json @@ -4,5 +4,7 @@ "7d573fa56fb11050f1642f18ca4bb3225e11e194|0-19" ], "blocks": [], - "python_version": "3.13" + "python_version": "3.13", + "baseline_version": "1.3.0", + "schema_version": 1 } \ No newline at end of file diff --git a/codeclone/baseline.py b/codeclone/baseline.py index 74f2030..e73ff7f 100644 --- a/codeclone/baseline.py +++ b/codeclone/baseline.py @@ -13,15 +13,28 @@ from pathlib import Path from typing import Any +from . import __version__ + +BASELINE_SCHEMA_VERSION = 1 + class Baseline: - __slots__ = ("blocks", "functions", "path", "python_version") + __slots__ = ( + "baseline_version", + "blocks", + "functions", + "path", + "python_version", + "schema_version", + ) def __init__(self, path: str | Path): self.path = Path(path) self.functions: set[str] = set() self.blocks: set[str] = set() self.python_version: str | None = None + self.baseline_version: str | None = None + self.schema_version: int | None = None def load(self) -> None: if not self.path.exists(): @@ -35,6 +48,14 @@ def load(self) -> None: self.python_version = ( python_version if isinstance(python_version, str) else None ) + baseline_version = data.get("baseline_version") + self.baseline_version = ( + baseline_version if isinstance(baseline_version, str) else None + ) + schema_version = data.get("schema_version") + self.schema_version = ( + schema_version if isinstance(schema_version, int) else None + ) except json.JSONDecodeError as e: raise ValueError(f"Corrupted baseline file at {self.path}: {e}") from e @@ -42,7 +63,13 @@ def save(self) -> None: self.path.parent.mkdir(parents=True, exist_ok=True) self.path.write_text( json.dumps( - _baseline_payload(self.functions, self.blocks, self.python_version), + _baseline_payload( + self.functions, + self.blocks, + self.python_version, + self.baseline_version, + self.schema_version, + ), indent=2, ensure_ascii=False, ), @@ -55,11 +82,15 @@ def from_groups( block_groups: Mapping[str, object], path: str | Path = "", python_version: str | None = None, + baseline_version: str | None = None, + schema_version: int | None = None, ) -> Baseline: bl = Baseline(path) bl.functions = set(func_groups.keys()) bl.blocks = set(block_groups.keys()) bl.python_version = python_version + bl.baseline_version = baseline_version + bl.schema_version = schema_version return bl def diff( @@ -74,6 +105,8 @@ def _baseline_payload( functions: set[str], blocks: set[str], python_version: str | None, + baseline_version: str | None, + schema_version: int | None, ) -> dict[str, Any]: payload: dict[str, Any] = { "functions": sorted(functions), @@ -81,4 +114,8 @@ def _baseline_payload( } if python_version: payload["python_version"] = python_version + payload["baseline_version"] = baseline_version or __version__ + payload["schema_version"] = ( + schema_version if schema_version is not None else BASELINE_SCHEMA_VERSION + ) return payload diff --git a/codeclone/blocks.py b/codeclone/blocks.py index 551d243..3469361 100644 --- a/codeclone/blocks.py +++ b/codeclone/blocks.py @@ -12,6 +12,7 @@ from dataclasses import dataclass from .blockhash import stmt_hash +from .fingerprint import sha1 from .normalize import NormalizationConfig @@ -25,6 +26,17 @@ class BlockUnit: size: int +@dataclass(frozen=True, slots=True) +class SegmentUnit: + segment_hash: str + segment_sig: str + filepath: str + qualname: str + start_line: int + end_line: int + size: int + + def extract_blocks( func_node: ast.AST, *, @@ -72,3 +84,48 @@ def extract_blocks( break return blocks + + +def extract_segments( + func_node: ast.AST, + *, + filepath: str, + qualname: str, + cfg: NormalizationConfig, + window_size: int, + max_segments: int, +) -> list[SegmentUnit]: + body = getattr(func_node, "body", None) + if not isinstance(body, list) or len(body) < window_size: + return [] + + stmt_hashes = [stmt_hash(stmt, cfg) for stmt in body] + + segments: list[SegmentUnit] = [] + + for i in range(len(stmt_hashes) - window_size + 1): + start = getattr(body[i], "lineno", None) + end = getattr(body[i + window_size - 1], "end_lineno", None) + if not start or not end: + continue + + window = stmt_hashes[i : i + window_size] + segment_hash = sha1("|".join(window)) + segment_sig = sha1("|".join(sorted(window))) + + segments.append( + SegmentUnit( + segment_hash=segment_hash, + segment_sig=segment_sig, + filepath=filepath, + qualname=qualname, + start_line=start, + end_line=end, + size=window_size, + ) + ) + + if len(segments) >= max_segments: + break + + return segments diff --git a/codeclone/cache.py b/codeclone/cache.py index f652d17..8045596 100644 --- a/codeclone/cache.py +++ b/codeclone/cache.py @@ -19,7 +19,7 @@ from typing import TYPE_CHECKING, Any, TypedDict, cast if TYPE_CHECKING: - from .blocks import BlockUnit + from .blocks import BlockUnit, SegmentUnit from .extractor import Unit from .errors import CacheError @@ -50,10 +50,21 @@ class BlockDict(TypedDict): size: int +class SegmentDict(TypedDict): + segment_hash: str + segment_sig: str + filepath: str + qualname: str + start_line: int + end_line: int + size: int + + class CacheEntry(TypedDict): stat: FileStat units: list[UnitDict] blocks: list[BlockDict] + segments: list[SegmentDict] class CacheData(TypedDict): @@ -63,7 +74,7 @@ class CacheData(TypedDict): class Cache: __slots__ = ("data", "load_warning", "path", "secret") - CACHE_VERSION = "1.0" + CACHE_VERSION = "1.1" def __init__(self, path: str | Path): self.path = Path(path) @@ -129,7 +140,7 @@ def load(self) -> None: self.data = {"version": self.CACHE_VERSION, "files": {}} return - self.data = cast(CacheData, data) + self.data = cast(CacheData, cast(object, data)) self.load_warning = None except (json.JSONDecodeError, ValueError): @@ -159,7 +170,7 @@ def get_file_entry(self, filepath: str) -> CacheEntry | None: if not isinstance(entry, dict): return None - required = {"stat", "units", "blocks"} + required = {"stat", "units", "blocks", "segments"} if not required.issubset(entry.keys()): return None @@ -171,11 +182,15 @@ def put_file_entry( stat_sig: FileStat, units: list[Unit], blocks: list[BlockUnit], + segments: list[SegmentUnit], ) -> None: self.data["files"][filepath] = { "stat": stat_sig, "units": cast(list[UnitDict], cast(object, [asdict(u) for u in units])), "blocks": cast(list[BlockDict], cast(object, [asdict(b) for b in blocks])), + "segments": cast( + list[SegmentDict], cast(object, [asdict(s) for s in segments]) + ), } diff --git a/codeclone/cfg.py b/codeclone/cfg.py index 9235a7f..ffa7aa8 100644 --- a/codeclone/cfg.py +++ b/codeclone/cfg.py @@ -102,14 +102,11 @@ def _visit(self, stmt: ast.stmt) -> None: # ---------- Control Flow ---------- def _visit_if(self, stmt: ast.If) -> None: - self.current.statements.append(ast.Expr(value=stmt.test)) - then_block = self.cfg.create_block() else_block = self.cfg.create_block() after_block = self.cfg.create_block() - self.current.add_successor(then_block) - self.current.add_successor(else_block) + self._emit_condition(stmt.test, then_block, else_block) self.current = then_block self._visit_statements(stmt.body) @@ -131,9 +128,7 @@ def _visit_while(self, stmt: ast.While) -> None: self.current.add_successor(cond_block) self.current = cond_block - self.current.statements.append(ast.Expr(value=stmt.test)) - self.current.add_successor(body_block) - self.current.add_successor(after_block) + self._emit_condition(stmt.test, body_block, after_block) self.current = body_block self._visit_statements(stmt.body) @@ -198,14 +193,14 @@ def _visit_try(self, stmt: _TryLike) -> None: final_block = self.cfg.create_block() # Process each statement in try body - # Link each to exception handlers + # Link only statements that can raise to exception handlers for stmt_node in stmt.body: if self.current.is_terminated: break - # Current statement could raise exception - for h_block in handlers_blocks: - self.current.add_successor(h_block) + if _stmt_can_raise(stmt_node): + for h_block in handlers_blocks: + self.current.add_successor(h_block) self._visit(stmt_node) @@ -261,3 +256,66 @@ def _visit_match(self, stmt: ast.Match) -> None: self.current.add_successor(after_block) self.current = after_block + + def _emit_condition( + self, test: ast.expr, true_block: Block, false_block: Block + ) -> None: + if isinstance(test, ast.BoolOp) and isinstance(test.op, (ast.And, ast.Or)): + self._emit_boolop(test, true_block, false_block) + return + + self.current.statements.append(ast.Expr(value=test)) + self.current.add_successor(true_block) + self.current.add_successor(false_block) + + def _emit_boolop( + self, test: ast.BoolOp, true_block: Block, false_block: Block + ) -> None: + values = test.values + op = test.op + current = self.current + + for idx, value in enumerate(values): + current.statements.append(ast.Expr(value=value)) + is_last = idx == len(values) - 1 + + if isinstance(op, ast.And): + if is_last: + current.add_successor(true_block) + current.add_successor(false_block) + else: + next_block = self.cfg.create_block() + current.add_successor(next_block) + current.add_successor(false_block) + current = next_block + else: + if is_last: + current.add_successor(true_block) + current.add_successor(false_block) + else: + next_block = self.cfg.create_block() + current.add_successor(true_block) + current.add_successor(next_block) + current = next_block + + self.current = current + + +def _stmt_can_raise(stmt: ast.stmt) -> bool: + if isinstance(stmt, ast.Raise): + return True + + for node in ast.walk(stmt): + if isinstance( + node, + ( + ast.Call, + ast.Attribute, + ast.Subscript, + ast.Await, + ast.YieldFrom, + ), + ): + return True + + return False diff --git a/codeclone/cli.py b/codeclone/cli.py index 677dbf8..6e52791 100644 --- a/codeclone/cli.py +++ b/codeclone/cli.py @@ -20,13 +20,20 @@ from rich.table import Table from rich.theme import Theme -from .baseline import Baseline +from . import __version__ +from .baseline import BASELINE_SCHEMA_VERSION, Baseline from .cache import Cache, CacheEntry, FileStat, file_stat_signature from .errors import CacheError from .extractor import extract_units_from_source from .html_report import build_html_report from .normalize import NormalizationConfig -from .report import build_block_groups, build_groups, to_json_report, to_text +from .report import ( + build_block_groups, + build_groups, + build_segment_groups, + to_json_report, + to_text, +) from .scanner import iter_py_files, module_name_from_path # Custom theme for Rich @@ -54,6 +61,7 @@ class ProcessingResult: error: str | None = None units: list[Any] | None = None blocks: list[Any] | None = None + segments: list[Any] | None = None stat: FileStat | None = None @@ -108,7 +116,7 @@ def process_file( stat = file_stat_signature(filepath) module_name = module_name_from_path(root, filepath) - units, blocks = extract_units_from_source( + units, blocks, segments = extract_units_from_source( source=source, filepath=filepath, module_name=module_name, @@ -122,6 +130,7 @@ def process_file( success=True, units=units, blocks=blocks, + segments=segments, stat=stat, ) @@ -262,6 +271,7 @@ def main() -> None: all_units: list[dict[str, Any]] = [] all_blocks: list[dict[str, Any]] = [] + all_segments: list[dict[str, Any]] = [] changed_files_count = 0 files_to_process: list[str] = [] @@ -315,6 +325,12 @@ def _safe_future_result(future: Any) -> tuple[ProcessingResult | None, str | Non cast(object, cached.get("blocks", [])), ) ) + all_segments.extend( + cast( + list[dict[str, Any]], + cast(object, cached.get("segments", [])), + ) + ) else: files_to_process.append(fp) except Exception as e: @@ -335,12 +351,15 @@ def handle_result(result: ProcessingResult) -> None: result.stat, result.units or [], result.blocks or [], + result.segments or [], ) changed_files_count += 1 if result.units: all_units.extend([asdict(u) for u in result.units]) if result.blocks: all_blocks.extend([asdict(b) for b in result.blocks]) + if result.segments: + all_segments.extend([asdict(s) for s in result.segments]) else: failed_files.append(f"{result.filepath}: {result.error}") @@ -460,6 +479,7 @@ def process_sequential(with_progress: bool) -> None: with console.status("[bold green]Grouping clones...", spinner="dots"): func_groups = build_groups(all_units) block_groups = build_block_groups(all_blocks) + segment_groups = build_segment_groups(all_segments) try: cache.save() except CacheError as e: @@ -468,6 +488,7 @@ def process_sequential(with_progress: bool) -> None: # Reporting func_clones_count = len(func_groups) block_clones_count = len(block_groups) + segment_clones_count = len(segment_groups) # Baseline Logic baseline_path = Path(args.baseline).expanduser().resolve() @@ -480,6 +501,27 @@ def process_sequential(with_progress: bool) -> None: if baseline_exists: baseline.load() + if not args.update_baseline: + if baseline.baseline_version != __version__: + console.print( + "[error]Baseline version mismatch.[/error]\n" + "Baseline was generated with CodeClone " + f"{baseline.baseline_version or 'unknown'}.\n" + f"Current version: {__version__}.\n" + "Please regenerate the baseline with --update-baseline." + ) + sys.exit(2) + if ( + baseline.schema_version is not None + and baseline.schema_version != BASELINE_SCHEMA_VERSION + ): + console.print( + "[error]Baseline schema version mismatch.[/error]\n" + f"Baseline schema: {baseline.schema_version}. " + f"Current schema: {BASELINE_SCHEMA_VERSION}.\n" + "Please regenerate the baseline with --update-baseline." + ) + sys.exit(2) if not args.update_baseline and baseline.python_version: current_version = f"{sys.version_info.major}.{sys.version_info.minor}" if baseline.python_version != current_version: @@ -511,6 +553,8 @@ def process_sequential(with_progress: bool) -> None: block_groups, path=baseline_path, python_version=f"{sys.version_info.major}.{sys.version_info.minor}", + baseline_version=__version__, + schema_version=BASELINE_SCHEMA_VERSION, ) new_baseline.save() console.print(f"[success]✔ Baseline updated:[/success] {baseline_path}") @@ -529,6 +573,7 @@ def process_sequential(with_progress: bool) -> None: table.add_row("Files Processed", str(changed_files_count)) table.add_row("Total Function Clones", str(func_clones_count)) table.add_row("Total Block Clones", str(block_clones_count)) + table.add_row("Total Segment Clones", str(segment_clones_count)) if baseline_exists: style = "error" if new_clones_count > 0 else "success" @@ -546,6 +591,7 @@ def process_sequential(with_progress: bool) -> None: build_html_report( func_groups=func_groups, block_groups=block_groups, + segment_groups=segment_groups, title="CodeClone Report", context_lines=3, max_snippet_lines=220, @@ -558,7 +604,7 @@ def process_sequential(with_progress: bool) -> None: out = Path(args.json_out).expanduser().resolve() out.parent.mkdir(parents=True, exist_ok=True) out.write_text( - to_json_report(func_groups, block_groups), + to_json_report(func_groups, block_groups, segment_groups), "utf-8", ) console.print(f"[info]JSON report saved:[/info] {out}") @@ -570,7 +616,9 @@ def process_sequential(with_progress: bool) -> None: "FUNCTION CLONES\n" + to_text(func_groups) + "\nBLOCK CLONES\n" - + to_text(block_groups), + + to_text(block_groups) + + "\nSEGMENT CLONES\n" + + to_text(segment_groups), "utf-8", ) console.print(f"[info]Text report saved:[/info] {out}") diff --git a/codeclone/extractor.py b/codeclone/extractor.py index 02f9730..8ffe0e3 100644 --- a/codeclone/extractor.py +++ b/codeclone/extractor.py @@ -15,7 +15,7 @@ from contextlib import contextmanager from dataclasses import dataclass -from .blocks import BlockUnit, extract_blocks +from .blocks import BlockUnit, SegmentUnit, extract_blocks, extract_segments from .cfg import CFGBuilder from .errors import ParseError from .fingerprint import bucket_loc, sha1 @@ -189,7 +189,7 @@ def extract_units_from_source( cfg: NormalizationConfig, min_loc: int, min_stmt: int, -) -> tuple[list[Unit], list[BlockUnit]]: +) -> tuple[list[Unit], list[BlockUnit], list[SegmentUnit]]: try: tree = _parse_with_limits(source, PARSE_TIMEOUT_SECONDS) except SyntaxError as e: @@ -200,6 +200,7 @@ def extract_units_from_source( units: list[Unit] = [] block_units: list[BlockUnit] = [] + segment_units: list[SegmentUnit] = [] for local_name, node in qb.units: start = getattr(node, "lineno", None) @@ -243,4 +244,16 @@ def extract_units_from_source( ) block_units.extend(blocks) - return units, block_units + # Segment-level units (windows within functions, for internal clones) + if loc >= 30 and stmt_count >= 12: + segments = extract_segments( + node, + filepath=filepath, + qualname=qualname, + cfg=cfg, + window_size=6, + max_segments=60, + ) + segment_units.extend(segments) + + return units, block_units, segment_units diff --git a/codeclone/html_report.py b/codeclone/html_report.py index eb23f87..0d8c9c1 100644 --- a/codeclone/html_report.py +++ b/codeclone/html_report.py @@ -239,6 +239,7 @@ def build_html_report( *, func_groups: dict[str, list[dict[str, Any]]], block_groups: dict[str, list[dict[str, Any]]], + segment_groups: dict[str, list[dict[str, Any]]], title: str = "CodeClone Report", context_lines: int = 3, max_snippet_lines: int = 220, @@ -247,8 +248,11 @@ def build_html_report( func_sorted = sorted(func_groups.items(), key=lambda kv: _group_sort_key(kv[1])) block_sorted = sorted(block_groups.items(), key=lambda kv: _group_sort_key(kv[1])) + segment_sorted = sorted( + segment_groups.items(), key=lambda kv: _group_sort_key(kv[1]) + ) - has_any = bool(func_sorted) or bool(block_sorted) + has_any = bool(func_sorted) or bool(block_sorted) or bool(segment_sorted) # Pygments CSS (scoped). Use modern GitHub-like styles when available. # We scope per theme to support toggle without reloading. @@ -467,7 +471,8 @@ def render_section(
{ICON_CHECK}

No code clones detected

- No structural or block-level duplication was found above configured thresholds. + No structural, block-level, or segment-level duplication was found above + configured thresholds.

This usually indicates healthy abstraction boundaries.

@@ -478,6 +483,9 @@ def render_section( "functions", "Function clones", func_sorted, "pill-func" ) block_section = render_section("blocks", "Block clones", block_sorted, "pill-block") + segment_section = render_section( + "segments", "Segment clones", segment_sorted, "pill-segment" + ) return REPORT_TEMPLATE.substitute( title=_escape(title), @@ -487,6 +495,7 @@ def render_section( empty_state_html=empty_state_html, func_section=func_section, block_section=block_section, + segment_section=segment_section, icon_theme=ICON_THEME, font_css_url=FONT_CSS_URL, ) diff --git a/codeclone/normalize.py b/codeclone/normalize.py index d78f0e0..8dbcd99 100644 --- a/codeclone/normalize.py +++ b/codeclone/normalize.py @@ -109,6 +109,78 @@ def visit_AugAssign(self, node: ast.AugAssign) -> AST: ) return self.generic_visit(new_node) + def visit_UnaryOp(self, node: ast.UnaryOp) -> ast.AST: + new_node = self.generic_visit(node) + assert isinstance(new_node, ast.UnaryOp) + + if isinstance(new_node.op, ast.Not): + operand = new_node.operand + if ( + isinstance(operand, ast.Compare) + and len(operand.ops) == 1 + and len(operand.comparators) == 1 + ): + op = operand.ops[0] + if isinstance(op, ast.In): + cmp = ast.Compare( + left=operand.left, + ops=[ast.NotIn()], + comparators=operand.comparators, + ) + return ast.copy_location(cmp, new_node) + if isinstance(op, ast.Is): + cmp = ast.Compare( + left=operand.left, + ops=[ast.IsNot()], + comparators=operand.comparators, + ) + return ast.copy_location(cmp, new_node) + return new_node + + def visit_BinOp(self, node: ast.BinOp) -> ast.AST: + new_node = self.generic_visit(node) + assert isinstance(new_node, ast.BinOp) + + if not isinstance( + new_node.op, (ast.Add, ast.Mult, ast.BitOr, ast.BitAnd, ast.BitXor) + ): + return new_node + + if not ( + _is_safe_commutative_operand(new_node.left) + and _is_safe_commutative_operand(new_node.right) + ): + return new_node + + left_key = _expr_sort_key(new_node.left) + right_key = _expr_sort_key(new_node.right) + if right_key < left_key: + new_node.left, new_node.right = new_node.right, new_node.left + return new_node + + +def _expr_sort_key(node: ast.AST) -> str: + return ast.dump(node, annotate_fields=True, include_attributes=False) + + +def _is_safe_commutative_operand(node: ast.AST) -> bool: + disallowed = ( + ast.Call, + ast.Attribute, + ast.Subscript, + ast.Await, + ast.Yield, + ast.YieldFrom, + ast.Lambda, + ast.NamedExpr, + ast.ListComp, + ast.SetComp, + ast.DictComp, + ast.GeneratorExp, + ) + + return all(not isinstance(child, disallowed) for child in ast.walk(node)) + def normalized_ast_dump(func_node: ast.AST, cfg: NormalizationConfig) -> str: """ diff --git a/codeclone/report.py b/codeclone/report.py index 04a83bd..8189dbd 100644 --- a/codeclone/report.py +++ b/codeclone/report.py @@ -37,6 +37,37 @@ def build_block_groups(blocks: list[GroupItem], min_functions: int = 2) -> Group return filtered +def build_segment_groups( + segments: list[GroupItem], min_occurrences: int = 2 +) -> GroupMap: + sig_groups: GroupMap = {} + for s in segments: + sig_groups.setdefault(s["segment_sig"], []).append(s) + + confirmed: GroupMap = {} + for items in sig_groups.values(): + if len(items) < min_occurrences: + continue + + hash_groups: GroupMap = {} + for item in items: + hash_groups.setdefault(item["segment_hash"], []).append(item) + + for segment_hash, hash_items in hash_groups.items(): + if len(hash_items) < min_occurrences: + continue + + by_func: GroupMap = {} + for it in hash_items: + by_func.setdefault(it["qualname"], []).append(it) + + for qualname, q_items in by_func.items(): + if len(q_items) >= min_occurrences: + confirmed[f"{segment_hash}|{qualname}"] = q_items + + return confirmed + + def to_json(groups: GroupMap) -> str: return json.dumps( { @@ -53,9 +84,11 @@ def to_json(groups: GroupMap) -> str: ) -def to_json_report(func_groups: GroupMap, block_groups: GroupMap) -> str: +def to_json_report( + func_groups: GroupMap, block_groups: GroupMap, segment_groups: GroupMap +) -> str: return json.dumps( - {"functions": func_groups, "blocks": block_groups}, + {"functions": func_groups, "blocks": block_groups, "segments": segment_groups}, ensure_ascii=False, indent=2, ) diff --git a/codeclone/templates.py b/codeclone/templates.py index de6993d..4d1227d 100644 --- a/codeclone/templates.py +++ b/codeclone/templates.py @@ -458,6 +458,13 @@ opacity: 0.9; } +.pill-segment { + color: var(--warning); + background: var(--warning-subtle); + border: 1px solid var(--warning); + opacity: 0.9; +} + /* Groups */ .group { margin-bottom: 16px; @@ -1109,6 +1116,7 @@ ${func_section} ${block_section} +${segment_section}