diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..614346f --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,316 @@ +# AGENTS.md — CodeClone (AI Agent Playbook) + +This document is the **source of truth** for how AI agents should work in this repository. +It is optimized for **determinism**, **CI stability**, and **reproducible changes**. + +> Repository goal: maximize **honesty**, **reproducibility**, **determinism**, and **precision** for real‑world CI usage. + +--- + +## 1) Operating principles (non‑negotiable) + +1. **Do not break CI contracts.** + - Treat baseline, cache, and report formats as **public APIs**. + - Any contract change must be **versioned**, documented, and accompanied by tests. + +2. **Determinism > cleverness.** + - Outputs must be stable across runs given identical inputs (same repo, tool version, python tag). + +3. **Evidence-based explainability.** + - The core engine produces **facts/metrics**. + - HTML/UI **renders facts**, it must not invent interpretations. + +4. **Safety first.** + - Never delete or overwrite user files outside repo. + - Any write must be atomic where relevant (e.g., baseline `.tmp` + `os.replace`). + +--- + +## 2) Quick orientation + +CodeClone is an AST/CFG-informed clone detector for Python. It supports: +- **function clones** (strongest signal) +- **block clones** (sliding window of statements, may be noisy on boilerplate) +- **segment clones** (report-only unless explicitly gated) + +Key artifacts: +- `codeclone.baseline.json` — trusted baseline snapshot (for CI comparisons) +- `.cache/codeclone/cache.json` — analysis cache (integrity-checked) +- `.cache/codeclone/report.html|report.json|report.txt` — reports + +--- + +## 3) One command to validate your change + +Run these locally before proposing changes: + +```bash +uv run ruff check . +uv run mypy . +uv run pytest -q +``` + +If you touched baseline/cache/report contracts, also run the repo’s audit runner (or the scenario script if present). + +--- + +## 4) Baseline contract (v1, stable) + +### Baseline file structure (canonical) + +```json +{ + "meta": { + "generator": { "name": "codeclone", "version": "X.Y.Z" }, + "schema_version": "1.0", + "fingerprint_version": "1", + "python_tag": "cp313", + "created_at": "2026-02-08T14:20:15Z", + "payload_sha256": "…" + }, + "clones": { + "functions": [], + "blocks": [] + } +} +``` + +### Rules + +- `schema_version` is **baseline schema**, not package version. +- Compatibility is tied to: + - `fingerprint_version` + - `python_tag` + - `generator.name == "codeclone"` +- `payload_sha256` is computed from a **canonical payload**: + - stable key order + - clone id lists are **sorted and unique** + - integrity check uses constant‑time compare (e.g., `hmac.compare_digest`) + +### Trust model + +- A baseline is either **trusted** (`baseline_status = ok`) or **untrusted**. +- **Normal mode**: + - warn + - ignore untrusted baseline + - compare vs empty baseline +- **CI gating mode** (`--ci` / `--fail-on-new`): + - fail‑fast if baseline untrusted + - exit code **2** for untrusted baseline + +### Legacy behavior + +- Legacy baselines (<= 1.3.x layout) must be treated as **untrusted** with explicit messaging and tests. + +--- + +## 5) Cache contract (integrity + size guards) + +- Cache is an **optimization**, never a source of truth. +- If cache is invalid or too large: + - warn + - proceed without cache + - ensure report meta reflects `cache_used=false` + +Never “fix” cache by silently mutating it; prefer regenerate. + +--- + +## 6) Reports and explainability + +Reports come in: +- HTML (`--html`) +- JSON (`--json`) +- Text (`--text`) + +### Report invariants + +- Ordering must be deterministic (stable sort keys). +- All provenance fields must be consistent across formats: + - baseline loaded / status + - baseline fingerprint + schema versions + - baseline generator version + - cache path / cache used + +### Explainability contract (core owns facts) + +For each clone group (especially block clones), the **core** should be able to provide factual fields such as: + +- `match_rule` +- `signature_kind` +- `window_size` (block size) / `segment_size` +- `merged_regions` flag and counts +- `stmt_type_sequence` (normalized) +- `stmt_type_histogram` +- `has_control_flow` (if/for/while/try/match) +- ratios (assert / assign / call) +- `max_consecutive_` (e.g., consecutive asserts) + +UI can show **hints** only when the predicate is **formal & exact** (100% confidence), e.g.: +- `assert_only_block` (assert_ratio == 1.0 and consecutive_asserts == block_len) +- `repeated_stmt_hash` (single stmt hash repeated across window) + +No UI-only heuristics that affect gating. + +--- + +## 7) Noise policy (what is and isn’t a “fix”) + +### Acceptable fixes +- Merge/report-layer improvements (e.g., merge sliding windows into maximal regions) **without changing gating**. +- Better evidence surfaced in HTML to explain matches. + +### Not acceptable as a “quick fix” +- Weakening detection rules to hide noisy test patterns, unless: + - it is configurable + - default remains honest + - the change is justified by real-world repos + - it includes tests for false-negative risk + +### Preferred remediation for test-only FPs +- Refactor tests to avoid long repetitive statement sequences: + - replace chains of `assert "... in html"` with loops or aggregated checks. + +--- + +## 8) How to propose changes (agent workflow) + +When you implement something: + +1. **State the intent** (what user-visible issue does it solve?) +2. **List files touched** and why. +3. **Call out contracts affected**: + - baseline / cache / report schema + - CLI exit codes / messages +4. **Add/adjust tests** for: + - normal-mode behavior + - CI gating behavior + - determinism (identical output on rerun) + - legacy/untrusted scenarios where applicable +5. Run: + - `ruff`, `mypy`, `pytest` + +Avoid changing unrelated files (locks, roadmap) unless required. + +--- + +## 9) CLI behavior and exit codes + +Agents must preserve these semantics: + +- **0** — success (including “new clones detected” in non-gating mode) +- **2** — baseline gating failure (untrusted/missing baseline when CI requires trusted baseline; invalid output extension, etc.) +- **3** — analysis gating failure (e.g., `--fail-threshold` exceeded or new clones in `--ci` as designed) + +If you introduce a new exit reason, document it and add tests. + +--- + +## 10) Release hygiene (for agent-assisted releases) + +Before cutting a release: + +- Confirm baseline schema compatibility is unchanged, or properly versioned. +- Ensure changelog has: + - user-facing changes + - migration notes if any +- Validate `twine check dist/*` for built artifacts. +- Smoke test install in a clean venv: + - `pip install dist/*.whl` + - `codeclone --version` + - `codeclone . --ci` in a sample repo with baseline. + +--- + +## 11) “Don’t do this” list + +- Don’t add hidden behavior differences between report formats. +- Don’t make baseline compatibility depend on package patch/minor version. +- Don’t add project-root hashes or unstable machine-local fields to baseline. +- Don’t embed suppressions into baseline unless explicitly designed as a versioned contract. +- Don’t introduce nondeterministic ordering (dict iteration, set ordering, filesystem traversal without sort). + +--- + +## 12) Where to put new code + +## 13) Python language + typing rules (3.10 → 3.14) + +These rules are **repo policy**. If you need to violate one, you must explain why in the PR. + +### Supported Python versions +- **Must run on Python 3.10, 3.11, 3.12, 3.13, 3.14**. +- Do not rely on behavior that is new to only the latest version unless you provide a fallback. +- Prefer **standard library** features that exist in 3.10+. + +### Modern syntax (allowed / preferred) +Use modern syntax when it stays compatible with 3.10+: +- `X | Y` unions, `list[str]` / `dict[str, int]` generics (PEP 604 / PEP 585) +- `from __future__ import annotations` is allowed, but keep behavior consistent across 3.10–3.14. +- `match/case` (PEP 634) is allowed, but only if it keeps determinism/readability. +- `typing.Self` (3.11+) **avoid** in public APIs unless you gate it with `typing_extensions`. +- Prefer `pathlib.Path` over `os.path` for new code (but keep hot paths pragmatic). + +### Typing standards +- **Type hints are required** for all public functions, core pipeline surfaces, and any code that touches: + baseline, cache, fingerprints, report models, serialization, CLI exit behavior. +- Keep **`Any` to an absolute minimum**: + - `Any` is allowed only at IO boundaries (JSON parsing, `argparse`, `subprocess`) and must be + *narrowed immediately* into typed structures (dataclasses / TypedDict / Protocol / enums). + - If `Any` appears in “core/domain” code, add a comment: `# Any: ` and a TODO to remove. +- Prefer **`Literal` / enums** for finite sets (e.g., status codes, kinds). +- Prefer **`dataclasses`** (frozen where reasonable) for data models; keep models JSON‑serializable. +- Use `collections.abc` types (`Iterable`, `Sequence`, `Mapping`) for inputs where appropriate. +- Avoid `cast()` unless you also add an invariant check nearby. + +### Dataclasses / models +- Models that cross module boundaries should be: + - explicitly typed + - immutable when possible (`frozen=True`) + - validated at construction (or via a dedicated `validate_*` function) if they are user‑provided. + +### Error handling +- Prefer explicit, typed error types over stringly‑typed errors. +- Exit codes are part of the public contract; do not change them without updating tests + docs. + +### Determinism requirements (language-level) +- Never iterate over unordered containers (`set`, `dict`) without sorting first when it affects: + hashes, IDs, report ordering, baseline payloads, or UI output. +- Use stable formatting (sorted keys, stable ordering) in JSON output. + +### Key PEPs to keep in mind +- PEP 8, PEP 484 (typing), PEP 526 (variable annotations) +- PEP 563 / PEP 649 (annotation evaluation changes across versions) — avoid relying on evaluation timing +- PEP 585 (built-in generics), PEP 604 (X | Y unions) +- PEP 634 (structural pattern matching) +- PEP 612 (ParamSpec) / PEP 646 (TypeVarTuple) — only if it clearly helps, don’t overcomplicate + + + +Prefer these rules: + +- **Domain / contracts / enums** live near the domain owner (baseline statuses in baseline domain). +- **Core logic** should not depend on HTML. +- **Render** depends on report model, never the other way around. +- If a module becomes a “god module”, split by: + - model (types) + - io/serialization + - rules/validation + - ui rendering + +Avoid deep package hierarchies unless they clearly reduce coupling. + +--- + +## 14) Minimal checklist for PRs (agents) + +- [ ] Change is deterministic. +- [ ] Contracts preserved or versioned. +- [ ] Tests added for new behavior. +- [ ] `ruff`, `mypy`, `pytest` green. +- [ ] CLI messages remain helpful and stable (don’t break scripts). +- [ ] Reports contain provenance fields and reflect trust model correctly. + +--- + +If you are an AI agent and something here conflicts with an instruction from a maintainer in the PR/issue thread, **ask for clarification in the thread** and default to this document until resolved. diff --git a/CHANGELOG.md b/CHANGELOG.md index 48213a3..efab51c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,159 @@ # Changelog +## [1.4.0] - 2026-02-12 + +### Overview + +This release stabilizes the baseline contract for long-term CI reuse without changing clone-detection semantics. Key +improvements include baseline schema standardization, enhanced cache efficiency, and hardened IO/contract behavior for +CI environments. + +--- + +### Baseline Schema & Compatibility + +**Stable v1 Schema** + +- Baseline now uses stable v1 schema with strict top-level `meta` + `clones` objects +- Compatibility gated by `schema_version`, `fingerprint_version`, and `python_tag` (independent of package patch/minor + version) +- Trust validation requires `meta.generator.name` to be `codeclone` +- Legacy 1.3 baseline layouts treated as untrusted with explicit regeneration guidance + +**Integrity & Hash Calculation** + +- Baseline integrity uses canonical `payload_sha256` over semantic payload (`functions`, `blocks`, + `fingerprint_version`, `python_tag`) +- Intentionally excluded from `payload_sha256`: + - `schema_version` (compatibility gate only) + - `meta.generator.name` (trust gate only) + - `meta.generator.version` and `meta.created_at` (informational only) +- Hash inputs remain stable across future 1.x patch/minor releases +- Baseline regeneration required only when `fingerprint_version` or `python_tag` changes + +**Migration Notes** + +- Early 1.4.0 development snapshots (before integrity canonicalization fix) may require one-time + `codeclone . --update-baseline` +- After this one-time update, baselines are stable for long-term CI use + +--- + +### File System & Storage + +**Atomic Operations** + +- Baseline writes use atomic `*.tmp` + `os.replace` pattern (same filesystem requirement) +- Configurable size guards: + - `--max-baseline-size-mb` + - `--max-cache-size-mb` + +**Baseline Trust Model** + +- **Normal mode**: Untrusted baseline triggers warning and comparison against empty baseline +- **CI preset** (`--ci`): Untrusted baseline causes fast-fail with exit code `2` +- Deterministic behavior ensures predictable CI outcomes + +--- + +### CLI & Exit Codes + +**Exit Code Contract** (explicit and stable) + +- `0` - Success +- `2` - Contract error (unreadable files, untrusted baseline, integrity failures) +- `3` - Gating failure (new clones, threshold violations) +- `5` - Internal error + +**Exit Code Priority** + +- Contract errors (exit `2`) override gating failures (exit `3`) when both conditions present + +**CI/Gating Modes** + +- In CI/gating modes (`--ci`, `--fail-on-new`, `--fail-threshold`): + - Unreadable or decode-failed source files treated as contract errors (exit `2`) + - Prevents incomplete analysis from passing CI checks + +**Error Handling** + +- Standardized internal error UX: `INTERNAL ERROR` with reason and actionable next steps +- New `--debug` flag (also `CODECLONE_DEBUG=1`) includes traceback + runtime environment details +- CLI help now includes canonical exit-code descriptions plus `Repository` / `Issues` / `Docs` links + +--- + +### Reporting Enhancements + +**JSON Report (v1.1 Schema)** + +- Compact deterministic layout with top-level `meta` + `files` + `groups` +- Explicit `group_item_layout` for array-based group records +- New `groups_split` structure with `new`/`known` keys per section +- Deterministic `meta.groups_counts` aggregates +- Legacy alias sections removed (`function_clones`, `block_clones`, `segment_clones`) + +**TXT Report (aligned to report meta v1.1)** + +- Normalized metadata/order as stable contract +- Explicit section metrics: `loc` for functions, `size` for blocks/segments +- Sections split into `(NEW)` and `(KNOWN)` for functions/blocks/segments +- With untrusted baseline: `(KNOWN)` sections empty, all groups in `(NEW)` + +**HTML Report (aligned to report meta v1.1)** + +- New baseline split controls: `New duplicates` / `Known duplicates` +- Consistent filtering behavior across report types +- Block explainability now core-owned (`block_group_facts`) +- Expanded `Report Provenance` section displays full meta information block + +**Cross-Format Metadata** + +- All formats (HTML/TXT/JSON) now include: + - `baseline_payload_sha256` and `baseline_payload_sha256_verified` for audit traceability + - Cache contract fields: `cache_schema_version`, `cache_status`, `cache_used` + - Baseline audit fields and trust status + +### Documentation + +- Added the contract documentation book `docs/book/`. + +--- + +### Testing + +**Baseline Contract Testing** + +- Expanded matrix coverage: + - Legacy format handling + - Type/shape validation + - Compatibility mismatch scenarios + - Integrity failure cases + - Canonical hash determinism + +**Golden Snapshot Testing** + +- New detector golden snapshot fixture with canonical runtime policy +- Golden assertions run on `cp313` (consistency) +- Full invariant suite maintains matrix-wide coverage +- Golden tests use same core `python_tag` source as CLI/baseline checks (prevents cross-layer drift) + +--- + +### Roadmap Note + +Version 1.4.0 establishes a stable baseline/CI contract but revealed internal structure needs cleanup. Version 1.5 will +focus on architecture refactoring for maintainability and orchestration, with strict constraints: + +**No changes to:** + +- Detection semantics +- Fingerprint algorithms +- Baseline hash inputs +- Determinism guarantees + +The 1.4.0 contract remains stable and reliable for long-term CI integration. + ## [1.3.0] - 2026-02-08 ### Overview @@ -59,8 +213,10 @@ codeclone . --update-baseline ### Cache & Security - Cache default moved to `/.cache/codeclone/cache.json` with legacy path warning. -- Cache schema was extended to include segment data (`CACHE_VERSION=1.1`). +- Cache schema moved to compact signed payload format (`CACHE_VERSION=1.2`) with + relative file keys and fixed-array entries for faster IO and smaller files. - Cache integrity uses constant-time signature checks and deep schema validation. +- Legacy `.cache_secret` is now treated as obsolete and triggers an explicit cleanup warning. - Invalid/oversized cache is ignored deterministically and rebuilt from source. - Added security regressions for traversal safety, report escaping, baseline/cache integrity, and deterministic report ordering across formats. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 47ccfdf..54f7748 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -6,20 +6,20 @@ CodeClone is an **AST + CFG-based code clone detector** focused on architectural not textual similarity. Contributions are welcome — especially those that improve **signal quality**, **CFG semantics**, -and **real-world usability**. +and **real-world CI usability**. --- ## Project Philosophy -Before contributing, please understand the core principles of the project: +Core principles: - **Low noise over high recall** - **Structural and control-flow similarity**, not semantic equivalence - **Deterministic and explainable behavior** -- Optimized for **CI usage and architectural analysis** +- Optimized for **CI usage** and architectural analysis -If a change increases false positives or reduces explainability, +If a change increases false positives, reduces determinism, or weakens explainability, it is unlikely to be accepted. --- @@ -42,14 +42,16 @@ We especially welcome contributions in the following areas: Please use the appropriate **GitHub Issue Template**. -When reporting bugs related to clone detection, include: +When reporting issues related to clone detection, include: -- minimal reproducible code snippets; -- the Python version used; +- minimal reproducible code snippets (preferred over screenshots); +- the CodeClone version; +- the Python version (`python_tag`, e.g. `cp313`); - whether the issue is primarily: - - AST-related, - - CFG-related, - - reporting / UI-related. + - AST-related, + - CFG-related, + - normalization-related, + - reporting / UI-related. Screenshots alone are usually insufficient for analysis. @@ -73,12 +75,13 @@ Well-argued false-positive reports are valuable and appreciated. CFG behavior in CodeClone is intentionally conservative in the 1.x series. -If proposing changes to CFG semantics, please include: +If proposing changes to CFG semantics, include: - a description of the current behavior; - the proposed new behavior; -- the expected impact on clone detection quality; -- concrete code examples. +- the expected impact on clone detection quality (noise/recall); +- concrete code examples; +- a note on determinism implications. Such changes often require design-level discussion and may be staged across versions. @@ -87,19 +90,44 @@ Such changes often require design-level discussion and may be staged across vers ## Security & Safety Expectations - Assume **untrusted input** (paths and source code). -- Add **negative tests** for any normalization or CFG change. -- Changes must preserve determinism and avoid new false positives. +- Prefer **fail-closed in gating modes** and **fail-open in normal modes** only when explicitly intended. +- Add **negative tests** for any normalization/CFG change. +- Changes must preserve determinism and avoid introducing new false positives. --- ## Baseline & CI -- Baselines are **versioned**. Regenerate with `codeclone . --update-baseline` - when detection logic or CodeClone version changes. -- Baselines in 1.3+ are tamper-evident (`generator`, `payload_sha256`). -- Baseline verification must use the same Python `major.minor` version. -- In `--fail-on-new` / `--ci`, untrusted baseline states fail fast. Outside gating - mode, baseline is ignored with warning and comparison proceeds against an empty baseline. +### Baseline contract (v1) + +- The baseline schema is versioned (`meta.schema_version`). +- Compatibility/trust gates include `schema_version`, `fingerprint_version`, `python_tag`, + and `meta.generator.name`. +- Integrity is tamper-evident via `meta.payload_sha256` over canonical payload: + `clones.functions`, `clones.blocks`, `meta.fingerprint_version`, `meta.python_tag`. + `meta.schema_version`, `meta.generator.name`, `meta.generator.version`, and `created_at` + are excluded from payload hashing. + +### When baseline regeneration is required + +- Regenerate baseline with `codeclone . --update-baseline` when + `fingerprint_version` **or** `python_tag` changes. +- Regeneration is **not** required for UI/report/CLI/cache/performance-only changes + if both `fingerprint_version` and `python_tag` are unchanged. + +### Gating behavior + +- In `--ci` (or explicit gating flags), **untrusted baseline states fail fast** as a contract error (exit 2). +- Outside gating mode, an untrusted/missing baseline is ignored with a warning and comparison proceeds + against an empty baseline. + +### Exit codes contract + +- **0** — success +- **2** — contract error (e.g., missing/untrusted baseline in gating, invalid output path/extension, incompatible + versions) +- **3** — gating failure (new clones detected, `--fail-threshold` exceeded) +- **5** — internal error (unexpected exception; please report) --- @@ -108,9 +136,7 @@ Such changes often require design-level discussion and may be staged across vers ```bash git clone https://github.com/orenlab/codeclone.git cd codeclone -python -m venv .venv -source .venv/bin/activate -pip install -e .[dev] +uv sync --all-extras --dev ``` Run tests: @@ -131,8 +157,9 @@ uv run ruff format . ## Code Style -- Python 3.10+ +- Python **3.10–3.14** - Type annotations are required +- `Any` should be minimized; prefer precise types and small typed helpers - `mypy` must pass - `ruff check` must pass - Code must be formatted with `ruff format` @@ -145,11 +172,11 @@ uv run ruff format . CodeClone follows **semantic versioning**: - **MAJOR**: fundamental detection model changes -- **MINOR**: new detection capabilities (for example, CFG improvements) +- **MINOR**: new detection capabilities (e.g., new detectors or major CFG/normalization behavior shifts) - **PATCH**: bug fixes, performance improvements, and UI/UX polish -Baselines are versioned. Any change to detection behavior must include documentation -and tests, and may require baseline regeneration. +Any change that affects detection behavior must include documentation and tests, +and may require a `fingerprint_version` bump (and thus baseline regeneration). --- diff --git a/README.md b/README.md index 7dea74c..96b7859 100644 --- a/README.md +++ b/README.md @@ -8,104 +8,47 @@ ![Baseline](https://img.shields.io/badge/baseline-versioned-green?style=flat-square) [![License](https://img.shields.io/pypi/l/codeclone.svg?style=flat-square)](LICENSE) -**CodeClone** is a Python code clone detector based on **normalized Python AST and Control Flow Graphs (CFG)**. -It helps teams discover architectural duplication and prevent new copy-paste from entering the codebase via CI. +**CodeClone** is a Python code clone detector based on **normalized AST and Control Flow Graphs (CFG)**. +It discovers architectural duplication and prevents new copy-paste from entering your codebase via CI. -CodeClone is designed to help teams: +--- + +## Why CodeClone + +CodeClone focuses on **architectural duplication**, not text similarity. It detects structural patterns through: -- discover **structural and control-flow duplication**, -- identify architectural hotspots, -- prevent *new* duplication via CI and pre-commit hooks. +- **Normalized AST analysis** — robust to renaming, formatting, and minor refactors +- **Control Flow Graphs** — captures execution logic, not just syntax +- **Strict, explainable matching** — clear signals, not fuzzy heuristics -Unlike token- or text-based tools, CodeClone operates on **normalized Python AST and CFG**, making it robust against -renaming, formatting, and minor refactoring. +Unlike token-based tools, CodeClone compares **structure and control flow**, making it ideal for finding: + +- Repeated service/orchestration patterns +- Duplicated guard/validation blocks +- Copy-pasted handler logic across modules +- Recurring internal segments in large functions --- -## Why CodeClone? +## Core Capabilities -Most existing tools detect *textual* duplication. -CodeClone detects **structural and block-level duplication**, which usually signals missing abstractions or -architectural drift. +**Three Detection Levels:** -Typical use cases: +1. **Function clones (CFG fingerprint)** + Strong structural signal for cross-layer duplication -- duplicated service or orchestration logic across layers (API ↔ application), -- repeated validation or guard blocks, -- copy-pasted request / handler flows, -- duplicated control-flow logic in routers, handlers, or services. +2. **Block clones (statement windows)** + Detects repeated local logic patterns ---- +3. **Segment clones (report-only)** + Internal function repetition for explainability; not used for baseline gating + +**CI-Ready Features:** -## Features - -### Function-level clone detection (Type-2, CFG-based) - -- Detects functions and methods with identical **control-flow structure**. -- Based on **Control Flow Graph (CFG)** fingerprinting. -- Robust to: - - variable renaming, - - constant changes, - - attribute renaming, - - formatting differences, - - docstrings and type annotations. -- Ideal for spotting architectural duplication across layers. - -### Block-level clone detection (Type-3-lite) - -- Detects repeated **statement blocks** inside larger functions. -- Uses sliding windows over CFG-normalized statement sequences. -- Targets: - - validation blocks, - - guard clauses, - - repeated orchestration logic. -- Carefully filtered to reduce noise: - - no overlapping windows, - - no clones inside the same function, - - no `__init__` noise, - - size and statement-count thresholds. - -### Segment-level internal clone detection - -- Detects repeated **segment windows** inside the same function. -- Uses a two-step deterministic match (candidate signature → strict hash). -- Included in reports for explainability, **not** in baseline/CI failure logic. - -### Control-Flow Awareness (CFG v1) - -- Each function is converted into a **Control Flow Graph**. -- CFG nodes contain normalized AST statements. -- CFG edges represent structural control flow: - - `if` / `else` - - `for` / `async for` / `while` - - `try` / `except` / `finally` - - `with` / `async with` - - `match` / `case` (Python 3.10+) -- Current CFG semantics (v1): - - `and` / `or` are modeled as short-circuit micro-CFG branches, - - `try/except` links only from statements that may raise, - - `break` / `continue` are modeled as terminating loop transitions with explicit targets, - - `for/while ... else` semantics are preserved structurally, - - `match case` and `except` handler order is preserved structurally, - - after-blocks are explicit and always present, - - focus is on **structural similarity**, not precise runtime semantics. - -This design keeps clone detection **stable, deterministic, and low-noise**. - -### Low-noise by design - -- AST + CFG normalization instead of token matching. -- Conservative defaults tuned for real-world Python projects. -- Explicit thresholds for size and statement count. -- No probabilistic scoring or heuristic similarity thresholds. -- Safe commutative normalization and local logical equivalences only. -- Focus on *architectural duplication*, not micro-similarities. - -### CI-friendly baseline mode - -- Establish a baseline of existing clones. -- Fail CI **only when new clones are introduced**. -- Safe for legacy codebases and incremental refactoring. +- Deterministic output with stable ordering +- Reproducible artifacts for audit trails +- Baseline-driven gating to prevent new duplication +- Fast incremental analysis with intelligent caching --- @@ -115,160 +58,219 @@ This design keeps clone detection **stable, deterministic, and low-noise**. pip install codeclone ``` -Python 3.10+ is required. +**Requirements:** Python 3.10+ + +--- ## Quick Start -Run on a project: +### Basic Analysis ```bash +# Analyze current directory codeclone . -``` -This will: - -- scan Python files, -- build CFGs for functions, -- detect function-level and block-level clones, -- print a summary to stdout. +# Check version +codeclone --version +``` -Generate reports: +### Generate Reports ```bash codeclone . \ + --html .cache/codeclone/report.html \ --json .cache/codeclone/report.json \ --text .cache/codeclone/report.txt ``` -Generate an HTML report: +### CI Integration ```bash -codeclone . --html .cache/codeclone/report.html -``` - -Check version: +# 1. Generate baseline once (commit to repo) +codeclone . --update-baseline -```bash -codeclone --version +# 2. Add to CI pipeline +codeclone . --ci ``` ---- - -## Reports and Metadata - -All report formats include provenance metadata for auditability: - -`codeclone_version`, `python_version`, `baseline_path`, `baseline_version`, -`baseline_schema_version`, `baseline_python_version`, `baseline_loaded`, -`baseline_status` (and cache metadata when available). - -baseline_status values: - -- `ok` -- `missing` -- `legacy` -- `invalid` -- `mismatch_version` -- `mismatch_schema` -- `mismatch_python` -- `generator_mismatch` -- `integrity_missing` -- `integrity_failed` -- `too_large` +The `--ci` preset is equivalent to `--fail-on-new --no-color --quiet`. --- -## Baseline Workflow (Recommended) +## Baseline Workflow -1. Create a baseline +Baselines capture the **current state of duplication** in your codebase. Once committed, they serve as the reference +point for CI checks. -Run once on your current codebase: +**Key points (contract-level):** -```bash -codeclone . --update-baseline -``` +- Baseline file is versioned (`codeclone.baseline.json`) and used to classify clones as **NEW** vs **KNOWN**. +- Compatibility is gated by `schema_version`, `fingerprint_version`, and `python_tag`. +- Baseline trust is gated by `meta.generator.name` (`codeclone`) and integrity (`payload_sha256`). +- In CI preset (`--ci`), an untrusted baseline is a contract error (exit `2`). -Commit the generated baseline file to the repository. +Full contract details: [`docs/book/06-baseline.md`](docs/book/06-baseline.md) -Baselines are versioned. If CodeClone is upgraded, regenerate the baseline to keep -CI deterministic and explainable. - -Baseline format in 1.3+ is tamper-evident (generator, payload_sha256) and validated -before baseline comparison. +--- -2. Trusted vs untrusted baseline behavior +## Exit Codes -Baseline states considered untrusted: +CodeClone uses a deterministic exit code contract: -- `invalid` -- `too_large` -- `generator_mismatch` -- `integrity_missing` -- `integrity_failed` +| Code | Meaning | +|------|-----------------------------------------------------------------------------| +| `0` | Success — run completed without gating failures | +| `2` | Contract error — baseline missing/untrusted, invalid output extensions, incompatible versions, unreadable source files in CI/gating | +| `3` | Gating failure — new clones detected or threshold exceeded | +| `5` | Internal error — unexpected exception | -Behavior: +**Priority:** Contract errors (`2`) override gating failures (`3`) when both occur. -- in normal mode, untrusted baseline is ignored with a warning (comparison falls back to empty baseline); -- in `--fail-on-new` / `--ci`, untrusted baseline fails fast (exit code 2). +Full contract details: [`docs/book/03-contracts-exit-codes.md`](docs/book/03-contracts-exit-codes.md) -3. Use in CI +**Debug Support:** ```bash -codeclone . --ci -``` - -or: +# Show detailed error information +codeclone . --debug -```bash -codeclone . --ci --html .cache/codeclone/report.html +# Or via environment variable +CODECLONE_DEBUG=1 codeclone . ``` -`--ci` is equivalent to `--fail-on-new --no-color --quiet`. - -Behavior: - -- existing clones are allowed, -- the build fails if new clones appear, -- refactoring that removes duplication is always allowed. - -`--fail-on-new` / `--ci` exits with a non-zero code when new clones are detected. - --- -### Cache - -By default, CodeClone stores the cache per project at: - -```bash -/.cache/codeclone/cache.json +## Reports + +### Supported Formats + +- **HTML** (`--html`) — Interactive web report with filtering +- **JSON** (`--json`) — Machine-readable structured data +- **Text** (`--text`) — Plain text summary + +### Report Schema (JSON v1.1) + +The JSON report uses a compact deterministic layout: + +- Top-level: `meta`, `files`, `groups`, `groups_split`, `group_item_layout` +- Optional top-level: `facts` +- `groups_split` provides explicit **NEW / KNOWN** separation per section +- `meta.groups_counts` provides deterministic per-section aggregates +- `meta` follows a shared canonical contract across HTML/JSON/TXT + +Canonical report contract: [`docs/book/08-report.md`](docs/book/08-report.md) + +**Minimal shape (v1.1):** + +```json +{ + "meta": { + "report_schema_version": "1.1", + "codeclone_version": "1.4.0", + "python_version": "3.13", + "python_tag": "cp313", + "baseline_path": "/path/to/codeclone.baseline.json", + "baseline_fingerprint_version": "1", + "baseline_schema_version": "1.0", + "baseline_python_tag": "cp313", + "baseline_generator_name": "codeclone", + "baseline_generator_version": "1.4.0", + "baseline_payload_sha256": "", + "baseline_payload_sha256_verified": true, + "baseline_loaded": true, + "baseline_status": "ok", + "cache_path": "/path/to/.cache/codeclone/cache.json", + "cache_used": true, + "cache_status": "ok", + "cache_schema_version": "1.2", + "files_skipped_source_io": 0, + "groups_counts": { + "functions": { + "total": 0, + "new": 0, + "known": 0 + }, + "blocks": { + "total": 0, + "new": 0, + "known": 0 + }, + "segments": { + "total": 0, + "new": 0, + "known": 0 + } + } + }, + "files": [], + "groups": { + "functions": {}, + "blocks": {}, + "segments": {} + }, + "groups_split": { + "functions": { + "new": [], + "known": [] + }, + "blocks": { + "new": [], + "known": [] + }, + "segments": { + "new": [], + "known": [] + } + }, + "group_item_layout": { + "functions": [ + "file_i", + "qualname", + "start", + "end", + "loc", + "stmt_count", + "fingerprint", + "loc_bucket" + ], + "blocks": [ + "file_i", + "qualname", + "start", + "end", + "size" + ], + "segments": [ + "file_i", + "qualname", + "start", + "end", + "size", + "segment_hash", + "segment_sig" + ] + }, + "facts": { + "blocks": {} + } +} ``` -You can override this path with `--cache-path` (`--cache-dir` is a legacy alias). - -If you used an older version of CodeClone, delete the legacy cache file at -`~/.cache/codeclone/cache.json` and add `.cache/` to `.gitignore`. - -Cache integrity checks are strict: signature mismatch or oversized cache files are ignored -with an explicit warning, then rebuilt from source. - -Cache entries are validated against expected structure/types; invalid entries are ignored -deterministically. - --- -## Python Version Consistency for Baseline Checks +## Cache -Due to inherent differences in Python’s AST between interpreter versions, baseline -generation and verification must be performed using the same Python version. +Cache is an optimization layer only and is never a source of truth. -This ensures deterministic and reproducible clone detection results. +- Default path: `/.cache/codeclone/cache.json` +- Schema version: **v1.2** +- Invalid or oversized cache is ignored with warning and rebuilt (fail-open) -CI checks therefore pin baseline verification to a single Python version, while the -test matrix continues to validate compatibility across Python 3.10–3.14. +Full contract details: [`docs/book/07-cache.md`](docs/book/07-cache.md) --- -## Using with pre-commit +## Pre-commit Integration ```yaml repos: @@ -289,73 +291,57 @@ repos: ### CodeClone Is -- an architectural analysis tool, -- a duplication radar, -- a CI guard against copy-paste, -- a control-flow-aware clone detector. +- A structural clone detector for Python +- A CI guard against new duplication +- A deterministic analysis tool with auditable outputs ### CodeClone Is Not -- a linter, -- a formatter, -- a semantic equivalence prover, -- a runtime analyzer. - -## How It Works (High Level) - -1. Parse Python source into AST. -2. Normalize AST (names, constants, attributes, annotations). -3. Build a Control Flow Graph (CFG) per function. -4. Compute stable CFG fingerprints. -5. Extract segment windows for internal clone discovery. -6. Detect function-level, block-level, and segment-level clones. -7. Apply conservative filters to suppress noise. - -See the architectural overview: - -- [docs/architecture.md](docs/architecture.md) +- A linter or code formatter +- A semantic equivalence prover +- A runtime execution analyzer --- -## Control Flow Graph (CFG) +## How It Works -Starting from version 1.1.0, CodeClone uses a Control Flow Graph (CFG) -to improve structural clone detection robustness. +**High-level Pipeline:** -The CFG is a structural abstraction, not a runtime execution model. +1. **Parse** — Python source → AST +2. **Normalize** — AST → canonical structure +3. **CFG Construction** — per-function control flow graph +4. **Fingerprinting** — stable hash computation +5. **Grouping** — function/block/segment clone groups +6. **Determinism** — stable ordering for reproducibility +7. **Baseline Comparison** — new vs known clones (when requested) -See full design and semantics: +Learn more: -- [docs/cfg.md](docs/cfg.md) +- Architecture: [`docs/architecture.md`](docs/architecture.md) +- CFG semantics: [`docs/cfg.md`](docs/cfg.md) --- -## CLI Options - -| Option | Description | Default | -|-------------------------------|----------------------------------------------------------------------|--------------------------------------| -| `root` | Project root directory to scan | `.` | -| `--version` | Print CodeClone version and exit | - | -| `--min-loc` | Minimum function LOC to analyze | `15` | -| `--min-stmt` | Minimum AST statements to analyze | `6` | -| `--processes` | Number of worker processes | `4` | -| `--cache-path FILE` | Cache file path | `/.cache/codeclone/cache.json` | -| `--cache-dir FILE` | Legacy alias for `--cache-path` | - | -| `--max-cache-size-mb MB` | Max cache size before ignore + warning | `50` | -| `--baseline FILE` | Baseline file path | `codeclone.baseline.json` | -| `--max-baseline-size-mb MB` | Max baseline size; untrusted baseline fails in CI, ignored otherwise | `5` | -| `--update-baseline` | Regenerate baseline from current results | `False` | -| `--fail-on-new` | Fail if new function/block clone groups appear vs baseline | `False` | -| `--fail-threshold MAX_CLONES` | Fail if total clone groups (`function + block`) exceed threshold | `-1` (disabled) | -| `--ci` | CI preset: `--fail-on-new --no-color --quiet` | `False` | -| `--html FILE` | Write HTML report (`.html`) | - | -| `--json FILE` | Write JSON report (`.json`) | - | -| `--text FILE` | Write text report (`.txt`) | - | -| `--no-progress` | Disable progress bar output | `False` | -| `--no-color` | Disable ANSI colors | `False` | -| `--quiet` | Minimize output (warnings/errors still shown) | `False` | -| `--verbose` | Show hash details for new clone groups in fail output | `False` | - -## License - -MIT License +## Documentation Map + +Use this map to pick the right level of detail: + +- **Contract book (canonical contracts/specs):** [`docs/book/`](docs/book/) + - Start here: [`docs/book/00-intro.md`](docs/book/00-intro.md) + - Exit codes and precedence: [`docs/book/03-contracts-exit-codes.md`](docs/book/03-contracts-exit-codes.md) + - Baseline contract (schema/trust/integrity): [`docs/book/06-baseline.md`](docs/book/06-baseline.md) + - Cache contract (schema/integrity/fail-open): [`docs/book/07-cache.md`](docs/book/07-cache.md) + - Report contract (schema v1.1 + NEW/KNOWN split): [`docs/book/08-report.md`](docs/book/08-report.md) + - CLI behavior: [`docs/book/09-cli.md`](docs/book/09-cli.md) + - HTML rendering: [`docs/book/10-html-render.md`](docs/book/10-html-render.md) + - Determinism policy: [`docs/book/12-determinism.md`](docs/book/12-determinism.md) + - Compatibility/versioning rules: [ + `docs/book/14-compatibility-and-versioning.md`](docs/book/14-compatibility-and-versioning.md) +- **Deep dives:** + - Architecture narrative: [`docs/architecture.md`](docs/architecture.md) + - CFG semantics: [`docs/cfg.md`](docs/cfg.md) + +## Links + +- **Issues:** +- **PyPI:** diff --git a/SECURITY.md b/SECURITY.md index 080e1ef..de26567 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -3,13 +3,14 @@ ## Supported Versions CodeClone is a static analysis tool and does not execute analyzed code at runtime. -Nevertheless, security and robustness are treated as first‑class concerns. +Nevertheless, security and robustness are treated as first-class concerns. The following versions currently receive security updates: | Version | Supported | |---------|-----------| -| 1.3.x | Yes | +| 1.4.x | Yes | +| 1.3.x | No | | 1.2.x | No | | 1.1.x | No | | 1.0.x | No | @@ -38,15 +39,22 @@ Additional safeguards: - HTML report content is escaped in both text and attribute contexts to prevent script injection. - Reports are static and do not execute analyzed code. +- Report explainability fields are generated in Python core; UI is rendering-only and does not infer semantics. - Scanner traversal is root-confined and prevents symlink-based path escape. - Baseline files are schema/type validated with size limits and tamper-evident integrity fields - (`generator`, `payload_sha256` for v1.3+). + (`meta.generator` as trust gate, `meta.payload_sha256` as integrity hash in baseline v1). - Baseline integrity is tamper-evident (audit signal), not tamper-proof cryptographic signing. An actor who can rewrite baseline content and recompute `payload_sha256` can still alter it. -- In `--fail-on-new` / `--ci`, untrusted baseline states fail fast; otherwise baseline is ignored +- Baseline hash covers canonical payload only (`clones.functions`, `clones.blocks`, + `meta.fingerprint_version`, `meta.python_tag`). +- Baseline hash excludes non-semantic metadata (`created_at`, `meta.generator.version`). +- `meta.schema_version` and `meta.generator.name` are validated as compatibility/trust gates and are + intentionally excluded from `payload_sha256`. +- In `--ci` (or explicit `--fail-on-new`), untrusted baseline states fail fast; otherwise baseline is ignored with explicit warning and comparison proceeds against an empty baseline. -- Cache files are HMAC-signed (constant-time comparison), size-limited, and ignored on mismatch. -- Cache secrets are stored next to the cache (`.cache_secret`) and must not be committed. +- Cache files are integrity-signed with canonical payload hashing (constant-time comparison), + size-limited, and ignored on mismatch. +- Legacy cache secret files (`.cache/codeclone/.cache_secret`) are obsolete and should be removed. --- diff --git a/codeclone.baseline.json b/codeclone.baseline.json index 7dafea0..a50c904 100644 --- a/codeclone.baseline.json +++ b/codeclone.baseline.json @@ -1,10 +1,27 @@ { - "functions": [], - "blocks": [], - "python_version": "3.13", - "baseline_version": "1.3.0", - "schema_version": 1, - "generator": "codeclone", - "payload_sha256": "92e80b05c857b796bb452de9e62985a1568874da468bc671998133975c94397a", - "created_at": "2026-02-08T09:54:31+00:00" -} \ No newline at end of file + "meta": { + "generator": { + "name": "codeclone", + "version": "1.4.0" + }, + "schema_version": "1.0", + "fingerprint_version": "1", + "python_tag": "cp313", + "created_at": "2026-02-12T15:31:42Z", + "payload_sha256": "691c6cedd10e2a51d6038780f3ae9dffe763356dd2aba742b3980f131b79f217" + }, + "clones": { + "functions": [ + "efc8465229b381a3a50502d59d9539c0be3efe86|20-49" + ], + "blocks": [ + "3c1b5cf24b4dfcd8e5736b735bfd3850940100d5|3c1b5cf24b4dfcd8e5736b735bfd3850940100d5|3c1b5cf24b4dfcd8e5736b735bfd3850940100d5|3c1b5cf24b4dfcd8e5736b735bfd3850940100d5", + "3c1b5cf24b4dfcd8e5736b735bfd3850940100d5|3c1b5cf24b4dfcd8e5736b735bfd3850940100d5|3c1b5cf24b4dfcd8e5736b735bfd3850940100d5|cb4fcbc1b2a65ec1346898fc0d660335e25d7cbc", + "8579659a9e8c9755a6d2f0b1d82dda8866fd243b|1912d2ee3c541cbf9e51f485348586afe1a00755|ee69aff0b7ea38927e5082ceef14115c805f6734|ee69aff0b7ea38927e5082ceef14115c805f6734", + "b4b5893be87edf98955f047cbf25ca755dc753b4|8579659a9e8c9755a6d2f0b1d82dda8866fd243b|1912d2ee3c541cbf9e51f485348586afe1a00755|ee69aff0b7ea38927e5082ceef14115c805f6734", + "b6ee70d0bd6ff4b593f127a137aed9ab41179145|cacc33d58f323481f65fed57873d1c840531859e|d60c0005a4c850c140378d1c82b81dde93a7ccab|d60c0005a4c850c140378d1c82b81dde93a7ccab", + "cacc33d58f323481f65fed57873d1c840531859e|d60c0005a4c850c140378d1c82b81dde93a7ccab|d60c0005a4c850c140378d1c82b81dde93a7ccab|b4b5893be87edf98955f047cbf25ca755dc753b4", + "ee69aff0b7ea38927e5082ceef14115c805f6734|fcd36b4275c94f1955fb55e1c1ca3c04c7c0bb26|3c1b5cf24b4dfcd8e5736b735bfd3850940100d5|3c1b5cf24b4dfcd8e5736b735bfd3850940100d5" + ] + } +} diff --git a/codeclone/_cli_args.py b/codeclone/_cli_args.py index 49c0ad5..15cbdc5 100644 --- a/codeclone/_cli_args.py +++ b/codeclone/_cli_args.py @@ -9,12 +9,25 @@ from __future__ import annotations import argparse -from typing import cast +import sys +from typing import NoReturn, cast from . import ui_messages as ui +from .contracts import ExitCode, cli_help_epilog -class _HelpFormatter(argparse.ArgumentDefaultsHelpFormatter): +class _ArgumentParser(argparse.ArgumentParser): + def error(self, message: str) -> NoReturn: + self.print_usage(sys.stderr) + self.exit( + int(ExitCode.CONTRACT_ERROR), + f"CONTRACT ERROR: {message}\n", + ) + + +class _HelpFormatter( + argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter +): def _get_help_string(self, action: argparse.Action) -> str: if action.dest == "cache_path": return action.help or "" @@ -22,10 +35,11 @@ def _get_help_string(self, action: argparse.Action) -> str: def build_parser(version: str) -> argparse.ArgumentParser: - ap = argparse.ArgumentParser( + ap = _ArgumentParser( prog="codeclone", description="AST and CFG-based code clone detector for Python.", formatter_class=_HelpFormatter, + epilog=cli_help_epilog(), ) ap.add_argument( "--version", @@ -158,4 +172,9 @@ def build_parser(version: str) -> argparse.ArgumentParser: action="store_true", help=ui.HELP_VERBOSE, ) + out_group.add_argument( + "--debug", + action="store_true", + help=ui.HELP_DEBUG, + ) return ap diff --git a/codeclone/_cli_meta.py b/codeclone/_cli_meta.py index fe6a04e..11fcca7 100644 --- a/codeclone/_cli_meta.py +++ b/codeclone/_cli_meta.py @@ -10,15 +10,49 @@ import sys from pathlib import Path -from typing import Any +from typing import TypedDict -from .baseline import Baseline +from .baseline import Baseline, current_python_tag +from .contracts import REPORT_SCHEMA_VERSION def _current_python_version() -> str: return f"{sys.version_info.major}.{sys.version_info.minor}" +class ReportMeta(TypedDict): + """ + Canonical report metadata contract shared by HTML, JSON, and TXT reports. + + Key semantics: + - python_version: runtime major.minor string for human readability (e.g. "3.13") + - python_tag: runtime compatibility tag used by baseline/cache contracts + (e.g. "cp313") + - baseline_*: values loaded from baseline metadata for audit/provenance + - cache_*: cache status/provenance for run transparency + """ + + report_schema_version: str + codeclone_version: str + python_version: str + python_tag: str + baseline_path: str + baseline_fingerprint_version: str | None + baseline_schema_version: str | None + baseline_python_tag: str | None + baseline_generator_name: str | None + baseline_generator_version: str | None + baseline_payload_sha256: str | None + baseline_payload_sha256_verified: bool + baseline_loaded: bool + baseline_status: str + cache_path: str + cache_used: bool + cache_status: str + cache_schema_version: str | None + files_skipped_source_io: int + + def _build_report_meta( *, codeclone_version: str, @@ -28,16 +62,32 @@ def _build_report_meta( baseline_status: str, cache_path: Path, cache_used: bool, -) -> dict[str, Any]: + cache_status: str, + cache_schema_version: str | None, + files_skipped_source_io: int, +) -> ReportMeta: return { + "report_schema_version": REPORT_SCHEMA_VERSION, "codeclone_version": codeclone_version, "python_version": _current_python_version(), + "python_tag": current_python_tag(), "baseline_path": str(baseline_path), - "baseline_version": baseline.baseline_version, + "baseline_fingerprint_version": baseline.fingerprint_version, "baseline_schema_version": baseline.schema_version, - "baseline_python_version": baseline.python_version, + "baseline_python_tag": baseline.python_tag, + "baseline_generator_name": baseline.generator, + "baseline_generator_version": baseline.generator_version, + "baseline_payload_sha256": baseline.payload_sha256, + "baseline_payload_sha256_verified": ( + baseline_loaded + and baseline_status == "ok" + and isinstance(baseline.payload_sha256, str) + ), "baseline_loaded": baseline_loaded, "baseline_status": baseline_status, "cache_path": str(cache_path), "cache_used": cache_used, + "cache_status": cache_status, + "cache_schema_version": cache_schema_version, + "files_skipped_source_io": files_skipped_source_io, } diff --git a/codeclone/_cli_paths.py b/codeclone/_cli_paths.py index 4dcd72f..3f76906 100644 --- a/codeclone/_cli_paths.py +++ b/codeclone/_cli_paths.py @@ -14,6 +14,9 @@ from rich.console import Console +from .contracts import ExitCode +from .ui_messages import fmt_contract_error + def expand_path(p: str) -> Path: return Path(p).expanduser().resolve() @@ -26,11 +29,20 @@ def _validate_output_path( label: str, console: Console, invalid_message: Callable[..., str], + invalid_path_message: Callable[..., str], ) -> Path: out = Path(path).expanduser() if out.suffix.lower() != expected_suffix: console.print( - invalid_message(label=label, path=out, expected_suffix=expected_suffix) + fmt_contract_error( + invalid_message(label=label, path=out, expected_suffix=expected_suffix) + ) + ) + sys.exit(ExitCode.CONTRACT_ERROR) + try: + return out.resolve() + except OSError as e: + console.print( + fmt_contract_error(invalid_path_message(label=label, path=out, error=e)) ) - sys.exit(2) - return out.resolve() + sys.exit(ExitCode.CONTRACT_ERROR) diff --git a/codeclone/_cli_summary.py b/codeclone/_cli_summary.py index 40df388..c320f39 100644 --- a/codeclone/_cli_summary.py +++ b/codeclone/_cli_summary.py @@ -51,7 +51,11 @@ def _build_summary_rows( def _build_summary_table(rows: list[tuple[str, int]]) -> Table: - summary_table = Table(title=ui.SUMMARY_TITLE, show_header=True) + summary_table = Table( + title=ui.SUMMARY_TITLE, + show_header=True, + width=ui.CLI_LAYOUT_WIDTH, + ) summary_table.add_column("Metric") summary_table.add_column("Value", justify="right") for label, value in rows: diff --git a/codeclone/_html_escape.py b/codeclone/_html_escape.py index 16f7ae4..025ec44 100644 --- a/codeclone/_html_escape.py +++ b/codeclone/_html_escape.py @@ -9,24 +9,23 @@ from __future__ import annotations import html -from typing import Any -def _escape_html(v: Any) -> str: +def _escape_html(v: object) -> str: text = html.escape("" if v is None else str(v), quote=True) text = text.replace("`", "`") text = text.replace("\u2028", "
").replace("\u2029", "
") return text -def _escape_attr(v: Any) -> str: +def _escape_attr(v: object) -> str: text = html.escape("" if v is None else str(v), quote=True) text = text.replace("`", "`") text = text.replace("\u2028", "
").replace("\u2029", "
") return text -def _meta_display(v: Any) -> str: +def _meta_display(v: object) -> str: if isinstance(v, bool): return "true" if v else "false" if v is None: diff --git a/codeclone/_html_snippets.py b/codeclone/_html_snippets.py index 915cb1d..a21467f 100644 --- a/codeclone/_html_snippets.py +++ b/codeclone/_html_snippets.py @@ -14,12 +14,12 @@ from collections.abc import Iterable from dataclasses import dataclass from functools import lru_cache -from typing import Any, NamedTuple, cast +from typing import NamedTuple, cast from .errors import FileProcessingError -def pairwise(iterable: Iterable[Any]) -> Iterable[tuple[Any, Any]]: +def pairwise(iterable: Iterable[object]) -> Iterable[tuple[object, object]]: a, b = itertools.tee(iterable) next(b, None) return zip(a, b, strict=False) diff --git a/codeclone/_report_blocks.py b/codeclone/_report_blocks.py new file mode 100644 index 0000000..a6369d6 --- /dev/null +++ b/codeclone/_report_blocks.py @@ -0,0 +1,94 @@ +""" +CodeClone — AST and CFG-based code clone detector for Python +focused on architectural duplication. + +Copyright (c) 2026 Den Rozhnovskiy +Licensed under the MIT License. +""" + +from __future__ import annotations + +from typing import Any + +from ._report_types import GroupItem, GroupMap + + +# Any: values come from report item dictionaries populated from JSON-like data. +def _coerce_positive_int(value: Any) -> int | None: + try: + integer = int(value) + except (TypeError, ValueError): + return None + return integer if integer > 0 else None + + +def _block_item_sort_key(item: GroupItem) -> tuple[str, str, int, int]: + start_line = _coerce_positive_int(item.get("start_line")) or 0 + end_line = _coerce_positive_int(item.get("end_line")) or 0 + return ( + str(item.get("filepath", "")), + str(item.get("qualname", "")), + start_line, + end_line, + ) + + +def _merge_block_items(items: list[GroupItem]) -> list[GroupItem]: + """ + Merge overlapping/adjacent block windows into maximal ranges per function. + """ + if not items: + return [] + + sorted_items = sorted(items, key=_block_item_sort_key) + merged: list[GroupItem] = [] + current: GroupItem | None = None + + for item in sorted_items: + start_line = _coerce_positive_int(item.get("start_line")) + end_line = _coerce_positive_int(item.get("end_line")) + if start_line is None or end_line is None or end_line < start_line: + continue + + if current is None: + current = dict(item) + current["start_line"] = start_line + current["end_line"] = end_line + current["size"] = max(1, end_line - start_line + 1) + continue + + same_owner = str(current.get("filepath", "")) == str( + item.get("filepath", "") + ) and str(current.get("qualname", "")) == str(item.get("qualname", "")) + if same_owner and start_line <= int(current["end_line"]) + 1: + current["end_line"] = max(int(current["end_line"]), end_line) + current["size"] = max( + 1, int(current["end_line"]) - int(current["start_line"]) + 1 + ) + continue + + merged.append(current) + current = dict(item) + current["start_line"] = start_line + current["end_line"] = end_line + current["size"] = max(1, end_line - start_line + 1) + + if current is not None: + merged.append(current) + + return merged + + +def prepare_block_report_groups(block_groups: GroupMap) -> GroupMap: + """ + Convert sliding block windows into maximal merged regions for reporting. + Block hash keys remain unchanged. + """ + prepared: GroupMap = {} + for key, items in block_groups.items(): + merged = _merge_block_items(items) + if merged: + prepared[key] = merged + else: + prepared[key] = sorted(items, key=_block_item_sort_key) + return prepared diff --git a/codeclone/_report_explain.py b/codeclone/_report_explain.py new file mode 100644 index 0000000..ad26cc0 --- /dev/null +++ b/codeclone/_report_explain.py @@ -0,0 +1,251 @@ +""" +CodeClone — AST and CFG-based code clone detector for Python +focused on architectural duplication. + +Copyright (c) 2026 Den Rozhnovskiy +Licensed under the MIT License. +""" + +from __future__ import annotations + +import ast +from pathlib import Path + +from ._report_explain_contract import ( + BLOCK_HINT_ASSERT_ONLY, + BLOCK_HINT_ASSERT_ONLY_LABEL, + BLOCK_HINT_ASSERT_ONLY_NOTE, + BLOCK_HINT_CONFIDENCE_DETERMINISTIC, + BLOCK_PATTERN_REPEATED_STMT_HASH, + resolve_group_compare_note, + resolve_group_display_name, +) +from ._report_types import GroupItem, GroupMap + + +def _signature_parts(group_key: str) -> list[str]: + return [part for part in group_key.split("|") if part] + + +def _parsed_file_tree( + filepath: str, *, ast_cache: dict[str, ast.AST | None] +) -> ast.AST | None: + if filepath in ast_cache: + return ast_cache[filepath] + + try: + source = Path(filepath).read_text("utf-8") + tree = ast.parse(source, filename=filepath) + except (OSError, SyntaxError): + tree = None + ast_cache[filepath] = tree + return tree + + +def _is_assert_like_stmt(stmt: ast.stmt) -> bool: + if isinstance(stmt, ast.Assert): + return True + if isinstance(stmt, ast.Expr): + value = stmt.value + if isinstance(value, ast.Constant) and isinstance(value.value, str): + return True + if isinstance(value, ast.Call): + func = value.func + if isinstance(func, ast.Name): + return func.id.lower().startswith("assert") + if isinstance(func, ast.Attribute): + return func.attr.lower().startswith("assert") + return False + + +def _assert_range_stats( + *, + filepath: str, + start_line: int, + end_line: int, + ast_cache: dict[str, ast.AST | None], + range_cache: dict[tuple[str, int, int], tuple[int, int, int]], +) -> tuple[int, int, int]: + cache_key = (filepath, start_line, end_line) + if cache_key in range_cache: + return range_cache[cache_key] + + tree = _parsed_file_tree(filepath, ast_cache=ast_cache) + if tree is None: + range_cache[cache_key] = (0, 0, 0) + return 0, 0, 0 + + stmts = [ + node + for node in ast.walk(tree) + if isinstance(node, ast.stmt) + and int(getattr(node, "lineno", 0)) >= start_line + and int(getattr(node, "end_lineno", 0)) <= end_line + ] + if not stmts: + range_cache[cache_key] = (0, 0, 0) + return 0, 0, 0 + + ordered_stmts = sorted( + stmts, + key=lambda stmt: ( + int(getattr(stmt, "lineno", 0)), + int(getattr(stmt, "end_lineno", 0)), + int(getattr(stmt, "col_offset", 0)), + int(getattr(stmt, "end_col_offset", 0)), + type(stmt).__name__, + ), + ) + + total = len(ordered_stmts) + assert_like = 0 + max_consecutive = 0 + current_consecutive = 0 + for stmt in ordered_stmts: + if _is_assert_like_stmt(stmt): + assert_like += 1 + current_consecutive += 1 + if current_consecutive > max_consecutive: + max_consecutive = current_consecutive + else: + current_consecutive = 0 + + stats = (total, assert_like, max_consecutive) + range_cache[cache_key] = stats + return stats + + +def _is_assert_only_range( + *, + filepath: str, + start_line: int, + end_line: int, + ast_cache: dict[str, ast.AST | None], + range_cache: dict[tuple[str, int, int], tuple[int, int, int]], +) -> bool: + total, assert_like, _ = _assert_range_stats( + filepath=filepath, + start_line=start_line, + end_line=end_line, + ast_cache=ast_cache, + range_cache=range_cache, + ) + return total > 0 and total == assert_like + + +def _base_block_facts(group_key: str) -> dict[str, str]: + signature_parts = _signature_parts(group_key) + window_size = max(1, len(signature_parts)) + repeated_signature = len(signature_parts) > 1 and all( + part == signature_parts[0] for part in signature_parts + ) + facts: dict[str, str] = { + "match_rule": "normalized_sliding_window", + "block_size": str(window_size), + "signature_kind": "stmt_hash_sequence", + "merged_regions": "true", + } + if repeated_signature: + facts["pattern"] = BLOCK_PATTERN_REPEATED_STMT_HASH + facts["pattern_label"] = BLOCK_PATTERN_REPEATED_STMT_HASH + facts["pattern_display"] = f"{signature_parts[0][:12]} x{window_size}" + return facts + + +def _enrich_with_assert_facts( + *, + facts: dict[str, str], + items: list[GroupItem], + ast_cache: dict[str, ast.AST | None], + range_cache: dict[tuple[str, int, int], tuple[int, int, int]], +) -> None: + assert_only = True + total_statements = 0 + assert_statements = 0 + max_consecutive_asserts = 0 + + if not items: + assert_only = False + + for item in items: + filepath = str(item.get("filepath", "")) + start_line = int(item.get("start_line", 0)) + end_line = int(item.get("end_line", 0)) + + range_total = 0 + range_assert = 0 + range_max_consecutive = 0 + if filepath and start_line > 0 and end_line > 0: + range_total, range_assert, range_max_consecutive = _assert_range_stats( + filepath=filepath, + start_line=start_line, + end_line=end_line, + ast_cache=ast_cache, + range_cache=range_cache, + ) + total_statements += range_total + assert_statements += range_assert + max_consecutive_asserts = max( + max_consecutive_asserts, range_max_consecutive + ) + + if ( + not filepath + or start_line <= 0 + or end_line <= 0 + or not _is_assert_only_range( + filepath=filepath, + start_line=start_line, + end_line=end_line, + ast_cache=ast_cache, + range_cache=range_cache, + ) + ): + assert_only = False + + if total_statements > 0: + ratio = round((assert_statements / total_statements) * 100) + facts["assert_ratio"] = f"{ratio}%" + facts["consecutive_asserts"] = str(max_consecutive_asserts) + + if assert_only: + facts["hint"] = BLOCK_HINT_ASSERT_ONLY + facts["hint_label"] = BLOCK_HINT_ASSERT_ONLY_LABEL + facts["hint_confidence"] = BLOCK_HINT_CONFIDENCE_DETERMINISTIC + facts["hint_note"] = BLOCK_HINT_ASSERT_ONLY_NOTE + + +def build_block_group_facts(block_groups: GroupMap) -> dict[str, dict[str, str]]: + """ + Build deterministic explainability facts for block clone groups. + + This is the source of truth for report-level block explanations. + Renderers (HTML/TXT/JSON) should only display these facts. + """ + ast_cache: dict[str, ast.AST | None] = {} + range_cache: dict[tuple[str, int, int], tuple[int, int, int]] = {} + facts_by_group: dict[str, dict[str, str]] = {} + + for group_key, items in block_groups.items(): + facts = _base_block_facts(group_key) + _enrich_with_assert_facts( + facts=facts, + items=items, + ast_cache=ast_cache, + range_cache=range_cache, + ) + group_arity = len(items) + peer_count = max(0, group_arity - 1) + facts["group_arity"] = str(group_arity) + facts["instance_peer_count"] = str(peer_count) + compare_note = resolve_group_compare_note( + group_arity=group_arity, peer_count=peer_count + ) + if compare_note is not None: + facts["group_compare_note"] = compare_note + group_display_name = resolve_group_display_name(hint_id=facts.get("hint")) + if group_display_name is not None: + facts["group_display_name"] = group_display_name + facts_by_group[group_key] = facts + + return facts_by_group diff --git a/codeclone/_report_explain_contract.py b/codeclone/_report_explain_contract.py new file mode 100644 index 0000000..543ad02 --- /dev/null +++ b/codeclone/_report_explain_contract.py @@ -0,0 +1,48 @@ +""" +CodeClone — AST and CFG-based code clone detector for Python +focused on architectural duplication. + +Copyright (c) 2026 Den Rozhnovskiy +Licensed under the MIT License. +""" + +from __future__ import annotations + +from typing import Final + +from .ui_messages import ( + REPORT_BLOCK_GROUP_DISPLAY_NAME_ASSERT_PATTERN, + fmt_report_block_group_compare_note_n_way, +) + +BLOCK_PATTERN_REPEATED_STMT_HASH: Final = "repeated_stmt_hash" + +BLOCK_HINT_ASSERT_ONLY: Final = "assert_only" +BLOCK_HINT_ASSERT_ONLY_LABEL: Final = "Assert-only block" +BLOCK_HINT_CONFIDENCE_DETERMINISTIC: Final = "deterministic" +BLOCK_HINT_ASSERT_ONLY_NOTE: Final = ( + "This block clone consists entirely of assert-only statements. " + "This often occurs in test suites." +) + + +def format_n_way_group_compare_note(*, peer_count: int) -> str: + return fmt_report_block_group_compare_note_n_way(peer_count=peer_count) + + +def resolve_group_compare_note(*, group_arity: int, peer_count: int) -> str | None: + if group_arity > 2: + return format_n_way_group_compare_note(peer_count=peer_count) + return None + + +def resolve_group_display_name(*, hint_id: str | None) -> str | None: + if hint_id == BLOCK_HINT_ASSERT_ONLY: + return REPORT_BLOCK_GROUP_DISPLAY_NAME_ASSERT_PATTERN + return None + + +def format_group_instance_compare_meta( + *, instance_index: int, group_arity: int, peer_count: int +) -> str: + return f"instance {instance_index}/{group_arity} • matches {peer_count} peers" diff --git a/codeclone/_report_serialize.py b/codeclone/_report_serialize.py index 54dcef5..9c1a576 100644 --- a/codeclone/_report_serialize.py +++ b/codeclone/_report_serialize.py @@ -9,23 +9,129 @@ from __future__ import annotations import json -from collections.abc import Mapping -from typing import Any +from collections.abc import Collection, Mapping from ._report_types import GroupItem, GroupMap +from .contracts import REPORT_SCHEMA_VERSION + +FunctionRecord = tuple[int, str, int, int, int, int, str, str] +BlockRecord = tuple[int, str, int, int, int] +SegmentRecord = tuple[int, str, int, int, int, str, str] +SplitLists = dict[str, list[str]] +GroupsSplit = dict[str, SplitLists] + +GROUP_ITEM_LAYOUT: dict[str, list[str]] = { + "functions": [ + "file_i", + "qualname", + "start", + "end", + "loc", + "stmt_count", + "fingerprint", + "loc_bucket", + ], + "blocks": ["file_i", "qualname", "start", "end", "size"], + "segments": [ + "file_i", + "qualname", + "start", + "end", + "size", + "segment_hash", + "segment_sig", + ], +} + + +def _item_sort_key(item: GroupItem) -> tuple[str, int, int, str]: + return ( + str(item.get("filepath", "")), + int(item.get("start_line", 0)), + int(item.get("end_line", 0)), + str(item.get("qualname", "")), + ) + + +def _collect_files( + *, + func_groups: GroupMap, + block_groups: GroupMap, + segment_groups: GroupMap, +) -> list[str]: + files: set[str] = set() + for groups in (func_groups, block_groups, segment_groups): + for items in groups.values(): + for item in items: + files.add(str(item.get("filepath", ""))) + return sorted(files) + + +def _encode_function_item(item: GroupItem, file_id: int) -> FunctionRecord: + return ( + file_id, + str(item.get("qualname", "")), + int(item.get("start_line", 0)), + int(item.get("end_line", 0)), + int(item.get("loc", 0)), + int(item.get("stmt_count", 0)), + str(item.get("fingerprint", "")), + str(item.get("loc_bucket", "")), + ) + + +def _encode_block_item(item: GroupItem, file_id: int) -> BlockRecord: + return ( + file_id, + str(item.get("qualname", "")), + int(item.get("start_line", 0)), + int(item.get("end_line", 0)), + int(item.get("size", 0)), + ) + + +def _encode_segment_item(item: GroupItem, file_id: int) -> SegmentRecord: + return ( + file_id, + str(item.get("qualname", "")), + int(item.get("start_line", 0)), + int(item.get("end_line", 0)), + int(item.get("size", 0)), + str(item.get("segment_hash", "")), + str(item.get("segment_sig", "")), + ) + + +def _function_record_sort_key(record: FunctionRecord) -> tuple[int, str, int, int]: + return record[0], record[1], record[2], record[3] + + +def _block_record_sort_key(record: BlockRecord) -> tuple[int, str, int, int]: + return record[0], record[1], record[2], record[3] + + +def _segment_record_sort_key(record: SegmentRecord) -> tuple[int, str, int, int]: + return record[0], record[1], record[2], record[3] + + +def _resolve_metric_value(item: GroupItem, metric_name: str) -> int: + raw_value = item.get(metric_name) + if raw_value is None: + fallback_metric = "size" if metric_name == "loc" else "loc" + raw_value = item.get(fallback_metric, 0) + return int(raw_value) + + +def _baseline_is_trusted(meta: Mapping[str, object]) -> bool: + return ( + meta.get("baseline_loaded") is True + and str(meta.get("baseline_status", "")).strip().lower() == "ok" + ) def to_json(groups: GroupMap) -> str: def _sorted_items(items: list[GroupItem]) -> list[GroupItem]: - return sorted( - items, - key=lambda item: ( - str(item.get("filepath", "")), - int(item.get("start_line", 0)), - int(item.get("end_line", 0)), - str(item.get("qualname", "")), - ), - ) + return sorted(items, key=_item_sort_key) return json.dumps( { @@ -47,46 +153,129 @@ def to_json_report( func_groups: GroupMap, block_groups: GroupMap, segment_groups: GroupMap, - meta: Mapping[str, Any] | None = None, + meta: Mapping[str, object] | None = None, + block_facts: Mapping[str, Mapping[str, str]] | None = None, + new_function_group_keys: Collection[str] | None = None, + new_block_group_keys: Collection[str] | None = None, + new_segment_group_keys: Collection[str] | None = None, ) -> str: - def _sorted_items(items: list[GroupItem]) -> list[GroupItem]: - return sorted( - items, - key=lambda item: ( - str(item.get("filepath", "")), - int(item.get("start_line", 0)), - int(item.get("end_line", 0)), - str(item.get("qualname", "")), - ), + """ + Serialize report JSON schema v1.1. + + NEW/KNOWN split contract: + - if baseline is not trusted, all groups are NEW and KNOWN is empty + - if baseline is trusted, callers must pass `new_*_group_keys` computed by + the core baseline diff pipeline; keys absent from `new_*` are treated as KNOWN + """ + meta_payload = dict(meta or {}) + meta_payload["report_schema_version"] = REPORT_SCHEMA_VERSION + + files = _collect_files( + func_groups=func_groups, + block_groups=block_groups, + segment_groups=segment_groups, + ) + file_ids = {filepath: idx for idx, filepath in enumerate(files)} + + function_groups: dict[str, list[FunctionRecord]] = {} + for group_key in sorted(func_groups): + function_records = [ + _encode_function_item(item, file_ids[str(item.get("filepath", ""))]) + for item in func_groups[group_key] + ] + function_groups[group_key] = sorted( + function_records, key=_function_record_sort_key + ) + + block_groups_out: dict[str, list[BlockRecord]] = {} + for group_key in sorted(block_groups): + block_records = [ + _encode_block_item(item, file_ids[str(item.get("filepath", ""))]) + for item in block_groups[group_key] + ] + block_groups_out[group_key] = sorted(block_records, key=_block_record_sort_key) + + segment_groups_out: dict[str, list[SegmentRecord]] = {} + for group_key in sorted(segment_groups): + segment_records = [ + _encode_segment_item(item, file_ids[str(item.get("filepath", ""))]) + for item in segment_groups[group_key] + ] + segment_groups_out[group_key] = sorted( + segment_records, key=_segment_record_sort_key ) - def _sorted_group_map(groups: GroupMap) -> GroupMap: - return { - k: _sorted_items(v) - for k, v in sorted(groups.items(), key=lambda kv: (-len(kv[1]), kv[0])) + baseline_trusted = _baseline_is_trusted(meta_payload) + + def _split_for( + *, + keys: Collection[str], + new_keys: Collection[str] | None, + ) -> SplitLists: + sorted_keys = sorted(keys) + if not baseline_trusted: + return {"new": sorted_keys, "known": []} + if new_keys is None: + return {"new": sorted_keys, "known": []} + new_key_set = set(new_keys) + new_list = [group_key for group_key in sorted_keys if group_key in new_key_set] + known_list = [ + group_key for group_key in sorted_keys if group_key not in new_key_set + ] + return {"new": new_list, "known": known_list} + + groups_split: GroupsSplit = { + "functions": _split_for( + keys=function_groups.keys(), + new_keys=new_function_group_keys, + ), + "blocks": _split_for( + keys=block_groups_out.keys(), + new_keys=new_block_group_keys, + ), + "segments": _split_for( + keys=segment_groups_out.keys(), + new_keys=new_segment_group_keys, + ), + } + meta_payload["groups_counts"] = { + section_name: { + "total": len(section_split["new"]) + len(section_split["known"]), + "new": len(section_split["new"]), + "known": len(section_split["known"]), } + for section_name, section_split in groups_split.items() + } - meta_payload = dict(meta or {}) - func_sorted = _sorted_group_map(func_groups) - block_sorted = _sorted_group_map(block_groups) - segment_sorted = _sorted_group_map(segment_groups) - return json.dumps( - { - "meta": meta_payload, - "function_clones": func_sorted, - "block_clones": block_sorted, - "segment_clones": segment_sorted, - # Backward-compatible keys. - "functions": func_sorted, - "blocks": block_sorted, - "segments": segment_sorted, + payload: dict[str, object] = { + "meta": meta_payload, + "files": files, + "groups": { + "functions": function_groups, + "blocks": block_groups_out, + "segments": segment_groups_out, }, + "groups_split": groups_split, + "group_item_layout": GROUP_ITEM_LAYOUT, + } + + if block_facts: + sorted_block_facts: dict[str, dict[str, str]] = {} + for group_key in sorted(block_facts): + sorted_block_facts[group_key] = { + fact_key: str(block_facts[group_key][fact_key]) + for fact_key in sorted(block_facts[group_key]) + } + payload["facts"] = {"blocks": sorted_block_facts} + + return json.dumps( + payload, ensure_ascii=False, indent=2, ) -def to_text(groups: GroupMap) -> str: +def to_text(groups: GroupMap, *, metric_name: str = "loc") -> str: lines: list[str] = [] for i, (_, v) in enumerate( sorted(groups.items(), key=lambda kv: (-len(kv[1]), kv[0])) @@ -105,56 +294,125 @@ def to_text(groups: GroupMap) -> str: [ f"- {item['qualname']} " f"{item['filepath']}:{item['start_line']}-{item['end_line']} " - f"loc={item.get('loc', item.get('size'))}" + f"{metric_name}={_resolve_metric_value(item, metric_name)}" for item in items ] ) return "\n".join(lines).strip() + "\n" -def _format_meta_text_value(value: Any) -> str: +def _format_meta_text_value(value: object) -> str: if isinstance(value, bool): return "true" if value else "false" if value is None: - return "n/a" + return "(none)" text = str(value).strip() - return text if text else "n/a" + return text if text else "(none)" def to_text_report( *, - meta: Mapping[str, Any], + meta: Mapping[str, object], func_groups: GroupMap, block_groups: GroupMap, segment_groups: GroupMap, + new_function_group_keys: Collection[str] | None = None, + new_block_group_keys: Collection[str] | None = None, + new_segment_group_keys: Collection[str] | None = None, ) -> str: + """ + Serialize deterministic TXT report. + + NEW/KNOWN split follows the same contract as JSON v1.1. + """ + + baseline_trusted = _baseline_is_trusted(meta) + + def _split_for( + *, + groups: GroupMap, + new_keys: Collection[str] | None, + ) -> SplitLists: + sorted_keys = sorted(groups.keys()) + if not baseline_trusted: + return {"new": sorted_keys, "known": []} + if new_keys is None: + return {"new": sorted_keys, "known": []} + new_key_set = set(new_keys) + new_list = [group_key for group_key in sorted_keys if group_key in new_key_set] + known_list = [ + group_key for group_key in sorted_keys if group_key not in new_key_set + ] + return {"new": new_list, "known": known_list} + + groups_split: GroupsSplit = { + "functions": _split_for(groups=func_groups, new_keys=new_function_group_keys), + "blocks": _split_for(groups=block_groups, new_keys=new_block_group_keys), + "segments": _split_for(groups=segment_groups, new_keys=new_segment_group_keys), + } + lines = [ "REPORT METADATA", + "Report schema version: " + f"{_format_meta_text_value(meta.get('report_schema_version'))}", f"CodeClone version: {_format_meta_text_value(meta.get('codeclone_version'))}", f"Python version: {_format_meta_text_value(meta.get('python_version'))}", + f"Python tag: {_format_meta_text_value(meta.get('python_tag'))}", f"Baseline path: {_format_meta_text_value(meta.get('baseline_path'))}", - f"Baseline version: {_format_meta_text_value(meta.get('baseline_version'))}", + "Baseline fingerprint version: " + f"{_format_meta_text_value(meta.get('baseline_fingerprint_version'))}", "Baseline schema version: " f"{_format_meta_text_value(meta.get('baseline_schema_version'))}", - "Baseline Python version: " - f"{_format_meta_text_value(meta.get('baseline_python_version'))}", + "Baseline Python tag: " + f"{_format_meta_text_value(meta.get('baseline_python_tag'))}", + "Baseline generator name: " + f"{_format_meta_text_value(meta.get('baseline_generator_name'))}", + "Baseline generator version: " + f"{_format_meta_text_value(meta.get('baseline_generator_version'))}", + "Baseline payload sha256: " + f"{_format_meta_text_value(meta.get('baseline_payload_sha256'))}", + "Baseline payload verified: " + f"{_format_meta_text_value(meta.get('baseline_payload_sha256_verified'))}", f"Baseline loaded: {_format_meta_text_value(meta.get('baseline_loaded'))}", f"Baseline status: {_format_meta_text_value(meta.get('baseline_status'))}", + f"Cache path: {_format_meta_text_value(meta.get('cache_path'))}", + "Cache schema version: " + f"{_format_meta_text_value(meta.get('cache_schema_version'))}", + f"Cache status: {_format_meta_text_value(meta.get('cache_status'))}", + f"Cache used: {_format_meta_text_value(meta.get('cache_used'))}", + "Source IO skipped: " + f"{_format_meta_text_value(meta.get('files_skipped_source_io'))}", ] - if "cache_path" in meta: - lines.append(f"Cache path: {_format_meta_text_value(meta.get('cache_path'))}") - if "cache_used" in meta: - lines.append(f"Cache used: {_format_meta_text_value(meta.get('cache_used'))}") - - sections = [ - ("FUNCTION CLONES", func_groups), - ("BLOCK CLONES", block_groups), - ("SEGMENT CLONES", segment_groups), - ] - for title, groups in sections: + + if not baseline_trusted: + lines.append("Note: baseline is untrusted; all groups are treated as NEW.") + + sections = ( + ("FUNCTION CLONES", "functions", func_groups, "loc"), + ("BLOCK CLONES", "blocks", block_groups, "size"), + ("SEGMENT CLONES", "segments", segment_groups, "size"), + ) + for title, section_key, groups, metric_name in sections: + split = groups_split[section_key] + new_groups: GroupMap = { + group_key: groups[group_key] + for group_key in split["new"] + if group_key in groups + } + known_groups: GroupMap = { + group_key: groups[group_key] + for group_key in split["known"] + if group_key in groups + } + + lines.append("") + lines.append(f"{title} (NEW) (groups={len(split['new'])})") + new_block = to_text(new_groups, metric_name=metric_name).rstrip() + lines.append(new_block if new_block else "(none)") + lines.append("") - lines.append(title) - block = to_text(groups).rstrip() - lines.append(block if block else "(none)") + lines.append(f"{title} (KNOWN) (groups={len(split['known'])})") + known_block = to_text(known_groups, metric_name=metric_name).rstrip() + lines.append(known_block if known_block else "(none)") return "\n".join(lines).rstrip() + "\n" diff --git a/codeclone/_report_types.py b/codeclone/_report_types.py index 6fbe632..79a732f 100644 --- a/codeclone/_report_types.py +++ b/codeclone/_report_types.py @@ -10,5 +10,9 @@ from typing import Any +# Any: report items aggregate heterogeneous JSON-like payloads from multiple +# pipelines (function/block/segment) and are narrowed at access sites. GroupItem = dict[str, Any] + + GroupMap = dict[str, list[GroupItem]] diff --git a/codeclone/baseline.py b/codeclone/baseline.py index 4e0894f..b63f88c 100644 --- a/codeclone/baseline.py +++ b/codeclone/baseline.py @@ -11,29 +11,101 @@ import hashlib import hmac import json +import os +import re +import sys from collections.abc import Mapping from datetime import datetime, timezone +from enum import Enum from pathlib import Path -from typing import Any +from typing import Any, Final from . import __version__ +from .contracts import ( + BASELINE_FINGERPRINT_VERSION, + BASELINE_SCHEMA_VERSION, +) from .errors import BaselineValidationError -BASELINE_SCHEMA_VERSION = 1 -MAX_BASELINE_SIZE_BYTES = 5 * 1024 * 1024 +# Any: baseline JSON parsing/serialization boundary. Values are validated +# and narrowed before entering compatibility/integrity checks. + BASELINE_GENERATOR = "codeclone" +BASELINE_SCHEMA_MAJOR = 1 +BASELINE_SCHEMA_MAX_MINOR = 0 +MAX_BASELINE_SIZE_BYTES = 5 * 1024 * 1024 + + +class BaselineStatus(str, Enum): + OK = "ok" + MISSING = "missing" + TOO_LARGE = "too_large" + INVALID_JSON = "invalid_json" + INVALID_TYPE = "invalid_type" + MISSING_FIELDS = "missing_fields" + MISMATCH_SCHEMA_VERSION = "mismatch_schema_version" + MISMATCH_FINGERPRINT_VERSION = "mismatch_fingerprint_version" + MISMATCH_PYTHON_VERSION = "mismatch_python_version" + GENERATOR_MISMATCH = "generator_mismatch" + INTEGRITY_MISSING = "integrity_missing" + INTEGRITY_FAILED = "integrity_failed" + + +BASELINE_UNTRUSTED_STATUSES: Final[frozenset[BaselineStatus]] = frozenset( + { + BaselineStatus.MISSING, + BaselineStatus.TOO_LARGE, + BaselineStatus.INVALID_JSON, + BaselineStatus.INVALID_TYPE, + BaselineStatus.MISSING_FIELDS, + BaselineStatus.MISMATCH_SCHEMA_VERSION, + BaselineStatus.MISMATCH_FINGERPRINT_VERSION, + BaselineStatus.MISMATCH_PYTHON_VERSION, + BaselineStatus.GENERATOR_MISMATCH, + BaselineStatus.INTEGRITY_MISSING, + BaselineStatus.INTEGRITY_FAILED, + } +) + + +def coerce_baseline_status( + raw_status: str | BaselineStatus | None, +) -> BaselineStatus: + if isinstance(raw_status, BaselineStatus): + return raw_status + if isinstance(raw_status, str): + try: + return BaselineStatus(raw_status) + except ValueError: + return BaselineStatus.INVALID_TYPE + return BaselineStatus.INVALID_TYPE + + +_TOP_LEVEL_KEYS = {"meta", "clones"} +_META_REQUIRED_KEYS = { + "generator", + "schema_version", + "fingerprint_version", + "python_tag", + "created_at", + "payload_sha256", +} +_CLONES_REQUIRED_KEYS = {"functions", "blocks"} +_FUNCTION_ID_RE = re.compile(r"^[0-9a-f]{40}\|(?:\d+-\d+|\d+\+)$") +_BLOCK_ID_RE = re.compile(r"^[0-9a-f]{40}\|[0-9a-f]{40}\|[0-9a-f]{40}\|[0-9a-f]{40}$") class Baseline: __slots__ = ( - "baseline_version", "blocks", "created_at", + "fingerprint_version", "functions", "generator", + "generator_version", "path", "payload_sha256", - "python_version", + "python_tag", "schema_version", ) @@ -41,104 +113,207 @@ def __init__(self, path: str | Path): self.path = Path(path) self.functions: set[str] = set() self.blocks: set[str] = set() - self.python_version: str | None = None - self.baseline_version: str | None = None - self.schema_version: int | None = None self.generator: str | None = None - self.payload_sha256: str | None = None + self.schema_version: str | None = None + self.fingerprint_version: str | None = None + self.python_tag: str | None = None self.created_at: str | None = None + self.payload_sha256: str | None = None + self.generator_version: str | None = None def load(self, *, max_size_bytes: int | None = None) -> None: - if not self.path.exists(): - return - size_limit = ( - MAX_BASELINE_SIZE_BYTES if max_size_bytes is None else max_size_bytes - ) - try: - size = self.path.stat().st_size + exists = self.path.exists() except OSError as e: raise BaselineValidationError( - f"Cannot stat baseline file at {self.path}: {e}" + f"Cannot stat baseline file at {self.path}: {e}", + status=BaselineStatus.INVALID_TYPE, ) from e + if not exists: + return + + size_limit = ( + MAX_BASELINE_SIZE_BYTES if max_size_bytes is None else max_size_bytes + ) + size = _safe_stat_size(self.path) if size > size_limit: raise BaselineValidationError( "Baseline file is too large " - f"({size} bytes, max {size_limit} bytes) at {self.path}", - status="too_large", + f"({size} bytes, max {size_limit} bytes) at {self.path}. " + "Increase --max-baseline-size-mb or regenerate baseline.", + status=BaselineStatus.TOO_LARGE, ) - try: - data = json.loads(self.path.read_text("utf-8")) - except json.JSONDecodeError as e: + payload = _load_json_object(self.path) + if _is_legacy_baseline_payload(payload): raise BaselineValidationError( - f"Corrupted baseline file at {self.path}: {e}" - ) from e + "Baseline format is legacy (<=1.3.x) and must be regenerated. " + "Please run --update-baseline.", + status=BaselineStatus.MISSING_FIELDS, + ) + + _validate_top_level_structure(payload, path=self.path) - if not isinstance(data, dict): + meta_obj = payload.get("meta") + clones_obj = payload.get("clones") + if not isinstance(meta_obj, dict): + raise BaselineValidationError( + f"Invalid baseline schema at {self.path}: 'meta' must be object", + status=BaselineStatus.INVALID_TYPE, + ) + if not isinstance(clones_obj, dict): raise BaselineValidationError( - f"Baseline payload must be an object at {self.path}" + f"Invalid baseline schema at {self.path}: 'clones' must be object", + status=BaselineStatus.INVALID_TYPE, ) - functions = _require_str_list(data, "functions", path=self.path) - blocks = _require_str_list(data, "blocks", path=self.path) - python_version = _optional_str(data, "python_version", path=self.path) - baseline_version = _optional_str(data, "baseline_version", path=self.path) - schema_version = _optional_int(data, "schema_version", path=self.path) - generator = _optional_str_loose(data, "generator") - payload_sha256 = _optional_str_loose(data, "payload_sha256") - created_at = _optional_str(data, "created_at", path=self.path) + _validate_required_keys(meta_obj, _META_REQUIRED_KEYS, path=self.path) + _validate_required_keys(clones_obj, _CLONES_REQUIRED_KEYS, path=self.path) + _validate_exact_clone_keys(clones_obj, path=self.path) + + generator, generator_version = _parse_generator_meta(meta_obj, path=self.path) + schema_version = _require_semver_str(meta_obj, "schema_version", path=self.path) + fingerprint_version = _require_str( + meta_obj, "fingerprint_version", path=self.path + ) + python_tag = _require_python_tag(meta_obj, "python_tag", path=self.path) + created_at = _require_utc_iso8601_z(meta_obj, "created_at", path=self.path) + payload_sha256 = _require_str(meta_obj, "payload_sha256", path=self.path) + + function_ids = _require_sorted_unique_ids( + clones_obj, + "functions", + pattern=_FUNCTION_ID_RE, + path=self.path, + ) + block_ids = _require_sorted_unique_ids( + clones_obj, + "blocks", + pattern=_BLOCK_ID_RE, + path=self.path, + ) - self.functions = set(functions) - self.blocks = set(blocks) - self.python_version = python_version - self.baseline_version = baseline_version - self.schema_version = schema_version self.generator = generator - self.payload_sha256 = payload_sha256 + self.schema_version = schema_version + self.fingerprint_version = fingerprint_version + self.python_tag = python_tag self.created_at = created_at + self.payload_sha256 = payload_sha256 + self.generator_version = generator_version + self.functions = set(function_ids) + self.blocks = set(block_ids) def save(self) -> None: self.path.parent.mkdir(parents=True, exist_ok=True) - now_utc = datetime.now(timezone.utc).replace(microsecond=0).isoformat() - self.path.write_text( - json.dumps( - _baseline_payload( - self.functions, - self.blocks, - self.python_version, - self.baseline_version, - self.schema_version, - self.generator, - now_utc, - ), - indent=2, - ensure_ascii=False, - ), - "utf-8", - ) - - def is_legacy_format(self) -> bool: - return self.baseline_version is None or self.schema_version is None + payload = _baseline_payload( + functions=self.functions, + blocks=self.blocks, + generator=self.generator, + schema_version=self.schema_version, + fingerprint_version=self.fingerprint_version, + python_tag=self.python_tag, + generator_version=self.generator_version, + created_at=self.created_at, + ) + _atomic_write_json(self.path, payload) - def verify_integrity(self) -> None: - if self.is_legacy_format(): - return + def verify_compatibility(self, *, current_python_tag: str) -> None: if self.generator != BASELINE_GENERATOR: raise BaselineValidationError( "Baseline generator mismatch: expected 'codeclone'.", - status="generator_mismatch", + status=BaselineStatus.GENERATOR_MISMATCH, + ) + if self.schema_version is None: + raise BaselineValidationError( + "Baseline schema version is missing.", + status=BaselineStatus.MISSING_FIELDS, + ) + if self.fingerprint_version is None: + raise BaselineValidationError( + "Baseline fingerprint version is missing.", + status=BaselineStatus.MISSING_FIELDS, + ) + if self.python_tag is None: + raise BaselineValidationError( + "Baseline python_tag is missing.", + status=BaselineStatus.MISSING_FIELDS, + ) + + schema_major, schema_minor, _ = _parse_semver( + self.schema_version, key="schema_version", path=self.path + ) + if schema_major != BASELINE_SCHEMA_MAJOR: + raise BaselineValidationError( + "Baseline schema version mismatch: " + f"baseline={self.schema_version}, " + f"supported_major={BASELINE_SCHEMA_MAJOR}.", + status=BaselineStatus.MISMATCH_SCHEMA_VERSION, + ) + if schema_minor > BASELINE_SCHEMA_MAX_MINOR: + raise BaselineValidationError( + "Baseline schema version is newer than supported: " + f"baseline={self.schema_version}, " + f"max=1.{BASELINE_SCHEMA_MAX_MINOR}.", + status=BaselineStatus.MISMATCH_SCHEMA_VERSION, + ) + if self.fingerprint_version != BASELINE_FINGERPRINT_VERSION: + raise BaselineValidationError( + "Baseline fingerprint version mismatch: " + f"baseline={self.fingerprint_version}, " + f"expected={BASELINE_FINGERPRINT_VERSION}.", + status=BaselineStatus.MISMATCH_FINGERPRINT_VERSION, + ) + if self.python_tag != current_python_tag: + raise BaselineValidationError( + "Baseline python tag mismatch: " + f"baseline={self.python_tag}, current={current_python_tag}.", + status=BaselineStatus.MISMATCH_PYTHON_VERSION, ) + self.verify_integrity() + + def verify_integrity(self) -> None: if not isinstance(self.payload_sha256, str): raise BaselineValidationError( "Baseline integrity payload hash is missing.", - status="integrity_missing", + status=BaselineStatus.INTEGRITY_MISSING, + ) + if len(self.payload_sha256) != 64: + raise BaselineValidationError( + "Baseline integrity payload hash is missing.", + status=BaselineStatus.INTEGRITY_MISSING, + ) + try: + int(self.payload_sha256, 16) + except ValueError as e: + raise BaselineValidationError( + "Baseline integrity payload hash is missing.", + status=BaselineStatus.INTEGRITY_MISSING, + ) from e + if self.schema_version is None: + raise BaselineValidationError( + "Baseline schema version is missing for integrity validation.", + status=BaselineStatus.MISSING_FIELDS, ) - expected = _compute_payload_sha256(self.functions, self.blocks) + if self.fingerprint_version is None: + raise BaselineValidationError( + "Baseline fingerprint version is missing for integrity validation.", + status=BaselineStatus.MISSING_FIELDS, + ) + if self.python_tag is None: + raise BaselineValidationError( + "Baseline python_tag is missing for integrity validation.", + status=BaselineStatus.MISSING_FIELDS, + ) + expected = _compute_payload_sha256( + functions=self.functions, + blocks=self.blocks, + fingerprint_version=self.fingerprint_version, + python_tag=self.python_tag, + ) if not hmac.compare_digest(self.payload_sha256, expected): raise BaselineValidationError( "Baseline integrity check failed: payload_sha256 mismatch.", - status="integrity_failed", + status=BaselineStatus.INTEGRITY_FAILED, ) @staticmethod @@ -146,18 +321,22 @@ def from_groups( func_groups: Mapping[str, object], block_groups: Mapping[str, object], path: str | Path = "", - python_version: str | None = None, - baseline_version: str | None = None, - schema_version: int | None = None, + schema_version: str | None = None, + fingerprint_version: str | None = None, + python_tag: str | None = None, + generator_version: str | None = None, ) -> Baseline: - bl = Baseline(path) - bl.functions = set(func_groups.keys()) - bl.blocks = set(block_groups.keys()) - bl.python_version = python_version - bl.baseline_version = baseline_version - bl.schema_version = schema_version - bl.generator = BASELINE_GENERATOR - return bl + baseline = Baseline(path) + baseline.functions = set(func_groups.keys()) + baseline.blocks = set(block_groups.keys()) + baseline.generator = BASELINE_GENERATOR + baseline.schema_version = schema_version or BASELINE_SCHEMA_VERSION + baseline.fingerprint_version = ( + fingerprint_version or BASELINE_FINGERPRINT_VERSION + ) + baseline.python_tag = python_tag or current_python_tag() + baseline.generator_version = generator_version or __version__ + return baseline def diff( self, func_groups: Mapping[str, object], block_groups: Mapping[str, object] @@ -167,39 +346,194 @@ def diff( return new_funcs, new_blocks +def _atomic_write_json(path: Path, payload: dict[str, Any]) -> None: + tmp_path = path.with_name(f"{path.name}.tmp") + data = json.dumps(payload, indent=2, ensure_ascii=False) + "\n" + with tmp_path.open("wb") as tmp_file: + tmp_file.write(data.encode("utf-8")) + tmp_file.flush() + os.fsync(tmp_file.fileno()) + os.replace(tmp_path, path) + + +def _safe_stat_size(path: Path) -> int: + try: + return path.stat().st_size + except OSError as e: + raise BaselineValidationError( + f"Cannot stat baseline file at {path}: {e}", + status=BaselineStatus.INVALID_TYPE, + ) from e + + +def _load_json_object(path: Path) -> dict[str, Any]: + try: + raw = path.read_text("utf-8") + except OSError as e: + raise BaselineValidationError( + f"Cannot read baseline file at {path}: {e}", + status=BaselineStatus.INVALID_JSON, + ) from e + try: + data = json.loads(raw) + except json.JSONDecodeError as e: + raise BaselineValidationError( + f"Corrupted baseline file at {path}: {e}", + status=BaselineStatus.INVALID_JSON, + ) from e + if not isinstance(data, dict): + raise BaselineValidationError( + f"Baseline payload must be an object at {path}", + status=BaselineStatus.INVALID_TYPE, + ) + return data + + +def _validate_top_level_structure(payload: dict[str, Any], *, path: Path) -> None: + keys = set(payload.keys()) + missing = _TOP_LEVEL_KEYS - keys + extra = keys - _TOP_LEVEL_KEYS + if missing: + raise BaselineValidationError( + f"Invalid baseline schema at {path}: missing top-level keys: " + f"{', '.join(sorted(missing))}", + status=BaselineStatus.MISSING_FIELDS, + ) + if extra: + raise BaselineValidationError( + f"Invalid baseline schema at {path}: unexpected top-level keys: " + f"{', '.join(sorted(extra))}", + status=BaselineStatus.INVALID_TYPE, + ) + + +def _validate_required_keys( + obj: dict[str, Any], required: set[str], *, path: Path +) -> None: + missing = required - set(obj.keys()) + if missing: + raise BaselineValidationError( + f"Invalid baseline schema at {path}: missing required fields: " + f"{', '.join(sorted(missing))}", + status=BaselineStatus.MISSING_FIELDS, + ) + + +def _validate_exact_clone_keys(clones: dict[str, Any], *, path: Path) -> None: + keys = set(clones.keys()) + extra = keys - _CLONES_REQUIRED_KEYS + if extra: + raise BaselineValidationError( + f"Invalid baseline schema at {path}: unexpected clone keys: " + f"{', '.join(sorted(extra))}", + status=BaselineStatus.INVALID_TYPE, + ) + + +def _is_legacy_baseline_payload(payload: dict[str, Any]) -> bool: + return "functions" in payload and "blocks" in payload + + +def _parse_generator_meta( + meta_obj: dict[str, Any], *, path: Path +) -> tuple[str, str | None]: + raw_generator = meta_obj.get("generator") + + if isinstance(raw_generator, str): + generator_version = _optional_str(meta_obj, "generator_version", path=path) + if generator_version is None: + # Legacy alias for baselines produced before generator_version rename. + generator_version = _optional_str(meta_obj, "codeclone_version", path=path) + return raw_generator, generator_version + + if isinstance(raw_generator, dict): + allowed_keys = {"name", "version"} + extra = set(raw_generator.keys()) - allowed_keys + if extra: + raise BaselineValidationError( + f"Invalid baseline schema at {path}: unexpected generator keys: " + f"{', '.join(sorted(extra))}", + status=BaselineStatus.INVALID_TYPE, + ) + generator_name = _require_str(raw_generator, "name", path=path) + generator_version = _optional_str(raw_generator, "version", path=path) + + if generator_version is None: + generator_version = _optional_str(meta_obj, "generator_version", path=path) + if generator_version is None: + generator_version = _optional_str( + meta_obj, "codeclone_version", path=path + ) + + return generator_name, generator_version + + raise BaselineValidationError( + f"Invalid baseline schema at {path}: 'generator' must be string or object", + status=BaselineStatus.INVALID_TYPE, + ) + + def _baseline_payload( + *, functions: set[str], blocks: set[str], - python_version: str | None, - baseline_version: str | None, - schema_version: int | None, generator: str | None, + schema_version: str | None, + fingerprint_version: str | None, + python_tag: str | None, + generator_version: str | None, created_at: str | None, ) -> dict[str, Any]: - payload: dict[str, Any] = _canonical_payload(functions, blocks) - if python_version: - payload["python_version"] = python_version - payload["baseline_version"] = baseline_version or __version__ - payload["schema_version"] = ( - schema_version if schema_version is not None else BASELINE_SCHEMA_VERSION + resolved_generator = generator or BASELINE_GENERATOR + resolved_schema = schema_version or BASELINE_SCHEMA_VERSION + resolved_fingerprint = fingerprint_version or BASELINE_FINGERPRINT_VERSION + resolved_python_tag = python_tag or current_python_tag() + resolved_generator_version = generator_version or __version__ + resolved_created_at = created_at or _utc_now_z() + + sorted_functions = sorted(functions) + sorted_blocks = sorted(blocks) + payload_sha256 = _compute_payload_sha256( + functions=set(sorted_functions), + blocks=set(sorted_blocks), + fingerprint_version=resolved_fingerprint, + python_tag=resolved_python_tag, ) - payload["generator"] = generator or BASELINE_GENERATOR - payload["payload_sha256"] = _compute_payload_sha256(functions, blocks) - if created_at: - payload["created_at"] = created_at - return payload - -def _canonical_payload(functions: set[str], blocks: set[str]) -> dict[str, list[str]]: return { - "functions": sorted(functions), - "blocks": sorted(blocks), + "meta": { + "generator": { + "name": resolved_generator, + "version": resolved_generator_version, + }, + "schema_version": resolved_schema, + "fingerprint_version": resolved_fingerprint, + "python_tag": resolved_python_tag, + "created_at": resolved_created_at, + "payload_sha256": payload_sha256, + }, + "clones": { + "functions": sorted_functions, + "blocks": sorted_blocks, + }, } -def _compute_payload_sha256(functions: set[str], blocks: set[str]) -> str: +def _compute_payload_sha256( + *, + functions: set[str], + blocks: set[str], + fingerprint_version: str, + python_tag: str, +) -> str: + canonical = { + "blocks": sorted(blocks), + "fingerprint_version": fingerprint_version, + "functions": sorted(functions), + "python_tag": python_tag, + } serialized = json.dumps( - _canonical_payload(functions, blocks), + canonical, sort_keys=True, separators=(",", ":"), ensure_ascii=False, @@ -207,39 +541,108 @@ def _compute_payload_sha256(functions: set[str], blocks: set[str]) -> str: return hashlib.sha256(serialized.encode("utf-8")).hexdigest() -def _require_str_list(data: dict[str, Any], key: str, *, path: Path) -> list[str]: - value = data.get(key) - if not isinstance(value, list) or not all(isinstance(v, str) for v in value): +def current_python_tag() -> str: + """Return the interpreter compatibility tag as an immutable string.""" + impl = sys.implementation.name + major, minor = sys.version_info[:2] + prefix = "cp" if impl == "cpython" else impl[:2] + return f"{prefix}{major}{minor}" + + +def _utc_now_z() -> str: + return ( + datetime.now(timezone.utc).replace(microsecond=0).strftime("%Y-%m-%dT%H:%M:%SZ") + ) + + +def _require_str(obj: dict[str, Any], key: str, *, path: Path) -> str: + value = obj.get(key) + if not isinstance(value, str): raise BaselineValidationError( - f"Invalid baseline schema at {path}: '{key}' must be list[str]" + f"Invalid baseline schema at {path}: '{key}' must be string", + status=BaselineStatus.INVALID_TYPE, ) return value -def _optional_str(data: dict[str, Any], key: str, *, path: Path) -> str | None: - value = data.get(key) +def _optional_str(obj: dict[str, Any], key: str, *, path: Path) -> str | None: + value = obj.get(key) if value is None: return None if not isinstance(value, str): raise BaselineValidationError( - f"Invalid baseline schema at {path}: '{key}' must be string" + f"Invalid baseline schema at {path}: '{key}' must be string", + status=BaselineStatus.INVALID_TYPE, ) return value -def _optional_int(data: dict[str, Any], key: str, *, path: Path) -> int | None: - value = data.get(key) - if value is None: - return None - if not isinstance(value, int): +def _require_semver_str(obj: dict[str, Any], key: str, *, path: Path) -> str: + value = _require_str(obj, key, path=path) + _parse_semver(value, key=key, path=path) + return value + + +def _parse_semver(value: str, *, key: str, path: Path) -> tuple[int, int, int]: + parts = value.split(".") + if len(parts) not in {2, 3} or not all(part.isdigit() for part in parts): + raise BaselineValidationError( + f"Invalid baseline schema at {path}: '{key}' must be semver string", + status=BaselineStatus.INVALID_TYPE, + ) + if len(parts) == 2: + major, minor = int(parts[0]), int(parts[1]) + patch = 0 + else: + major, minor, patch = int(parts[0]), int(parts[1]), int(parts[2]) + return major, minor, patch + + +def _require_python_tag(obj: dict[str, Any], key: str, *, path: Path) -> str: + value = _require_str(obj, key, path=path) + if not re.fullmatch(r"[a-z]{2}\d{2,3}", value): raise BaselineValidationError( - f"Invalid baseline schema at {path}: '{key}' must be integer" + f"Invalid baseline schema at {path}: '{key}' must look like 'cp313'", + status=BaselineStatus.INVALID_TYPE, ) return value -def _optional_str_loose(data: dict[str, Any], key: str) -> str | None: - value = data.get(key) - if isinstance(value, str): - return value - return None +def _require_utc_iso8601_z(obj: dict[str, Any], key: str, *, path: Path) -> str: + value = _require_str(obj, key, path=path) + try: + datetime.strptime(value, "%Y-%m-%dT%H:%M:%SZ") + except ValueError as e: + raise BaselineValidationError( + f"Invalid baseline schema at {path}: '{key}' must be UTC ISO-8601 with Z", + status=BaselineStatus.INVALID_TYPE, + ) from e + return value + + +def _require_sorted_unique_ids( + obj: dict[str, Any], key: str, *, pattern: re.Pattern[str], path: Path +) -> list[str]: + value = obj.get(key) + if not isinstance(value, list): + raise BaselineValidationError( + f"Invalid baseline schema at {path}: '{key}' must be list[str]", + status=BaselineStatus.INVALID_TYPE, + ) + if not all(isinstance(item, str) for item in value): + raise BaselineValidationError( + f"Invalid baseline schema at {path}: '{key}' must be list[str]", + status=BaselineStatus.INVALID_TYPE, + ) + values = list(value) + if values != sorted(values) or len(values) != len(set(values)): + raise BaselineValidationError( + f"Invalid baseline schema at {path}: '{key}' must be sorted and unique", + status=BaselineStatus.INVALID_TYPE, + ) + if not all(pattern.fullmatch(item) for item in values): + raise BaselineValidationError( + f"Invalid baseline schema at {path}: '{key}' has invalid id format", + status=BaselineStatus.INVALID_TYPE, + ) + return values diff --git a/codeclone/cache.py b/codeclone/cache.py index 566e82e..3753148 100644 --- a/codeclone/cache.py +++ b/codeclone/cache.py @@ -12,20 +12,34 @@ import hmac import json import os -import secrets -from collections.abc import Mapping -from dataclasses import asdict +from collections.abc import Mapping, Sequence +from enum import Enum from pathlib import Path -from typing import TYPE_CHECKING, Any, TypedDict, cast +from typing import TYPE_CHECKING, TypedDict if TYPE_CHECKING: from .blocks import BlockUnit, SegmentUnit from .extractor import Unit +from .baseline import current_python_tag +from .contracts import BASELINE_FINGERPRINT_VERSION, CACHE_VERSION from .errors import CacheError -OS_NAME = os.name MAX_CACHE_SIZE_BYTES = 50 * 1024 * 1024 +LEGACY_CACHE_SECRET_FILENAME = ".cache_secret" + + +class CacheStatus(str, Enum): + OK = "ok" + MISSING = "missing" + TOO_LARGE = "too_large" + UNREADABLE = "unreadable" + INVALID_JSON = "invalid_json" + INVALID_TYPE = "invalid_type" + VERSION_MISMATCH = "version_mismatch" + PYTHON_TAG_MISMATCH = "python_tag_mismatch" + FINGERPRINT_MISMATCH = "mismatch_fingerprint_version" + INTEGRITY_FAILED = "integrity_failed" class FileStat(TypedDict): @@ -72,115 +86,331 @@ class CacheEntry(TypedDict): class CacheData(TypedDict): version: str + python_tag: str + fingerprint_version: str files: dict[str, CacheEntry] class Cache: - __slots__ = ("data", "load_warning", "max_size_bytes", "path", "secret") - CACHE_VERSION = "1.1" + __slots__ = ( + "cache_schema_version", + "data", + "fingerprint_version", + "legacy_secret_warning", + "load_status", + "load_warning", + "max_size_bytes", + "path", + "root", + ) + + _CACHE_VERSION = CACHE_VERSION - def __init__(self, path: str | Path, *, max_size_bytes: int | None = None): + def __init__( + self, + path: str | Path, + *, + root: str | Path | None = None, + max_size_bytes: int | None = None, + ): self.path = Path(path) - self.data: CacheData = {"version": self.CACHE_VERSION, "files": {}} - self.secret = self._load_secret() - self.load_warning: str | None = None + self.root = _resolve_root(root) + self.fingerprint_version = BASELINE_FINGERPRINT_VERSION + self.data: CacheData = _empty_cache_data( + version=self._CACHE_VERSION, + python_tag=current_python_tag(), + fingerprint_version=self.fingerprint_version, + ) + self.legacy_secret_warning = self._detect_legacy_secret_warning() + self.cache_schema_version: str | None = None + self.load_status = CacheStatus.MISSING + self.load_warning: str | None = self.legacy_secret_warning self.max_size_bytes = ( MAX_CACHE_SIZE_BYTES if max_size_bytes is None else max_size_bytes ) - def _load_secret(self) -> bytes: - """Load or create cache signing secret.""" - # Store secret in the same directory as the cache file, named .cache_secret - # If cache is at ~/.cache/codeclone/cache.json, secret is - # ~/.cache/codeclone/.cache_secret - secret_path = self.path.parent / ".cache_secret" - if secret_path.exists(): - return secret_path.read_bytes() - else: - secret = secrets.token_bytes(32) - try: - self.path.parent.mkdir(parents=True, exist_ok=True) - secret_path.write_bytes(secret) - # Set restrictive permissions on secret file (Unix only) - if OS_NAME == "posix": - secret_path.chmod(0o600) - except OSError: - pass - return secret - - def _sign_data(self, data: Mapping[str, Any]) -> str: - """Create HMAC signature of cache data.""" - # Sort keys for deterministic JSON serialization - data_str = json.dumps(data, sort_keys=True) - return hmac.new(self.secret, data_str.encode(), hashlib.sha256).hexdigest() + def _detect_legacy_secret_warning(self) -> str | None: + secret_path = self.path.parent / LEGACY_CACHE_SECRET_FILENAME + try: + if secret_path.exists(): + return ( + f"Legacy cache secret file detected at {secret_path}; " + "delete this obsolete file." + ) + except OSError as e: + return f"Legacy cache secret check failed: {e}" + return None + + def _set_load_warning(self, message: str | None) -> None: + if message is None: + self.load_warning = self.legacy_secret_warning + return + if self.legacy_secret_warning: + self.load_warning = f"{message}\n{self.legacy_secret_warning}" + return + self.load_warning = message + + def _ignore_cache( + self, + message: str, + *, + status: CacheStatus, + schema_version: str | None = None, + ) -> None: + self._set_load_warning(message) + self.load_status = status + self.cache_schema_version = schema_version + self.data = _empty_cache_data( + version=self._CACHE_VERSION, + python_tag=current_python_tag(), + fingerprint_version=self.fingerprint_version, + ) + + def _sign_data(self, data: Mapping[str, object]) -> str: + """Create deterministic SHA-256 signature for canonical payload data.""" + canonical = _canonical_json(data) + return hashlib.sha256(canonical.encode("utf-8")).hexdigest() def load(self) -> None: - if not self.path.exists(): + try: + exists = self.path.exists() + except OSError as e: + self._ignore_cache( + f"Cache unreadable; ignoring cache: {e}", + status=CacheStatus.UNREADABLE, + ) + return + + if not exists: + self._set_load_warning(None) + self.load_status = CacheStatus.MISSING + self.cache_schema_version = None return try: size = self.path.stat().st_size if size > self.max_size_bytes: - self.load_warning = ( + self._ignore_cache( "Cache file too large " - f"({size} bytes, max {self.max_size_bytes}); ignoring cache." + f"({size} bytes, max {self.max_size_bytes}); ignoring cache.", + status=CacheStatus.TOO_LARGE, ) - self.data = {"version": self.CACHE_VERSION, "files": {}} return - raw = json.loads(self.path.read_text("utf-8")) - stored_sig = raw.get("_signature") + raw_obj: object = json.loads(self.path.read_text("utf-8")) + parsed = self._parse_cache_document(raw_obj) + if parsed is None: + return + self.data = parsed + self.load_status = CacheStatus.OK + self._set_load_warning(None) - # Extract data without signature for verification - data = {k: v for k, v in raw.items() if k != "_signature"} + except OSError as e: + self._ignore_cache( + f"Cache unreadable; ignoring cache: {e}", + status=CacheStatus.UNREADABLE, + ) + except json.JSONDecodeError: + self._ignore_cache( + "Cache corrupted; ignoring cache.", + status=CacheStatus.INVALID_JSON, + ) - # Verify signature - expected_sig = self._sign_data(data) - if not ( - isinstance(stored_sig, str) - and hmac.compare_digest(stored_sig, expected_sig) - ): - self.load_warning = "Cache signature mismatch; ignoring cache." - self.data = {"version": self.CACHE_VERSION, "files": {}} - return + def _parse_cache_document(self, raw_obj: object) -> CacheData | None: + raw = _as_str_dict(raw_obj) + if raw is None: + self._ignore_cache( + "Cache format invalid; ignoring cache.", + status=CacheStatus.INVALID_TYPE, + ) + return None - if data.get("version") != self.CACHE_VERSION: - self.load_warning = ( - "Cache version mismatch " - f"(found {data.get('version')}); ignoring cache." - ) - self.data = {"version": self.CACHE_VERSION, "files": {}} - return + # Legacy cache format: top-level {version, files, _signature}. + legacy_version = _as_str(raw.get("version")) + if legacy_version is not None: + self._ignore_cache( + f"Cache version mismatch (found {legacy_version}); ignoring cache.", + status=CacheStatus.VERSION_MISMATCH, + schema_version=legacy_version, + ) + return None - # Basic structure check - if not isinstance(data.get("files"), dict): - self.load_warning = "Cache format invalid; ignoring cache." - self.data = {"version": self.CACHE_VERSION, "files": {}} - return + version = _as_str(raw.get("v")) + if version is None: + self._ignore_cache( + "Cache format invalid; ignoring cache.", + status=CacheStatus.INVALID_TYPE, + ) + return None + + if version != self._CACHE_VERSION: + self._ignore_cache( + f"Cache version mismatch (found {version}); ignoring cache.", + status=CacheStatus.VERSION_MISMATCH, + schema_version=version, + ) + return None + + sig = _as_str(raw.get("sig")) + payload_obj = raw.get("payload") + payload = _as_str_dict(payload_obj) + if sig is None or payload is None: + self._ignore_cache( + "Cache format invalid; ignoring cache.", + status=CacheStatus.INVALID_TYPE, + schema_version=version, + ) + return None + + expected_sig = self._sign_data(payload) + if not hmac.compare_digest(sig, expected_sig): + self._ignore_cache( + "Cache signature mismatch; ignoring cache.", + status=CacheStatus.INTEGRITY_FAILED, + schema_version=version, + ) + return None - self.data = cast(CacheData, cast(object, data)) - self.load_warning = None + runtime_tag = current_python_tag() + py_tag = _as_str(payload.get("py")) + if py_tag is None: + self._ignore_cache( + "Cache format invalid; ignoring cache.", + status=CacheStatus.INVALID_TYPE, + schema_version=version, + ) + return None + + if py_tag != runtime_tag: + self._ignore_cache( + "Cache python tag mismatch " + f"(found {py_tag}, expected {runtime_tag}); ignoring cache.", + status=CacheStatus.PYTHON_TAG_MISMATCH, + schema_version=version, + ) + return None - except (json.JSONDecodeError, ValueError): - self.load_warning = "Cache corrupted; ignoring cache." - self.data = {"version": self.CACHE_VERSION, "files": {}} + fp_version = _as_str(payload.get("fp")) + if fp_version is None: + self._ignore_cache( + "Cache format invalid; ignoring cache.", + status=CacheStatus.INVALID_TYPE, + schema_version=version, + ) + return None + + if fp_version != self.fingerprint_version: + self._ignore_cache( + "Cache fingerprint version mismatch " + f"(found {fp_version}, expected {self.fingerprint_version}); " + "ignoring cache.", + status=CacheStatus.FINGERPRINT_MISMATCH, + schema_version=version, + ) + return None + + files_obj = payload.get("files") + files_dict = _as_str_dict(files_obj) + if files_dict is None: + self._ignore_cache( + "Cache format invalid; ignoring cache.", + status=CacheStatus.INVALID_TYPE, + schema_version=version, + ) + return None + + parsed_files: dict[str, CacheEntry] = {} + for wire_path, file_entry_obj in files_dict.items(): + runtime_path = self._runtime_filepath_from_wire(wire_path) + parsed_entry = _decode_wire_file_entry(file_entry_obj, runtime_path) + if parsed_entry is None: + self._ignore_cache( + "Cache format invalid; ignoring cache.", + status=CacheStatus.INVALID_TYPE, + schema_version=version, + ) + return None + parsed_files[runtime_path] = parsed_entry + + self.cache_schema_version = version + return { + "version": self._CACHE_VERSION, + "python_tag": runtime_tag, + "fingerprint_version": self.fingerprint_version, + "files": parsed_files, + } def save(self) -> None: try: self.path.parent.mkdir(parents=True, exist_ok=True) + wire_files: dict[str, object] = {} + for runtime_path in sorted( + self.data["files"], key=self._wire_filepath_from_runtime + ): + entry = self.get_file_entry(runtime_path) + if entry is None: + continue + wire_path = self._wire_filepath_from_runtime(runtime_path) + wire_files[wire_path] = _encode_wire_file_entry(entry) + + payload: dict[str, object] = { + "py": current_python_tag(), + "fp": self.fingerprint_version, + "files": wire_files, + } + signed_doc = { + "v": self._CACHE_VERSION, + "payload": payload, + "sig": self._sign_data(payload), + } + + tmp_path = self.path.with_name(f"{self.path.name}.tmp") + tmp_path.write_text(_canonical_json(signed_doc), "utf-8") + os.replace(tmp_path, self.path) + + self.data["version"] = self._CACHE_VERSION + self.data["python_tag"] = current_python_tag() + self.data["fingerprint_version"] = self.fingerprint_version - # Add signature - data_with_sig = {**self.data, "_signature": self._sign_data(self.data)} - - self.path.write_text( - json.dumps(data_with_sig, ensure_ascii=False, indent=2), - "utf-8", - ) except OSError as e: raise CacheError(f"Failed to save cache: {e}") from e + def _wire_filepath_from_runtime(self, runtime_filepath: str) -> str: + runtime_path = Path(runtime_filepath) + if self.root is None: + return runtime_path.as_posix() + + try: + relative = runtime_path.relative_to(self.root) + return relative.as_posix() + except ValueError: + pass + + try: + relative = runtime_path.resolve().relative_to(self.root.resolve()) + return relative.as_posix() + except OSError: + return runtime_path.as_posix() + except ValueError: + return runtime_path.as_posix() + + def _runtime_filepath_from_wire(self, wire_filepath: str) -> str: + wire_path = Path(wire_filepath) + if self.root is None or wire_path.is_absolute(): + return str(wire_path) + + combined = self.root / wire_path + try: + return str(combined.resolve(strict=False)) + except OSError: + return str(combined) + def get_file_entry(self, filepath: str) -> CacheEntry | None: entry = self.data["files"].get(filepath) + if entry is None: + wire_key = self._wire_filepath_from_runtime(filepath) + runtime_key = self._runtime_filepath_from_wire(wire_key) + entry = self.data["files"].get(runtime_key) if entry is None: return None @@ -214,13 +444,54 @@ def put_file_entry( blocks: list[BlockUnit], segments: list[SegmentUnit], ) -> None: - self.data["files"][filepath] = { + runtime_path = self._runtime_filepath_from_wire( + self._wire_filepath_from_runtime(filepath) + ) + + unit_rows: list[UnitDict] = [ + { + "qualname": unit.qualname, + "filepath": runtime_path, + "start_line": unit.start_line, + "end_line": unit.end_line, + "loc": unit.loc, + "stmt_count": unit.stmt_count, + "fingerprint": unit.fingerprint, + "loc_bucket": unit.loc_bucket, + } + for unit in units + ] + + block_rows: list[BlockDict] = [ + { + "block_hash": block.block_hash, + "filepath": runtime_path, + "qualname": block.qualname, + "start_line": block.start_line, + "end_line": block.end_line, + "size": block.size, + } + for block in blocks + ] + + segment_rows: list[SegmentDict] = [ + { + "segment_hash": segment.segment_hash, + "segment_sig": segment.segment_sig, + "filepath": runtime_path, + "qualname": segment.qualname, + "start_line": segment.start_line, + "end_line": segment.end_line, + "size": segment.size, + } + for segment in segments + ] + + self.data["files"][runtime_path] = { "stat": stat_sig, - "units": cast(list[UnitDict], cast(object, [asdict(u) for u in units])), - "blocks": cast(list[BlockDict], cast(object, [asdict(b) for b in blocks])), - "segments": cast( - list[SegmentDict], cast(object, [asdict(s) for s in segments]) - ), + "units": unit_rows, + "blocks": block_rows, + "segments": segment_rows, } @@ -232,6 +503,286 @@ def file_stat_signature(path: str) -> FileStat: } +def _empty_cache_data( + *, + version: str, + python_tag: str, + fingerprint_version: str, +) -> CacheData: + return { + "version": version, + "python_tag": python_tag, + "fingerprint_version": fingerprint_version, + "files": {}, + } + + +def _canonical_json(data: object) -> str: + return json.dumps(data, sort_keys=True, separators=(",", ":"), ensure_ascii=False) + + +def _as_str(value: object) -> str | None: + return value if isinstance(value, str) else None + + +def _as_int(value: object) -> int | None: + return value if isinstance(value, int) else None + + +def _as_list(value: object) -> list[object] | None: + return value if isinstance(value, list) else None + + +def _as_str_dict(value: object) -> dict[str, object] | None: + if not isinstance(value, dict): + return None + for key in value: + if not isinstance(key, str): + return None + return value + + +def _decode_wire_file_entry(value: object, filepath: str) -> CacheEntry | None: + obj = _as_str_dict(value) + if obj is None: + return None + + stat_obj = obj.get("st") + stat_list = _as_list(stat_obj) + if stat_list is None or len(stat_list) != 2: + return None + mtime_ns = _as_int(stat_list[0]) + size = _as_int(stat_list[1]) + if mtime_ns is None or size is None: + return None + + units: list[UnitDict] = [] + blocks: list[BlockDict] = [] + segments: list[SegmentDict] = [] + + units_obj = obj.get("u") + if units_obj is not None: + units_list = _as_list(units_obj) + if units_list is None: + return None + for unit_obj in units_list: + decoded_unit = _decode_wire_unit(unit_obj, filepath) + if decoded_unit is None: + return None + units.append(decoded_unit) + + blocks_obj = obj.get("b") + if blocks_obj is not None: + blocks_list = _as_list(blocks_obj) + if blocks_list is None: + return None + for block_obj in blocks_list: + decoded_block = _decode_wire_block(block_obj, filepath) + if decoded_block is None: + return None + blocks.append(decoded_block) + + segments_obj = obj.get("s") + if segments_obj is not None: + segments_list = _as_list(segments_obj) + if segments_list is None: + return None + for segment_obj in segments_list: + decoded_segment = _decode_wire_segment(segment_obj, filepath) + if decoded_segment is None: + return None + segments.append(decoded_segment) + + return { + "stat": {"mtime_ns": mtime_ns, "size": size}, + "units": units, + "blocks": blocks, + "segments": segments, + } + + +def _decode_wire_unit(value: object, filepath: str) -> UnitDict | None: + row = _as_list(value) + if row is None or len(row) != 7: + return None + + qualname = _as_str(row[0]) + start_line = _as_int(row[1]) + end_line = _as_int(row[2]) + loc = _as_int(row[3]) + stmt_count = _as_int(row[4]) + fingerprint = _as_str(row[5]) + loc_bucket = _as_str(row[6]) + + if ( + qualname is None + or start_line is None + or end_line is None + or loc is None + or stmt_count is None + or fingerprint is None + or loc_bucket is None + ): + return None + + return { + "qualname": qualname, + "filepath": filepath, + "start_line": start_line, + "end_line": end_line, + "loc": loc, + "stmt_count": stmt_count, + "fingerprint": fingerprint, + "loc_bucket": loc_bucket, + } + + +def _decode_wire_block(value: object, filepath: str) -> BlockDict | None: + row = _as_list(value) + if row is None or len(row) != 5: + return None + + qualname = _as_str(row[0]) + start_line = _as_int(row[1]) + end_line = _as_int(row[2]) + size = _as_int(row[3]) + block_hash = _as_str(row[4]) + + if ( + qualname is None + or start_line is None + or end_line is None + or size is None + or block_hash is None + ): + return None + + return { + "block_hash": block_hash, + "filepath": filepath, + "qualname": qualname, + "start_line": start_line, + "end_line": end_line, + "size": size, + } + + +def _decode_wire_segment(value: object, filepath: str) -> SegmentDict | None: + row = _as_list(value) + if row is None or len(row) != 6: + return None + + qualname = _as_str(row[0]) + start_line = _as_int(row[1]) + end_line = _as_int(row[2]) + size = _as_int(row[3]) + segment_hash = _as_str(row[4]) + segment_sig = _as_str(row[5]) + + if ( + qualname is None + or start_line is None + or end_line is None + or size is None + or segment_hash is None + or segment_sig is None + ): + return None + + return { + "segment_hash": segment_hash, + "segment_sig": segment_sig, + "filepath": filepath, + "qualname": qualname, + "start_line": start_line, + "end_line": end_line, + "size": size, + } + + +def _encode_wire_file_entry(entry: CacheEntry) -> dict[str, object]: + wire: dict[str, object] = { + "st": [entry["stat"]["mtime_ns"], entry["stat"]["size"]], + } + + units = sorted( + entry["units"], + key=lambda unit: ( + unit["qualname"], + unit["start_line"], + unit["end_line"], + unit["fingerprint"], + ), + ) + if units: + wire["u"] = [ + [ + unit["qualname"], + unit["start_line"], + unit["end_line"], + unit["loc"], + unit["stmt_count"], + unit["fingerprint"], + unit["loc_bucket"], + ] + for unit in units + ] + + blocks = sorted( + entry["blocks"], + key=lambda block: ( + block["qualname"], + block["start_line"], + block["end_line"], + block["block_hash"], + ), + ) + if blocks: + wire["b"] = [ + [ + block["qualname"], + block["start_line"], + block["end_line"], + block["size"], + block["block_hash"], + ] + for block in blocks + ] + + segments = sorted( + entry["segments"], + key=lambda segment: ( + segment["qualname"], + segment["start_line"], + segment["end_line"], + segment["segment_hash"], + ), + ) + if segments: + wire["s"] = [ + [ + segment["qualname"], + segment["start_line"], + segment["end_line"], + segment["size"], + segment["segment_hash"], + segment["segment_sig"], + ] + for segment in segments + ] + + return wire + + +def _resolve_root(root: str | Path | None) -> Path | None: + if root is None: + return None + try: + return Path(root).resolve(strict=False) + except OSError: + return None + + def _is_file_stat_dict(value: object) -> bool: if not isinstance(value, dict): return False @@ -275,10 +826,10 @@ def _is_segment_list(value: object) -> bool: def _has_typed_fields( - value: dict[str, object], + value: Mapping[str, object], *, - string_keys: tuple[str, ...], - int_keys: tuple[str, ...], + string_keys: Sequence[str], + int_keys: Sequence[str], ) -> bool: return all(isinstance(value.get(key), str) for key in string_keys) and all( isinstance(value.get(key), int) for key in int_keys diff --git a/codeclone/cli.py b/codeclone/cli.py index 0ef5832..667197c 100644 --- a/codeclone/cli.py +++ b/codeclone/cli.py @@ -2,10 +2,11 @@ import os import sys -from concurrent.futures import ProcessPoolExecutor, as_completed +from collections.abc import Mapping, Sequence +from concurrent.futures import Future, ProcessPoolExecutor, as_completed from dataclasses import asdict, dataclass from pathlib import Path -from typing import Any, cast +from typing import TYPE_CHECKING, cast from rich.console import Console from rich.panel import Panel @@ -21,30 +22,44 @@ from . import __version__ from . import ui_messages as ui from ._cli_args import build_parser -from ._cli_meta import _build_report_meta as _build_report_meta_impl -from ._cli_meta import _current_python_version as _current_python_version_impl -from ._cli_paths import _validate_output_path as _validate_output_path_impl -from ._cli_paths import expand_path as _expand_path_impl -from ._cli_summary import _build_summary_rows as _build_summary_rows_impl -from ._cli_summary import _build_summary_table as _build_summary_table_impl -from ._cli_summary import _print_summary as _print_summary_impl -from ._cli_summary import _summary_value_style as _summary_value_style_impl -from .baseline import BASELINE_SCHEMA_VERSION, Baseline -from .cache import Cache, CacheEntry, FileStat, file_stat_signature +from ._cli_meta import _build_report_meta +from ._cli_paths import _validate_output_path +from ._cli_summary import _print_summary +from ._report_types import GroupItem +from .baseline import ( + BASELINE_UNTRUSTED_STATUSES, + Baseline, + BaselineStatus, + coerce_baseline_status, + current_python_tag, +) +from .cache import Cache, CacheEntry, CacheStatus, FileStat, file_stat_signature +from .contracts import ( + BASELINE_FINGERPRINT_VERSION, + BASELINE_SCHEMA_VERSION, + ISSUES_URL, + ExitCode, +) from .errors import BaselineValidationError, CacheError from .extractor import extract_units_from_source from .html_report import build_html_report from .normalize import NormalizationConfig from .report import ( + build_block_group_facts, build_block_groups, build_groups, build_segment_groups, + prepare_block_report_groups, prepare_segment_report_groups, to_json_report, to_text_report, ) from .scanner import iter_py_files, module_name_from_path +if TYPE_CHECKING: + from .blocks import BlockUnit, SegmentUnit + from .extractor import Unit + # Custom theme for Rich custom_theme = Theme( { @@ -68,26 +83,6 @@ def _make_console(*, no_color: bool) -> Console: MAX_FILE_SIZE = 10 * 1024 * 1024 # 10MB BATCH_SIZE = 100 -_VALID_BASELINE_STATUSES = { - "ok", - "missing", - "legacy", - "invalid", - "mismatch_version", - "mismatch_schema", - "mismatch_python", - "generator_mismatch", - "integrity_missing", - "integrity_failed", - "too_large", -} -_UNTRUSTED_BASELINE_STATUSES = { - "invalid", - "too_large", - "generator_mismatch", - "integrity_missing", - "integrity_failed", -} @dataclass(slots=True) @@ -97,14 +92,11 @@ class ProcessingResult: filepath: str success: bool error: str | None = None - units: list[Any] | None = None - blocks: list[Any] | None = None - segments: list[Any] | None = None + units: list[Unit] | None = None + blocks: list[BlockUnit] | None = None + segments: list[SegmentUnit] | None = None stat: FileStat | None = None - - -def expand_path(p: str) -> Path: - return _expand_path_impl(p) + error_kind: str | None = None def process_file( @@ -138,17 +130,31 @@ def process_file( filepath=filepath, success=False, error=f"File too large: {st_size} bytes (max {MAX_FILE_SIZE})", + error_kind="file_too_large", ) except OSError as e: return ProcessingResult( - filepath=filepath, success=False, error=f"Cannot stat file: {e}" + filepath=filepath, + success=False, + error=f"Cannot stat file: {e}", + error_kind="stat_error", ) try: source = Path(filepath).read_text("utf-8") except UnicodeDecodeError as e: return ProcessingResult( - filepath=filepath, success=False, error=f"Encoding error: {e}" + filepath=filepath, + success=False, + error=f"Encoding error: {e}", + error_kind="source_read_error", + ) + except OSError as e: + return ProcessingResult( + filepath=filepath, + success=False, + error=f"Cannot read file: {e}", + error_kind="source_read_error", ) stat = file_stat_signature(filepath) @@ -177,115 +183,35 @@ def process_file( filepath=filepath, success=False, error=f"Unexpected error: {type(e).__name__}: {e}", + error_kind="unexpected_error", ) def print_banner() -> None: console.print( - Panel.fit( + Panel( ui.banner_title(__version__), border_style="blue", padding=(0, 2), + width=ui.CLI_LAYOUT_WIDTH, + expand=False, ) ) -def _validate_output_path(path: str, *, expected_suffix: str, label: str) -> Path: - return _validate_output_path_impl( - path, - expected_suffix=expected_suffix, - label=label, - console=console, - invalid_message=ui.fmt_invalid_output_extension, - ) - - -def _current_python_version() -> str: - return _current_python_version_impl() - - -def _build_report_meta( +def _is_debug_enabled( *, - baseline_path: Path, - baseline: Baseline, - baseline_loaded: bool, - baseline_status: str, - cache_path: Path, - cache_used: bool, -) -> dict[str, Any]: - return _build_report_meta_impl( - codeclone_version=__version__, - baseline_path=baseline_path, - baseline=baseline, - baseline_loaded=baseline_loaded, - baseline_status=baseline_status, - cache_path=cache_path, - cache_used=cache_used, - ) - - -def _summary_value_style(*, label: str, value: int) -> str: - return _summary_value_style_impl(label=label, value=value) + argv: Sequence[str] | None = None, + environ: Mapping[str, str] | None = None, +) -> bool: + args = list(sys.argv[1:] if argv is None else argv) + debug_from_flag = any(arg == "--debug" for arg in args) + env = os.environ if environ is None else environ + debug_from_env = env.get("CODECLONE_DEBUG") == "1" + return debug_from_flag or debug_from_env -def _build_summary_rows( - *, - files_found: int, - files_analyzed: int, - cache_hits: int, - files_skipped: int, - func_clones_count: int, - block_clones_count: int, - segment_clones_count: int, - suppressed_segment_groups: int, - new_clones_count: int, -) -> list[tuple[str, int]]: - return _build_summary_rows_impl( - files_found=files_found, - files_analyzed=files_analyzed, - cache_hits=cache_hits, - files_skipped=files_skipped, - func_clones_count=func_clones_count, - block_clones_count=block_clones_count, - segment_clones_count=segment_clones_count, - suppressed_segment_groups=suppressed_segment_groups, - new_clones_count=new_clones_count, - ) - - -def _build_summary_table(rows: list[tuple[str, int]]) -> Any: - return _build_summary_table_impl(rows) - - -def _print_summary( - *, - quiet: bool, - files_found: int, - files_analyzed: int, - cache_hits: int, - files_skipped: int, - func_clones_count: int, - block_clones_count: int, - segment_clones_count: int, - suppressed_segment_groups: int, - new_clones_count: int, -) -> None: - _print_summary_impl( - console=console, - quiet=quiet, - files_found=files_found, - files_analyzed=files_analyzed, - cache_hits=cache_hits, - files_skipped=files_skipped, - func_clones_count=func_clones_count, - block_clones_count=block_clones_count, - segment_clones_count=segment_clones_count, - suppressed_segment_groups=suppressed_segment_groups, - new_clones_count=new_clones_count, - ) - - -def main() -> None: +def _main_impl() -> None: ap = build_parser(__version__) cache_path_from_args = any( @@ -307,8 +233,10 @@ def main() -> None: console = _make_console(no_color=args.no_color) if args.max_baseline_size_mb < 0 or args.max_cache_size_mb < 0: - console.print("[error]Size limits must be non-negative integers (MB).[/error]") - sys.exit(1) + console.print( + ui.fmt_contract_error("Size limits must be non-negative integers (MB).") + ) + sys.exit(ExitCode.CONTRACT_ERROR) if not args.quiet: print_banner() @@ -316,11 +244,13 @@ def main() -> None: try: root_path = Path(args.root).resolve() if not root_path.exists(): - console.print(ui.ERR_ROOT_NOT_FOUND.format(path=root_path)) - sys.exit(1) - except Exception as e: - console.print(ui.ERR_INVALID_ROOT_PATH.format(error=e)) - sys.exit(1) + console.print( + ui.fmt_contract_error(ui.ERR_ROOT_NOT_FOUND.format(path=root_path)) + ) + sys.exit(ExitCode.CONTRACT_ERROR) + except OSError as e: + console.print(ui.fmt_contract_error(ui.ERR_INVALID_ROOT_PATH.format(error=e))) + sys.exit(ExitCode.CONTRACT_ERROR) if not args.quiet: console.print(ui.fmt_scanning_root(root_path)) @@ -330,15 +260,30 @@ def main() -> None: text_out_path: Path | None = None if args.html_out: html_out_path = _validate_output_path( - args.html_out, expected_suffix=".html", label="HTML" + args.html_out, + expected_suffix=".html", + label="HTML", + console=console, + invalid_message=ui.fmt_invalid_output_extension, + invalid_path_message=ui.fmt_invalid_output_path, ) if args.json_out: json_out_path = _validate_output_path( - args.json_out, expected_suffix=".json", label="JSON" + args.json_out, + expected_suffix=".json", + label="JSON", + console=console, + invalid_message=ui.fmt_invalid_output_extension, + invalid_path_message=ui.fmt_invalid_output_path, ) if args.text_out: text_out_path = _validate_output_path( - args.text_out, expected_suffix=".txt", label="text" + args.text_out, + expected_suffix=".txt", + label="text", + console=console, + invalid_message=ui.fmt_invalid_output_extension, + invalid_path_message=ui.fmt_invalid_output_path, ) # Initialize Cache @@ -358,14 +303,18 @@ def main() -> None: legacy_path=legacy_resolved, new_path=cache_path ) ) - cache = Cache(cache_path, max_size_bytes=args.max_cache_size_mb * 1024 * 1024) + cache = Cache( + cache_path, + root=root_path, + max_size_bytes=args.max_cache_size_mb * 1024 * 1024, + ) cache.load() if cache.load_warning: console.print(f"[warning]{cache.load_warning}[/warning]") - all_units: list[dict[str, Any]] = [] - all_blocks: list[dict[str, Any]] = [] - all_segments: list[dict[str, Any]] = [] + all_units: list[GroupItem] = [] + all_blocks: list[GroupItem] = [] + all_segments: list[GroupItem] = [] files_found = 0 files_analyzed = 0 cache_hits = 0 @@ -395,7 +344,9 @@ def _safe_process_file(fp: str) -> ProcessingResult | None: console.print(ui.fmt_worker_failed(e)) return None - def _safe_future_result(future: Any) -> tuple[ProcessingResult | None, str | None]: + def _safe_future_result( + future: Future[ProcessingResult], + ) -> tuple[ProcessingResult | None, str | None]: try: return future.result(), None except Exception as e: @@ -415,19 +366,19 @@ def _safe_future_result(future: Any) -> tuple[ProcessingResult | None, str | Non cache_hits += 1 all_units.extend( cast( - list[dict[str, Any]], + list[GroupItem], cast(object, cached.get("units", [])), ) ) all_blocks.extend( cast( - list[dict[str, Any]], + list[GroupItem], cast(object, cached.get("blocks", [])), ) ) all_segments.extend( cast( - list[dict[str, Any]], + list[GroupItem], cast(object, cached.get("segments", [])), ) ) @@ -446,30 +397,31 @@ def _safe_future_result(future: Any) -> tuple[ProcessingResult | None, str | Non cache_hits += 1 all_units.extend( cast( - list[dict[str, Any]], + list[GroupItem], cast(object, cached.get("units", [])), ) ) all_blocks.extend( cast( - list[dict[str, Any]], + list[GroupItem], cast(object, cached.get("blocks", [])), ) ) all_segments.extend( cast( - list[dict[str, Any]], + list[GroupItem], cast(object, cached.get("segments", [])), ) ) else: files_to_process.append(fp) - except Exception as e: - console.print(ui.ERR_SCAN_FAILED.format(error=e)) - sys.exit(1) + except OSError as e: + console.print(ui.fmt_contract_error(ui.ERR_SCAN_FAILED.format(error=e))) + sys.exit(ExitCode.CONTRACT_ERROR) total_files = len(files_to_process) failed_files = [] + source_read_failures: list[str] = [] # Processing phase if total_files > 0: @@ -493,7 +445,10 @@ def handle_result(result: ProcessingResult) -> None: all_segments.extend([asdict(s) for s in result.segments]) else: files_skipped += 1 - failed_files.append(f"{result.filepath}: {result.error}") + failure = f"{result.filepath}: {result.error}" + failed_files.append(failure) + if result.error_kind == "source_read_error": + source_read_failures.append(failure) def process_sequential(with_progress: bool) -> None: nonlocal files_skipped @@ -623,6 +578,11 @@ def process_sequential(with_progress: bool) -> None: if len(failed_files) > 10: console.print(f" ... and {len(failed_files) - 10} more") + gating_mode = args.fail_on_new or args.fail_threshold >= 0 + source_read_contract_failure = ( + bool(source_read_failures) and gating_mode and not args.update_baseline + ) + # Analysis phase suppressed_segment_groups = 0 if args.quiet: @@ -650,132 +610,136 @@ def process_sequential(with_progress: bool) -> None: console.print(ui.fmt_cache_save_failed(e)) # Reporting + block_groups_report = prepare_block_report_groups(block_groups) + block_group_facts = build_block_group_facts(block_groups_report) func_clones_count = len(func_groups) block_clones_count = len(block_groups) segment_clones_count = len(segment_groups) # Baseline Logic - baseline_path = Path(args.baseline).expanduser().resolve() + baseline_arg_path = Path(args.baseline).expanduser() + try: + baseline_path = baseline_arg_path.resolve() + baseline_exists = baseline_path.exists() + except OSError as e: + console.print( + ui.fmt_contract_error( + ui.fmt_invalid_baseline_path(path=baseline_arg_path, error=e) + ) + ) + sys.exit(ExitCode.CONTRACT_ERROR) # If user didn't specify path, the default is ./codeclone.baseline.json. baseline = Baseline(baseline_path) - baseline_exists = baseline_path.exists() baseline_loaded = False - baseline_status = "missing" - baseline_failure_code: int | None = None + baseline_status = BaselineStatus.MISSING + baseline_failure_code: ExitCode | None = None baseline_trusted_for_diff = False if baseline_exists: try: baseline.load(max_size_bytes=args.max_baseline_size_mb * 1024 * 1024) except BaselineValidationError as e: - baseline_status = ( - e.status if e.status in _VALID_BASELINE_STATUSES else "invalid" - ) + baseline_status = coerce_baseline_status(e.status) if not args.update_baseline: console.print(ui.fmt_invalid_baseline(e)) if args.fail_on_new: - baseline_failure_code = 2 + baseline_failure_code = ExitCode.CONTRACT_ERROR else: console.print(ui.WARN_BASELINE_IGNORED) else: - baseline_loaded = True - baseline_status = "ok" - baseline_trusted_for_diff = True if not args.update_baseline: - if baseline.is_legacy_format(): - baseline_status = "legacy" - console.print(ui.fmt_baseline_version_missing(__version__)) - baseline_failure_code = 2 - baseline_trusted_for_diff = False + try: + baseline.verify_compatibility( + current_python_tag=current_python_tag() + ) + except BaselineValidationError as e: + baseline_status = coerce_baseline_status(e.status) + console.print(ui.fmt_invalid_baseline(e)) + if args.fail_on_new: + baseline_failure_code = ExitCode.CONTRACT_ERROR + else: + console.print(ui.WARN_BASELINE_IGNORED) else: - if baseline.baseline_version != __version__: - assert baseline.baseline_version is not None - baseline_status = "mismatch_version" - console.print( - ui.fmt_baseline_version_mismatch( - baseline_version=baseline.baseline_version, - current_version=__version__, - ) - ) - baseline_failure_code = 2 - baseline_trusted_for_diff = False - if baseline.schema_version != BASELINE_SCHEMA_VERSION: - assert baseline.schema_version is not None - if baseline_status == "ok": - baseline_status = "mismatch_schema" - console.print( - ui.fmt_baseline_schema_mismatch( - baseline_schema=baseline.schema_version, - current_schema=BASELINE_SCHEMA_VERSION, - ) - ) - baseline_failure_code = 2 - baseline_trusted_for_diff = False - if baseline.python_version: - current_version = _current_python_version() - if baseline.python_version != current_version: - if baseline_status == "ok": - baseline_status = "mismatch_python" - console.print( - ui.fmt_baseline_python_mismatch( - baseline_python=baseline.python_version, - current_python=current_version, - ) - ) - if args.fail_on_new: - console.print(ui.ERR_BASELINE_SAME_PYTHON_REQUIRED) - baseline_failure_code = 2 - baseline_trusted_for_diff = False - if baseline_status == "ok": - try: - baseline.verify_integrity() - except BaselineValidationError as e: - status = ( - e.status - if e.status in _VALID_BASELINE_STATUSES - else "invalid" - ) - baseline_status = status - console.print(ui.fmt_invalid_baseline(e)) - baseline_trusted_for_diff = False - if args.fail_on_new: - baseline_failure_code = 2 - else: - console.print(ui.WARN_BASELINE_IGNORED) - if baseline_status in _UNTRUSTED_BASELINE_STATUSES: - baseline_loaded = False - baseline_trusted_for_diff = False + baseline_loaded = True + baseline_status = BaselineStatus.OK + baseline_trusted_for_diff = True else: if not args.update_baseline: console.print(ui.fmt_path(ui.WARN_BASELINE_MISSING, baseline_path)) + if baseline_status in BASELINE_UNTRUSTED_STATUSES: + baseline_loaded = False + baseline_trusted_for_diff = False + if args.fail_on_new and not args.update_baseline: + baseline_failure_code = ExitCode.CONTRACT_ERROR + if args.update_baseline: new_baseline = Baseline.from_groups( func_groups, block_groups, path=baseline_path, - python_version=f"{sys.version_info.major}.{sys.version_info.minor}", - baseline_version=__version__, + python_tag=current_python_tag(), + fingerprint_version=BASELINE_FINGERPRINT_VERSION, schema_version=BASELINE_SCHEMA_VERSION, + generator_version=__version__, ) - new_baseline.save() + try: + new_baseline.save() + except OSError as e: + console.print( + ui.fmt_contract_error( + ui.fmt_baseline_write_failed(path=baseline_path, error=e) + ) + ) + sys.exit(ExitCode.CONTRACT_ERROR) console.print(ui.fmt_path(ui.SUCCESS_BASELINE_UPDATED, baseline_path)) baseline = new_baseline baseline_loaded = True - baseline_status = "ok" + baseline_status = BaselineStatus.OK baseline_trusted_for_diff = True # When updating, we don't fail on new, we just saved the new state. # But we might still want to print the summary. + try: + report_cache_path = cache_path.resolve() + except OSError: + report_cache_path = cache_path + + raw_cache_status = getattr(cache, "load_status", None) + if isinstance(raw_cache_status, CacheStatus): + cache_status = raw_cache_status + elif isinstance(raw_cache_status, str): + try: + cache_status = CacheStatus(raw_cache_status) + except ValueError: + cache_status = ( + CacheStatus.OK + if cache.load_warning is None + else CacheStatus.INVALID_TYPE + ) + else: + cache_status = ( + CacheStatus.OK if cache.load_warning is None else CacheStatus.INVALID_TYPE + ) + + raw_cache_schema_version = getattr(cache, "cache_schema_version", None) + cache_schema_version = ( + raw_cache_schema_version if isinstance(raw_cache_schema_version, str) else None + ) + report_meta = _build_report_meta( + codeclone_version=__version__, baseline_path=baseline_path, baseline=baseline, baseline_loaded=baseline_loaded, - baseline_status=baseline_status, - cache_path=cache_path.resolve(), - cache_used=cache.load_warning is None, + baseline_status=baseline_status.value, + cache_path=report_cache_path, + cache_used=cache_status == CacheStatus.OK, + cache_status=cache_status.value, + cache_schema_version=cache_schema_version, + files_skipped_source_io=len(source_read_failures), ) # Diff @@ -786,6 +750,7 @@ def process_sequential(with_progress: bool) -> None: new_clones_count = len(new_func) + len(new_block) _print_summary( + console=console, quiet=args.quiet, files_found=files_found, files_analyzed=files_analyzed, @@ -811,48 +776,88 @@ def _print_output_notice(message: str) -> None: output_notice_printed = True console.print(message) + def _write_report_output(*, out: Path, content: str, label: str) -> None: + try: + out.parent.mkdir(parents=True, exist_ok=True) + out.write_text(content, "utf-8") + except OSError as e: + console.print( + ui.fmt_contract_error( + ui.fmt_report_write_failed(label=label, path=out, error=e) + ) + ) + sys.exit(ExitCode.CONTRACT_ERROR) + if html_out_path: out = html_out_path - out.parent.mkdir(parents=True, exist_ok=True) - out.write_text( - build_html_report( + _write_report_output( + out=out, + content=build_html_report( func_groups=func_groups, - block_groups=block_groups, + block_groups=block_groups_report, segment_groups=segment_groups, + block_group_facts=block_group_facts, + new_function_group_keys=new_func, + new_block_group_keys=new_block, report_meta=report_meta, title="CodeClone Report", context_lines=3, max_snippet_lines=220, ), - "utf-8", + label="HTML", ) html_report_path = str(out) _print_output_notice(ui.fmt_path(ui.INFO_HTML_REPORT_SAVED, out)) if json_out_path: out = json_out_path - out.parent.mkdir(parents=True, exist_ok=True) - out.write_text( - to_json_report(func_groups, block_groups, segment_groups, report_meta), - "utf-8", + _write_report_output( + out=out, + content=to_json_report( + func_groups, + block_groups_report, + segment_groups, + report_meta, + block_group_facts, + new_function_group_keys=new_func, + new_block_group_keys=new_block, + new_segment_group_keys=set(segment_groups.keys()), + ), + label="JSON", ) _print_output_notice(ui.fmt_path(ui.INFO_JSON_REPORT_SAVED, out)) if text_out_path: out = text_out_path - out.parent.mkdir(parents=True, exist_ok=True) - out.write_text( - to_text_report( + _write_report_output( + out=out, + content=to_text_report( meta=report_meta, func_groups=func_groups, - block_groups=block_groups, + block_groups=block_groups_report, segment_groups=segment_groups, + new_function_group_keys=new_func, + new_block_group_keys=new_block, + new_segment_group_keys=set(segment_groups.keys()), ), - "utf-8", + label="text", ) _print_output_notice(ui.fmt_path(ui.INFO_TEXT_REPORT_SAVED, out)) + if source_read_contract_failure: + console.print( + ui.fmt_contract_error( + ui.fmt_unreadable_source_in_gating(count=len(source_read_failures)) + ) + ) + for failure in source_read_failures[:10]: + console.print(f" • {failure}") + if len(source_read_failures) > 10: + console.print(f" ... and {len(source_read_failures) - 10} more") + sys.exit(ExitCode.CONTRACT_ERROR) + if baseline_failure_code is not None: + console.print(ui.fmt_contract_error(ui.ERR_BASELINE_GATING_REQUIRES_TRUSTED)) sys.exit(baseline_failure_code) # Exit Codes @@ -861,6 +866,7 @@ def _print_output_notice(message: str) -> None: if html_report_path is None and default_report.exists(): html_report_path = str(default_report) + console.print(ui.fmt_gating_failure("New code clones detected.")) console.print(f"\n{ui.FAIL_NEW_TITLE}") console.print(f"\n{ui.FAIL_NEW_SUMMARY_TITLE}") console.print(ui.FAIL_NEW_FUNCTION.format(count=len(new_func))) @@ -880,16 +886,36 @@ def _print_output_notice(message: str) -> None: console.print(f"\n{ui.FAIL_NEW_DETAIL_BLOCK}") for h in sorted(new_block): console.print(f"- {h}") - sys.exit(3) + sys.exit(ExitCode.GATING_FAILURE) if 0 <= args.fail_threshold < (func_clones_count + block_clones_count): total = func_clones_count + block_clones_count - console.print(ui.fmt_fail_threshold(total=total, threshold=args.fail_threshold)) - sys.exit(2) + console.print( + ui.fmt_gating_failure( + ui.fmt_fail_threshold(total=total, threshold=args.fail_threshold) + ) + ) + sys.exit(ExitCode.GATING_FAILURE) if not args.update_baseline and not args.fail_on_new and new_clones_count > 0: console.print(ui.WARN_NEW_CLONES_WITHOUT_FAIL) +def main() -> None: + try: + _main_impl() + except SystemExit: + raise + except Exception as e: + console.print( + ui.fmt_internal_error( + e, + issues_url=ISSUES_URL, + debug=_is_debug_enabled(), + ) + ) + sys.exit(ExitCode.INTERNAL_ERROR) + + if __name__ == "__main__": main() diff --git a/codeclone/contracts.py b/codeclone/contracts.py new file mode 100644 index 0000000..d8b27a1 --- /dev/null +++ b/codeclone/contracts.py @@ -0,0 +1,64 @@ +""" +CodeClone — AST and CFG-based code clone detector for Python +focused on architectural duplication. + +Copyright (c) 2026 Den Rozhnovskiy +Licensed under the MIT License. +""" + +from __future__ import annotations + +from enum import IntEnum +from typing import Final + +BASELINE_SCHEMA_VERSION: Final = "1.0" +BASELINE_FINGERPRINT_VERSION: Final = "1" + +CACHE_VERSION: Final = "1.2" +REPORT_SCHEMA_VERSION: Final = "1.1" + + +class ExitCode(IntEnum): + SUCCESS = 0 + CONTRACT_ERROR = 2 + GATING_FAILURE = 3 + INTERNAL_ERROR = 5 + + +REPOSITORY_URL: Final = "https://github.com/orenlab/codeclone" +ISSUES_URL: Final = "https://github.com/orenlab/codeclone/issues" +DOCS_URL: Final = "https://github.com/orenlab/codeclone/tree/main/docs" + +EXIT_CODE_DESCRIPTIONS: Final[tuple[tuple[ExitCode, str], ...]] = ( + (ExitCode.SUCCESS, "success"), + ( + ExitCode.CONTRACT_ERROR, + ( + "contract error (baseline missing/untrusted, invalid output " + "extensions, incompatible versions, unreadable source files in CI/gating)" + ), + ), + ( + ExitCode.GATING_FAILURE, + "gating failure (new clones detected, threshold exceeded)", + ), + ( + ExitCode.INTERNAL_ERROR, + "internal error (unexpected exception; please report)", + ), +) + + +def cli_help_epilog() -> str: + lines = ["Exit codes"] + for code, description in EXIT_CODE_DESCRIPTIONS: + lines.append(f" - {int(code)} - {description}") + lines.extend( + [ + "", + f"Repository: {REPOSITORY_URL}", + f"Issues: {ISSUES_URL}", + f"Docs: {DOCS_URL}", + ] + ) + return "\n".join(lines) diff --git a/codeclone/errors.py b/codeclone/errors.py index 11e32b8..87b8ffa 100644 --- a/codeclone/errors.py +++ b/codeclone/errors.py @@ -36,6 +36,6 @@ class BaselineValidationError(BaselineSchemaError): __slots__ = ("status",) - def __init__(self, message: str, *, status: str = "invalid") -> None: + def __init__(self, message: str, *, status: str = "invalid_type") -> None: super().__init__(message) self.status = status diff --git a/codeclone/extractor.py b/codeclone/extractor.py index d0a6236..d3b81c7 100644 --- a/codeclone/extractor.py +++ b/codeclone/extractor.py @@ -9,6 +9,7 @@ from __future__ import annotations import ast +import math import os import signal from collections.abc import Iterator @@ -49,6 +50,17 @@ class _ParseTimeoutError(Exception): pass +def _consumed_cpu_seconds(resource_module: object) -> float: + """Return consumed CPU seconds for the current process.""" + try: + usage = resource_module.getrusage( # type: ignore[attr-defined] + resource_module.RUSAGE_SELF # type: ignore[attr-defined] + ) + return float(usage.ru_utime) + float(usage.ru_stime) + except Exception: + return 0.0 + + @contextmanager def _parse_limits(timeout_s: int) -> Iterator[None]: if os.name != "posix" or timeout_s <= 0: @@ -70,11 +82,18 @@ def _timeout_handler(_signum: int, _frame: object) -> None: old_limits = resource.getrlimit(resource.RLIMIT_CPU) soft, hard = old_limits - hard_ceiling = timeout_s if hard == resource.RLIM_INFINITY else max(1, hard) + consumed_cpu_s = _consumed_cpu_seconds(resource) + desired_soft = max(1, timeout_s + math.ceil(consumed_cpu_s)) if soft == resource.RLIM_INFINITY: - new_soft = min(timeout_s, hard_ceiling) + candidate_soft = desired_soft + else: + # Never reduce finite soft limits and avoid immediate SIGXCPU + # when the process already consumed more CPU than timeout_s. + candidate_soft = max(soft, desired_soft) + if hard == resource.RLIM_INFINITY: + new_soft = candidate_soft else: - new_soft = min(timeout_s, soft, hard_ceiling) + new_soft = min(max(1, hard), candidate_soft) # Never lower hard limit: raising it back may be disallowed for # unprivileged processes and can lead to process termination later. resource.setrlimit(resource.RLIMIT_CPU, (new_soft, hard)) diff --git a/codeclone/html_report.py b/codeclone/html_report.py index c7ddf36..6540e47 100644 --- a/codeclone/html_report.py +++ b/codeclone/html_report.py @@ -8,7 +8,7 @@ from __future__ import annotations -from typing import Any +from collections.abc import Collection, Mapping from . import __version__ from ._html_escape import _escape_attr, _escape_html, _meta_display @@ -20,6 +20,9 @@ _try_pygments, pairwise, ) +from ._report_explain_contract import format_group_instance_compare_meta +from ._report_types import GroupItem, GroupMap +from .contracts import DOCS_URL, ISSUES_URL, REPOSITORY_URL from .templates import FONT_CSS_URL, REPORT_TEMPLATE __all__ = [ @@ -37,21 +40,51 @@ # ============================ -def _group_sort_key(items: list[dict[str, Any]]) -> tuple[int]: +def _group_sort_key(items: list[GroupItem]) -> tuple[int]: return (-len(items),) def build_html_report( *, - func_groups: dict[str, list[dict[str, Any]]], - block_groups: dict[str, list[dict[str, Any]]], - segment_groups: dict[str, list[dict[str, Any]]], - report_meta: dict[str, Any] | None = None, + func_groups: GroupMap, + block_groups: GroupMap, + segment_groups: GroupMap, + block_group_facts: dict[str, dict[str, str]], + new_function_group_keys: Collection[str] | None = None, + new_block_group_keys: Collection[str] | None = None, + report_meta: Mapping[str, object] | None = None, title: str = "CodeClone Report", context_lines: int = 3, max_snippet_lines: int = 220, ) -> str: file_cache = _FileCache() + resolved_block_group_facts = block_group_facts + + def _path_basename(value: object) -> str | None: + if not isinstance(value, str): + return None + text = value.strip() + if not text: + return None + normalized = text.replace("\\", "/").rstrip("/") + if not normalized: + return None + return normalized.rsplit("/", maxsplit=1)[-1] + + meta = dict(report_meta or {}) + baseline_loaded = bool(meta.get("baseline_loaded")) + baseline_status = str(meta.get("baseline_status", "")).strip().lower() + + if baseline_loaded and baseline_status == "ok": + baseline_split_note = ( + "Split is based on baseline: known duplicates are already " + "recorded in baseline, new duplicates are absent from baseline." + ) + else: + baseline_split_note = ( + "Baseline is not loaded or not trusted: " + "all duplicates are treated as new versus an empty baseline." + ) func_sorted = sorted( func_groups.items(), key=lambda kv: (*_group_sort_key(kv[1]), kv[0]) @@ -139,59 +172,208 @@ def _svg_icon(size: int, stroke_width: str, body: str) -> str: # Section renderer # ---------------------------- + def _display_group_key( + section_id: str, group_key: str, block_meta: dict[str, str] | None = None + ) -> str: + if section_id != "blocks": + return group_key + + if block_meta and block_meta.get("pattern_display"): + return str(block_meta["pattern_display"]) + + return group_key + + def _block_group_explanation_meta( + section_id: str, group_key: str + ) -> dict[str, str]: + if section_id != "blocks": + return {} + + raw = resolved_block_group_facts.get(group_key, {}) + return {str(k): str(v) for k, v in raw.items() if v is not None} + + def _render_group_explanation(meta: Mapping[str, object]) -> str: + if not meta: + return "" + + explain_items: list[tuple[str, str]] = [] + if meta.get("match_rule"): + explain_items.append( + (f"match_rule: {meta['match_rule']}", "group-explain-item") + ) + if meta.get("block_size"): + explain_items.append( + (f"block_size: {meta['block_size']}", "group-explain-item") + ) + if meta.get("signature_kind"): + explain_items.append( + (f"signature_kind: {meta['signature_kind']}", "group-explain-item") + ) + if meta.get("merged_regions"): + explain_items.append( + (f"merged_regions: {meta['merged_regions']}", "group-explain-item") + ) + pattern_value = str(meta.get("pattern", "")).strip() + if pattern_value: + pattern_label = str(meta.get("pattern_label", pattern_value)).strip() + pattern_display = str(meta.get("pattern_display", "")).strip() + if pattern_display: + explain_items.append( + ( + f"pattern: {pattern_label} ({pattern_display})", + "group-explain-item", + ) + ) + else: + explain_items.append( + (f"pattern: {pattern_label}", "group-explain-item") + ) + + hint_id = str(meta.get("hint", "")).strip() + if hint_id: + hint_label = str(meta.get("hint_label", hint_id)).strip() + explain_items.append( + (f"hint: {hint_label}", "group-explain-item group-explain-warn") + ) + if meta.get("hint_confidence"): + explain_items.append( + ( + f"hint_confidence: {meta['hint_confidence']}", + "group-explain-item group-explain-muted", + ) + ) + if meta.get("assert_ratio"): + explain_items.append( + ( + f"assert_ratio: {meta['assert_ratio']}", + "group-explain-item group-explain-muted", + ) + ) + if meta.get("consecutive_asserts"): + explain_items.append( + ( + f"consecutive_asserts: {meta['consecutive_asserts']}", + "group-explain-item group-explain-muted", + ) + ) + hint_context_label = str(meta.get("hint_context_label", "")).strip() + if hint_context_label: + explain_items.append( + ( + hint_context_label, + "group-explain-item group-explain-muted", + ) + ) + + attrs = { + "data-match-rule": str(meta.get("match_rule", "")), + "data-block-size": str(meta.get("block_size", "")), + "data-signature-kind": str(meta.get("signature_kind", "")), + "data-merged-regions": str(meta.get("merged_regions", "")), + "data-pattern": str(meta.get("pattern", "")), + "data-pattern-label": str(meta.get("pattern_label", "")), + "data-hint": str(meta.get("hint", "")), + "data-hint-label": str(meta.get("hint_label", "")), + "data-hint-context-label": str(meta.get("hint_context_label", "")), + "data-hint-confidence": str(meta.get("hint_confidence", "")), + "data-assert-ratio": str(meta.get("assert_ratio", "")), + "data-consecutive-asserts": str(meta.get("consecutive_asserts", "")), + } + attr_html = " ".join( + f'{key}="{_escape_attr(value)}"' for key, value in attrs.items() if value + ) + parts = [ + f'{_escape_html(text)}' + for text, css_class in explain_items + ] + note = "" + if isinstance(meta.get("hint_note"), str): + note_text = _escape_html(str(meta["hint_note"])) + note = f'

{note_text}

' + return f'
{"".join(parts)}{note}
' + def render_section( section_id: str, section_title: str, - groups: list[tuple[str, list[dict[str, Any]]]], + groups: list[tuple[str, list[GroupItem]]], pill_cls: str, + *, + novelty_by_group: Mapping[str, str] | None = None, ) -> str: if not groups: return "" - # build group DOM with data-search (for fast client-side search) + def _block_group_name(display_key: str, meta: dict[str, str]) -> str: + if meta.get("group_display_name"): + return str(meta["group_display_name"]) + if len(display_key) > 56: + return f"{display_key[:24]}...{display_key[-16:]}" + return display_key + + def _group_name(display_key: str, meta: dict[str, str]) -> str: + if section_id == "blocks": + return _block_group_name(display_key, meta) + return display_key + + def _item_span_size(item: GroupItem) -> int: + start_line = int(item.get("start_line", 0)) + end_line = int(item.get("end_line", 0)) + return max(0, end_line - start_line + 1) + + def _group_span_size(items: list[GroupItem]) -> int: + return max((_item_span_size(item) for item in items), default=0) + + section_novelty = novelty_by_group or {} + has_novelty_filter = bool(section_novelty) + out: list[str] = [ - f'
', - '
', + f'
', + '
', f"

{_escape_html(section_title)} " - f'' + f'' f"{len(groups)} groups

", + "
", f""" -