From c7f885e415590dc3d039c903447cf57f627285a6 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 16 Jun 2026 19:13:48 +0000 Subject: [PATCH 1/2] ci(security): harden OpenSSF Scorecard posture (token-permissions, fuzzing, signed releases) Address the low-scoring OpenSSF Scorecard checks (overall 6.5 at 075f9e0): Token-Permissions (0 -> 10): - docker-beta.yml: drop top-level write scopes to `contents: read`; move packages/id-token/attestations write to the build job only. - codeql.yml: declare a top-level `permissions: contents: read`. Fuzzing (0): - Add an Atheris fuzz target for the source chunker (the most exposed parser), plus ClusterFuzzLite config (.clusterfuzzlite/) and a bounded fuzz CI workflow. Signed-Releases: - Add release.yml: on a v* tag, build sdist+wheel, keyless-sign each with Sigstore (OIDC), and publish a GitHub Release with the .sigstore bundles. SAST (8): - Add a bandit job to security.yml; annotate two confirmed false positives (parameterized SQL IN-clause; a public-host classifier) with `# nosec`. License (9 -> 10): - Rename LICENSE-2.0.txt -> LICENSE (standard name) and update references in pyproject.toml, README.md, Dockerfile. https://claude.ai/code/session_01VgY3wMWzuBw6QFNivhXZYy --- .clusterfuzzlite/Dockerfile | 8 ++++ .clusterfuzzlite/build.sh | 10 +++++ .github/workflows/codeql.yml | 5 +++ .github/workflows/docker-beta.yml | 11 ++++-- .github/workflows/fuzz.yml | 59 +++++++++++++++++++++++++++++ .github/workflows/release.yml | 62 +++++++++++++++++++++++++++++++ .github/workflows/security.yml | 19 ++++++++++ Dockerfile | 4 +- LICENSE-2.0.txt => LICENSE | 0 README.md | 2 +- coderag/store/sqlite_store.py | 2 +- coderag/surfaces/http_api.py | 2 +- fuzz/fuzz_chunk_file.py | 57 ++++++++++++++++++++++++++++ pyproject.toml | 2 +- 14 files changed, 234 insertions(+), 9 deletions(-) create mode 100644 .clusterfuzzlite/Dockerfile create mode 100755 .clusterfuzzlite/build.sh create mode 100644 .github/workflows/fuzz.yml create mode 100644 .github/workflows/release.yml rename LICENSE-2.0.txt => LICENSE (100%) create mode 100644 fuzz/fuzz_chunk_file.py diff --git a/.clusterfuzzlite/Dockerfile b/.clusterfuzzlite/Dockerfile new file mode 100644 index 0000000..cdb2386 --- /dev/null +++ b/.clusterfuzzlite/Dockerfile @@ -0,0 +1,8 @@ +# Build image for CodeRAG's Atheris fuzz harnesses, driven by ClusterFuzzLite +# (and compatible with OSS-Fuzz). base-builder-python ships Atheris plus the +# `compile_python_fuzzer` helper that build.sh uses to bundle each harness. +FROM gcr.io/oss-fuzz-base/base-builder-python + +COPY . $SRC/coderag +COPY .clusterfuzzlite/build.sh $SRC/build.sh +WORKDIR $SRC/coderag diff --git a/.clusterfuzzlite/build.sh b/.clusterfuzzlite/build.sh new file mode 100755 index 0000000..218e665 --- /dev/null +++ b/.clusterfuzzlite/build.sh @@ -0,0 +1,10 @@ +#!/bin/bash -eu +# ClusterFuzzLite / OSS-Fuzz build script. Runs inside base-builder-python, which +# provides Atheris and the `compile_python_fuzzer` helper. Installs CodeRAG so the +# harnesses can import it, then compiles every harness under fuzz/ into $OUT. + +pip3 install --no-cache-dir "$SRC/coderag" + +for harness in "$SRC/coderag/fuzz/"fuzz_*.py; do + compile_python_fuzzer "$harness" +done diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index cfa229e..aa58295 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -16,6 +16,11 @@ concurrency: group: codeql-${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true +# Least privilege at the top level; the analyze job escalates to the single write +# scope it needs (uploading code-scanning results). +permissions: + contents: read + jobs: analyze: name: Analyze (python) diff --git a/.github/workflows/docker-beta.yml b/.github/workflows/docker-beta.yml index 73dc6c6..1c02a9d 100644 --- a/.github/workflows/docker-beta.yml +++ b/.github/workflows/docker-beta.yml @@ -10,11 +10,10 @@ on: branches: [master] workflow_dispatch: +# Least privilege at the top level; the build job below opts into exactly the +# write scopes it needs (top-level write would fail OpenSSF Token-Permissions). permissions: contents: read - packages: write # push to GHCR - id-token: write # OIDC for build provenance - attestations: write # SLSA provenance + SBOM attestations concurrency: group: docker-${{ github.workflow }}-${{ github.ref }} @@ -47,6 +46,12 @@ jobs: if: always() runs-on: ubuntu-latest timeout-minutes: 30 + # Write scopes are scoped to this job only; the prepare job needs none. + permissions: + contents: read # checkout + packages: write # push images to GHCR + id-token: write # OIDC for build provenance / keyless signing + attestations: write # SLSA provenance + SBOM attestations strategy: fail-fast: false matrix: diff --git a/.github/workflows/fuzz.yml b/.github/workflows/fuzz.yml new file mode 100644 index 0000000..398eafd --- /dev/null +++ b/.github/workflows/fuzz.yml @@ -0,0 +1,59 @@ +name: Fuzz + +# Continuous fuzzing of the source chunker — the most exposed parser in CodeRAG +# (it ingests arbitrary file bytes from any watched repo and must never crash). +# The harness in fuzz/ is an Atheris target; here it runs for a bounded burst on +# PRs that touch the chunker, and a longer burst on a weekly schedule. The same +# harness is OSS-Fuzz / ClusterFuzzLite-compatible via .clusterfuzzlite/. +on: + pull_request: + branches: [master] + paths: + - "coderag/chunking/**" + - "coderag/_lines.py" + - "coderag/types.py" + - "fuzz/**" + - ".github/workflows/fuzz.yml" + schedule: + - cron: "15 4 * * 1" + workflow_dispatch: + +# Least privilege: the job only needs to read the repo. +permissions: + contents: read + +concurrency: + group: fuzz-${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + atheris: + name: Atheris (chunker) + runs-on: ubuntu-latest + timeout-minutes: 20 + steps: + - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6 + + - name: Set up Python + uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6 + with: + # Atheris ships manylinux wheels for 3.11, so no clang build needed. + python-version: "3.11" + + - name: Set up uv (fast installs + cache) + uses: astral-sh/setup-uv@fac544c07dec837d0ccb6301d7b5580bf5edae39 # v8.2.0 + with: + enable-cache: true + + - name: Install CodeRAG + Atheris + run: uv pip install --system -e . atheris + + # Bounded run: a short, deterministic burst on PRs; a longer time-boxed run + # on the weekly schedule. A crash or a broken invariant fails the job. + - name: Fuzz the chunker + run: | + if [ "${{ github.event_name }}" = "schedule" ]; then + python fuzz/fuzz_chunk_file.py -max_total_time=600 + else + python fuzz/fuzz_chunk_file.py -atheris_runs=50000 + fi diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 0000000..483c24c --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,62 @@ +name: Release + +# When a v* tag is pushed, build the sdist + wheel, sign each artifact with +# Sigstore (keyless — using the workflow's OIDC identity, no long-lived key), and +# publish a GitHub Release with the artifacts and their .sigstore bundles attached. +# Signed release artifacts are what OpenSSF Scorecard's Signed-Releases check +# verifies; the bundles let anyone run `sigstore verify` to confirm provenance. +on: + push: + tags: ["v*"] + workflow_dispatch: + +# Least privilege at the top level; the job escalates to exactly what it needs. +permissions: + contents: read + +concurrency: + group: release-${{ github.ref }} + cancel-in-progress: false + +jobs: + release: + name: Build, sign & publish + runs-on: ubuntu-latest + timeout-minutes: 15 + permissions: + contents: write # create the GitHub Release and upload assets + id-token: write # OIDC identity for keyless Sigstore signing + steps: + - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6 + with: + persist-credentials: false + + - name: Set up uv + uses: astral-sh/setup-uv@fac544c07dec837d0ccb6301d7b5580bf5edae39 # v8.2.0 + with: + python-version: "3.12" + + - name: Build sdist + wheel + run: uv build + + - name: Validate package metadata (twine check) + run: uvx twine check dist/* + + # Keyless Sigstore signing: one .sigstore bundle per artifact, authenticated + # by the workflow's OIDC identity (no secret key to manage or leak). + - name: Sign artifacts with Sigstore + run: | + uv pip install --system sigstore + for artifact in dist/*.tar.gz dist/*.whl; do + python -m sigstore sign --bundle "${artifact}.sigstore" "${artifact}" + done + + - name: Publish GitHub Release with signed artifacts + env: + GH_TOKEN: ${{ github.token }} + run: | + gh release create "${GITHUB_REF_NAME}" \ + dist/*.tar.gz dist/*.whl dist/*.sigstore \ + --title "${GITHUB_REF_NAME}" \ + --generate-notes \ + --verify-tag diff --git a/.github/workflows/security.yml b/.github/workflows/security.yml index c29ead6..71bac9b 100644 --- a/.github/workflows/security.yml +++ b/.github/workflows/security.yml @@ -5,6 +5,7 @@ name: Security # - pip-audit: scans resolved Python dependencies for known CVEs (PyPI advisories). # - gitleaks: scans the repo / PR diff for committed secrets, even without a # developer running the local pre-commit hook. +# - bandit: Python-specific static security analysis (SAST) of the package. on: push: branches: [main, master, develop] @@ -56,3 +57,21 @@ jobs: uses: gitleaks/gitleaks-action@e0c47f4f8be36e29cdc102c57e68cb5cbf0e8d1e # v3 env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + bandit: + name: bandit (Python SAST) + runs-on: ubuntu-latest + timeout-minutes: 10 + steps: + - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6 + + - name: Set up uv (provides uvx for bandit) + uses: astral-sh/setup-uv@fac544c07dec837d0ccb6301d7b5580bf5edae39 # v8.2.0 + with: + python-version: "3.12" + + # Static security analysis of the package source — a Python-focused second + # opinion alongside CodeQL. Gates on MEDIUM+ severity findings; known false + # positives are annotated inline with `# nosec ` + a justification. + - name: Run bandit + run: uvx bandit -r coderag -ll diff --git a/Dockerfile b/Dockerfile index f1d241f..db47694 100644 --- a/Dockerfile +++ b/Dockerfile @@ -26,8 +26,8 @@ LABEL org.opencontainers.image.source="https://github.com/Neverdecel/CodeRAG" \ COPY --from=ghcr.io/astral-sh/uv:0.8.17@sha256:e4644cb5bd56fdc2c5ea3ee0525d9d21eed1603bccd6a21f887a938be7e85be1 /uv /uvx /usr/local/bin/ WORKDIR /app -# pyproject reads README.md and LICENSE-2.0.txt, so both are needed to build the wheel. -COPY pyproject.toml README.md LICENSE-2.0.txt ./ +# pyproject reads README.md and LICENSE, so both are needed to build the wheel. +COPY pyproject.toml README.md LICENSE ./ COPY coderag ./coderag # Non-root runtime user; writable mount points for the index and the codebase. diff --git a/LICENSE-2.0.txt b/LICENSE similarity index 100% rename from LICENSE-2.0.txt rename to LICENSE diff --git a/README.md b/README.md index cbe3833..1db0f0c 100644 --- a/README.md +++ b/README.md @@ -240,7 +240,7 @@ contribution details. ## 📄 License -Apache License 2.0 — see [LICENSE](LICENSE-2.0.txt). +Apache License 2.0 — see [LICENSE](LICENSE). ## 🙏 Acknowledgments diff --git a/coderag/store/sqlite_store.py b/coderag/store/sqlite_store.py index 8ebafd2..5d6fe7a 100644 --- a/coderag/store/sqlite_store.py +++ b/coderag/store/sqlite_store.py @@ -242,7 +242,7 @@ def hydrate(self, chunk_ids: Sequence[int]) -> Dict[int, sqlite3.Row]: placeholders = ",".join("?" for _ in chunk_ids) with self._lock: rows = self._conn.execute( - "SELECT c.id, c.symbol, c.kind, c.start_line, c.end_line, c.language, " + "SELECT c.id, c.symbol, c.kind, c.start_line, c.end_line, c.language, " # nosec B608 — IN-list is positional "?" placeholders; ids bound as params " c.text, f.path AS path " "FROM chunks c JOIN files f ON f.id = c.file_id " f"WHERE c.id IN ({placeholders})", diff --git a/coderag/surfaces/http_api.py b/coderag/surfaces/http_api.py index 4f121d0..d235901 100644 --- a/coderag/surfaces/http_api.py +++ b/coderag/surfaces/http_api.py @@ -128,7 +128,7 @@ def _is_public_host(host: str) -> bool: """True if ``host`` is reachable beyond loopback (so auth really matters).""" if host in ("127.0.0.1", "localhost", "::1"): return False - if host in ("0.0.0.0", "::"): + if host in ("0.0.0.0", "::"): # nosec B104 — classifies a host as public; does not bind a socket return True try: return not ipaddress.ip_address(host).is_loopback diff --git a/fuzz/fuzz_chunk_file.py b/fuzz/fuzz_chunk_file.py new file mode 100644 index 0000000..27d580c --- /dev/null +++ b/fuzz/fuzz_chunk_file.py @@ -0,0 +1,57 @@ +#!/usr/bin/env python3 +"""Atheris fuzz target for CodeRAG's source chunker. + +``chunk_file`` is the most exposed parser in the engine: it ingests arbitrary +file contents from any watched repository and is contractually required to +*never* raise — a parse failure must degrade to line-window chunking, not crash +indexing (see ``coderag/chunking/__init__.py``). This harness feeds random bytes +as source text across every supported language and asserts that contract plus a +few structural invariants on the chunks it returns. + +Run locally:: + + pip install atheris + python fuzz/fuzz_chunk_file.py -atheris_runs=50000 + +The same target is built for ClusterFuzzLite / OSS-Fuzz via ``.clusterfuzzlite/``. +""" + +from __future__ import annotations + +import sys + +import atheris + +with atheris.instrument_imports(): + from coderag.chunking import chunk_file, languages # noqa: E402 + from coderag.config import Config # noqa: E402 + +# Symbol-aware languages (Python via the stdlib ``ast``, plus the tree-sitter +# set) and one non-symbol language to exercise the line-window fallback path. +_LANGUAGES = sorted(languages.SYMBOL_LANGUAGES) + ["text"] +_CONFIG = Config() + + +def TestOneInput(data: bytes) -> None: + fdp = atheris.FuzzedDataProvider(data) + language = _LANGUAGES[fdp.ConsumeIntInRange(0, len(_LANGUAGES) - 1)] + text = fdp.ConsumeUnicodeNoSurrogates(fdp.remaining_bytes()) + + # Contract: chunk_file never raises on arbitrary input (it falls back to + # line windows when symbol extraction fails). + chunks = chunk_file(text, language, _CONFIG) + + # Structural invariants every emitted chunk must satisfy. + assert isinstance(chunks, list) + for chunk in chunks: + assert chunk.start_line >= 1 + assert chunk.end_line >= chunk.start_line + + +def main() -> None: + atheris.Setup(sys.argv, TestOneInput) + atheris.Fuzz() + + +if __name__ == "__main__": + main() diff --git a/pyproject.toml b/pyproject.toml index a4e63ac..f556be8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,7 @@ version = "1.0.0" description = "Standalone, local-first semantic code-search engine for large and custom codebases." readme = "README.md" requires-python = ">=3.11" -license = { file = "LICENSE-2.0.txt" } +license = { file = "LICENSE" } authors = [{ name = "Neverdecel" }] keywords = ["code-search", "rag", "embeddings", "faiss", "semantic-search", "retrieval"] dependencies = [ From c73ba2fceba68782bec3896da0b6eae828d3933c Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 16 Jun 2026 19:41:03 +0000 Subject: [PATCH 2/2] fix(chunking): bound tree-sitter parse time so adversarial input can't hang indexing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Atheris fuzz target (added in this PR) found a ~180-byte TypeScript input — mostly newlines with a few stray tokens — that drove the tree-sitter grammar's GLR error-recovery super-linear: a single parse ran for minutes and ballooned RSS past 2 GB. Indexing arbitrary repos must never let one hostile/garbled file hang or OOM the indexer. Fix: set a per-parse time budget (timeout_micros, 2s) on the cached tree-sitter parsers. A parse that blows the budget raises, and chunk_file falls back to line windows — its existing graceful-degradation contract. Real source parses in single-digit milliseconds, so the guard never trips on legitimate code. Regression tests in tests/test_chunking.py: assert every tree-sitter parser carries the budget, and that the exact fuzzer-found input degrades to windows (SIGALRM-bounded so a future regression fails fast instead of hanging). https://claude.ai/code/session_01VgY3wMWzuBw6QFNivhXZYy --- coderag/chunking/treesitter.py | 20 +++++++++++++- tests/test_chunking.py | 48 ++++++++++++++++++++++++++++++++-- 2 files changed, 65 insertions(+), 3 deletions(-) diff --git a/coderag/chunking/treesitter.py b/coderag/chunking/treesitter.py index c7691f3..8de4134 100644 --- a/coderag/chunking/treesitter.py +++ b/coderag/chunking/treesitter.py @@ -9,6 +9,7 @@ from __future__ import annotations import logging +import warnings from functools import lru_cache from typing import Any, Callable, List, Set @@ -85,12 +86,29 @@ def loader() -> Any: _NAME_FIELDS = ("name", "type") +# Upper bound on a single tree-sitter parse. tree-sitter's GLR error-recovery can +# go super-linear in time and memory on adversarial input — e.g. a few stray +# tokens scattered among many newlines — which would otherwise hang indexing (and +# balloon RSS) on a hostile or garbled file. A parse that blows the budget raises, +# and chunk_file falls back to line windows (its existing graceful-degradation +# path). Real source parses in single-digit milliseconds, so this never trips on +# legitimate code; it's a robustness guard, found via fuzzing the chunker. +_PARSE_TIMEOUT_MICROS = 2_000_000 # 2s + @lru_cache(maxsize=16) def _parser(language: str) -> Any: import tree_sitter as ts - return ts.Parser(_LANGUAGE_LOADERS[language]()) + parser = ts.Parser(_LANGUAGE_LOADERS[language]()) + # `timeout_micros` is deprecated upstream in favour of a parse-time progress + # callback, but that callback is ignored for bytestring sources in the pinned + # tree-sitter range (<0.26), so timeout_micros stays the working guard here. + with warnings.catch_warnings(): + warnings.simplefilter("ignore", DeprecationWarning) + # Writable at runtime even though the stubs mark it read-only (deprecated). + parser.timeout_micros = _PARSE_TIMEOUT_MICROS # type: ignore[misc] + return parser def _kind(node_type: str) -> str: diff --git a/tests/test_chunking.py b/tests/test_chunking.py index b5c254a..3f4a078 100644 --- a/tests/test_chunking.py +++ b/tests/test_chunking.py @@ -2,8 +2,14 @@ from __future__ import annotations -from coderag.chunking import chunk_file -from coderag.chunking.languages import detect_language +import base64 +import signal +import warnings + +import pytest + +from coderag.chunking import chunk_file, treesitter +from coderag.chunking.languages import TREE_SITTER_LANGUAGES, detect_language from coderag.config import Config CFG = Config(provider="fake", window_lines=10, window_overlap=2, max_chunk_lines=50) @@ -119,3 +125,41 @@ def test_unknown_language_uses_windows(): def test_empty_file_yields_nothing(): assert chunk_file(" \n \n", "python", CFG) == [] + + +# A ~180-byte input the chunker fuzzer (fuzz/fuzz_chunk_file.py) found: mostly +# newlines with a few stray tokens, which drove the tree-sitter TypeScript grammar's +# error-recovery super-linear (a multi-second parse that ballooned RSS past 2 GB). +# The chunker must bound the parse and degrade to line windows, never hang. +_FUZZED_PATHOLOGICAL_TS = base64.b64decode( + "FgoKCgoKGgoKCgoKCgoKCgoKCAoKCgoKCgoKCgoKCgoKCnwKMgoKCgoKCigKCgoKCgoKCgoKCgoKCgoK" + "CgoKCgoKCgoKCn9/CiVbCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgp9dVQKCgoKCgoKFAoKCgoKCgoKCgoK" + "CgoKCgoKCgoKChEWCgoKCgoKCgoKCgoKCiUKCmAKCgotCgoKCgoKCgoKCgoKCgoKCgoKdXV1aHV1dQ==" +).decode("utf-8") + + +def test_treesitter_parsers_carry_a_parse_time_budget(): + for language in sorted(TREE_SITTER_LANGUAGES): + parser = treesitter._parser(language) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", DeprecationWarning) + assert parser.timeout_micros == treesitter._PARSE_TIMEOUT_MICROS + + +@pytest.mark.skipif( + not hasattr(signal, "SIGALRM"), reason="needs SIGALRM to bound a potential hang" +) +def test_pathological_treesitter_input_degrades_to_windows(): + def _bail(signum, frame): + raise TimeoutError("chunk_file did not bound the tree-sitter parse") + + prev = signal.signal(signal.SIGALRM, _bail) + signal.alarm(8) # parse budget is ~2s; this only fires if the guard regresses + try: + chunks = chunk_file(_FUZZED_PATHOLOGICAL_TS, "typescript", CFG) + finally: + signal.alarm(0) + signal.signal(signal.SIGALRM, prev) + + assert isinstance(chunks, list) + assert all(c.kind == "window" for c in chunks)