Neverdecel · Neverdecel · Jun 16, 2026 · Jun 16, 2026 · Jun 16, 2026
diff --git a/.clusterfuzzlite/Dockerfile b/.clusterfuzzlite/Dockerfile
@@ -0,0 +1,8 @@
+# Build image for CodeRAG's Atheris fuzz harnesses, driven by ClusterFuzzLite
+# (and compatible with OSS-Fuzz). base-builder-python ships Atheris plus the
+# `compile_python_fuzzer` helper that build.sh uses to bundle each harness.
+FROM gcr.io/oss-fuzz-base/base-builder-python
+
+COPY . $SRC/coderag
+COPY .clusterfuzzlite/build.sh $SRC/build.sh
+WORKDIR $SRC/coderag
diff --git a/.clusterfuzzlite/build.sh b/.clusterfuzzlite/build.sh
@@ -0,0 +1,10 @@
+#!/bin/bash -eu
+# ClusterFuzzLite / OSS-Fuzz build script. Runs inside base-builder-python, which
+# provides Atheris and the `compile_python_fuzzer` helper. Installs CodeRAG so the
+# harnesses can import it, then compiles every harness under fuzz/ into $OUT.
+
+pip3 install --no-cache-dir "$SRC/coderag"
+
+for harness in "$SRC/coderag/fuzz/"fuzz_*.py; do
+  compile_python_fuzzer "$harness"
+done
diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
@@ -16,6 +16,11 @@ concurrency:
   group: codeql-${{ github.workflow }}-${{ github.ref }}
   cancel-in-progress: true
 
+# Least privilege at the top level; the analyze job escalates to the single write
+# scope it needs (uploading code-scanning results).
+permissions:
+  contents: read
+
 jobs:
   analyze:
     name: Analyze (python)

diff --git a/.github/workflows/docker-beta.yml b/.github/workflows/docker-beta.yml
@@ -10,11 +10,10 @@ on:
     branches: [master]
   workflow_dispatch:
 
+# Least privilege at the top level; the build job below opts into exactly the
+# write scopes it needs (top-level write would fail OpenSSF Token-Permissions).
 permissions:
   contents: read
-  packages: write       # push to GHCR
-  id-token: write       # OIDC for build provenance
-  attestations: write   # SLSA provenance + SBOM attestations
 
 concurrency:
   group: docker-${{ github.workflow }}-${{ github.ref }}
@@ -47,6 +46,12 @@ jobs:
     if: always()
     runs-on: ubuntu-latest
     timeout-minutes: 30
+    # Write scopes are scoped to this job only; the prepare job needs none.
+    permissions:
+      contents: read       # checkout
+      packages: write      # push images to GHCR
+      id-token: write      # OIDC for build provenance / keyless signing
+      attestations: write  # SLSA provenance + SBOM attestations
     strategy:
       fail-fast: false
       matrix:

diff --git a/.github/workflows/fuzz.yml b/.github/workflows/fuzz.yml
@@ -0,0 +1,59 @@
+name: Fuzz
+
+# Continuous fuzzing of the source chunker — the most exposed parser in CodeRAG
+# (it ingests arbitrary file bytes from any watched repo and must never crash).
+# The harness in fuzz/ is an Atheris target; here it runs for a bounded burst on
+# PRs that touch the chunker, and a longer burst on a weekly schedule. The same
+# harness is OSS-Fuzz / ClusterFuzzLite-compatible via .clusterfuzzlite/.
+on:
+  pull_request:
+    branches: [master]
+    paths:
+      - "coderag/chunking/**"
+      - "coderag/_lines.py"
+      - "coderag/types.py"
+      - "fuzz/**"
+      - ".github/workflows/fuzz.yml"
+  schedule:
+    - cron: "15 4 * * 1"
+  workflow_dispatch:
+
+# Least privilege: the job only needs to read the repo.
+permissions:
+  contents: read
+
+concurrency:
+  group: fuzz-${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  atheris:
+    name: Atheris (chunker)
+    runs-on: ubuntu-latest
+    timeout-minutes: 20
+    steps:
+      - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6
+
+      - name: Set up Python
+        uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6
+        with:
+          # Atheris ships manylinux wheels for 3.11, so no clang build needed.
+          python-version: "3.11"
+
+      - name: Set up uv (fast installs + cache)
+        uses: astral-sh/setup-uv@fac544c07dec837d0ccb6301d7b5580bf5edae39 # v8.2.0
+        with:
+          enable-cache: true
+
+      - name: Install CodeRAG + Atheris
+        run: uv pip install --system -e . atheris
+
+      # Bounded run: a short, deterministic burst on PRs; a longer time-boxed run
+      # on the weekly schedule. A crash or a broken invariant fails the job.
+      - name: Fuzz the chunker
+        run: |
+          if [ "${{ github.event_name }}" = "schedule" ]; then
+            python fuzz/fuzz_chunk_file.py -max_total_time=600
+          else
+            python fuzz/fuzz_chunk_file.py -atheris_runs=50000
+          fi
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -0,0 +1,62 @@
+name: Release
+
+# When a v* tag is pushed, build the sdist + wheel, sign each artifact with
+# Sigstore (keyless — using the workflow's OIDC identity, no long-lived key), and
+# publish a GitHub Release with the artifacts and their .sigstore bundles attached.
+# Signed release artifacts are what OpenSSF Scorecard's Signed-Releases check
+# verifies; the bundles let anyone run `sigstore verify` to confirm provenance.
+on:
+  push:
+    tags: ["v*"]
+  workflow_dispatch:
+
+# Least privilege at the top level; the job escalates to exactly what it needs.
+permissions:
+  contents: read
+
+concurrency:
+  group: release-${{ github.ref }}
+  cancel-in-progress: false
+
+jobs:
+  release:
+    name: Build, sign & publish
+    runs-on: ubuntu-latest
+    timeout-minutes: 15
+    permissions:
+      contents: write   # create the GitHub Release and upload assets
+      id-token: write   # OIDC identity for keyless Sigstore signing
+    steps:
+      - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6
+        with:
+          persist-credentials: false
+
+      - name: Set up uv
+        uses: astral-sh/setup-uv@fac544c07dec837d0ccb6301d7b5580bf5edae39 # v8.2.0
+        with:
+          python-version: "3.12"
+
+      - name: Build sdist + wheel
+        run: uv build
+
+      - name: Validate package metadata (twine check)
+        run: uvx twine check dist/*
+
+      # Keyless Sigstore signing: one .sigstore bundle per artifact, authenticated
+      # by the workflow's OIDC identity (no secret key to manage or leak).
+      - name: Sign artifacts with Sigstore
+        run: |
+          uv pip install --system sigstore
+          for artifact in dist/*.tar.gz dist/*.whl; do
+            python -m sigstore sign --bundle "${artifact}.sigstore" "${artifact}"
+          done
+
+      - name: Publish GitHub Release with signed artifacts
+        env:
+          GH_TOKEN: ${{ github.token }}
+        run: |
+          gh release create "${GITHUB_REF_NAME}" \
+            dist/*.tar.gz dist/*.whl dist/*.sigstore \
+            --title "${GITHUB_REF_NAME}" \
+            --generate-notes \
+            --verify-tag
diff --git a/.github/workflows/security.yml b/.github/workflows/security.yml
@@ -5,6 +5,7 @@ name: Security
 #   - pip-audit:  scans resolved Python dependencies for known CVEs (PyPI advisories).
 #   - gitleaks:   scans the repo / PR diff for committed secrets, even without a
 #                 developer running the local pre-commit hook.
+#   - bandit:     Python-specific static security analysis (SAST) of the package.
 on:
   push:
     branches: [main, master, develop]
@@ -56,3 +57,21 @@ jobs:
         uses: gitleaks/gitleaks-action@e0c47f4f8be36e29cdc102c57e68cb5cbf0e8d1e # v3
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+  bandit:
+    name: bandit (Python SAST)
+    runs-on: ubuntu-latest
+    timeout-minutes: 10
+    steps:
+      - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6
+
+      - name: Set up uv (provides uvx for bandit)
+        uses: astral-sh/setup-uv@fac544c07dec837d0ccb6301d7b5580bf5edae39 # v8.2.0
+        with:
+          python-version: "3.12"
+
+      # Static security analysis of the package source — a Python-focused second
+      # opinion alongside CodeQL. Gates on MEDIUM+ severity findings; known false
+      # positives are annotated inline with `# nosec <id>` + a justification.
+      - name: Run bandit
+        run: uvx bandit -r coderag -ll
diff --git a/Dockerfile b/Dockerfile
@@ -26,8 +26,8 @@ LABEL org.opencontainers.image.source="https://github.com/Neverdecel/CodeRAG" \
 COPY --from=ghcr.io/astral-sh/uv:0.8.17@sha256:e4644cb5bd56fdc2c5ea3ee0525d9d21eed1603bccd6a21f887a938be7e85be1 /uv /uvx /usr/local/bin/
 
 WORKDIR /app
-# pyproject reads README.md and LICENSE-2.0.txt, so both are needed to build the wheel.
-COPY pyproject.toml README.md LICENSE-2.0.txt ./
+# pyproject reads README.md and LICENSE, so both are needed to build the wheel.
+COPY pyproject.toml README.md LICENSE ./
 COPY coderag ./coderag
 
 # Non-root runtime user; writable mount points for the index and the codebase.

diff --git a/LICENSE-2.0.txt → LICENSE b/LICENSE-2.0.txt → LICENSE
diff --git a/README.md b/README.md
@@ -240,7 +240,7 @@ contribution details.
 
 ## 📄 License
 
-Apache License 2.0 — see [LICENSE](LICENSE-2.0.txt).
+Apache License 2.0 — see [LICENSE](LICENSE).
 
 ## 🙏 Acknowledgments
 

diff --git a/coderag/chunking/treesitter.py b/coderag/chunking/treesitter.py
@@ -9,6 +9,7 @@
 from __future__ import annotations
 
 import logging
+import warnings
 from functools import lru_cache
 from typing import Any, Callable, List, Set
 
@@ -85,12 +86,29 @@ def loader() -> Any:
 
 _NAME_FIELDS = ("name", "type")
 
+# Upper bound on a single tree-sitter parse. tree-sitter's GLR error-recovery can
+# go super-linear in time and memory on adversarial input — e.g. a few stray
+# tokens scattered among many newlines — which would otherwise hang indexing (and
+# balloon RSS) on a hostile or garbled file. A parse that blows the budget raises,
+# and chunk_file falls back to line windows (its existing graceful-degradation
+# path). Real source parses in single-digit milliseconds, so this never trips on
+# legitimate code; it's a robustness guard, found via fuzzing the chunker.
+_PARSE_TIMEOUT_MICROS = 2_000_000  # 2s
+
 
 @lru_cache(maxsize=16)
 def _parser(language: str) -> Any:
     import tree_sitter as ts
 
-    return ts.Parser(_LANGUAGE_LOADERS[language]())
+    parser = ts.Parser(_LANGUAGE_LOADERS[language]())
+    # `timeout_micros` is deprecated upstream in favour of a parse-time progress
+    # callback, but that callback is ignored for bytestring sources in the pinned
+    # tree-sitter range (<0.26), so timeout_micros stays the working guard here.
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore", DeprecationWarning)
+        # Writable at runtime even though the stubs mark it read-only (deprecated).
+        parser.timeout_micros = _PARSE_TIMEOUT_MICROS  # type: ignore[misc]
+    return parser
 
 
 def _kind(node_type: str) -> str:

diff --git a/coderag/store/sqlite_store.py b/coderag/store/sqlite_store.py
@@ -242,7 +242,7 @@ def hydrate(self, chunk_ids: Sequence[int]) -> Dict[int, sqlite3.Row]:
         placeholders = ",".join("?" for _ in chunk_ids)
         with self._lock:
             rows = self._conn.execute(
-                "SELECT c.id, c.symbol, c.kind, c.start_line, c.end_line, c.language, "
+                "SELECT c.id, c.symbol, c.kind, c.start_line, c.end_line, c.language, "  # nosec B608 — IN-list is positional "?" placeholders; ids bound as params
                 "       c.text, f.path AS path "
                 "FROM chunks c JOIN files f ON f.id = c.file_id "
                 f"WHERE c.id IN ({placeholders})",

diff --git a/coderag/surfaces/http_api.py b/coderag/surfaces/http_api.py
@@ -128,7 +128,7 @@ def _is_public_host(host: str) -> bool:
     """True if ``host`` is reachable beyond loopback (so auth really matters)."""
     if host in ("127.0.0.1", "localhost", "::1"):
         return False
-    if host in ("0.0.0.0", "::"):
+    if host in ("0.0.0.0", "::"):  # nosec B104 — classifies a host as public; does not bind a socket
         return True
     try:
         return not ipaddress.ip_address(host).is_loopback

diff --git a/fuzz/fuzz_chunk_file.py b/fuzz/fuzz_chunk_file.py
@@ -0,0 +1,57 @@
+#!/usr/bin/env python3
+"""Atheris fuzz target for CodeRAG's source chunker.
+
+``chunk_file`` is the most exposed parser in the engine: it ingests arbitrary
+file contents from any watched repository and is contractually required to
+*never* raise — a parse failure must degrade to line-window chunking, not crash
+indexing (see ``coderag/chunking/__init__.py``). This harness feeds random bytes
+as source text across every supported language and asserts that contract plus a
+few structural invariants on the chunks it returns.
+
+Run locally::
+
+    pip install atheris
+    python fuzz/fuzz_chunk_file.py -atheris_runs=50000
+
+The same target is built for ClusterFuzzLite / OSS-Fuzz via ``.clusterfuzzlite/``.
+"""
+
+from __future__ import annotations
+
+import sys
+
+import atheris
+
+with atheris.instrument_imports():
+    from coderag.chunking import chunk_file, languages  # noqa: E402
+    from coderag.config import Config  # noqa: E402
+
+# Symbol-aware languages (Python via the stdlib ``ast``, plus the tree-sitter
+# set) and one non-symbol language to exercise the line-window fallback path.
+_LANGUAGES = sorted(languages.SYMBOL_LANGUAGES) + ["text"]
+_CONFIG = Config()
+
+
+def TestOneInput(data: bytes) -> None:
+    fdp = atheris.FuzzedDataProvider(data)
+    language = _LANGUAGES[fdp.ConsumeIntInRange(0, len(_LANGUAGES) - 1)]
+    text = fdp.ConsumeUnicodeNoSurrogates(fdp.remaining_bytes())
+
+    # Contract: chunk_file never raises on arbitrary input (it falls back to
+    # line windows when symbol extraction fails).
+    chunks = chunk_file(text, language, _CONFIG)
+
+    # Structural invariants every emitted chunk must satisfy.
+    assert isinstance(chunks, list)
+    for chunk in chunks:
+        assert chunk.start_line >= 1
+        assert chunk.end_line >= chunk.start_line
+
+
+def main() -> None:
+    atheris.Setup(sys.argv, TestOneInput)
+    atheris.Fuzz()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/pyproject.toml b/pyproject.toml
@@ -8,7 +8,7 @@ version = "1.0.0"
 description = "Standalone, local-first semantic code-search engine for large and custom codebases."
 readme = "README.md"
 requires-python = ">=3.11"
-license = { file = "LICENSE-2.0.txt" }
+license = { file = "LICENSE" }
 authors = [{ name = "Neverdecel" }]
 keywords = ["code-search", "rag", "embeddings", "faiss", "semantic-search", "retrieval"]
 dependencies = [