Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions .clusterfuzzlite/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# Build image for CodeRAG's Atheris fuzz harnesses, driven by ClusterFuzzLite
# (and compatible with OSS-Fuzz). base-builder-python ships Atheris plus the
# `compile_python_fuzzer` helper that build.sh uses to bundle each harness.
FROM gcr.io/oss-fuzz-base/base-builder-python

COPY . $SRC/coderag
COPY .clusterfuzzlite/build.sh $SRC/build.sh
WORKDIR $SRC/coderag
10 changes: 10 additions & 0 deletions .clusterfuzzlite/build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#!/bin/bash -eu
# ClusterFuzzLite / OSS-Fuzz build script. Runs inside base-builder-python, which
# provides Atheris and the `compile_python_fuzzer` helper. Installs CodeRAG so the
# harnesses can import it, then compiles every harness under fuzz/ into $OUT.

pip3 install --no-cache-dir "$SRC/coderag"

for harness in "$SRC/coderag/fuzz/"fuzz_*.py; do
compile_python_fuzzer "$harness"
done
5 changes: 5 additions & 0 deletions .github/workflows/codeql.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,11 @@ concurrency:
group: codeql-${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true

# Least privilege at the top level; the analyze job escalates to the single write
# scope it needs (uploading code-scanning results).
permissions:
contents: read

jobs:
analyze:
name: Analyze (python)
Expand Down
11 changes: 8 additions & 3 deletions .github/workflows/docker-beta.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,10 @@ on:
branches: [master]
workflow_dispatch:

# Least privilege at the top level; the build job below opts into exactly the
# write scopes it needs (top-level write would fail OpenSSF Token-Permissions).
permissions:
contents: read
packages: write # push to GHCR
id-token: write # OIDC for build provenance
attestations: write # SLSA provenance + SBOM attestations

concurrency:
group: docker-${{ github.workflow }}-${{ github.ref }}
Expand Down Expand Up @@ -47,6 +46,12 @@ jobs:
if: always()
runs-on: ubuntu-latest
timeout-minutes: 30
# Write scopes are scoped to this job only; the prepare job needs none.
permissions:
contents: read # checkout
packages: write # push images to GHCR
id-token: write # OIDC for build provenance / keyless signing
attestations: write # SLSA provenance + SBOM attestations
strategy:
fail-fast: false
matrix:
Expand Down
59 changes: 59 additions & 0 deletions .github/workflows/fuzz.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
name: Fuzz

# Continuous fuzzing of the source chunker — the most exposed parser in CodeRAG
# (it ingests arbitrary file bytes from any watched repo and must never crash).
# The harness in fuzz/ is an Atheris target; here it runs for a bounded burst on
# PRs that touch the chunker, and a longer burst on a weekly schedule. The same
# harness is OSS-Fuzz / ClusterFuzzLite-compatible via .clusterfuzzlite/.
on:
pull_request:
branches: [master]
paths:
- "coderag/chunking/**"
- "coderag/_lines.py"
- "coderag/types.py"
- "fuzz/**"
- ".github/workflows/fuzz.yml"
schedule:
- cron: "15 4 * * 1"
workflow_dispatch:

# Least privilege: the job only needs to read the repo.
permissions:
contents: read

concurrency:
group: fuzz-${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true

jobs:
atheris:
name: Atheris (chunker)
runs-on: ubuntu-latest
timeout-minutes: 20
steps:
- uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6

- name: Set up Python
uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6
with:
# Atheris ships manylinux wheels for 3.11, so no clang build needed.
python-version: "3.11"

- name: Set up uv (fast installs + cache)
uses: astral-sh/setup-uv@fac544c07dec837d0ccb6301d7b5580bf5edae39 # v8.2.0
with:
enable-cache: true

- name: Install CodeRAG + Atheris
run: uv pip install --system -e . atheris

# Bounded run: a short, deterministic burst on PRs; a longer time-boxed run
# on the weekly schedule. A crash or a broken invariant fails the job.
- name: Fuzz the chunker
run: |
if [ "${{ github.event_name }}" = "schedule" ]; then
python fuzz/fuzz_chunk_file.py -max_total_time=600
else
python fuzz/fuzz_chunk_file.py -atheris_runs=50000
fi
62 changes: 62 additions & 0 deletions .github/workflows/release.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
name: Release

# When a v* tag is pushed, build the sdist + wheel, sign each artifact with
# Sigstore (keyless — using the workflow's OIDC identity, no long-lived key), and
# publish a GitHub Release with the artifacts and their .sigstore bundles attached.
# Signed release artifacts are what OpenSSF Scorecard's Signed-Releases check
# verifies; the bundles let anyone run `sigstore verify` to confirm provenance.
on:
push:
tags: ["v*"]
workflow_dispatch:

# Least privilege at the top level; the job escalates to exactly what it needs.
permissions:
contents: read

concurrency:
group: release-${{ github.ref }}
cancel-in-progress: false

jobs:
release:
name: Build, sign & publish
runs-on: ubuntu-latest
timeout-minutes: 15
permissions:
contents: write # create the GitHub Release and upload assets
id-token: write # OIDC identity for keyless Sigstore signing
steps:
- uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6
with:
persist-credentials: false

- name: Set up uv
uses: astral-sh/setup-uv@fac544c07dec837d0ccb6301d7b5580bf5edae39 # v8.2.0
with:
python-version: "3.12"

- name: Build sdist + wheel
run: uv build

- name: Validate package metadata (twine check)
run: uvx twine check dist/*

# Keyless Sigstore signing: one .sigstore bundle per artifact, authenticated
# by the workflow's OIDC identity (no secret key to manage or leak).
- name: Sign artifacts with Sigstore
run: |
uv pip install --system sigstore
for artifact in dist/*.tar.gz dist/*.whl; do
python -m sigstore sign --bundle "${artifact}.sigstore" "${artifact}"
done

- name: Publish GitHub Release with signed artifacts
env:
GH_TOKEN: ${{ github.token }}
run: |
gh release create "${GITHUB_REF_NAME}" \
dist/*.tar.gz dist/*.whl dist/*.sigstore \
--title "${GITHUB_REF_NAME}" \
--generate-notes \
--verify-tag
19 changes: 19 additions & 0 deletions .github/workflows/security.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ name: Security
# - pip-audit: scans resolved Python dependencies for known CVEs (PyPI advisories).
# - gitleaks: scans the repo / PR diff for committed secrets, even without a
# developer running the local pre-commit hook.
# - bandit: Python-specific static security analysis (SAST) of the package.
on:
push:
branches: [main, master, develop]
Expand Down Expand Up @@ -56,3 +57,21 @@ jobs:
uses: gitleaks/gitleaks-action@e0c47f4f8be36e29cdc102c57e68cb5cbf0e8d1e # v3
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

bandit:
name: bandit (Python SAST)
runs-on: ubuntu-latest
timeout-minutes: 10
steps:
- uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6

- name: Set up uv (provides uvx for bandit)
uses: astral-sh/setup-uv@fac544c07dec837d0ccb6301d7b5580bf5edae39 # v8.2.0
with:
python-version: "3.12"

# Static security analysis of the package source — a Python-focused second
# opinion alongside CodeQL. Gates on MEDIUM+ severity findings; known false
# positives are annotated inline with `# nosec <id>` + a justification.
- name: Run bandit
run: uvx bandit -r coderag -ll
4 changes: 2 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@ LABEL org.opencontainers.image.source="https://github.com/Neverdecel/CodeRAG" \
COPY --from=ghcr.io/astral-sh/uv:0.8.17@sha256:e4644cb5bd56fdc2c5ea3ee0525d9d21eed1603bccd6a21f887a938be7e85be1 /uv /uvx /usr/local/bin/

WORKDIR /app
# pyproject reads README.md and LICENSE-2.0.txt, so both are needed to build the wheel.
COPY pyproject.toml README.md LICENSE-2.0.txt ./
# pyproject reads README.md and LICENSE, so both are needed to build the wheel.
COPY pyproject.toml README.md LICENSE ./
COPY coderag ./coderag

# Non-root runtime user; writable mount points for the index and the codebase.
Expand Down
File renamed without changes.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,7 @@ contribution details.

## 📄 License

Apache License 2.0 — see [LICENSE](LICENSE-2.0.txt).
Apache License 2.0 — see [LICENSE](LICENSE).

## 🙏 Acknowledgments

Expand Down
20 changes: 19 additions & 1 deletion coderag/chunking/treesitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from __future__ import annotations

import logging
import warnings
from functools import lru_cache
from typing import Any, Callable, List, Set

Expand Down Expand Up @@ -85,12 +86,29 @@ def loader() -> Any:

_NAME_FIELDS = ("name", "type")

# Upper bound on a single tree-sitter parse. tree-sitter's GLR error-recovery can
# go super-linear in time and memory on adversarial input — e.g. a few stray
# tokens scattered among many newlines — which would otherwise hang indexing (and
# balloon RSS) on a hostile or garbled file. A parse that blows the budget raises,
# and chunk_file falls back to line windows (its existing graceful-degradation
# path). Real source parses in single-digit milliseconds, so this never trips on
# legitimate code; it's a robustness guard, found via fuzzing the chunker.
_PARSE_TIMEOUT_MICROS = 2_000_000 # 2s


@lru_cache(maxsize=16)
def _parser(language: str) -> Any:
import tree_sitter as ts

return ts.Parser(_LANGUAGE_LOADERS[language]())
parser = ts.Parser(_LANGUAGE_LOADERS[language]())
# `timeout_micros` is deprecated upstream in favour of a parse-time progress
# callback, but that callback is ignored for bytestring sources in the pinned
# tree-sitter range (<0.26), so timeout_micros stays the working guard here.
with warnings.catch_warnings():
warnings.simplefilter("ignore", DeprecationWarning)
# Writable at runtime even though the stubs mark it read-only (deprecated).
parser.timeout_micros = _PARSE_TIMEOUT_MICROS # type: ignore[misc]
return parser


def _kind(node_type: str) -> str:
Expand Down
2 changes: 1 addition & 1 deletion coderag/store/sqlite_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,7 @@ def hydrate(self, chunk_ids: Sequence[int]) -> Dict[int, sqlite3.Row]:
placeholders = ",".join("?" for _ in chunk_ids)
with self._lock:
rows = self._conn.execute(
"SELECT c.id, c.symbol, c.kind, c.start_line, c.end_line, c.language, "
"SELECT c.id, c.symbol, c.kind, c.start_line, c.end_line, c.language, " # nosec B608 — IN-list is positional "?" placeholders; ids bound as params
" c.text, f.path AS path "
"FROM chunks c JOIN files f ON f.id = c.file_id "
f"WHERE c.id IN ({placeholders})",
Expand Down
2 changes: 1 addition & 1 deletion coderag/surfaces/http_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ def _is_public_host(host: str) -> bool:
"""True if ``host`` is reachable beyond loopback (so auth really matters)."""
if host in ("127.0.0.1", "localhost", "::1"):
return False
if host in ("0.0.0.0", "::"):
if host in ("0.0.0.0", "::"): # nosec B104 — classifies a host as public; does not bind a socket
return True
try:
return not ipaddress.ip_address(host).is_loopback
Expand Down
57 changes: 57 additions & 0 deletions fuzz/fuzz_chunk_file.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
#!/usr/bin/env python3
"""Atheris fuzz target for CodeRAG's source chunker.

``chunk_file`` is the most exposed parser in the engine: it ingests arbitrary
file contents from any watched repository and is contractually required to
*never* raise — a parse failure must degrade to line-window chunking, not crash
indexing (see ``coderag/chunking/__init__.py``). This harness feeds random bytes
as source text across every supported language and asserts that contract plus a
few structural invariants on the chunks it returns.

Run locally::

pip install atheris
python fuzz/fuzz_chunk_file.py -atheris_runs=50000

The same target is built for ClusterFuzzLite / OSS-Fuzz via ``.clusterfuzzlite/``.
"""

from __future__ import annotations

import sys

import atheris

with atheris.instrument_imports():
from coderag.chunking import chunk_file, languages # noqa: E402
from coderag.config import Config # noqa: E402

# Symbol-aware languages (Python via the stdlib ``ast``, plus the tree-sitter
# set) and one non-symbol language to exercise the line-window fallback path.
_LANGUAGES = sorted(languages.SYMBOL_LANGUAGES) + ["text"]
_CONFIG = Config()


def TestOneInput(data: bytes) -> None:
fdp = atheris.FuzzedDataProvider(data)
language = _LANGUAGES[fdp.ConsumeIntInRange(0, len(_LANGUAGES) - 1)]
text = fdp.ConsumeUnicodeNoSurrogates(fdp.remaining_bytes())

# Contract: chunk_file never raises on arbitrary input (it falls back to
# line windows when symbol extraction fails).
chunks = chunk_file(text, language, _CONFIG)

# Structural invariants every emitted chunk must satisfy.
assert isinstance(chunks, list)
for chunk in chunks:
assert chunk.start_line >= 1
assert chunk.end_line >= chunk.start_line


def main() -> None:
atheris.Setup(sys.argv, TestOneInput)
atheris.Fuzz()


if __name__ == "__main__":
main()
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ version = "1.0.0"
description = "Standalone, local-first semantic code-search engine for large and custom codebases."
readme = "README.md"
requires-python = ">=3.11"
license = { file = "LICENSE-2.0.txt" }
license = { file = "LICENSE" }
authors = [{ name = "Neverdecel" }]
keywords = ["code-search", "rag", "embeddings", "faiss", "semantic-search", "retrieval"]
dependencies = [
Expand Down
Loading
Loading