From 9c9083b7c9af13b638dc70ddde4e9dc9266de4ee Mon Sep 17 00:00:00 2001 From: Julian Risch Date: Tue, 2 Jun 2026 12:55:06 +0200 Subject: [PATCH 1/3] ci: add Atheris fuzz targets and ClusterFuzzLite MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Address the OpenSSF Scorecard Fuzzing check (0/10) and add real fuzzing of the project's untrusted-input entry points. - test/fuzz/: three Atheris harnesses — Pipeline.loads (serialized pipeline deserialization), Document.from_dict, and document_matches_filter (filter expressions). Each catches the exceptions that are a normal reaction to malformed input so only genuine crashes/hangs/unexpected errors are reported. - .clusterfuzzlite/: Dockerfile + build.sh + project.yaml to build the harnesses with the OSS-Fuzz Python toolchain. - .github/workflows/cflite_pr.yml: short, code-change-scoped ClusterFuzzLite run on PRs that touch fuzzed code, least-privilege token, SHA-pinned actions. - licenserc.toml: exclude .clusterfuzzlite from the license-header check. Scorecard detects this via both the `import atheris` harnesses and the .clusterfuzzlite deployment. pytest does not collect fuzz_*.py. Co-Authored-By: Claude Opus 4.8 (1M context) --- .clusterfuzzlite/Dockerfile | 12 +++++++ .clusterfuzzlite/build.sh | 13 ++++++++ .clusterfuzzlite/project.yaml | 5 +++ .github/workflows/cflite_pr.yml | 41 ++++++++++++++++++++++++ licenserc.toml | 1 + test/fuzz/README.md | 42 +++++++++++++++++++++++++ test/fuzz/fuzz_document_from_dict.py | 42 +++++++++++++++++++++++++ test/fuzz/fuzz_filters.py | 47 ++++++++++++++++++++++++++++ test/fuzz/fuzz_pipeline_loads.py | 40 +++++++++++++++++++++++ 9 files changed, 243 insertions(+) create mode 100644 .clusterfuzzlite/Dockerfile create mode 100755 .clusterfuzzlite/build.sh create mode 100644 .clusterfuzzlite/project.yaml create mode 100644 .github/workflows/cflite_pr.yml create mode 100644 test/fuzz/README.md create mode 100644 test/fuzz/fuzz_document_from_dict.py create mode 100644 test/fuzz/fuzz_filters.py create mode 100644 test/fuzz/fuzz_pipeline_loads.py diff --git a/.clusterfuzzlite/Dockerfile b/.clusterfuzzlite/Dockerfile new file mode 100644 index 0000000000..38623deffb --- /dev/null +++ b/.clusterfuzzlite/Dockerfile @@ -0,0 +1,12 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +# ClusterFuzzLite / OSS-Fuzz build environment for the Haystack Atheris fuzz targets. +# The base image is intentionally left as the rolling OSS-Fuzz tag: the fuzzing +# toolchain expects the latest base-builder, and OSS-Fuzz/ClusterFuzzLite manage it. +FROM gcr.io/oss-fuzz-base/base-builder-python + +COPY . $SRC/haystack +WORKDIR $SRC/haystack +COPY .clusterfuzzlite/build.sh $SRC/ diff --git a/.clusterfuzzlite/build.sh b/.clusterfuzzlite/build.sh new file mode 100755 index 0000000000..711dcca766 --- /dev/null +++ b/.clusterfuzzlite/build.sh @@ -0,0 +1,13 @@ +#!/bin/bash -eu +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +# Builds the Atheris fuzz targets for ClusterFuzzLite / OSS-Fuzz. +# `compile_python_fuzzer` is provided by the base-builder-python image. + +pip3 install . + +for harness in "$SRC"/haystack/test/fuzz/fuzz_*.py; do + compile_python_fuzzer "$harness" +done diff --git a/.clusterfuzzlite/project.yaml b/.clusterfuzzlite/project.yaml new file mode 100644 index 0000000000..31cda8cec3 --- /dev/null +++ b/.clusterfuzzlite/project.yaml @@ -0,0 +1,5 @@ +# Project configuration for ClusterFuzzLite (and a starting point for OSS-Fuzz). +# https://google.github.io/clusterfuzzlite/ +language: python +sanitizers: + - address diff --git a/.github/workflows/cflite_pr.yml b/.github/workflows/cflite_pr.yml new file mode 100644 index 0000000000..51493e080d --- /dev/null +++ b/.github/workflows/cflite_pr.yml @@ -0,0 +1,41 @@ +name: ClusterFuzzLite PR fuzzing + +# Short, code-change-scoped fuzzing run on PRs that touch fuzzed code or the +# fuzzing setup. Catches regressions and crashes introduced by a change. +# Continuous/batch fuzzing can be added later as a separate scheduled workflow. +on: + pull_request: + paths: + - "haystack/**" + - "test/fuzz/**" + - ".clusterfuzzlite/**" + - ".github/workflows/cflite_pr.yml" + +permissions: + contents: read + +jobs: + pr-fuzzing: + runs-on: ubuntu-latest + concurrency: + group: cflite-pr-${{ github.event.pull_request.number }} + cancel-in-progress: true + steps: + - name: Build Fuzzers + id: build + uses: google/clusterfuzzlite/actions/build_fuzzers@884713a6c30a92e5e8544c39945cd7cb630abcd1 # v1 + with: + language: python + sanitizer: address + + - name: Run Fuzzers + id: run + uses: google/clusterfuzzlite/actions/run_fuzzers@884713a6c30a92e5e8544c39945cd7cb630abcd1 # v1 + with: + github-token: ${{ secrets.GITHUB_TOKEN }} + fuzz-seconds: 180 + mode: code-change + sanitizer: address + # Crashes fail the job and are uploaded as artifacts. SARIF upload is + # disabled to keep the token least-privilege (no security-events: write). + output-sarif: false diff --git a/licenserc.toml b/licenserc.toml index c0fcb56ea4..68b30ed1ef 100644 --- a/licenserc.toml +++ b/licenserc.toml @@ -1,6 +1,7 @@ headerPath = "license-header.txt" excludes = [ + ".clusterfuzzlite", "*.csv", "*.feature", "*.html", diff --git a/test/fuzz/README.md b/test/fuzz/README.md new file mode 100644 index 0000000000..4e48dd2834 --- /dev/null +++ b/test/fuzz/README.md @@ -0,0 +1,42 @@ +# Fuzz targets + +[Atheris](https://github.com/google/atheris) fuzz harnesses for Haystack's +untrusted-input entry points. They are wired into CI via +[ClusterFuzzLite](https://google.github.io/clusterfuzzlite/) +(see [`.clusterfuzzlite/`](../../.clusterfuzzlite) and the +`ClusterFuzzLite PR fuzzing` workflow). + +| Harness | Target | Why | +|---|---|---| +| `fuzz_pipeline_loads.py` | `Pipeline.loads` | Deserializing a serialized pipeline (YAML) is a documented attack surface. | +| `fuzz_document_from_dict.py` | `Document.from_dict` | Reconstructing a `Document` from an untrusted dict. | +| `fuzz_filters.py` | `document_matches_filter` | Evaluating an untrusted filter expression. | + +Each harness catches the exceptions that are a *normal* reaction to malformed +input; anything else (a crash, unbounded recursion, a hang, or an unexpected +exception type) is reported by Atheris as a finding. The "expected" exception +lists can be tightened over time to surface more subtle bugs. + +## Run locally + +```sh +pip install atheris +pip install -e . + +# Fuzz for a bit (Ctrl-C to stop); -atheris_runs limits the number of inputs. +python test/fuzz/fuzz_pipeline_loads.py -atheris_runs=100000 +python test/fuzz/fuzz_document_from_dict.py -atheris_runs=100000 +python test/fuzz/fuzz_filters.py -atheris_runs=100000 +``` + +Pass a directory argument to use/grow a seed corpus, and a crashing input file +to reproduce a finding: + +```sh +python test/fuzz/fuzz_pipeline_loads.py corpus/ # use corpus dir +python test/fuzz/fuzz_pipeline_loads.py crash- # reproduce a crash +``` + +> Note: Atheris builds a native extension and is not part of the dev +> dependencies; install it on demand as shown above. `pytest` does not collect +> these files (they are named `fuzz_*.py`, not `test_*.py`). diff --git a/test/fuzz/fuzz_document_from_dict.py b/test/fuzz/fuzz_document_from_dict.py new file mode 100644 index 0000000000..8a646f093b --- /dev/null +++ b/test/fuzz/fuzz_document_from_dict.py @@ -0,0 +1,42 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +"""Fuzz target for ``Document.from_dict`` — deserializing untrusted document dicts.""" + +import json +import sys + +import atheris + +with atheris.instrument_imports(): + from haystack.dataclasses import Document + +# Normal reactions to malformed input; anything else is a genuine finding. +_EXPECTED = (ValueError, TypeError, KeyError) + + +def TestOneInput(data: bytes) -> None: + """Decode fuzzer bytes into a JSON object and feed it to ``Document.from_dict``.""" + fdp = atheris.FuzzedDataProvider(data) + raw = fdp.ConsumeUnicodeNoSurrogates(fdp.remaining_bytes()) + try: + obj = json.loads(raw) + except (ValueError, RecursionError): + return + if not isinstance(obj, dict): + return + try: + Document.from_dict(obj) + except _EXPECTED: + pass + + +def main() -> None: + """Set up and run the Atheris fuzzing loop.""" + atheris.Setup(sys.argv, TestOneInput) + atheris.Fuzz() + + +if __name__ == "__main__": + main() diff --git a/test/fuzz/fuzz_filters.py b/test/fuzz/fuzz_filters.py new file mode 100644 index 0000000000..9c8b46015d --- /dev/null +++ b/test/fuzz/fuzz_filters.py @@ -0,0 +1,47 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +"""Fuzz target for ``document_matches_filter`` — evaluating untrusted filter expressions.""" + +import json +import sys + +import atheris + +with atheris.instrument_imports(): + from haystack.dataclasses import Document + from haystack.errors import FilterError + from haystack.utils.filters import document_matches_filter + +# A fixed document is enough; we are fuzzing the filter expression, not the document. +_DOCUMENT = Document(content="the quick brown fox", meta={"page": 1, "name": "fuzz"}) + +# Normal reactions to malformed filters; anything else is a genuine finding. +_EXPECTED = (FilterError, ValueError, TypeError, KeyError) + + +def TestOneInput(data: bytes) -> None: + """Decode fuzzer bytes into a JSON filter dict and evaluate it against a document.""" + fdp = atheris.FuzzedDataProvider(data) + raw = fdp.ConsumeUnicodeNoSurrogates(fdp.remaining_bytes()) + try: + filters = json.loads(raw) + except (ValueError, RecursionError): + return + if not isinstance(filters, dict): + return + try: + document_matches_filter(filters, _DOCUMENT) + except _EXPECTED: + pass + + +def main() -> None: + """Set up and run the Atheris fuzzing loop.""" + atheris.Setup(sys.argv, TestOneInput) + atheris.Fuzz() + + +if __name__ == "__main__": + main() diff --git a/test/fuzz/fuzz_pipeline_loads.py b/test/fuzz/fuzz_pipeline_loads.py new file mode 100644 index 0000000000..472e6f3761 --- /dev/null +++ b/test/fuzz/fuzz_pipeline_loads.py @@ -0,0 +1,40 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +"""Fuzz target for ``Pipeline.loads`` — deserializing untrusted serialized pipelines. + +Loading a serialized pipeline is an explicit attack surface (see SECURITY.md), so this +target feeds arbitrary fuzzer bytes through the YAML unmarshaller and ``from_dict``. +""" + +import sys + +import atheris + +with atheris.instrument_imports(): + from haystack import Pipeline + from haystack.core.errors import PipelineError + from haystack.errors import DeserializationError + +# Exceptions that are a normal reaction to malformed input. Anything else — a crash, +# unbounded recursion, a hang, or an unexpected exception type — is a genuine finding. +_EXPECTED = (DeserializationError, PipelineError, ValueError, TypeError, KeyError) + + +def TestOneInput(data: bytes) -> None: + """Feed one fuzzer-generated input to ``Pipeline.loads`` as a YAML document.""" + try: + Pipeline.loads(data) + except _EXPECTED: + pass + + +def main() -> None: + """Set up and run the Atheris fuzzing loop.""" + atheris.Setup(sys.argv, TestOneInput) + atheris.Fuzz() + + +if __name__ == "__main__": + main() From c49cecc01d88eb58848c5c741d28ce118a70d11f Mon Sep 17 00:00:00 2001 From: Julian Risch Date: Tue, 2 Jun 2026 12:58:16 +0200 Subject: [PATCH 2/3] ci: pin ClusterFuzzLite base image by digest Pin gcr.io/oss-fuzz-base/base-builder-python to its current digest instead of the rolling latest tag, for supply-chain integrity. The OSS-Fuzz base-builder is updated frequently, so the comment documents how to refresh the digest. Co-Authored-By: Claude Opus 4.8 (1M context) --- .clusterfuzzlite/Dockerfile | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/.clusterfuzzlite/Dockerfile b/.clusterfuzzlite/Dockerfile index 38623deffb..fcf69e27a1 100644 --- a/.clusterfuzzlite/Dockerfile +++ b/.clusterfuzzlite/Dockerfile @@ -3,9 +3,10 @@ # SPDX-License-Identifier: Apache-2.0 # ClusterFuzzLite / OSS-Fuzz build environment for the Haystack Atheris fuzz targets. -# The base image is intentionally left as the rolling OSS-Fuzz tag: the fuzzing -# toolchain expects the latest base-builder, and OSS-Fuzz/ClusterFuzzLite manage it. -FROM gcr.io/oss-fuzz-base/base-builder-python +# Pinned by digest for supply-chain integrity. Bump periodically (the OSS-Fuzz +# base-builder is updated frequently with toolchain fixes); resolve a fresh digest with: +# docker buildx imagetools inspect gcr.io/oss-fuzz-base/base-builder-python:latest --format '{{.Manifest.Digest}}' +FROM gcr.io/oss-fuzz-base/base-builder-python@sha256:bdae8ffe13ebbaf3b653f0a5082d8d72e108d8cd9eed1fef1a85d8350efa3fbf COPY . $SRC/haystack WORKDIR $SRC/haystack From 7bd8881f640b1c473380b321a2c63f7e8ed29c9b Mon Sep 17 00:00:00 2001 From: Julian Risch Date: Tue, 2 Jun 2026 13:02:49 +0200 Subject: [PATCH 3/3] ci: track ClusterFuzzLite base image with Dependabot Add a docker ecosystem entry for /.clusterfuzzlite so Dependabot keeps the digest-pinned gcr.io/oss-fuzz-base/base-builder-python in Dockerfile up to date. Co-Authored-By: Claude Opus 4.8 (1M context) --- .github/dependabot.yml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.github/dependabot.yml b/.github/dependabot.yml index d7d1bea410..2d6a7bff63 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -31,3 +31,11 @@ updates: interval: 'daily' cooldown: default-days: 1 + + # Keeps the digest-pinned OSS-Fuzz base-builder in .clusterfuzzlite/Dockerfile fresh. + - package-ecosystem: 'docker' + directory: '/.clusterfuzzlite' + schedule: + interval: 'daily' + cooldown: + default-days: 1