diff --git a/.clusterfuzzlite/Dockerfile b/.clusterfuzzlite/Dockerfile new file mode 100644 index 0000000000..fcf69e27a1 --- /dev/null +++ b/.clusterfuzzlite/Dockerfile @@ -0,0 +1,13 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +# ClusterFuzzLite / OSS-Fuzz build environment for the Haystack Atheris fuzz targets. +# Pinned by digest for supply-chain integrity. Bump periodically (the OSS-Fuzz +# base-builder is updated frequently with toolchain fixes); resolve a fresh digest with: +# docker buildx imagetools inspect gcr.io/oss-fuzz-base/base-builder-python:latest --format '{{.Manifest.Digest}}' +FROM gcr.io/oss-fuzz-base/base-builder-python@sha256:bdae8ffe13ebbaf3b653f0a5082d8d72e108d8cd9eed1fef1a85d8350efa3fbf + +COPY . $SRC/haystack +WORKDIR $SRC/haystack +COPY .clusterfuzzlite/build.sh $SRC/ diff --git a/.clusterfuzzlite/build.sh b/.clusterfuzzlite/build.sh new file mode 100755 index 0000000000..711dcca766 --- /dev/null +++ b/.clusterfuzzlite/build.sh @@ -0,0 +1,13 @@ +#!/bin/bash -eu +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +# Builds the Atheris fuzz targets for ClusterFuzzLite / OSS-Fuzz. +# `compile_python_fuzzer` is provided by the base-builder-python image. + +pip3 install . + +for harness in "$SRC"/haystack/test/fuzz/fuzz_*.py; do + compile_python_fuzzer "$harness" +done diff --git a/.clusterfuzzlite/project.yaml b/.clusterfuzzlite/project.yaml new file mode 100644 index 0000000000..31cda8cec3 --- /dev/null +++ b/.clusterfuzzlite/project.yaml @@ -0,0 +1,5 @@ +# Project configuration for ClusterFuzzLite (and a starting point for OSS-Fuzz). +# https://google.github.io/clusterfuzzlite/ +language: python +sanitizers: + - address diff --git a/.github/dependabot.yml b/.github/dependabot.yml index d7d1bea410..2d6a7bff63 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -31,3 +31,11 @@ updates: interval: 'daily' cooldown: default-days: 1 + + # Keeps the digest-pinned OSS-Fuzz base-builder in .clusterfuzzlite/Dockerfile fresh. + - package-ecosystem: 'docker' + directory: '/.clusterfuzzlite' + schedule: + interval: 'daily' + cooldown: + default-days: 1 diff --git a/.github/workflows/cflite_pr.yml b/.github/workflows/cflite_pr.yml new file mode 100644 index 0000000000..51493e080d --- /dev/null +++ b/.github/workflows/cflite_pr.yml @@ -0,0 +1,41 @@ +name: ClusterFuzzLite PR fuzzing + +# Short, code-change-scoped fuzzing run on PRs that touch fuzzed code or the +# fuzzing setup. Catches regressions and crashes introduced by a change. +# Continuous/batch fuzzing can be added later as a separate scheduled workflow. +on: + pull_request: + paths: + - "haystack/**" + - "test/fuzz/**" + - ".clusterfuzzlite/**" + - ".github/workflows/cflite_pr.yml" + +permissions: + contents: read + +jobs: + pr-fuzzing: + runs-on: ubuntu-latest + concurrency: + group: cflite-pr-${{ github.event.pull_request.number }} + cancel-in-progress: true + steps: + - name: Build Fuzzers + id: build + uses: google/clusterfuzzlite/actions/build_fuzzers@884713a6c30a92e5e8544c39945cd7cb630abcd1 # v1 + with: + language: python + sanitizer: address + + - name: Run Fuzzers + id: run + uses: google/clusterfuzzlite/actions/run_fuzzers@884713a6c30a92e5e8544c39945cd7cb630abcd1 # v1 + with: + github-token: ${{ secrets.GITHUB_TOKEN }} + fuzz-seconds: 180 + mode: code-change + sanitizer: address + # Crashes fail the job and are uploaded as artifacts. SARIF upload is + # disabled to keep the token least-privilege (no security-events: write). + output-sarif: false diff --git a/licenserc.toml b/licenserc.toml index c0fcb56ea4..68b30ed1ef 100644 --- a/licenserc.toml +++ b/licenserc.toml @@ -1,6 +1,7 @@ headerPath = "license-header.txt" excludes = [ + ".clusterfuzzlite", "*.csv", "*.feature", "*.html", diff --git a/test/fuzz/README.md b/test/fuzz/README.md new file mode 100644 index 0000000000..4e48dd2834 --- /dev/null +++ b/test/fuzz/README.md @@ -0,0 +1,42 @@ +# Fuzz targets + +[Atheris](https://github.com/google/atheris) fuzz harnesses for Haystack's +untrusted-input entry points. They are wired into CI via +[ClusterFuzzLite](https://google.github.io/clusterfuzzlite/) +(see [`.clusterfuzzlite/`](../../.clusterfuzzlite) and the +`ClusterFuzzLite PR fuzzing` workflow). + +| Harness | Target | Why | +|---|---|---| +| `fuzz_pipeline_loads.py` | `Pipeline.loads` | Deserializing a serialized pipeline (YAML) is a documented attack surface. | +| `fuzz_document_from_dict.py` | `Document.from_dict` | Reconstructing a `Document` from an untrusted dict. | +| `fuzz_filters.py` | `document_matches_filter` | Evaluating an untrusted filter expression. | + +Each harness catches the exceptions that are a *normal* reaction to malformed +input; anything else (a crash, unbounded recursion, a hang, or an unexpected +exception type) is reported by Atheris as a finding. The "expected" exception +lists can be tightened over time to surface more subtle bugs. + +## Run locally + +```sh +pip install atheris +pip install -e . + +# Fuzz for a bit (Ctrl-C to stop); -atheris_runs limits the number of inputs. +python test/fuzz/fuzz_pipeline_loads.py -atheris_runs=100000 +python test/fuzz/fuzz_document_from_dict.py -atheris_runs=100000 +python test/fuzz/fuzz_filters.py -atheris_runs=100000 +``` + +Pass a directory argument to use/grow a seed corpus, and a crashing input file +to reproduce a finding: + +```sh +python test/fuzz/fuzz_pipeline_loads.py corpus/ # use corpus dir +python test/fuzz/fuzz_pipeline_loads.py crash- # reproduce a crash +``` + +> Note: Atheris builds a native extension and is not part of the dev +> dependencies; install it on demand as shown above. `pytest` does not collect +> these files (they are named `fuzz_*.py`, not `test_*.py`). diff --git a/test/fuzz/fuzz_document_from_dict.py b/test/fuzz/fuzz_document_from_dict.py new file mode 100644 index 0000000000..8a646f093b --- /dev/null +++ b/test/fuzz/fuzz_document_from_dict.py @@ -0,0 +1,42 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +"""Fuzz target for ``Document.from_dict`` — deserializing untrusted document dicts.""" + +import json +import sys + +import atheris + +with atheris.instrument_imports(): + from haystack.dataclasses import Document + +# Normal reactions to malformed input; anything else is a genuine finding. +_EXPECTED = (ValueError, TypeError, KeyError) + + +def TestOneInput(data: bytes) -> None: + """Decode fuzzer bytes into a JSON object and feed it to ``Document.from_dict``.""" + fdp = atheris.FuzzedDataProvider(data) + raw = fdp.ConsumeUnicodeNoSurrogates(fdp.remaining_bytes()) + try: + obj = json.loads(raw) + except (ValueError, RecursionError): + return + if not isinstance(obj, dict): + return + try: + Document.from_dict(obj) + except _EXPECTED: + pass + + +def main() -> None: + """Set up and run the Atheris fuzzing loop.""" + atheris.Setup(sys.argv, TestOneInput) + atheris.Fuzz() + + +if __name__ == "__main__": + main() diff --git a/test/fuzz/fuzz_filters.py b/test/fuzz/fuzz_filters.py new file mode 100644 index 0000000000..9c8b46015d --- /dev/null +++ b/test/fuzz/fuzz_filters.py @@ -0,0 +1,47 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +"""Fuzz target for ``document_matches_filter`` — evaluating untrusted filter expressions.""" + +import json +import sys + +import atheris + +with atheris.instrument_imports(): + from haystack.dataclasses import Document + from haystack.errors import FilterError + from haystack.utils.filters import document_matches_filter + +# A fixed document is enough; we are fuzzing the filter expression, not the document. +_DOCUMENT = Document(content="the quick brown fox", meta={"page": 1, "name": "fuzz"}) + +# Normal reactions to malformed filters; anything else is a genuine finding. +_EXPECTED = (FilterError, ValueError, TypeError, KeyError) + + +def TestOneInput(data: bytes) -> None: + """Decode fuzzer bytes into a JSON filter dict and evaluate it against a document.""" + fdp = atheris.FuzzedDataProvider(data) + raw = fdp.ConsumeUnicodeNoSurrogates(fdp.remaining_bytes()) + try: + filters = json.loads(raw) + except (ValueError, RecursionError): + return + if not isinstance(filters, dict): + return + try: + document_matches_filter(filters, _DOCUMENT) + except _EXPECTED: + pass + + +def main() -> None: + """Set up and run the Atheris fuzzing loop.""" + atheris.Setup(sys.argv, TestOneInput) + atheris.Fuzz() + + +if __name__ == "__main__": + main() diff --git a/test/fuzz/fuzz_pipeline_loads.py b/test/fuzz/fuzz_pipeline_loads.py new file mode 100644 index 0000000000..472e6f3761 --- /dev/null +++ b/test/fuzz/fuzz_pipeline_loads.py @@ -0,0 +1,40 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +"""Fuzz target for ``Pipeline.loads`` — deserializing untrusted serialized pipelines. + +Loading a serialized pipeline is an explicit attack surface (see SECURITY.md), so this +target feeds arbitrary fuzzer bytes through the YAML unmarshaller and ``from_dict``. +""" + +import sys + +import atheris + +with atheris.instrument_imports(): + from haystack import Pipeline + from haystack.core.errors import PipelineError + from haystack.errors import DeserializationError + +# Exceptions that are a normal reaction to malformed input. Anything else — a crash, +# unbounded recursion, a hang, or an unexpected exception type — is a genuine finding. +_EXPECTED = (DeserializationError, PipelineError, ValueError, TypeError, KeyError) + + +def TestOneInput(data: bytes) -> None: + """Feed one fuzzer-generated input to ``Pipeline.loads`` as a YAML document.""" + try: + Pipeline.loads(data) + except _EXPECTED: + pass + + +def main() -> None: + """Set up and run the Atheris fuzzing loop.""" + atheris.Setup(sys.argv, TestOneInput) + atheris.Fuzz() + + +if __name__ == "__main__": + main()