From 9c9083b7c9af13b638dc70ddde4e9dc9266de4ee Mon Sep 17 00:00:00 2001
From: Julian Risch <julian.risch@deepset.ai>
Date: Tue, 2 Jun 2026 12:55:06 +0200
Subject: [PATCH 1/3] ci: add Atheris fuzz targets and ClusterFuzzLite
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Address the OpenSSF Scorecard Fuzzing check (0/10) and add real fuzzing of the
project's untrusted-input entry points.

- test/fuzz/: three Atheris harnesses — Pipeline.loads (serialized pipeline
  deserialization), Document.from_dict, and document_matches_filter (filter
  expressions). Each catches the exceptions that are a normal reaction to
  malformed input so only genuine crashes/hangs/unexpected errors are reported.
- .clusterfuzzlite/: Dockerfile + build.sh + project.yaml to build the harnesses
  with the OSS-Fuzz Python toolchain.
- .github/workflows/cflite_pr.yml: short, code-change-scoped ClusterFuzzLite run
  on PRs that touch fuzzed code, least-privilege token, SHA-pinned actions.
- licenserc.toml: exclude .clusterfuzzlite from the license-header check.

Scorecard detects this via both the `import atheris` harnesses and the
.clusterfuzzlite deployment. pytest does not collect fuzz_*.py.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .clusterfuzzlite/Dockerfile          | 12 +++++++
 .clusterfuzzlite/build.sh            | 13 ++++++++
 .clusterfuzzlite/project.yaml        |  5 +++
 .github/workflows/cflite_pr.yml      | 41 ++++++++++++++++++++++++
 licenserc.toml                       |  1 +
 test/fuzz/README.md                  | 42 +++++++++++++++++++++++++
 test/fuzz/fuzz_document_from_dict.py | 42 +++++++++++++++++++++++++
 test/fuzz/fuzz_filters.py            | 47 ++++++++++++++++++++++++++++
 test/fuzz/fuzz_pipeline_loads.py     | 40 +++++++++++++++++++++++
 9 files changed, 243 insertions(+)
 create mode 100644 .clusterfuzzlite/Dockerfile
 create mode 100755 .clusterfuzzlite/build.sh
 create mode 100644 .clusterfuzzlite/project.yaml
 create mode 100644 .github/workflows/cflite_pr.yml
 create mode 100644 test/fuzz/README.md
 create mode 100644 test/fuzz/fuzz_document_from_dict.py
 create mode 100644 test/fuzz/fuzz_filters.py
 create mode 100644 test/fuzz/fuzz_pipeline_loads.py

diff --git a/.clusterfuzzlite/Dockerfile b/.clusterfuzzlite/Dockerfile
new file mode 100644
index 0000000000..38623deffb
--- /dev/null
+++ b/.clusterfuzzlite/Dockerfile
@@ -0,0 +1,12 @@
+# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# ClusterFuzzLite / OSS-Fuzz build environment for the Haystack Atheris fuzz targets.
+# The base image is intentionally left as the rolling OSS-Fuzz tag: the fuzzing
+# toolchain expects the latest base-builder, and OSS-Fuzz/ClusterFuzzLite manage it.
+FROM gcr.io/oss-fuzz-base/base-builder-python
+
+COPY . $SRC/haystack
+WORKDIR $SRC/haystack
+COPY .clusterfuzzlite/build.sh $SRC/
diff --git a/.clusterfuzzlite/build.sh b/.clusterfuzzlite/build.sh
new file mode 100755
index 0000000000..711dcca766
--- /dev/null
+++ b/.clusterfuzzlite/build.sh
@@ -0,0 +1,13 @@
+#!/bin/bash -eu
+# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# Builds the Atheris fuzz targets for ClusterFuzzLite / OSS-Fuzz.
+# `compile_python_fuzzer` is provided by the base-builder-python image.
+
+pip3 install .
+
+for harness in "$SRC"/haystack/test/fuzz/fuzz_*.py; do
+  compile_python_fuzzer "$harness"
+done
diff --git a/.clusterfuzzlite/project.yaml b/.clusterfuzzlite/project.yaml
new file mode 100644
index 0000000000..31cda8cec3
--- /dev/null
+++ b/.clusterfuzzlite/project.yaml
@@ -0,0 +1,5 @@
+# Project configuration for ClusterFuzzLite (and a starting point for OSS-Fuzz).
+# https://google.github.io/clusterfuzzlite/
+language: python
+sanitizers:
+  - address
diff --git a/.github/workflows/cflite_pr.yml b/.github/workflows/cflite_pr.yml
new file mode 100644
index 0000000000..51493e080d
--- /dev/null
+++ b/.github/workflows/cflite_pr.yml
@@ -0,0 +1,41 @@
+name: ClusterFuzzLite PR fuzzing
+
+# Short, code-change-scoped fuzzing run on PRs that touch fuzzed code or the
+# fuzzing setup. Catches regressions and crashes introduced by a change.
+# Continuous/batch fuzzing can be added later as a separate scheduled workflow.
+on:
+  pull_request:
+    paths:
+      - "haystack/**"
+      - "test/fuzz/**"
+      - ".clusterfuzzlite/**"
+      - ".github/workflows/cflite_pr.yml"
+
+permissions:
+  contents: read
+
+jobs:
+  pr-fuzzing:
+    runs-on: ubuntu-latest
+    concurrency:
+      group: cflite-pr-${{ github.event.pull_request.number }}
+      cancel-in-progress: true
+    steps:
+      - name: Build Fuzzers
+        id: build
+        uses: google/clusterfuzzlite/actions/build_fuzzers@884713a6c30a92e5e8544c39945cd7cb630abcd1 # v1
+        with:
+          language: python
+          sanitizer: address
+
+      - name: Run Fuzzers
+        id: run
+        uses: google/clusterfuzzlite/actions/run_fuzzers@884713a6c30a92e5e8544c39945cd7cb630abcd1 # v1
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          fuzz-seconds: 180
+          mode: code-change
+          sanitizer: address
+          # Crashes fail the job and are uploaded as artifacts. SARIF upload is
+          # disabled to keep the token least-privilege (no security-events: write).
+          output-sarif: false
diff --git a/licenserc.toml b/licenserc.toml
index c0fcb56ea4..68b30ed1ef 100644
--- a/licenserc.toml
+++ b/licenserc.toml
@@ -1,6 +1,7 @@
 headerPath = "license-header.txt"
 
 excludes = [
+    ".clusterfuzzlite",
     "*.csv",
     "*.feature",
     "*.html",
diff --git a/test/fuzz/README.md b/test/fuzz/README.md
new file mode 100644
index 0000000000..4e48dd2834
--- /dev/null
+++ b/test/fuzz/README.md
@@ -0,0 +1,42 @@
+# Fuzz targets
+
+[Atheris](https://github.com/google/atheris) fuzz harnesses for Haystack's
+untrusted-input entry points. They are wired into CI via
+[ClusterFuzzLite](https://google.github.io/clusterfuzzlite/)
+(see [`.clusterfuzzlite/`](../../.clusterfuzzlite) and the
+`ClusterFuzzLite PR fuzzing` workflow).
+
+| Harness | Target | Why |
+|---|---|---|
+| `fuzz_pipeline_loads.py` | `Pipeline.loads` | Deserializing a serialized pipeline (YAML) is a documented attack surface. |
+| `fuzz_document_from_dict.py` | `Document.from_dict` | Reconstructing a `Document` from an untrusted dict. |
+| `fuzz_filters.py` | `document_matches_filter` | Evaluating an untrusted filter expression. |
+
+Each harness catches the exceptions that are a *normal* reaction to malformed
+input; anything else (a crash, unbounded recursion, a hang, or an unexpected
+exception type) is reported by Atheris as a finding. The "expected" exception
+lists can be tightened over time to surface more subtle bugs.
+
+## Run locally
+
+```sh
+pip install atheris
+pip install -e .
+
+# Fuzz for a bit (Ctrl-C to stop); -atheris_runs limits the number of inputs.
+python test/fuzz/fuzz_pipeline_loads.py -atheris_runs=100000
+python test/fuzz/fuzz_document_from_dict.py -atheris_runs=100000
+python test/fuzz/fuzz_filters.py -atheris_runs=100000
+```
+
+Pass a directory argument to use/grow a seed corpus, and a crashing input file
+to reproduce a finding:
+
+```sh
+python test/fuzz/fuzz_pipeline_loads.py corpus/            # use corpus dir
+python test/fuzz/fuzz_pipeline_loads.py crash-<hash>       # reproduce a crash
+```
+
+> Note: Atheris builds a native extension and is not part of the dev
+> dependencies; install it on demand as shown above. `pytest` does not collect
+> these files (they are named `fuzz_*.py`, not `test_*.py`).
diff --git a/test/fuzz/fuzz_document_from_dict.py b/test/fuzz/fuzz_document_from_dict.py
new file mode 100644
index 0000000000..8a646f093b
--- /dev/null
+++ b/test/fuzz/fuzz_document_from_dict.py
@@ -0,0 +1,42 @@
+# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""Fuzz target for ``Document.from_dict`` — deserializing untrusted document dicts."""
+
+import json
+import sys
+
+import atheris
+
+with atheris.instrument_imports():
+    from haystack.dataclasses import Document
+
+# Normal reactions to malformed input; anything else is a genuine finding.
+_EXPECTED = (ValueError, TypeError, KeyError)
+
+
+def TestOneInput(data: bytes) -> None:
+    """Decode fuzzer bytes into a JSON object and feed it to ``Document.from_dict``."""
+    fdp = atheris.FuzzedDataProvider(data)
+    raw = fdp.ConsumeUnicodeNoSurrogates(fdp.remaining_bytes())
+    try:
+        obj = json.loads(raw)
+    except (ValueError, RecursionError):
+        return
+    if not isinstance(obj, dict):
+        return
+    try:
+        Document.from_dict(obj)
+    except _EXPECTED:
+        pass
+
+
+def main() -> None:
+    """Set up and run the Atheris fuzzing loop."""
+    atheris.Setup(sys.argv, TestOneInput)
+    atheris.Fuzz()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/fuzz/fuzz_filters.py b/test/fuzz/fuzz_filters.py
new file mode 100644
index 0000000000..9c8b46015d
--- /dev/null
+++ b/test/fuzz/fuzz_filters.py
@@ -0,0 +1,47 @@
+# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""Fuzz target for ``document_matches_filter`` — evaluating untrusted filter expressions."""
+
+import json
+import sys
+
+import atheris
+
+with atheris.instrument_imports():
+    from haystack.dataclasses import Document
+    from haystack.errors import FilterError
+    from haystack.utils.filters import document_matches_filter
+
+# A fixed document is enough; we are fuzzing the filter expression, not the document.
+_DOCUMENT = Document(content="the quick brown fox", meta={"page": 1, "name": "fuzz"})
+
+# Normal reactions to malformed filters; anything else is a genuine finding.
+_EXPECTED = (FilterError, ValueError, TypeError, KeyError)
+
+
+def TestOneInput(data: bytes) -> None:
+    """Decode fuzzer bytes into a JSON filter dict and evaluate it against a document."""
+    fdp = atheris.FuzzedDataProvider(data)
+    raw = fdp.ConsumeUnicodeNoSurrogates(fdp.remaining_bytes())
+    try:
+        filters = json.loads(raw)
+    except (ValueError, RecursionError):
+        return
+    if not isinstance(filters, dict):
+        return
+    try:
+        document_matches_filter(filters, _DOCUMENT)
+    except _EXPECTED:
+        pass
+
+
+def main() -> None:
+    """Set up and run the Atheris fuzzing loop."""
+    atheris.Setup(sys.argv, TestOneInput)
+    atheris.Fuzz()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/fuzz/fuzz_pipeline_loads.py b/test/fuzz/fuzz_pipeline_loads.py
new file mode 100644
index 0000000000..472e6f3761
--- /dev/null
+++ b/test/fuzz/fuzz_pipeline_loads.py
@@ -0,0 +1,40 @@
+# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""Fuzz target for ``Pipeline.loads`` — deserializing untrusted serialized pipelines.
+
+Loading a serialized pipeline is an explicit attack surface (see SECURITY.md), so this
+target feeds arbitrary fuzzer bytes through the YAML unmarshaller and ``from_dict``.
+"""
+
+import sys
+
+import atheris
+
+with atheris.instrument_imports():
+    from haystack import Pipeline
+    from haystack.core.errors import PipelineError
+    from haystack.errors import DeserializationError
+
+# Exceptions that are a normal reaction to malformed input. Anything else — a crash,
+# unbounded recursion, a hang, or an unexpected exception type — is a genuine finding.
+_EXPECTED = (DeserializationError, PipelineError, ValueError, TypeError, KeyError)
+
+
+def TestOneInput(data: bytes) -> None:
+    """Feed one fuzzer-generated input to ``Pipeline.loads`` as a YAML document."""
+    try:
+        Pipeline.loads(data)
+    except _EXPECTED:
+        pass
+
+
+def main() -> None:
+    """Set up and run the Atheris fuzzing loop."""
+    atheris.Setup(sys.argv, TestOneInput)
+    atheris.Fuzz()
+
+
+if __name__ == "__main__":
+    main()

From c49cecc01d88eb58848c5c741d28ce118a70d11f Mon Sep 17 00:00:00 2001
From: Julian Risch <julian.risch@deepset.ai>
Date: Tue, 2 Jun 2026 12:58:16 +0200
Subject: [PATCH 2/3] ci: pin ClusterFuzzLite base image by digest

Pin gcr.io/oss-fuzz-base/base-builder-python to its current digest instead of
the rolling latest tag, for supply-chain integrity. The OSS-Fuzz base-builder
is updated frequently, so the comment documents how to refresh the digest.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .clusterfuzzlite/Dockerfile | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/.clusterfuzzlite/Dockerfile b/.clusterfuzzlite/Dockerfile
index 38623deffb..fcf69e27a1 100644
--- a/.clusterfuzzlite/Dockerfile
+++ b/.clusterfuzzlite/Dockerfile
@@ -3,9 +3,10 @@
 # SPDX-License-Identifier: Apache-2.0
 
 # ClusterFuzzLite / OSS-Fuzz build environment for the Haystack Atheris fuzz targets.
-# The base image is intentionally left as the rolling OSS-Fuzz tag: the fuzzing
-# toolchain expects the latest base-builder, and OSS-Fuzz/ClusterFuzzLite manage it.
-FROM gcr.io/oss-fuzz-base/base-builder-python
+# Pinned by digest for supply-chain integrity. Bump periodically (the OSS-Fuzz
+# base-builder is updated frequently with toolchain fixes); resolve a fresh digest with:
+#   docker buildx imagetools inspect gcr.io/oss-fuzz-base/base-builder-python:latest --format '{{.Manifest.Digest}}'
+FROM gcr.io/oss-fuzz-base/base-builder-python@sha256:bdae8ffe13ebbaf3b653f0a5082d8d72e108d8cd9eed1fef1a85d8350efa3fbf
 
 COPY . $SRC/haystack
 WORKDIR $SRC/haystack

From 7bd8881f640b1c473380b321a2c63f7e8ed29c9b Mon Sep 17 00:00:00 2001
From: Julian Risch <julian.risch@deepset.ai>
Date: Tue, 2 Jun 2026 13:02:49 +0200
Subject: [PATCH 3/3] ci: track ClusterFuzzLite base image with Dependabot

Add a docker ecosystem entry for /.clusterfuzzlite so Dependabot keeps the
digest-pinned gcr.io/oss-fuzz-base/base-builder-python in Dockerfile up to date.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .github/dependabot.yml | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/.github/dependabot.yml b/.github/dependabot.yml
index d7d1bea410..2d6a7bff63 100644
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -31,3 +31,11 @@ updates:
       interval: 'daily'
     cooldown:
       default-days: 1
+
+  # Keeps the digest-pinned OSS-Fuzz base-builder in .clusterfuzzlite/Dockerfile fresh.
+  - package-ecosystem: 'docker'
+    directory: '/.clusterfuzzlite'
+    schedule:
+      interval: 'daily'
+    cooldown:
+      default-days: 1