From e3c475698ac1c164a2d53a36689e917e9862811d Mon Sep 17 00:00:00 2001 From: Adam Korczynski Date: Wed, 18 Feb 2026 21:10:26 +0000 Subject: [PATCH 1/2] Add module fuzzers Signed-off-by: Adam Korczynski --- module-fuzzers/fuzz_crypto.cpp | 503 +++++++++++++ module-fuzzers/fuzz_dataops.cpp | 1166 +++++++++++++++++++++++++++++++ module-fuzzers/fuzz_decode.cpp | 1029 +++++++++++++++++++++++++++ module-fuzzers/fuzz_helpers.h | 139 ++++ module-fuzzers/fuzz_ioops.cpp | 1015 +++++++++++++++++++++++++++ module-fuzzers/fuzz_parsers.cpp | 744 ++++++++++++++++++++ module-fuzzers/fuzz_textops.cpp | 467 +++++++++++++ 7 files changed, 5063 insertions(+) create mode 100644 module-fuzzers/fuzz_crypto.cpp create mode 100644 module-fuzzers/fuzz_dataops.cpp create mode 100644 module-fuzzers/fuzz_decode.cpp create mode 100644 module-fuzzers/fuzz_helpers.h create mode 100644 module-fuzzers/fuzz_ioops.cpp create mode 100644 module-fuzzers/fuzz_parsers.cpp create mode 100644 module-fuzzers/fuzz_textops.cpp diff --git a/module-fuzzers/fuzz_crypto.cpp b/module-fuzzers/fuzz_crypto.cpp new file mode 100644 index 0000000..1ce4b9e --- /dev/null +++ b/module-fuzzers/fuzz_crypto.cpp @@ -0,0 +1,503 @@ +// fuzz_crypto.cpp — Fuzzer for CPython's hash and HMAC C extension modules. +// +// This fuzzer exercises the following CPython C extension modules via +// their Python API, called through the Python C API from C++: +// +// _md5, _sha1, _sha2 — MD5, SHA-1, SHA-224/256/384/512 +// _sha3 — SHA3-224/256/384/512, SHAKE-128/256 +// _blake2 — BLAKE2b (64-byte key/16-byte salt/person), +// BLAKE2s (32-byte key/8-byte salt/person) +// _hmac — Low-level compute_md5/sha1/sha256/sha512 +// hmac (Python module) — hmac.new(), hmac.digest(), hmac.compare_digest() +// hashlib (Python module) — hashlib.new(), hashlib.pbkdf2_hmac(), +// hashlib.file_digest() +// +// The first byte of fuzz input selects one of 13 operation types. Each +// operation consumes further bytes via FuzzedDataProvider to parameterize +// the call (algorithm choice, key/salt/data sizes, action sequences). +// +// Operations fall into two categories: +// +// Chained — Create a hash/HMAC object, then loop up to 100 actions +// chosen from: .update(data), .digest(), .hexdigest(), .copy().digest(), +// and reading .name/.digest_size/.block_size attributes. Used for +// standard hashes, SHAKE (variable-length digest), BLAKE2 (keyed + +// variable digest_size), hmac.new(), and hashlib.new(). +// +// One-shot — A single function call: _hmac.compute_*(key, msg), +// hmac.digest(key, msg, algo), hmac.compare_digest(a, b), +// hashlib.file_digest(BytesIO, algo), hashlib.pbkdf2_hmac(algo, pw, salt, 1). +// +// All module functions and constructors are imported once during init and +// cached as static PyObject* pointers. PyRef (RAII) prevents reference leaks. +// PyGC_Collect() runs every 200 iterations. Max input size: 1 MB. + +#include "fuzz_helpers.h" + +// --------------------------------------------------------------------------- +// Cached module objects, initialized once. +// --------------------------------------------------------------------------- + +static PyObject *ctor_md5, *ctor_sha1; +static PyObject *ctor_sha224, *ctor_sha256, *ctor_sha384, *ctor_sha512; +static PyObject *ctor_sha3_224, *ctor_sha3_256, *ctor_sha3_384, *ctor_sha3_512; +static PyObject *ctor_shake_128, *ctor_shake_256; +static PyObject *ctor_blake2b, *ctor_blake2s; + +static PyObject **all_hash_ctors[] = { + &ctor_md5, &ctor_sha1, &ctor_sha224, &ctor_sha256, + &ctor_sha384, &ctor_sha512, &ctor_sha3_224, &ctor_sha3_256, + &ctor_sha3_384, &ctor_sha3_512, &ctor_blake2b, &ctor_blake2s, +}; +static constexpr int kNumHashCtors = + sizeof(all_hash_ctors) / sizeof(all_hash_ctors[0]); + +static PyObject **shake_ctors[] = {&ctor_shake_128, &ctor_shake_256}; +static constexpr int kNumShakeCtors = 2; + +static PyObject *hmac_compute_funcs[4]; +static int num_hmac_compute_funcs = 0; + +static PyObject *hashlib_new, *hashlib_pbkdf2_hmac, *hashlib_file_digest; +static PyObject *py_hmac_new, *py_hmac_digest, *py_hmac_compare_digest; +static PyObject *bytesio_ctor; + +static const char *kHmacAlgos[] = { + "md5", "sha224", "sha256", "sha384", "sha512", "sha3_256", "blake2s", +}; +static constexpr int kNumHmacAlgos = + sizeof(kHmacAlgos) / sizeof(kHmacAlgos[0]); + +static const char *kPbkdf2Algos[] = {"sha1", "sha256", "sha512"}; +static constexpr int kNumPbkdf2Algos = 3; + +static const char *kHashlibAlgos[] = {"md5", "sha256", "sha3_256", "sha512"}; +static constexpr int kNumHashlibAlgos = 4; + +static unsigned long gc_counter = 0; + +static int initialized = 0; + +static void init_crypto(void) { + if (initialized) return; + + struct { + PyObject **dest; + const char *mod, *attr; + } inits[] = { + {&ctor_md5, "_md5", "md5"}, + {&ctor_sha1, "_sha1", "sha1"}, + {&ctor_sha224, "_sha2", "sha224"}, + {&ctor_sha256, "_sha2", "sha256"}, + {&ctor_sha384, "_sha2", "sha384"}, + {&ctor_sha512, "_sha2", "sha512"}, + {&ctor_sha3_224, "_sha3", "sha3_224"}, + {&ctor_sha3_256, "_sha3", "sha3_256"}, + {&ctor_sha3_384, "_sha3", "sha3_384"}, + {&ctor_sha3_512, "_sha3", "sha3_512"}, + {&ctor_shake_128, "_sha3", "shake_128"}, + {&ctor_shake_256, "_sha3", "shake_256"}, + {&ctor_blake2b, "_blake2", "blake2b"}, + {&ctor_blake2s, "_blake2", "blake2s"}, + }; + for (auto &i : inits) + *i.dest = import_attr(i.mod, i.attr); + + PyObject *hmac_mod = PyImport_ImportModule("_hmac"); + if (hmac_mod) { + const char *names[] = { + "compute_md5", "compute_sha1", "compute_sha256", "compute_sha512", + }; + for (auto name : names) { + PyObject *fn = PyObject_GetAttrString(hmac_mod, name); + if (fn) + hmac_compute_funcs[num_hmac_compute_funcs++] = fn; + else + PyErr_Clear(); + } + Py_DECREF(hmac_mod); + } else { + PyErr_Clear(); + } + + hashlib_new = import_attr("hashlib", "new"); + hashlib_pbkdf2_hmac = import_attr("hashlib", "pbkdf2_hmac"); + hashlib_file_digest = import_attr("hashlib", "file_digest"); + py_hmac_new = import_attr("hmac", "new"); + py_hmac_digest = import_attr("hmac", "digest"); + py_hmac_compare_digest = import_attr("hmac", "compare_digest"); + bytesio_ctor = import_attr("io", "BytesIO"); + + assert(!PyErr_Occurred()); + initialized = 1; +} + +// --------------------------------------------------------------------------- +// Chained action loop — shared by OP_HASH_CHAIN, OP_SHAKE_CHAIN, +// OP_BLAKE2*_KEYED, OP_BLAKE2*_VARDIGEST, OP_PYHMAC_CHAIN, and +// OP_HASHLIB_CHAIN. +// +// Takes a borrowed reference to a hash-like object and loops up to 100 +// fuzz-driven actions: .update(data), .digest(), .hexdigest(), +// .copy().digest(), and attribute reads (.name, .digest_size, .block_size). +// --------------------------------------------------------------------------- + +static void chain_hash_actions(PyObject *h, FuzzedDataProvider &fdp) { + for (int i = 0; fdp.remaining_bytes() > 0 && i < 100; i++) { + switch (fdp.ConsumeIntegralInRange(0, 4)) { + case 0: { // .update(data) + std::string data = fdp.ConsumeBytesAsString( + fdp.ConsumeIntegralInRange( + 0, std::min(fdp.remaining_bytes(), (size_t)10000))); + PyRef r = PyObject_CallMethod(h, "update", "y#", Y(data)); + CHECK(r); + break; + } + case 1: { + PyRef d = PyObject_CallMethod(h, "digest", NULL); + CHECK(d); + break; + } + case 2: { + PyRef d = PyObject_CallMethod(h, "hexdigest", NULL); + CHECK(d); + break; + } + case 3: { // .copy().digest() + PyRef h2 = PyObject_CallMethod(h, "copy", NULL); + CHECK(h2); + PyRef d = PyObject_CallMethod(h2, "digest", NULL); + CHECK(d); + break; + } + case 4: { // .name, .digest_size, .block_size + PyRef n = PyObject_GetAttrString(h, "name"); + CHECK(n); + PyRef ds = PyObject_GetAttrString(h, "digest_size"); + CHECK(ds); + PyRef bs = PyObject_GetAttrString(h, "block_size"); + CHECK(bs); + break; + } + } + } + if (PyErr_Occurred()) PyErr_Clear(); +} + +// --------------------------------------------------------------------------- +// Operations (13 ops). +// --------------------------------------------------------------------------- + +// OP_HASH_CHAIN: Create a hash object from one of 12 C module constructors +// (_md5.md5, _sha1.sha1, _sha2.sha224/256/384/512, _sha3.sha3_224/256/384/512, +// _blake2.blake2b/s) with fuzz-chosen initial data, then run chained actions. +static void op_hash_chain(PyObject *ctor, FuzzedDataProvider &fdp) { + std::string init = fdp.ConsumeBytesAsString( + fdp.ConsumeIntegralInRange(0, 10000)); + PyRef h = PyObject_CallFunction(ctor, "y#", Y(init)); + CHECK(h); + chain_hash_actions(h, fdp); +} + +// OP_SHAKE_CHAIN: Create a SHAKE-128 or SHAKE-256 XOF object, then loop +// up to 100 actions: .update(data), .digest(variable_length), or +// .copy().digest(variable_length). Exercises the variable-output-length +// code paths in _sha3. +static void op_shake_chain(PyObject *ctor, FuzzedDataProvider &fdp) { + std::string init = fdp.ConsumeBytesAsString( + fdp.ConsumeIntegralInRange(0, 10000)); + PyRef h = PyObject_CallFunction(ctor, "y#", Y(init)); + CHECK(h); + for (int i = 0; fdp.remaining_bytes() > 0 && i < 100; i++) { + switch (fdp.ConsumeIntegralInRange(0, 2)) { + case 0: { + std::string data = fdp.ConsumeBytesAsString( + fdp.ConsumeIntegralInRange( + 0, std::min(fdp.remaining_bytes(), (size_t)10000))); + PyRef r = PyObject_CallMethod(h, "update", "y#", Y(data)); + CHECK(r); + break; + } + case 1: { + int len = fdp.ConsumeIntegralInRange(1, 10000); + PyRef d = PyObject_CallMethod(h, "digest", "i", len); + CHECK(d); + break; + } + case 2: { + PyRef h2 = PyObject_CallMethod(h, "copy", NULL); + CHECK(h2); + int len = fdp.ConsumeIntegralInRange(1, 10000); + PyRef d = PyObject_CallMethod(h2, "digest", "i", len); + CHECK(d); + break; + } + } + } + if (PyErr_Occurred()) PyErr_Clear(); +} + +// OP_BLAKE2B_KEYED / OP_BLAKE2S_KEYED: Create a BLAKE2 object with +// fuzz-chosen key, salt, and person parameters (up to max_key/max_salt/ +// max_person bytes respectively), then run chained hash actions. +// BLAKE2b: key<=64, salt<=16, person<=16. BLAKE2s: key<=32, salt<=8, person<=8. +static void op_blake2_keyed(PyObject *ctor, int max_key, int max_salt, + int max_person, FuzzedDataProvider &fdp) { + std::string key = fdp.ConsumeBytesAsString( + fdp.ConsumeIntegralInRange(0, max_key)); + std::string salt = fdp.ConsumeBytesAsString( + fdp.ConsumeIntegralInRange(0, max_salt)); + std::string person = fdp.ConsumeBytesAsString( + fdp.ConsumeIntegralInRange(0, max_person)); + std::string data = fdp.ConsumeBytesAsString( + fdp.ConsumeIntegralInRange(0, 10000)); + + PyRef kwargs = PyDict_New(); + CHECK(kwargs); + PyRef k = PyBytes_FromStringAndSize(Y(key)); + CHECK(k); + PyRef s = PyBytes_FromStringAndSize(Y(salt)); + CHECK(s); + PyRef p = PyBytes_FromStringAndSize(Y(person)); + CHECK(p); + PyDict_SetItemString(kwargs, "key", k); + PyDict_SetItemString(kwargs, "salt", s); + PyDict_SetItemString(kwargs, "person", p); + + PyRef d = PyBytes_FromStringAndSize(Y(data)); + CHECK(d); + PyRef args = PyTuple_Pack(1, (PyObject *)d); + CHECK(args); + PyRef h = PyObject_Call(ctor, args, kwargs); + CHECK(h); + chain_hash_actions(h, fdp); +} + +// OP_BLAKE2B_VARDIGEST / OP_BLAKE2S_VARDIGEST: Create a BLAKE2 object with +// a fuzz-chosen digest_size (1 to max_ds bytes), then run chained actions. +// Exercises the variable output length code path in _blake2. +static void op_blake2_vardigest(PyObject *ctor, int max_ds, + FuzzedDataProvider &fdp) { + int ds = fdp.ConsumeIntegralInRange(1, max_ds); + std::string data = fdp.ConsumeBytesAsString( + fdp.ConsumeIntegralInRange(0, 10000)); + + PyRef kwargs = PyDict_New(); + CHECK(kwargs); + PyRef dsobj = PyLong_FromLong(ds); + CHECK(dsobj); + PyDict_SetItemString(kwargs, "digest_size", dsobj); + + PyRef d = PyBytes_FromStringAndSize(Y(data)); + CHECK(d); + PyRef args = PyTuple_Pack(1, (PyObject *)d); + CHECK(args); + PyRef h = PyObject_Call(ctor, args, kwargs); + CHECK(h); + chain_hash_actions(h, fdp); +} + +// OP_HMAC_COMPUTE: One-shot call to one of _hmac.compute_md5/sha1/sha256/sha512 +// with fuzz-chosen key and message. These are the low-level C implementations +// of HMAC in the _hmac module (not the Python hmac wrapper). +static void op_hmac_compute(PyObject *func, FuzzedDataProvider &fdp) { + std::string key = fdp.ConsumeBytesAsString( + fdp.ConsumeIntegralInRange(1, 10000)); + if (key.empty()) key.push_back('\x00'); + std::string msg = fdp.ConsumeRemainingBytesAsString(); + PyRef r = PyObject_CallFunction(func, "y#y#", Y(key), Y(msg)); + if (PyErr_Occurred()) PyErr_Clear(); +} + +// OP_PYHMAC_CHAIN: Create an HMAC object via hmac.new(key, digestmod=algo) +// where algo is fuzz-chosen from {md5, sha224, sha256, sha384, sha512, +// sha3_256, blake2s}, then run chained hash actions (update/digest/copy/etc). +// Exercises the Python hmac module which delegates to C hash constructors. +static void op_pyhmac_chain(const char *algo, FuzzedDataProvider &fdp) { + std::string key = fdp.ConsumeBytesAsString( + fdp.ConsumeIntegralInRange(1, 10000)); + if (key.empty()) key.push_back('\x00'); + + PyRef kwargs = PyDict_New(); + CHECK(kwargs); + PyRef dm = PyUnicode_FromString(algo); + CHECK(dm); + PyDict_SetItemString(kwargs, "digestmod", dm); + PyRef kb = PyBytes_FromStringAndSize(Y(key)); + CHECK(kb); + PyRef args = PyTuple_Pack(1, (PyObject *)kb); + CHECK(args); + PyRef h = PyObject_Call(py_hmac_new, args, kwargs); + CHECK(h); + chain_hash_actions(h, fdp); +} + +// OP_HMAC_DIGEST: One-shot call to hmac.digest(key, msg, "sha256"). +// Exercises the fast single-call HMAC path without creating an HMAC object. +static void op_hmac_digest(FuzzedDataProvider &fdp) { + std::string key = fdp.ConsumeBytesAsString( + fdp.ConsumeIntegralInRange(1, 10000)); + if (key.empty()) key.push_back('\x00'); + std::string msg = fdp.ConsumeRemainingBytesAsString(); + PyRef r = PyObject_CallFunction(py_hmac_digest, "y#y#s", + Y(key), Y(msg), "sha256"); + if (PyErr_Occurred()) PyErr_Clear(); +} + +// OP_HMAC_COMPARE: Compute HMAC-SHA256 of fuzz data, then call +// hmac.compare_digest() against a zero-padded 32-byte buffer derived from +// the same data. Exercises the constant-time comparison code path. +static void op_hmac_compare(FuzzedDataProvider &fdp) { + std::string data = fdp.ConsumeRemainingBytesAsString(); + PyRef h = PyObject_CallFunction(py_hmac_new, "sy#s", + "k", Y(data), "sha256"); + CHECK(h); + PyRef dig = PyObject_CallMethod(h, "digest", NULL); + CHECK(dig); + char padded[32] = {}; + memcpy(padded, data.data(), data.size() < 32 ? data.size() : 32); + PyRef padobj = PyBytes_FromStringAndSize(padded, 32); + CHECK(padobj); + PyRef r = PyObject_CallFunction(py_hmac_compare_digest, "OO", + (PyObject *)dig, (PyObject *)padobj); + if (PyErr_Occurred()) PyErr_Clear(); +} + +// OP_HASHLIB_CHAIN: Create a hash object via hashlib.new(algo, data, +// usedforsecurity=False) where algo is fuzz-chosen from {md5, sha256, +// sha3_256, sha512}, then run chained actions. Unlike OP_HASH_CHAIN which +// uses the C module constructors directly, this goes through hashlib's +// dispatch logic (OpenSSL vs builtin). +static void op_hashlib_chain(const char *algo, FuzzedDataProvider &fdp) { + std::string init = fdp.ConsumeBytesAsString( + fdp.ConsumeIntegralInRange(0, 10000)); + PyRef kwargs = PyDict_New(); + CHECK(kwargs); + PyDict_SetItemString(kwargs, "usedforsecurity", Py_False); + PyRef name = PyUnicode_FromString(algo); + CHECK(name); + PyRef d = PyBytes_FromStringAndSize(Y(init)); + CHECK(d); + PyRef args = PyTuple_Pack(2, (PyObject *)name, (PyObject *)d); + CHECK(args); + PyRef h = PyObject_Call(hashlib_new, args, kwargs); + CHECK(h); + chain_hash_actions(h, fdp); +} + +// OP_HASHLIB_FILE_DIGEST: One-shot call to hashlib.file_digest(BytesIO(data), +// algo) with fuzz-chosen algorithm, then .hexdigest(). Exercises the +// file-based hashing path that reads from a file-like object. +static void op_hashlib_file_digest(const char *algo, FuzzedDataProvider &fdp) { + std::string data = fdp.ConsumeRemainingBytesAsString(); + PyRef bio = PyObject_CallFunction(bytesio_ctor, "y#", Y(data)); + CHECK(bio); + PyRef h = PyObject_CallFunction(hashlib_file_digest, "Os", + (PyObject *)bio, algo); + CHECK(h); + PyRef r = PyObject_CallMethod(h, "hexdigest", NULL); + if (PyErr_Occurred()) PyErr_Clear(); +} + +// OP_PBKDF2: One-shot call to hashlib.pbkdf2_hmac(algo, password, salt, 1) +// with fuzz-chosen algorithm from {sha1, sha256, sha512}. Uses 1 iteration +// to keep execution fast while still exercising the PBKDF2 code path. +static void op_pbkdf2(const char *algo, FuzzedDataProvider &fdp) { + std::string salt = fdp.ConsumeBytesAsString( + fdp.ConsumeIntegralInRange(1, 10000)); + if (salt.empty()) salt.push_back('\x00'); + std::string pw = fdp.ConsumeRemainingBytesAsString(); + PyRef r = PyObject_CallFunction(hashlib_pbkdf2_hmac, "sy#y#i", + algo, Y(pw), Y(salt), 1); + if (PyErr_Occurred()) PyErr_Clear(); +} + +// --------------------------------------------------------------------------- +// Dispatch. +// --------------------------------------------------------------------------- + +enum Op { + OP_HASH_CHAIN, + OP_SHAKE_CHAIN, + OP_BLAKE2B_KEYED, + OP_BLAKE2S_KEYED, + OP_BLAKE2B_VARDIGEST, + OP_BLAKE2S_VARDIGEST, + OP_HMAC_COMPUTE, + OP_PYHMAC_CHAIN, + OP_HMAC_DIGEST, + OP_HMAC_COMPARE, + OP_HASHLIB_CHAIN, + OP_HASHLIB_FILE_DIGEST, + OP_PBKDF2, + NUM_OPS +}; + +extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + assert(Py_IsInitialized()); + init_crypto(); + if (size < 1 || size > kMaxInputSize) return 0; + if (PyErr_Occurred()) PyErr_Clear(); + + FuzzedDataProvider fdp(data, size); + switch (fdp.ConsumeIntegralInRange(0, NUM_OPS - 1)) { + case OP_HASH_CHAIN: { + int ci = fdp.ConsumeIntegralInRange(0, kNumHashCtors - 1); + op_hash_chain(*all_hash_ctors[ci], fdp); + break; + } + case OP_SHAKE_CHAIN: { + int ci = fdp.ConsumeIntegralInRange(0, kNumShakeCtors - 1); + op_shake_chain(*shake_ctors[ci], fdp); + break; + } + case OP_BLAKE2B_KEYED: + op_blake2_keyed(ctor_blake2b, 64, 16, 16, fdp); + break; + case OP_BLAKE2S_KEYED: + op_blake2_keyed(ctor_blake2s, 32, 8, 8, fdp); + break; + case OP_BLAKE2B_VARDIGEST: + op_blake2_vardigest(ctor_blake2b, 64, fdp); + break; + case OP_BLAKE2S_VARDIGEST: + op_blake2_vardigest(ctor_blake2s, 32, fdp); + break; + case OP_HMAC_COMPUTE: + if (num_hmac_compute_funcs > 0) { + int fi = fdp.ConsumeIntegralInRange( + 0, num_hmac_compute_funcs - 1); + op_hmac_compute(hmac_compute_funcs[fi], fdp); + } + break; + case OP_PYHMAC_CHAIN: { + int ai = fdp.ConsumeIntegralInRange(0, kNumHmacAlgos - 1); + op_pyhmac_chain(kHmacAlgos[ai], fdp); + break; + } + case OP_HMAC_DIGEST: + op_hmac_digest(fdp); + break; + case OP_HMAC_COMPARE: + op_hmac_compare(fdp); + break; + case OP_HASHLIB_CHAIN: { + int ai = fdp.ConsumeIntegralInRange(0, kNumHashlibAlgos - 1); + op_hashlib_chain(kHashlibAlgos[ai], fdp); + break; + } + case OP_HASHLIB_FILE_DIGEST: { + int ai = fdp.ConsumeIntegralInRange(0, kNumHashlibAlgos - 1); + op_hashlib_file_digest(kHashlibAlgos[ai], fdp); + break; + } + case OP_PBKDF2: { + int ai = fdp.ConsumeIntegralInRange(0, kNumPbkdf2Algos - 1); + op_pbkdf2(kPbkdf2Algos[ai], fdp); + break; + } + } + + if (++gc_counter % kGcInterval == 0) PyGC_Collect(); + return 0; +} diff --git a/module-fuzzers/fuzz_dataops.cpp b/module-fuzzers/fuzz_dataops.cpp new file mode 100644 index 0000000..10afe04 --- /dev/null +++ b/module-fuzzers/fuzz_dataops.cpp @@ -0,0 +1,1166 @@ +// fuzz_dataops.cpp — Fuzzer for CPython's data-structure C extension modules. +// +// This fuzzer exercises the following CPython C extension modules via +// their Python API, called through the Python C API from C++: +// +// array — array(typecode) with frombytes, tobytes, tolist, +// reverse, byteswap, append, extend, pop, count, +// index, insert, remove, buffer_info, __sizeof__, +// __contains__, __iter__, slice ops, comparison, +// concatenation, repetition, fromlist +// _ctypes — c_char/c_int/c_double.from_buffer_copy(), +// create_string_buffer, (c_char*N).from_buffer_copy, +// Structure.from_buffer_copy +// mmap — anonymous mmap: write, find, rfind, read, readline, +// seek, resize, move, getitem, setitem, flush, size, +// tell, close, context manager +// _locale — strxfrm, strcoll +// _dbm — dbm.open, write, read, keys, delete, iteration +// _sqlite3 — connect(':memory:'), execute, executemany, +// executescript, complete_statement, create_function, +// create_aggregate, set_authorizer, create_collation, +// Row factory, blobopen, register_adapter +// +// The first byte of fuzz input selects one of 9 operation types. Each +// operation consumes further bytes via FuzzedDataProvider to parameterize +// the call (typecode, sub-operation, SQL, key/value splits). +// +// All module functions and class constructors are imported once during init +// and cached as static PyObject* pointers. Two helper classes (Structure +// subclass, Aggregate class) are defined via PyRun_String at init time. +// PyRef (RAII) prevents reference leaks. PyGC_Collect() runs every 200 +// iterations. Max input size: 64 KB. + +#include "fuzz_helpers.h" + +// --------------------------------------------------------------------------- +// Cached module objects, initialized once. +// --------------------------------------------------------------------------- + +// array +static PyObject *array_array; + +// ctypes +static PyObject *ct_c_char, *ct_c_int, *ct_c_double; +static PyObject *ct_create_string_buffer, *ct_sizeof; +static PyObject *ct_Structure_cls; + +// mmap +static PyObject *mmap_mmap; + +// locale +static PyObject *locale_strxfrm, *locale_strcoll; + +// dbm +static PyObject *dbm_open; + +// sqlite3 +static PyObject *sqlite3_connect, *sqlite3_complete_statement; +static PyObject *sqlite3_register_adapter, *sqlite3_Row; +static long sqlite3_SQLITE_OK_val; +static PyObject *sqlite3_Aggregate_cls; + +static unsigned long gc_counter = 0; + +static int initialized = 0; + +static void init_dataops(void) { + if (initialized) return; + + // array + array_array = import_attr("array", "array"); + + // ctypes + ct_c_char = import_attr("ctypes", "c_char"); + ct_c_int = import_attr("ctypes", "c_int"); + ct_c_double = import_attr("ctypes", "c_double"); + ct_create_string_buffer = import_attr("ctypes", "create_string_buffer"); + ct_sizeof = import_attr("ctypes", "sizeof"); + + // ctypes Structure subclass. + { + PyObject *globals = PyDict_New(); + PyDict_SetItemString(globals, "__builtins__", PyEval_GetBuiltins()); + PyObject *r = PyRun_String( + "import ctypes\n" + "class _S(ctypes.Structure):\n" + " _fields_ = [('a', ctypes.c_int), ('b', ctypes.c_double)]\n", + Py_file_input, globals, globals); + if (!r) { PyErr_Print(); abort(); } + Py_DECREF(r); + ct_Structure_cls = PyDict_GetItemString(globals, "_S"); + Py_INCREF(ct_Structure_cls); + Py_DECREF(globals); + } + + // mmap + mmap_mmap = import_attr("mmap", "mmap"); + + // locale + locale_strxfrm = import_attr("locale", "strxfrm"); + locale_strcoll = import_attr("locale", "strcoll"); + + // dbm + dbm_open = import_attr("dbm", "open"); + + // sqlite3 + sqlite3_connect = import_attr("sqlite3", "connect"); + sqlite3_complete_statement = import_attr("sqlite3", "complete_statement"); + sqlite3_register_adapter = import_attr("sqlite3", "register_adapter"); + sqlite3_Row = import_attr("sqlite3", "Row"); + { + PyObject *v = import_attr("sqlite3", "SQLITE_OK"); + sqlite3_SQLITE_OK_val = PyLong_AsLong(v); + Py_DECREF(v); + } + + // Aggregate class for sqlite3. + { + PyObject *globals = PyDict_New(); + PyDict_SetItemString(globals, "__builtins__", PyEval_GetBuiltins()); + PyObject *r = PyRun_String( + "class _Agg:\n" + " def __init__(self): self.vals = []\n" + " def step(self, v): self.vals.append(v)\n" + " def finalize(self): return len(self.vals)\n", + Py_file_input, globals, globals); + if (!r) { PyErr_Print(); abort(); } + Py_DECREF(r); + sqlite3_Aggregate_cls = PyDict_GetItemString(globals, "_Agg"); + Py_INCREF(sqlite3_Aggregate_cls); + Py_DECREF(globals); + } + + // Suppress warnings. + PyRun_SimpleString("import warnings; warnings.filterwarnings('ignore')"); + + assert(!PyErr_Occurred()); + initialized = 1; +} + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +// Item sizes for array typecodes. +static int typecode_itemsize(char tc) { + switch (tc) { + case 'b': case 'B': return 1; + case 'H': return 2; + case 'i': case 'I': case 'l': case 'L': case 'f': return 4; + case 'd': case 'q': case 'Q': return 8; + default: return 1; + } +} + +// Create an array with the given typecode and aligned data. +static PyObject *make_array(char tc, const std::string &data) { + int item_sz = typecode_itemsize(tc); + size_t aligned_len = (data.size() / item_sz) * item_sz; + if (aligned_len == 0) aligned_len = item_sz; + + char tc_str[2] = {tc, '\0'}; + PyObject *arr = PyObject_CallFunction(array_array, "s", tc_str); + if (!arr) return NULL; + + // frombytes with aligned data. + std::string aligned = data.substr(0, aligned_len); + if (aligned.size() < (size_t)item_sz) { + aligned.resize(item_sz, '\0'); + } + PyRef pydata = PyBytes_FromStringAndSize(aligned.data(), aligned.size()); + if (!pydata) { Py_DECREF(arr); return NULL; } + PyRef r = PyObject_CallMethod(arr, "frombytes", "O", (PyObject *)pydata); + if (!r) { PyErr_Clear(); Py_DECREF(arr); return NULL; } + return arr; +} + +// --------------------------------------------------------------------------- +// Operations (9 ops). +// --------------------------------------------------------------------------- + +// OP_ARRAY_FROMBYTES: FDP selects typecode, creates array from aligned fuzz +// data, then calls tobytes/tolist/reverse/byteswap. Exercises the array C +// module's core buffer and conversion operations. +static void op_array_frombytes(FuzzedDataProvider &fdp) { + static const char kTypecodes[] = "bBHiIlLfdqQ"; + char tc = kTypecodes[fdp.ConsumeIntegralInRange(0, 10)]; + std::string data = fdp.ConsumeBytesAsString( + std::min(fdp.remaining_bytes(), (size_t)10000)); + + PyRef arr(make_array(tc, data)); + CHECK(arr); + + { + PyRef r = PyObject_CallMethod(arr, "tobytes", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(arr, "tolist", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(arr, "reverse", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(arr, "byteswap", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } +} + +// OP_ARRAY_METHODS: FDP selects typecode, creates array, then exercises +// append/extend/pop/count/index/insert/remove/buffer_info/__sizeof__/ +// __contains__/__iter__/len. Exercises the array C module's element ops. +static void op_array_methods(FuzzedDataProvider &fdp) { + static const char kTypecodes[] = "bBHiIlLfdqQ"; + char tc = kTypecodes[fdp.ConsumeIntegralInRange(0, 10)]; + std::string data = fdp.ConsumeBytesAsString( + std::min(fdp.remaining_bytes(), (size_t)10000)); + + PyRef arr(make_array(tc, data)); + CHECK(arr); + + // append(0) + { + PyRef zero = PyLong_FromLong(0); + CHECK(zero); + PyRef r = PyObject_CallMethod(arr, "append", "O", (PyObject *)zero); + if (PyErr_Occurred()) PyErr_Clear(); + } + + // extend with a slice. + { + PyRef slice = PySequence_GetSlice(arr, 0, 1); + if (slice) { + PyRef r = PyObject_CallMethod(arr, "extend", "O", (PyObject *)slice); + if (PyErr_Occurred()) PyErr_Clear(); + } else { + PyErr_Clear(); + } + } + + // pop() + { + PyRef r = PyObject_CallMethod(arr, "pop", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + + // count(first_element) and index(first_element) + { + PyRef first = PySequence_GetItem(arr, 0); + if (first) { + PyRef c = PyObject_CallMethod(arr, "count", "O", (PyObject *)first); + if (PyErr_Occurred()) PyErr_Clear(); + PyRef idx = PyObject_CallMethod(arr, "index", "O", (PyObject *)first); + if (PyErr_Occurred()) PyErr_Clear(); + } else { + PyErr_Clear(); + } + } + + // insert(0, 42) + remove(42) + { + PyRef val = PyLong_FromLong(42); + CHECK(val); + PyRef r = PyObject_CallMethod(arr, "insert", "iO", 0, (PyObject *)val); + if (PyErr_Occurred()) PyErr_Clear(); + PyRef r2 = PyObject_CallMethod(arr, "remove", "O", (PyObject *)val); + if (PyErr_Occurred()) PyErr_Clear(); + } + + // buffer_info, __sizeof__ + { + PyRef bi = PyObject_CallMethod(arr, "buffer_info", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + PyRef sz = PyObject_CallMethod(arr, "__sizeof__", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + + // __contains__, iter, len + { + PyRef first = PySequence_GetItem(arr, 0); + if (first) { + int r = PySequence_Contains(arr, first); + (void)r; + if (PyErr_Occurred()) PyErr_Clear(); + } else { + PyErr_Clear(); + } + Py_ssize_t len = PyObject_Length(arr); + (void)len; + if (PyErr_Occurred()) PyErr_Clear(); + } +} + +// OP_ARRAY_SLICE: FDP selects typecode, creates two arrays, does slice read, +// slice assignment, concatenation, repetition, comparison. Exercises the +// array C module's sequence protocol paths. +static void op_array_slice(FuzzedDataProvider &fdp) { + static const char kTypecodes[] = "bBHiIlLfdqQ"; + char tc = kTypecodes[fdp.ConsumeIntegralInRange(0, 10)]; + std::string data = fdp.ConsumeBytesAsString( + std::min(fdp.remaining_bytes(), (size_t)10000)); + + PyRef a1(make_array(tc, data)); + CHECK(a1); + PyRef a2(make_array(tc, data)); + CHECK(a2); + + // Slice read a1[0:N]. + { + Py_ssize_t len = PyObject_Length(a1); + Py_ssize_t n = len < 4 ? len : 4; + PyRef sl = PySequence_GetSlice(a1, 0, n); + if (PyErr_Occurred()) PyErr_Clear(); + } + + // Slice assignment a1[::2] = array of zeros. + { + Py_ssize_t len = PyObject_Length(a1); + if (len > 0) { + // Count elements in a1[::2]. + Py_ssize_t slice_len = (len + 1) / 2; + // Build array of zeros with same typecode. + char tc_str[2] = {tc, '\0'}; + PyRef zeros_arr = PyObject_CallFunction(array_array, "s", tc_str); + if (zeros_arr) { + std::string zero_data(slice_len * typecode_itemsize(tc), '\0'); + PyRef pydata = PyBytes_FromStringAndSize(zero_data.data(), + zero_data.size()); + if (pydata) { + PyRef fb = PyObject_CallMethod(zeros_arr, "frombytes", "O", + (PyObject *)pydata); + if (fb) { + PyRef step = PyLong_FromLong(2); + PyRef sl = PySlice_New(NULL, NULL, step); + if (sl) { + int r = PyObject_SetItem(a1, sl, zeros_arr); + (void)r; + } + } + } + } + if (PyErr_Occurred()) PyErr_Clear(); + } + } + + // Concatenation a1 + a2. + { + PyRef r = PySequence_Concat(a1, a2); + if (PyErr_Occurred()) PyErr_Clear(); + } + + // Repetition a1 * min(len, 3). + { + Py_ssize_t len = PyObject_Length(a1); + int rep = len < 3 ? (int)len : 3; + PyRef r = PySequence_Repeat(a1, rep); + if (PyErr_Occurred()) PyErr_Clear(); + } + + // Comparison a1 == a2. + { + PyRef r = PyObject_RichCompare(a1, a2, Py_EQ); + if (PyErr_Occurred()) PyErr_Clear(); + } +} + +// OP_CTYPES: FDP selects sub-op for different ctypes from_buffer_copy calls. +// Exercises the _ctypes C module's buffer copy and array creation paths. +static void op_ctypes(FuzzedDataProvider &fdp) { + int variant = fdp.ConsumeIntegralInRange(0, 5); + std::string data = fdp.ConsumeBytesAsString( + std::min(fdp.remaining_bytes(), (size_t)10000)); + + switch (variant) { + case 0: { + // c_char.from_buffer_copy(1 byte) + std::string buf = data.substr(0, 1); + if (buf.empty()) buf.push_back('\0'); + PyRef pydata = PyBytes_FromStringAndSize(buf.data(), buf.size()); + CHECK(pydata); + PyRef r = PyObject_CallMethod(ct_c_char, "from_buffer_copy", "O", + (PyObject *)pydata); + break; + } + case 1: { + // c_int.from_buffer_copy(4 bytes) + std::string buf = data.substr(0, 4); + buf.resize(4, '\0'); + PyRef pydata = PyBytes_FromStringAndSize(buf.data(), buf.size()); + CHECK(pydata); + PyRef r = PyObject_CallMethod(ct_c_int, "from_buffer_copy", "O", + (PyObject *)pydata); + break; + } + case 2: { + // c_double.from_buffer_copy(8 bytes) + std::string buf = data.substr(0, 8); + buf.resize(8, '\0'); + PyRef pydata = PyBytes_FromStringAndSize(buf.data(), buf.size()); + CHECK(pydata); + PyRef r = PyObject_CallMethod(ct_c_double, "from_buffer_copy", "O", + (PyObject *)pydata); + break; + } + case 3: { + // create_string_buffer(data[:256]) + std::string buf = data.substr(0, 256); + PyRef pydata = PyBytes_FromStringAndSize(buf.data(), buf.size()); + CHECK(pydata); + PyRef r = PyObject_CallFunction(ct_create_string_buffer, "O", + (PyObject *)pydata); + break; + } + case 4: { + // (c_char * N).from_buffer_copy(data) + if (data.empty()) break; + PyRef n = PyLong_FromLong(data.size()); + CHECK(n); + PyRef arr_type = PyNumber_Multiply(ct_c_char, n); + CHECK(arr_type); + PyRef pydata = PyBytes_FromStringAndSize(Y(data)); + CHECK(pydata); + PyRef r = PyObject_CallMethod(arr_type, "from_buffer_copy", "O", + (PyObject *)pydata); + break; + } + case 5: { + // Structure.from_buffer_copy(padded data) + PyRef sz = PyObject_CallFunction(ct_sizeof, "O", ct_Structure_cls); + CHECK(sz); + long struct_sz = PyLong_AsLong(sz); + std::string buf = data.substr(0, struct_sz); + buf.resize(struct_sz, '\0'); + PyRef pydata = PyBytes_FromStringAndSize(buf.data(), buf.size()); + CHECK(pydata); + PyRef r = PyObject_CallMethod(ct_Structure_cls, "from_buffer_copy", "O", + (PyObject *)pydata); + break; + } + } + if (PyErr_Occurred()) PyErr_Clear(); +} + +// OP_MMAP: Create anonymous mmap, write data, then FDP selects actions. +// Exercises the mmap C module's core operations. +static void op_mmap(FuzzedDataProvider &fdp) { + int action = fdp.ConsumeIntegralInRange(0, 5); + std::string data = fdp.ConsumeBytesAsString( + std::min(fdp.remaining_bytes(), (size_t)10000)); + if (data.empty()) data.push_back('\0'); + + // mmap(-1, size) + Py_ssize_t map_size = data.size(); + PyRef mm = PyObject_CallFunction(mmap_mmap, "in", -1, map_size); + CHECK(mm); + + // Write data. + PyRef pydata = PyBytes_FromStringAndSize(Y(data)); + CHECK(pydata); + { + PyRef r = PyObject_CallMethod(mm, "write", "O", (PyObject *)pydata); + if (!r) { PyErr_Clear(); goto cleanup; } + } + + // Seek to 0. + { + PyRef r = PyObject_CallMethod(mm, "seek", "i", 0); + if (!r) { PyErr_Clear(); goto cleanup; } + } + + switch (action) { + case 0: { + // find + rfind + size_t pat_len = data.size() < 4 ? data.size() : 4; + PyRef pat = PyBytes_FromStringAndSize(data.data(), pat_len); + CHECK(pat); + { + PyRef r = PyObject_CallMethod(mm, "find", "O", (PyObject *)pat); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(mm, "rfind", "O", (PyObject *)pat); + if (PyErr_Occurred()) PyErr_Clear(); + } + break; + } + case 1: { + // read + readline + { + long n = map_size < 4 ? map_size : 4; + PyRef r = PyObject_CallMethod(mm, "read", "l", n); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef sk = PyObject_CallMethod(mm, "seek", "i", 0); + if (PyErr_Occurred()) PyErr_Clear(); + PyRef r = PyObject_CallMethod(mm, "readline", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + break; + } + case 2: { + // resize + move + long new_size = map_size * 2; + if (new_size < 1) new_size = 1; + { + PyRef r = PyObject_CallMethod(mm, "resize", "l", new_size); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + long src = map_size < 2 ? 0 : 1; + long count = map_size < 2 ? 0 : (map_size / 2 < new_size / 2 ? + map_size / 2 : new_size / 2); + PyRef r = PyObject_CallMethod(mm, "move", "lll", + (long)0, src, count); + if (PyErr_Occurred()) PyErr_Clear(); + } + break; + } + case 3: { + // getitem + setitem + { + PyRef idx = PyLong_FromLong(0); + CHECK(idx); + PyRef r = PyObject_GetItem(mm, idx); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + Py_ssize_t n = map_size < 4 ? map_size : 4; + PyRef sl = PySlice_New(PyLong_FromLong(0), PyLong_FromLong(n), NULL); + CHECK(sl); + PyRef r = PyObject_GetItem(mm, sl); + if (PyErr_Occurred()) PyErr_Clear(); + } + if (data.size() > 0) { + PyRef idx = PyLong_FromLong(0); + CHECK(idx); + PyRef val = PyLong_FromLong((unsigned char)data[0]); + CHECK(val); + PyObject_SetItem(mm, idx, val); + if (PyErr_Occurred()) PyErr_Clear(); + } + break; + } + case 4: { + // flush + size + tell + { + PyRef r = PyObject_CallMethod(mm, "flush", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(mm, "size", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(mm, "tell", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + break; + } + case 5: { + // read all + { + PyRef r = PyObject_CallMethod(mm, "read", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + break; + } + } + +cleanup: + { + PyRef r = PyObject_CallMethod(mm, "close", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } +} + +// OP_LOCALE: FDP selects strxfrm or strcoll. Exercises the _locale C module. +static void op_locale(FuzzedDataProvider &fdp) { + int str_enc = fdp.ConsumeIntegralInRange(0, 3); + bool use_strcoll = fdp.ConsumeBool(); + std::string data = fdp.ConsumeBytesAsString( + std::min(fdp.remaining_bytes(), (size_t)10000)); + PyRef pystr(fuzz_bytes_to_str(data, str_enc)); + CHECK(pystr); + + if (use_strcoll) { + Py_ssize_t slen = PyUnicode_GET_LENGTH(pystr); + Py_ssize_t mid = slen / 2; + PyRef half1 = PyUnicode_Substring(pystr, 0, mid); + CHECK(half1); + PyRef half2 = PyUnicode_Substring(pystr, mid, slen); + CHECK(half2); + PyRef r = PyObject_CallFunction(locale_strcoll, "OO", + (PyObject *)half1, (PyObject *)half2); + } else { + PyRef r = PyObject_CallFunction(locale_strxfrm, "O", (PyObject *)pystr); + } + if (PyErr_Occurred()) PyErr_Clear(); +} + +// OP_DBM: Open an in-memory dbm, write N key-value pairs, read back, iterate. +// Exercises the _dbm C extension module's storage operations. +static void op_dbm(FuzzedDataProvider &fdp) { + std::string data = fdp.ConsumeBytesAsString( + std::min(fdp.remaining_bytes(), (size_t)10000)); + + // Use a unique filename based on the gc counter to avoid conflicts. + char dbpath[64]; + snprintf(dbpath, sizeof(dbpath), "/tmp/_fuzz_dbm_%lu", gc_counter); + + PyRef db = PyObject_CallFunction(dbm_open, "ss", dbpath, "n"); + CHECK(db); + + // Write key-value pairs from fuzz data. + size_t limit = data.size() < 64 ? data.size() : 64; + for (size_t i = 0; i + 3 < limit; i += 4) { + PyRef key = PyBytes_FromStringAndSize(data.data() + i, 2); + if (!key) { PyErr_Clear(); continue; } + PyRef val = PyBytes_FromStringAndSize(data.data() + i + 2, 2); + if (!val) { PyErr_Clear(); continue; } + int r = PyObject_SetItem(db, key, val); + (void)r; + if (PyErr_Occurred()) PyErr_Clear(); + } + + // Read keys. + { + PyRef keys = PyObject_CallMethod(db, "keys", NULL); + if (keys) { + PyRef it = PyObject_GetIter(keys); + if (it) { + PyObject *k; + while ((k = PyIter_Next(it)) != NULL) { + PyRef val = PyObject_GetItem(db, k); + Py_DECREF(k); + if (PyErr_Occurred()) PyErr_Clear(); + } + } + } + if (PyErr_Occurred()) PyErr_Clear(); + } + + // Check membership. + { + PyRef test_key = PyBytes_FromStringAndSize("k", 1); + if (test_key) { + int r = PySequence_Contains(db, test_key); + (void)r; + if (PyErr_Occurred()) PyErr_Clear(); + } + } + + // Close. + { + PyRef r = PyObject_CallMethod(db, "close", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } +} + +// Helper: Create a memory connection with PRAGMA max_page_count=100. +static PyObject *make_sqlite_conn() { + PyObject *conn = PyObject_CallFunction(sqlite3_connect, "s", ":memory:"); + if (!conn) return NULL; + PyRef r = PyObject_CallMethod(conn, "execute", "s", + "PRAGMA max_page_count=100"); + if (!r) { + PyErr_Clear(); + Py_DECREF(conn); + return NULL; + } + return conn; +} + +// OP_SQLITE3_BASIC: connect(':memory:'), then FDP selects: execute fuzz SQL, +// parameterized queries, executemany, executescript, complete_statement. +// Exercises the _sqlite3 C module's basic execution paths. +static void op_sqlite3_basic(FuzzedDataProvider &fdp) { + int str_enc = fdp.ConsumeIntegralInRange(0, 3); + int variant = fdp.ConsumeIntegralInRange(0, 4); + std::string data = fdp.ConsumeBytesAsString( + std::min(fdp.remaining_bytes(), (size_t)10000)); + PyRef pystr(fuzz_bytes_to_str(data, str_enc)); + CHECK(pystr); + + PyRef conn(make_sqlite_conn()); + CHECK(conn); + + switch (variant) { + case 0: { + // Execute fuzz SQL. + PyRef r = PyObject_CallMethod(conn, "execute", "O", (PyObject *)pystr); + break; + } + case 1: { + // Parameterized INSERT/SELECT/UPDATE/DELETE. + PyRef pydata = PyBytes_FromStringAndSize(Y(data)); + CHECK(pydata); + { + PyRef r = PyObject_CallMethod(conn, "execute", "s", + "CREATE TABLE t(a TEXT, b BLOB)"); + if (!r) { PyErr_Clear(); break; } + } + { + PyRef params = PyTuple_Pack(2, (PyObject *)pystr, (PyObject *)pydata); + CHECK(params); + PyRef r = PyObject_CallMethod(conn, "execute", "sO", + "INSERT INTO t VALUES(?, ?)", + (PyObject *)params); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + // SELECT. + PyRef sub = PyUnicode_Substring(pystr, 0, 32); + if (!sub) { PyErr_Clear(); break; } + PyRef params = PyTuple_Pack(1, (PyObject *)sub); + CHECK(params); + PyRef r = PyObject_CallMethod(conn, "execute", "sO", + "SELECT * FROM t WHERE a LIKE ?", + (PyObject *)params); + if (PyErr_Occurred()) PyErr_Clear(); + } + break; + } + case 2: { + // executemany. + { + PyRef r = PyObject_CallMethod(conn, "execute", "s", + "CREATE TABLE t(v INTEGER)"); + if (!r) { PyErr_Clear(); break; } + } + { + PyRef rows = PyList_New(0); + CHECK(rows); + size_t limit = data.size() < 64 ? data.size() : 64; + for (size_t i = 0; i < limit; i++) { + PyRef val = PyLong_FromLong((unsigned char)data[i]); + PyRef tup = PyTuple_Pack(1, (PyObject *)val); + if (tup) PyList_Append(rows, tup); + } + PyRef r = PyObject_CallMethod(conn, "executemany", "sO", + "INSERT INTO t VALUES(?)", + (PyObject *)rows); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef cur = PyObject_CallMethod(conn, "execute", "s", + "SELECT count(*), sum(v), avg(v), min(v), max(v) FROM t"); + if (cur) { + PyRef row = PyObject_CallMethod(cur, "fetchone", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } else { + PyErr_Clear(); + } + } + break; + } + case 3: { + // executescript. + Py_ssize_t slen = PyUnicode_GET_LENGTH(pystr); + PyObject *sql = slen > 0 ? (PyObject *)pystr : NULL; + if (!sql) { + PyRef def = PyUnicode_FromString("SELECT 1;"); + PyRef r = PyObject_CallMethod(conn, "executescript", "O", + (PyObject *)def); + } else { + PyRef r = PyObject_CallMethod(conn, "executescript", "O", sql); + } + break; + } + case 4: { + // complete_statement. + Py_ssize_t slen = PyUnicode_GET_LENGTH(pystr); + PyObject *sql = slen > 0 ? (PyObject *)pystr : NULL; + if (!sql) { + PyRef def = PyUnicode_FromString("SELECT 1;"); + PyRef r = PyObject_CallFunction(sqlite3_complete_statement, "O", + (PyObject *)def); + } else { + PyRef r = PyObject_CallFunction(sqlite3_complete_statement, "O", sql); + } + break; + } + } + if (PyErr_Occurred()) PyErr_Clear(); + + PyRef cl = PyObject_CallMethod(conn, "close", NULL); + if (PyErr_Occurred()) PyErr_Clear(); +} + +// OP_SQLITE3_ADVANCED: connect(':memory:'), then FDP selects: create_function, +// create_aggregate, set_authorizer, create_collation, Row factory, blobopen, +// register_adapter. Exercises the _sqlite3 C module's advanced features. +static void op_sqlite3_advanced(FuzzedDataProvider &fdp) { + int str_enc = fdp.ConsumeIntegralInRange(0, 3); + int variant = fdp.ConsumeIntegralInRange(0, 6); + std::string data = fdp.ConsumeBytesAsString( + std::min(fdp.remaining_bytes(), (size_t)10000)); + PyRef pystr(fuzz_bytes_to_str(data, str_enc)); + CHECK(pystr); + + PyRef conn(make_sqlite_conn()); + CHECK(conn); + + switch (variant) { + case 0: { + // create_function + SELECT. + PyRef globals = PyDict_New(); + CHECK(globals); + PyDict_SetItemString(globals, "__builtins__", PyEval_GetBuiltins()); + PyRef fn = PyRun_String("lambda x: x", Py_eval_input, globals, globals); + CHECK(fn); + { + PyRef r = PyObject_CallMethod(conn, "create_function", "siO", + "fuzzfn", 1, (PyObject *)fn); + if (!r) { PyErr_Clear(); break; } + } + { + PyRef r = PyObject_CallMethod(conn, "execute", "s", + "CREATE TABLE t(a TEXT)"); + if (!r) { PyErr_Clear(); break; } + } + { + PyRef sub = PyUnicode_Substring(pystr, 0, 32); + if (!sub) { PyErr_Clear(); break; } + PyRef params = PyTuple_Pack(1, (PyObject *)sub); + CHECK(params); + PyRef r = PyObject_CallMethod(conn, "execute", "sO", + "INSERT INTO t VALUES(?)", + (PyObject *)params); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef cur = PyObject_CallMethod(conn, "execute", "s", + "SELECT fuzzfn(a) FROM t"); + if (cur) { + PyRef rows = PyObject_CallMethod(cur, "fetchall", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } else { + PyErr_Clear(); + } + } + break; + } + case 1: { + // create_aggregate + SELECT. + { + PyRef r = PyObject_CallMethod(conn, "create_aggregate", "siO", + "fuzzagg", 1, + sqlite3_Aggregate_cls); + if (!r) { PyErr_Clear(); break; } + } + { + PyRef r = PyObject_CallMethod(conn, "execute", "s", + "CREATE TABLE t(v INTEGER)"); + if (!r) { PyErr_Clear(); break; } + } + { + PyRef rows = PyList_New(0); + CHECK(rows); + size_t limit = data.size() < 32 ? data.size() : 32; + for (size_t i = 0; i < limit; i++) { + PyRef val = PyLong_FromLong((unsigned char)data[i]); + PyRef tup = PyTuple_Pack(1, (PyObject *)val); + if (tup) PyList_Append(rows, tup); + } + PyRef r = PyObject_CallMethod(conn, "executemany", "sO", + "INSERT INTO t VALUES(?)", + (PyObject *)rows); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef cur = PyObject_CallMethod(conn, "execute", "s", + "SELECT fuzzagg(v) FROM t"); + if (cur) { + PyRef row = PyObject_CallMethod(cur, "fetchone", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } else { + PyErr_Clear(); + } + } + break; + } + case 2: { + // set_authorizer + SELECT. + PyRef globals = PyDict_New(); + CHECK(globals); + PyDict_SetItemString(globals, "__builtins__", PyEval_GetBuiltins()); + PyRef code_str = PyUnicode_FromFormat( + "lambda *a: %ld", sqlite3_SQLITE_OK_val); + CHECK(code_str); + PyRef auth_fn = PyRun_String(PyUnicode_AsUTF8(code_str), + Py_eval_input, globals, globals); + CHECK(auth_fn); + { + PyRef r = PyObject_CallMethod(conn, "set_authorizer", "O", + (PyObject *)auth_fn); + if (!r) { PyErr_Clear(); break; } + } + { + PyRef r = PyObject_CallMethod(conn, "execute", "s", + "CREATE TABLE t(a TEXT)"); + if (!r) { PyErr_Clear(); break; } + } + { + PyRef sub = PyUnicode_Substring(pystr, 0, 16); + if (!sub) { PyErr_Clear(); break; } + PyRef params = PyTuple_Pack(1, (PyObject *)sub); + CHECK(params); + PyRef r = PyObject_CallMethod(conn, "execute", "sO", + "INSERT INTO t VALUES(?)", + (PyObject *)params); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef cur = PyObject_CallMethod(conn, "execute", "s", + "SELECT * FROM t"); + if (cur) { + PyRef rows = PyObject_CallMethod(cur, "fetchall", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } else { + PyErr_Clear(); + } + } + break; + } + case 3: { + // create_collation + ORDER BY. + PyRef globals = PyDict_New(); + CHECK(globals); + PyDict_SetItemString(globals, "__builtins__", PyEval_GetBuiltins()); + PyRef coll_fn = PyRun_String( + "lambda a, b: (a > b) - (a < b)", + Py_eval_input, globals, globals); + CHECK(coll_fn); + { + PyRef r = PyObject_CallMethod(conn, "create_collation", "sO", + "fuzz", (PyObject *)coll_fn); + if (!r) { PyErr_Clear(); break; } + } + { + PyRef r = PyObject_CallMethod(conn, "execute", "s", + "CREATE TABLE t(a TEXT)"); + if (!r) { PyErr_Clear(); break; } + } + { + PyRef params = PyTuple_Pack(1, (PyObject *)pystr); + CHECK(params); + PyRef r = PyObject_CallMethod(conn, "execute", "sO", + "INSERT INTO t VALUES(?)", + (PyObject *)params); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef cur = PyObject_CallMethod(conn, "execute", "s", + "SELECT * FROM t ORDER BY a COLLATE fuzz"); + if (cur) { + PyRef rows = PyObject_CallMethod(cur, "fetchall", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } else { + PyErr_Clear(); + } + } + break; + } + case 4: { + // Row factory + SELECT. + PyObject_SetAttrString(conn, "row_factory", sqlite3_Row); + { + PyRef r = PyObject_CallMethod(conn, "execute", "s", + "CREATE TABLE t(a TEXT, b INTEGER)"); + if (!r) { PyErr_Clear(); break; } + } + { + PyRef sub = PyUnicode_Substring(pystr, 0, 8); + if (!sub) { PyErr_Clear(); break; } + PyRef params = PyTuple_Pack(2, (PyObject *)sub, PyLong_FromLong(42)); + CHECK(params); + PyRef r = PyObject_CallMethod(conn, "execute", "sO", + "INSERT INTO t VALUES(?, ?)", + (PyObject *)params); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef cur = PyObject_CallMethod(conn, "execute", "s", + "SELECT * FROM t"); + if (cur) { + PyRef row = PyObject_CallMethod(cur, "fetchone", NULL); + if (row && row.p != Py_None) { + PyRef a = PyObject_GetItem(row, PyUnicode_FromString("a")); + if (PyErr_Occurred()) PyErr_Clear(); + PyRef b = PyObject_GetItem(row, PyUnicode_FromString("b")); + if (PyErr_Occurred()) PyErr_Clear(); + PyRef keys = PyObject_CallMethod(row, "keys", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + if (PyErr_Occurred()) PyErr_Clear(); + } else { + PyErr_Clear(); + } + } + break; + } + case 5: { + // blobopen + read/write. + { + PyRef r = PyObject_CallMethod(conn, "execute", "s", + "CREATE TABLE t(a BLOB)"); + if (!r) { PyErr_Clear(); break; } + } + { + std::string blob_data = data.substr(0, 64); + PyRef pydata = PyBytes_FromStringAndSize(blob_data.data(), + blob_data.size()); + CHECK(pydata); + PyRef params = PyTuple_Pack(1, (PyObject *)pydata); + CHECK(params); + PyRef r = PyObject_CallMethod(conn, "execute", "sO", + "INSERT INTO t VALUES(?)", + (PyObject *)params); + if (!r) { PyErr_Clear(); break; } + } + { + PyRef cur = PyObject_CallMethod(conn, "execute", "s", + "SELECT rowid FROM t"); + if (!cur) { PyErr_Clear(); break; } + PyRef row = PyObject_CallMethod(cur, "fetchone", NULL); + if (!row || row.p == Py_None) { PyErr_Clear(); break; } + PyRef rid = PySequence_GetItem(row, 0); + CHECK(rid); + PyRef blob = PyObject_CallMethod(conn, "blobopen", "sssO", + "main", "t", "a", (PyObject *)rid); + if (!blob) { PyErr_Clear(); break; } + { + PyRef rd = PyObject_CallMethod(blob, "read", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef sk = PyObject_CallMethod(blob, "seek", "i", 0); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + size_t wr_len = data.size() < 64 ? data.size() : 64; + PyRef wr_data = PyBytes_FromStringAndSize(data.data(), wr_len); + if (wr_data) { + PyRef wr = PyObject_CallMethod(blob, "write", "O", + (PyObject *)wr_data); + if (PyErr_Occurred()) PyErr_Clear(); + } + } + { + PyRef cl = PyObject_CallMethod(blob, "close", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + } + break; + } + case 6: { + // register_adapter. + PyRef globals = PyDict_New(); + CHECK(globals); + PyDict_SetItemString(globals, "__builtins__", PyEval_GetBuiltins()); + PyRef r = PyRun_String( + "class _AdaptMe:\n" + " def __init__(self, v): self.v = v\n", + Py_file_input, globals, globals); + CHECK(r); + PyRef adapt_cls = PyRef(PyDict_GetItemString(globals, "_AdaptMe")); + Py_INCREF(adapt_cls.p); + CHECK(adapt_cls); + + PyRef adapter_fn = PyRun_String( + "lambda a: str(a.v)", Py_eval_input, globals, globals); + CHECK(adapter_fn); + + { + PyRef reg = PyObject_CallFunction(sqlite3_register_adapter, "OO", + (PyObject *)adapt_cls, + (PyObject *)adapter_fn); + if (!reg) { PyErr_Clear(); break; } + } + { + PyRef r2 = PyObject_CallMethod(conn, "execute", "s", + "CREATE TABLE t(a TEXT)"); + if (!r2) { PyErr_Clear(); break; } + } + { + PyRef sub = PyUnicode_Substring(pystr, 0, 8); + if (!sub) { PyErr_Clear(); break; } + PyRef obj = PyObject_CallFunction(adapt_cls, "O", (PyObject *)sub); + if (!obj) { PyErr_Clear(); break; } + PyRef params = PyTuple_Pack(1, (PyObject *)obj); + CHECK(params); + PyRef r3 = PyObject_CallMethod(conn, "execute", "sO", + "INSERT INTO t VALUES(?)", + (PyObject *)params); + if (PyErr_Occurred()) PyErr_Clear(); + } + break; + } + } + if (PyErr_Occurred()) PyErr_Clear(); + + PyRef cl = PyObject_CallMethod(conn, "close", NULL); + if (PyErr_Occurred()) PyErr_Clear(); +} + +// --------------------------------------------------------------------------- +// Dispatch. +// --------------------------------------------------------------------------- + +enum Op { + OP_ARRAY_FROMBYTES, + OP_ARRAY_METHODS, + OP_ARRAY_SLICE, + OP_CTYPES, + OP_MMAP, + OP_LOCALE, + OP_DBM, + OP_SQLITE3_BASIC, + OP_SQLITE3_ADVANCED, + NUM_OPS +}; + +extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + assert(Py_IsInitialized()); + init_dataops(); + if (size < 1 || size > 0x10000) return 0; + if (PyErr_Occurred()) PyErr_Clear(); + + FuzzedDataProvider fdp(data, size); + switch (fdp.ConsumeIntegralInRange(0, NUM_OPS - 1)) { + case OP_ARRAY_FROMBYTES: + op_array_frombytes(fdp); + break; + case OP_ARRAY_METHODS: + op_array_methods(fdp); + break; + case OP_ARRAY_SLICE: + op_array_slice(fdp); + break; + case OP_CTYPES: + op_ctypes(fdp); + break; + case OP_MMAP: + op_mmap(fdp); + break; + case OP_LOCALE: + op_locale(fdp); + break; + case OP_DBM: + op_dbm(fdp); + break; + case OP_SQLITE3_BASIC: + op_sqlite3_basic(fdp); + break; + case OP_SQLITE3_ADVANCED: + op_sqlite3_advanced(fdp); + break; + } + + if (++gc_counter % kGcInterval == 0) PyGC_Collect(); + return 0; +} diff --git a/module-fuzzers/fuzz_decode.cpp b/module-fuzzers/fuzz_decode.cpp new file mode 100644 index 0000000..234a265 --- /dev/null +++ b/module-fuzzers/fuzz_decode.cpp @@ -0,0 +1,1029 @@ +// fuzz_decode.cpp — Fuzzer for CPython's compression, encoding, serialization, +// and certificate-parsing C extension modules. +// +// This fuzzer exercises the following CPython C extension modules via +// their Python API, called through the Python C API from C++: +// +// zlib — compress/decompress (one-shot and streaming via +// compressobj/decompressobj with wbits, zdict, copy, +// flush), crc32, adler32 +// _bz2 — BZ2Decompressor.decompress(), bz2.compress() +// _lzma — LZMADecompressor.decompress() with FORMAT_AUTO/XZ/ALONE +// and 16 MB memlimit, lzma.compress() +// binascii — 6 decoders: a2b_base64 (with strict_mode), a2b_hex, +// a2b_uu, a2b_qp, a2b_ascii85, a2b_base85 +// 6 encoders: b2a_base64 (with newline), b2a_hex, +// b2a_uu (clamped to 45 bytes), b2a_qp, +// b2a_ascii85 (with foldspaces/wrapcol), b2a_base85 +// Checksums: crc32, crc_hqx +// Round-trip: hexlify -> unhexlify +// _pickle — pickle.dumps() with 8 container types (bytes, str, +// list, tuple, set, frozenset, bytearray, dict) across +// protocols 0-5 and fix_imports flag. +// pickle.loads() via RestrictedUnpickler (blocks +// find_class), PersistentUnpickler (handles PERSID/ +// BINPERSID), and RestrictedUnpickler with +// encoding='bytes'. +// Pickler chain: dump, clear_memo, dump, getvalue. +// Round-trip: dumps then loads. +// _ssl — ssl.DER_cert_to_PEM_cert(), then optionally +// SSLContext(PROTOCOL_TLS_CLIENT).load_verify_locations() +// _multibytecodec, +// _codecs_jp, _codecs_cn, _codecs_kr, +// _codecs_hk, _codecs_tw, _codecs_iso2022 +// — codecs.decode() with 17 codecs including shift_jis, +// euc-jp, gb2312, big5, gb18030, iso-2022-jp, etc. +// codecs.encode() with 19 codecs. +// Incremental decoders (shift_jis, gb18030, utf-16): +// split input at midpoint, decode halves, getstate, reset. +// Incremental encoders (shift_jis, utf-8): +// split string at midpoint, encode, reset, getstate. +// StreamReader: codecs.getreader('utf-8')(BytesIO).read() +// +// The first byte of fuzz input selects one of 20 operation types. Each +// operation consumes further bytes via FuzzedDataProvider to parameterize +// the call (algorithm/codec selection, compression level, protocol number, +// container type, wbits value, boolean flags, data splits). +// +// All module functions, constructors, and format constants are imported once +// during init and cached as static PyObject* and long pointers. Two pickle +// Unpickler subclasses (RestrictedUnpickler, PersistentUnpickler) are defined +// via PyRun_String at init time and cached as class objects. +// +// PyRef (RAII) prevents reference leaks. PyGC_Collect() runs every 200 +// iterations. Max input size: 1 MB. + +#include "fuzz_helpers.h" + +// --------------------------------------------------------------------------- +// Cached module objects, initialized once. +// --------------------------------------------------------------------------- + +// zlib +static PyObject *zlib_compress, *zlib_decompress; +static PyObject *zlib_decompressobj, *zlib_compressobj; +static PyObject *zlib_crc32, *zlib_adler32; + +// bz2 +static PyObject *bz2_compress, *bz2_BZ2Decompressor; + +// lzma +static PyObject *lzma_LZMADecompressor, *lzma_compress; +static long lzma_FORMAT_AUTO_val, lzma_FORMAT_XZ_val, lzma_FORMAT_ALONE_val; + +// binascii +static PyObject *ba_a2b_base64, *ba_a2b_hex, *ba_a2b_uu, *ba_a2b_qp; +static PyObject *ba_a2b_ascii85, *ba_a2b_base85; +static PyObject *ba_b2a_base64, *ba_b2a_hex, *ba_b2a_uu, *ba_b2a_qp; +static PyObject *ba_b2a_ascii85, *ba_b2a_base85; +static PyObject *ba_crc32, *ba_crc_hqx, *ba_hexlify, *ba_unhexlify; + +// pickle +static PyObject *pickle_dumps, *pickle_loads; + +// codecs +static PyObject *codecs_decode, *codecs_encode; +static PyObject *codecs_getincrementaldecoder, *codecs_getincrementalencoder; +static PyObject *codecs_getreader; + +// ssl +static PyObject *ssl_DER_cert_to_PEM_cert, *ssl_SSLContext; +static long ssl_PROTOCOL_TLS_CLIENT_val; + +// io +static PyObject *bytesio_ctor; + +// pickle helper classes +static PyObject *RestrictedUnpickler_cls, *PersistentUnpickler_cls; + +static unsigned long gc_counter = 0; + +static int initialized = 0; + +static void init_decode(void) { + if (initialized) return; + + // zlib + zlib_compress = import_attr("zlib", "compress"); + zlib_decompress = import_attr("zlib", "decompress"); + zlib_decompressobj = import_attr("zlib", "decompressobj"); + zlib_compressobj = import_attr("zlib", "compressobj"); + zlib_crc32 = import_attr("zlib", "crc32"); + zlib_adler32 = import_attr("zlib", "adler32"); + + // bz2 + bz2_compress = import_attr("bz2", "compress"); + bz2_BZ2Decompressor = import_attr("bz2", "BZ2Decompressor"); + + // lzma + lzma_LZMADecompressor = import_attr("lzma", "LZMADecompressor"); + lzma_compress = import_attr("lzma", "compress"); + { + PyObject *v; + v = import_attr("lzma", "FORMAT_AUTO"); + lzma_FORMAT_AUTO_val = PyLong_AsLong(v); + Py_DECREF(v); + v = import_attr("lzma", "FORMAT_XZ"); + lzma_FORMAT_XZ_val = PyLong_AsLong(v); + Py_DECREF(v); + v = import_attr("lzma", "FORMAT_ALONE"); + lzma_FORMAT_ALONE_val = PyLong_AsLong(v); + Py_DECREF(v); + } + + // binascii + ba_a2b_base64 = import_attr("binascii", "a2b_base64"); + ba_a2b_hex = import_attr("binascii", "a2b_hex"); + ba_a2b_uu = import_attr("binascii", "a2b_uu"); + ba_a2b_qp = import_attr("binascii", "a2b_qp"); + ba_a2b_ascii85 = import_attr("binascii", "a2b_ascii85"); + ba_a2b_base85 = import_attr("binascii", "a2b_base85"); + ba_b2a_base64 = import_attr("binascii", "b2a_base64"); + ba_b2a_hex = import_attr("binascii", "b2a_hex"); + ba_b2a_uu = import_attr("binascii", "b2a_uu"); + ba_b2a_qp = import_attr("binascii", "b2a_qp"); + ba_b2a_ascii85 = import_attr("binascii", "b2a_ascii85"); + ba_b2a_base85 = import_attr("binascii", "b2a_base85"); + ba_crc32 = import_attr("binascii", "crc32"); + ba_crc_hqx = import_attr("binascii", "crc_hqx"); + ba_hexlify = import_attr("binascii", "hexlify"); + ba_unhexlify = import_attr("binascii", "unhexlify"); + + // pickle + pickle_dumps = import_attr("pickle", "dumps"); + pickle_loads = import_attr("pickle", "loads"); + + // codecs + codecs_decode = import_attr("codecs", "decode"); + codecs_encode = import_attr("codecs", "encode"); + codecs_getincrementaldecoder = import_attr("codecs", + "getincrementaldecoder"); + codecs_getincrementalencoder = import_attr("codecs", + "getincrementalencoder"); + codecs_getreader = import_attr("codecs", "getreader"); + + // ssl + ssl_DER_cert_to_PEM_cert = import_attr("ssl", "DER_cert_to_PEM_cert"); + ssl_SSLContext = import_attr("ssl", "SSLContext"); + { + PyObject *v = import_attr("ssl", "PROTOCOL_TLS_CLIENT"); + ssl_PROTOCOL_TLS_CLIENT_val = PyLong_AsLong(v); + Py_DECREF(v); + } + + // io + bytesio_ctor = import_attr("io", "BytesIO"); + + // Suppress warnings. + PyRun_SimpleString("import warnings; warnings.filterwarnings('ignore')"); + + // Pickle helper classes via PyRun_String. + { + PyObject *globals = PyDict_New(); + PyDict_SetItemString(globals, "__builtins__", PyEval_GetBuiltins()); + PyObject *r = PyRun_String( + "import pickle, io\n" + "class RestrictedUnpickler(pickle.Unpickler):\n" + " def find_class(self, module, name):\n" + " raise pickle.UnpicklingError('restricted')\n" + "class PersistentUnpickler(pickle.Unpickler):\n" + " def persistent_load(self, pid): return pid\n" + " def find_class(self, module, name):\n" + " raise pickle.UnpicklingError('restricted')\n", + Py_file_input, globals, globals); + if (!r) { + PyErr_Print(); + abort(); + } + Py_DECREF(r); + RestrictedUnpickler_cls = + PyDict_GetItemString(globals, "RestrictedUnpickler"); + Py_INCREF(RestrictedUnpickler_cls); + PersistentUnpickler_cls = + PyDict_GetItemString(globals, "PersistentUnpickler"); + Py_INCREF(PersistentUnpickler_cls); + Py_DECREF(globals); + } + + assert(!PyErr_Occurred()); + initialized = 1; +} + +// --------------------------------------------------------------------------- +// Operations — Compression (6 ops) +// --------------------------------------------------------------------------- + +// OP_ZLIB_DECOMPRESS: Create a zlib.decompressobj with fuzz-chosen wbits +// from {-15 (raw), 0 (auto), 15 (zlib), 31 (gzip), 47 (auto-detect)} and +// an optional zdict (first 32 bytes of data). Call .decompress(data, 1MB), +// optionally .flush(), and optionally .copy() + decompress on the copy. +// Exercises Decomp_Type, zlib_Decompress_decompress, copy, flush paths. +static void op_zlib_decompress(FuzzedDataProvider &fdp) { + static const int kWbitsChoices[] = {-15, 0, 15, 31, 47}; + int wbits = kWbitsChoices[fdp.ConsumeIntegralInRange(0, 4)]; + bool use_zdict = fdp.ConsumeBool(); + std::string data = fdp.ConsumeRemainingBytesAsString(); + + PyRef kwargs = PyDict_New(); + CHECK(kwargs); + PyRef wbits_obj = PyLong_FromLong(wbits); + CHECK(wbits_obj); + PyRef args_dobj = PyTuple_Pack(1, (PyObject *)wbits_obj); + CHECK(args_dobj); + + if (use_zdict && data.size() > 32) { + PyRef zdict = PyBytes_FromStringAndSize(data.data(), 32); + CHECK(zdict); + PyDict_SetItemString(kwargs, "zdict", zdict); + data = data.substr(32); + } + + PyRef dobj = PyObject_Call(zlib_decompressobj, args_dobj, kwargs); + CHECK(dobj); + + PyRef pydata = PyBytes_FromStringAndSize(Y(data)); + CHECK(pydata); + PyRef r = PyObject_CallMethod(dobj, "decompress", "Oi", + (PyObject *)pydata, 1048576); + if (!r) { + PyErr_Clear(); + return; + } + + if (fdp.remaining_bytes() > 0 || data.size() % 2 == 0) { + PyRef flush_r = PyObject_CallMethod(dobj, "flush", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + + if (data.size() % 3 == 0) { + PyRef copy_obj = PyObject_CallMethod(dobj, "copy", NULL); + if (copy_obj) { + PyRef r2 = PyObject_CallMethod(copy_obj, "decompress", "Oi", + (PyObject *)pydata, 1048576); + if (PyErr_Occurred()) PyErr_Clear(); + } else { + PyErr_Clear(); + } + } +} + +// OP_ZLIB_COMPRESS: Either one-shot zlib.compress(data, level) or streaming +// via compressobj(level).compress(data).flush(), with optional .copy().flush(). +// Level is fuzz-chosen 0-9. Exercises Compress_Type and zlib_compress_impl. +static void op_zlib_compress(FuzzedDataProvider &fdp) { + int level = fdp.ConsumeIntegralInRange(0, 9); + bool use_obj = fdp.ConsumeBool(); + std::string data = fdp.ConsumeBytesAsString( + std::min(fdp.remaining_bytes(), (size_t)10000)); + + PyRef pydata = PyBytes_FromStringAndSize(Y(data)); + CHECK(pydata); + + if (use_obj) { + PyRef cobj = PyObject_CallFunction(zlib_compressobj, "i", level); + CHECK(cobj); + PyRef r1 = PyObject_CallMethod(cobj, "compress", "O", + (PyObject *)pydata); + CHECK(r1); + if (data.size() % 2 == 0) { + PyRef copy_obj = PyObject_CallMethod(cobj, "copy", NULL); + if (copy_obj) { + PyRef r2 = PyObject_CallMethod(copy_obj, "flush", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } else { + PyErr_Clear(); + } + } + PyRef r3 = PyObject_CallMethod(cobj, "flush", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } else { + PyRef r = PyObject_CallFunction(zlib_compress, "Oi", + (PyObject *)pydata, level); + if (PyErr_Occurred()) PyErr_Clear(); + } +} + +// OP_ZLIB_CHECKSUM: Call either zlib.crc32(data) or zlib.adler32(data), +// fuzz-chosen. Exercises the checksum C implementations in zlibmodule.c. +static void op_zlib_checksum(FuzzedDataProvider &fdp) { + bool use_crc = fdp.ConsumeBool(); + std::string data = fdp.ConsumeRemainingBytesAsString(); + PyRef pydata = PyBytes_FromStringAndSize(Y(data)); + CHECK(pydata); + PyRef r = PyObject_CallFunction( + use_crc ? zlib_crc32 : zlib_adler32, "O", (PyObject *)pydata); + if (PyErr_Occurred()) PyErr_Clear(); +} + +// OP_BZ2: Either bz2.compress(data) or BZ2Decompressor().decompress(data, 1MB), +// fuzz-chosen. Exercises the _bz2 C extension (BZ2Compressor/BZ2Decompressor). +static void op_bz2(FuzzedDataProvider &fdp) { + bool do_compress = fdp.ConsumeBool(); + std::string data = fdp.ConsumeBytesAsString( + std::min(fdp.remaining_bytes(), (size_t)10000)); + PyRef pydata = PyBytes_FromStringAndSize(Y(data)); + CHECK(pydata); + + if (do_compress) { + PyRef r = PyObject_CallFunction(bz2_compress, "O", + (PyObject *)pydata); + if (PyErr_Occurred()) PyErr_Clear(); + } else { + PyRef dobj = PyObject_CallFunction(bz2_BZ2Decompressor, NULL); + CHECK(dobj); + PyRef r = PyObject_CallMethod(dobj, "decompress", "Oi", + (PyObject *)pydata, 1048576); + if (PyErr_Occurred()) PyErr_Clear(); + } +} + +// OP_LZMA_DECOMPRESS: Create LZMADecompressor with fuzz-chosen format from +// {FORMAT_AUTO, FORMAT_XZ, FORMAT_ALONE} and 16 MB memlimit, then call +// .decompress(data, 1MB). Exercises the _lzma C extension decompressor. +static void op_lzma_decompress(FuzzedDataProvider &fdp) { + long fmt_vals[] = { + lzma_FORMAT_AUTO_val, lzma_FORMAT_XZ_val, lzma_FORMAT_ALONE_val, + }; + long fmt = fmt_vals[fdp.ConsumeIntegralInRange(0, 2)]; + std::string data = fdp.ConsumeBytesAsString( + std::min(fdp.remaining_bytes(), (size_t)10000)); + PyRef pydata = PyBytes_FromStringAndSize(Y(data)); + CHECK(pydata); + + PyRef kwargs = PyDict_New(); + CHECK(kwargs); + PyRef fmt_obj = PyLong_FromLong(fmt); + CHECK(fmt_obj); + PyDict_SetItemString(kwargs, "format", fmt_obj); + PyRef memlimit = PyLong_FromLong(16 * 1024 * 1024); + CHECK(memlimit); + PyDict_SetItemString(kwargs, "memlimit", memlimit); + + PyRef empty_args = PyTuple_New(0); + CHECK(empty_args); + PyRef dobj = PyObject_Call(lzma_LZMADecompressor, empty_args, kwargs); + CHECK(dobj); + + PyRef r = PyObject_CallMethod(dobj, "decompress", "Oi", + (PyObject *)pydata, 1048576); + if (PyErr_Occurred()) PyErr_Clear(); +} + +// OP_LZMA_COMPRESS: One-shot lzma.compress(data). Exercises the _lzma +// C extension compressor with default settings. +static void op_lzma_compress(FuzzedDataProvider &fdp) { + std::string data = fdp.ConsumeBytesAsString( + std::min(fdp.remaining_bytes(), (size_t)10000)); + PyRef pydata = PyBytes_FromStringAndSize(Y(data)); + CHECK(pydata); + PyRef r = PyObject_CallFunction(lzma_compress, "O", (PyObject *)pydata); + if (PyErr_Occurred()) PyErr_Clear(); +} + +// --------------------------------------------------------------------------- +// Operations — Binascii (4 ops) +// --------------------------------------------------------------------------- + +// OP_BINASCII_DECODE: Call one of 6 binary-to-binary decoders from the +// binascii C module: a2b_base64 (with optional strict_mode=True), a2b_hex, +// a2b_uu, a2b_qp, a2b_ascii85, a2b_base85. Fuzz selects which decoder. +static void op_binascii_decode(FuzzedDataProvider &fdp) { + int which = fdp.ConsumeIntegralInRange(0, 5); + bool strict = fdp.ConsumeBool(); + std::string data = fdp.ConsumeRemainingBytesAsString(); + PyRef pydata = PyBytes_FromStringAndSize(Y(data)); + CHECK(pydata); + + PyObject *funcs[] = { + ba_a2b_base64, ba_a2b_hex, ba_a2b_uu, + ba_a2b_qp, ba_a2b_ascii85, ba_a2b_base85, + }; + + if (which == 0 && strict) { + PyRef kwargs = PyDict_New(); + CHECK(kwargs); + PyDict_SetItemString(kwargs, "strict_mode", Py_True); + PyRef args = PyTuple_Pack(1, (PyObject *)pydata); + CHECK(args); + PyRef r = PyObject_Call(ba_a2b_base64, args, kwargs); + } else { + PyRef r = PyObject_CallFunction(funcs[which], "O", + (PyObject *)pydata); + } + if (PyErr_Occurred()) PyErr_Clear(); +} + +// OP_BINASCII_ENCODE: Call one of 6 binary-to-text encoders from the +// binascii C module: b2a_base64 (with optional newline kwarg), b2a_hex, +// b2a_uu (input clamped to 45 bytes), b2a_qp, b2a_ascii85 (with optional +// foldspaces and wrapcol=72), b2a_base85. Fuzz selects which encoder. +static void op_binascii_encode(FuzzedDataProvider &fdp) { + int which = fdp.ConsumeIntegralInRange(0, 5); + std::string data = fdp.ConsumeBytesAsString( + std::min(fdp.remaining_bytes(), (size_t)10000)); + + // b2a_uu requires <= 45 bytes. + if (which == 2 && data.size() > 45) data.resize(45); + + PyRef pydata = PyBytes_FromStringAndSize(Y(data)); + CHECK(pydata); + + PyObject *funcs[] = { + ba_b2a_base64, ba_b2a_hex, ba_b2a_uu, + ba_b2a_qp, ba_b2a_ascii85, ba_b2a_base85, + }; + + if (which == 0) { + // b2a_base64 with optional newline kwarg. + bool newline = fdp.ConsumeBool(); + PyRef kwargs = PyDict_New(); + CHECK(kwargs); + PyDict_SetItemString(kwargs, "newline", newline ? Py_True : Py_False); + PyRef args = PyTuple_Pack(1, (PyObject *)pydata); + CHECK(args); + PyRef r = PyObject_Call(ba_b2a_base64, args, kwargs); + } else if (which == 4) { + // b2a_ascii85 with optional foldspaces/wrapcol. + bool foldspaces = fdp.ConsumeBool(); + PyRef kwargs = PyDict_New(); + CHECK(kwargs); + if (foldspaces) + PyDict_SetItemString(kwargs, "foldspaces", Py_True); + PyRef wrapcol = PyLong_FromLong(72); + CHECK(wrapcol); + PyDict_SetItemString(kwargs, "wrapcol", wrapcol); + PyRef args = PyTuple_Pack(1, (PyObject *)pydata); + CHECK(args); + PyRef r = PyObject_Call(ba_b2a_ascii85, args, kwargs); + } else { + PyRef r = PyObject_CallFunction(funcs[which], "O", + (PyObject *)pydata); + } + if (PyErr_Occurred()) PyErr_Clear(); +} + +// OP_BINASCII_CHECKSUM: Call either binascii.crc32(data) or +// binascii.crc_hqx(data, 0), fuzz-chosen. +static void op_binascii_checksum(FuzzedDataProvider &fdp) { + bool use_crc32 = fdp.ConsumeBool(); + std::string data = fdp.ConsumeRemainingBytesAsString(); + PyRef pydata = PyBytes_FromStringAndSize(Y(data)); + CHECK(pydata); + + if (use_crc32) { + PyRef r = PyObject_CallFunction(ba_crc32, "O", (PyObject *)pydata); + } else { + PyRef r = PyObject_CallFunction(ba_crc_hqx, "Oi", + (PyObject *)pydata, 0); + } + if (PyErr_Occurred()) PyErr_Clear(); +} + +// OP_BINASCII_ROUNDTRIP: binascii.hexlify(data) then binascii.unhexlify() +// on the result. Exercises both directions of hex encoding. +static void op_binascii_roundtrip(FuzzedDataProvider &fdp) { + std::string data = fdp.ConsumeRemainingBytesAsString(); + PyRef pydata = PyBytes_FromStringAndSize(Y(data)); + CHECK(pydata); + PyRef hexed = PyObject_CallFunction(ba_hexlify, "O", + (PyObject *)pydata); + CHECK(hexed); + PyRef r = PyObject_CallFunction(ba_unhexlify, "O", (PyObject *)hexed); + if (PyErr_Occurred()) PyErr_Clear(); +} + +// --------------------------------------------------------------------------- +// Operations — Pickle (4 ops) +// --------------------------------------------------------------------------- + +// Build a Python container from fuzz bytes for pickle.dumps operations. +// type selects: 0=bytes, 1=str, 2=list of ints, 3=tuple of ints, +// 4=set of ints, 5=frozenset of ints, 6=bytearray, 7=dict(int->None). +// Capped at 256 elements to keep serialization fast. +// str_enc selects the byte-to-str decoding (see fuzz_bytes_to_str). +static PyObject *build_pickle_container(int type, const uint8_t *buf, + size_t len, int str_enc) { + if (len > 256) len = 256; + switch (type) { + case 0: // raw bytes + return PyBytes_FromStringAndSize((const char *)buf, len); + case 1: { // str + std::string s((const char *)buf, len); + return fuzz_bytes_to_str(s, str_enc); + } + case 2: { // list of ints + PyObject *lst = PyList_New((Py_ssize_t)len); + if (!lst) return NULL; + for (size_t i = 0; i < len; i++) + PyList_SET_ITEM(lst, i, PyLong_FromLong(buf[i])); + return lst; + } + case 3: { // tuple of ints + PyObject *tup = PyTuple_New((Py_ssize_t)len); + if (!tup) return NULL; + for (size_t i = 0; i < len; i++) + PyTuple_SET_ITEM(tup, i, PyLong_FromLong(buf[i])); + return tup; + } + case 4: { // set + PyObject *lst = PyList_New((Py_ssize_t)len); + if (!lst) return NULL; + for (size_t i = 0; i < len; i++) + PyList_SET_ITEM(lst, i, PyLong_FromLong(buf[i])); + PyObject *s = PySet_New(lst); + Py_DECREF(lst); + return s; + } + case 5: { // frozenset + PyObject *lst = PyList_New((Py_ssize_t)len); + if (!lst) return NULL; + for (size_t i = 0; i < len; i++) + PyList_SET_ITEM(lst, i, PyLong_FromLong(buf[i])); + PyObject *s = PyFrozenSet_New(lst); + Py_DECREF(lst); + return s; + } + case 6: // bytearray + return PyByteArray_FromStringAndSize((const char *)buf, len); + case 7: { // dict.fromkeys + PyObject *d = PyDict_New(); + if (!d) return NULL; + for (size_t i = 0; i < len; i++) { + PyRef key = PyLong_FromLong(buf[i]); + if (key) PyDict_SetItem(d, key, Py_None); + } + return d; + } + default: + return PyBytes_FromStringAndSize((const char *)buf, len); + } +} + +// OP_PICKLE_DUMPS: Build a fuzz-chosen container type (see +// build_pickle_container; str containers use fuzz_bytes_to_str for +// fuzz-chosen byte-to-str decoding), then call pickle.dumps(obj, protocol=N, +// fix_imports=bool). Protocol is fuzz-chosen 0-5, exercising all pickle +// opcodes: MARK, SHORT_BINBYTES, BINUNICODE, EMPTY_SET, ADDITEMS, +// FROZENSET, BYTEARRAY8, SETITEMS, etc. +static void op_pickle_dumps(FuzzedDataProvider &fdp) { + int container_type = fdp.ConsumeIntegralInRange(0, 7); + int protocol = fdp.ConsumeIntegralInRange(0, 5); + bool fix_imports = fdp.ConsumeBool(); + int str_enc = fdp.ConsumeIntegralInRange(0, 3); + std::string data = fdp.ConsumeBytesAsString( + std::min(fdp.remaining_bytes(), (size_t)10000)); + + PyRef obj(build_pickle_container( + container_type, (const uint8_t *)data.data(), data.size(), str_enc)); + CHECK(obj); + + PyRef kwargs = PyDict_New(); + CHECK(kwargs); + PyRef proto = PyLong_FromLong(protocol); + CHECK(proto); + PyDict_SetItemString(kwargs, "protocol", proto); + PyDict_SetItemString(kwargs, "fix_imports", + fix_imports ? Py_True : Py_False); + PyRef args = PyTuple_Pack(1, (PyObject *)obj); + CHECK(args); + PyRef r = PyObject_Call(pickle_dumps, args, kwargs); + if (PyErr_Occurred()) PyErr_Clear(); +} + +// OP_PICKLE_LOADS: Wrap fuzz data in BytesIO, then unpickle via one of 3 +// Unpickler subclass variants (fuzz-chosen): +// 0 — RestrictedUnpickler: blocks find_class (safe against arbitrary code) +// 1 — PersistentUnpickler: handles PERSID/BINPERSID opcodes, blocks find_class +// 2 — RestrictedUnpickler with fix_imports=True, encoding='bytes' (Py2 compat) +// Exercises the _pickle C extension's Unpickler_Type code paths. +static void op_pickle_loads(FuzzedDataProvider &fdp) { + int variant = fdp.ConsumeIntegralInRange(0, 2); + std::string data = fdp.ConsumeRemainingBytesAsString(); + PyRef pydata = PyBytes_FromStringAndSize(Y(data)); + CHECK(pydata); + PyRef bio = PyObject_CallFunction(bytesio_ctor, "O", + (PyObject *)pydata); + CHECK(bio); + + PyObject *cls = nullptr; + PyRef kwargs_ref; + switch (variant) { + case 0: // RestrictedUnpickler + cls = RestrictedUnpickler_cls; + break; + case 1: // PersistentUnpickler + cls = PersistentUnpickler_cls; + break; + case 2: { // RestrictedUnpickler with fix_imports + encoding='bytes' + cls = RestrictedUnpickler_cls; + kwargs_ref = PyRef(PyDict_New()); + CHECK(kwargs_ref); + PyDict_SetItemString(kwargs_ref, "fix_imports", Py_True); + PyRef enc = PyUnicode_FromString("bytes"); + CHECK(enc); + PyDict_SetItemString(kwargs_ref, "encoding", enc); + break; + } + } + + PyRef args = PyTuple_Pack(1, (PyObject *)bio); + CHECK(args); + PyRef unpickler = PyObject_Call( + cls, args, kwargs_ref.p ? (PyObject *)kwargs_ref : NULL); + CHECK(unpickler); + PyRef r = PyObject_CallMethod(unpickler, "load", NULL); + if (PyErr_Occurred()) PyErr_Clear(); +} + +// OP_PICKLE_PICKLER: Create pickle.Pickler(BytesIO, protocol=N), then chain: +// .dump(list_of_ints), .clear_memo(), .dump(str), .getvalue(). +// The str object for the second dump is built via fuzz_bytes_to_str with a +// fuzz-chosen decoding. Exercises the Pickler_Type, memo proxy clear, and +// multi-dump sequences in the _pickle C extension. Protocol is fuzz-chosen 0-5. +static void op_pickle_pickler(FuzzedDataProvider &fdp) { + int protocol = fdp.ConsumeIntegralInRange(0, 5); + int str_enc = fdp.ConsumeIntegralInRange(0, 3); + std::string data = fdp.ConsumeBytesAsString( + std::min(fdp.remaining_bytes(), (size_t)10000)); + + PyRef bio = PyObject_CallFunction(bytesio_ctor, NULL); + CHECK(bio); + + // Import pickle.Pickler (cached after first call). + static PyObject *pickle_Pickler = nullptr; + if (!pickle_Pickler) { + pickle_Pickler = import_attr("pickle", "Pickler"); + } + + PyRef pickler = PyObject_CallFunction(pickle_Pickler, "Oi", + (PyObject *)bio, protocol); + CHECK(pickler); + + // Build first object: list of ints. + PyRef obj1(build_pickle_container( + 2, (const uint8_t *)data.data(), data.size(), str_enc)); + CHECK(obj1); + + PyRef r1 = PyObject_CallMethod(pickler, "dump", "O", (PyObject *)obj1); + if (!r1) { + PyErr_Clear(); + return; + } + + PyRef cm = PyObject_CallMethod(pickler, "clear_memo", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + + // Build second object: string. + PyRef obj2(fuzz_bytes_to_str(data, str_enc)); + CHECK(obj2); + PyRef r2 = PyObject_CallMethod(pickler, "dump", "O", (PyObject *)obj2); + if (PyErr_Occurred()) PyErr_Clear(); + + PyRef val = PyObject_CallMethod(bio, "getvalue", NULL); + if (PyErr_Occurred()) PyErr_Clear(); +} + +// OP_PICKLE_ROUNDTRIP: Build a fuzz-chosen container (str containers use +// fuzz_bytes_to_str for fuzz-chosen byte-to-str decoding), pickle.dumps() it, +// then pickle.loads() the result. Exercises both Pickler and Unpickler in +// a single iteration, ensuring round-trip consistency. +static void op_pickle_roundtrip(FuzzedDataProvider &fdp) { + int container_type = fdp.ConsumeIntegralInRange(0, 7); + int str_enc = fdp.ConsumeIntegralInRange(0, 3); + std::string data = fdp.ConsumeBytesAsString( + std::min(fdp.remaining_bytes(), (size_t)10000)); + + PyRef obj(build_pickle_container( + container_type, (const uint8_t *)data.data(), data.size(), str_enc)); + CHECK(obj); + + PyRef dumped = PyObject_CallFunction(pickle_dumps, "O", (PyObject *)obj); + if (!dumped) { + PyErr_Clear(); + return; + } + PyRef loaded = PyObject_CallFunction(pickle_loads, "O", + (PyObject *)dumped); + if (PyErr_Occurred()) PyErr_Clear(); +} + +// --------------------------------------------------------------------------- +// Operations — Codecs (5 ops) +// +// These exercise the _multibytecodec C engine and per-language codec +// C modules (_codecs_jp, _codecs_cn, _codecs_kr, _codecs_hk, _codecs_tw, +// _codecs_iso2022) as well as built-in codecs (utf-7/8/16/32, ascii, +// latin-1, charmap, unicode_escape, raw_unicode_escape, cp1252). +// --------------------------------------------------------------------------- + +// Codec names for OP_CODECS_DECODE: 17 decoders covering multibyte CJK +// codecs plus single-byte and Unicode escape codecs. +static const char *kCodecDecoders[] = { + "utf-7", "shift_jis", "euc-jp", "gb2312", "big5", "iso-2022-jp", + "euc-kr", "gb18030", "big5hkscs", "charmap", "ascii", "latin-1", + "cp1252", "unicode_escape", "raw_unicode_escape", "utf-16", "utf-32", +}; +static constexpr int kNumCodecDecoders = + sizeof(kCodecDecoders) / sizeof(kCodecDecoders[0]); + +// Codec names for OP_CODECS_ENCODE: 19 encoders covering multibyte CJK +// codecs plus Unicode, UTF, and single-byte encoders. +static const char *kCodecEncoders[] = { + "shift_jis", "euc-jp", "gb2312", "big5", "iso-2022-jp", "euc-kr", + "gb18030", "big5hkscs", "unicode_escape", "raw_unicode_escape", + "utf-7", "utf-8", "utf-16", "utf-16-le", "utf-16-be", "utf-32", + "latin-1", "ascii", "charmap", +}; +static constexpr int kNumCodecEncoders = + sizeof(kCodecEncoders) / sizeof(kCodecEncoders[0]); + +// OP_CODECS_DECODE: Call codecs.decode(bytes, codec, 'replace') with a +// fuzz-chosen codec from 17 decoders. The 'replace' error handler ensures +// no UnicodeDecodeError is raised. Exercises the multibytecodec_decode and +// built-in codec decode paths. +static void op_codecs_decode(FuzzedDataProvider &fdp) { + int ci = fdp.ConsumeIntegralInRange(0, kNumCodecDecoders - 1); + std::string data = fdp.ConsumeRemainingBytesAsString(); + PyRef pydata = PyBytes_FromStringAndSize(Y(data)); + CHECK(pydata); + PyRef r = PyObject_CallFunction(codecs_decode, "Oss", + (PyObject *)pydata, + kCodecDecoders[ci], "replace"); + if (PyErr_Occurred()) PyErr_Clear(); +} + +// OP_CODECS_ENCODE: Convert fuzz bytes to a Python str using a fuzz-chosen +// decoding (Latin-1, UTF-8, UTF-16-LE, or UTF-32-LE — see fuzz_bytes_to_str), +// then call codecs.encode(str, codec, 'replace') with a fuzz-chosen codec +// from 19 encoders. Exercises the multibytecodec_encode and built-in codec +// encode paths. +static void op_codecs_encode(FuzzedDataProvider &fdp) { + int ci = fdp.ConsumeIntegralInRange(0, kNumCodecEncoders - 1); + int str_enc = fdp.ConsumeIntegralInRange(0, 3); + std::string data = fdp.ConsumeBytesAsString( + std::min(fdp.remaining_bytes(), (size_t)10000)); + PyRef pystr(fuzz_bytes_to_str(data, str_enc)); + CHECK(pystr); + PyRef r = PyObject_CallFunction(codecs_encode, "Oss", + (PyObject *)pystr, + kCodecEncoders[ci], "replace"); + if (PyErr_Occurred()) PyErr_Clear(); +} + +// OP_CODECS_INCREMENTAL_DECODE: Get an IncrementalDecoder for a fuzz-chosen +// codec from {shift_jis, gb18030, utf-16}, split the fuzz data at the +// midpoint, then: .decode(first_half), .decode(second_half, final=True), +// .getstate(), .reset(). Exercises the stateful incremental decoding path +// in _multibytecodec (MultibyteIncrementalDecoder_Type). +static void op_codecs_incremental_decode(FuzzedDataProvider &fdp) { + static const char *kIncCodecs[] = {"shift_jis", "gb18030", "utf-16"}; + int ci = fdp.ConsumeIntegralInRange(0, 2); + std::string data = fdp.ConsumeRemainingBytesAsString(); + size_t mid = data.size() / 2; + + PyRef codec_name = PyUnicode_FromString(kIncCodecs[ci]); + CHECK(codec_name); + PyRef decoder_factory = PyObject_CallFunction( + codecs_getincrementaldecoder, "O", (PyObject *)codec_name); + CHECK(decoder_factory); + + PyRef decoder = PyObject_CallFunction(decoder_factory, "s", "replace"); + CHECK(decoder); + + PyRef half1 = PyBytes_FromStringAndSize(data.data(), mid); + CHECK(half1); + PyRef r1 = PyObject_CallMethod(decoder, "decode", "O", + (PyObject *)half1); + if (!r1) { + PyErr_Clear(); + return; + } + + PyRef half2 = PyBytes_FromStringAndSize(data.data() + mid, + data.size() - mid); + CHECK(half2); + PyRef r2 = PyObject_CallMethod(decoder, "decode", "Oi", + (PyObject *)half2, 1); + if (PyErr_Occurred()) PyErr_Clear(); + + PyRef state = PyObject_CallMethod(decoder, "getstate", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + PyRef reset = PyObject_CallMethod(decoder, "reset", NULL); + if (PyErr_Occurred()) PyErr_Clear(); +} + +// OP_CODECS_INCREMENTAL_ENCODE: Get an IncrementalEncoder for a fuzz-chosen +// codec from {shift_jis, utf-8}. Convert fuzz bytes to str via fuzz-chosen +// decoding (see fuzz_bytes_to_str), split the resulting string at the +// midpoint, then: .encode(first_half), .reset(), .encode(second_half), +// .getstate(). Exercises the stateful incremental encoding path in +// _multibytecodec (MultibyteIncrementalEncoder_Type). +static void op_codecs_incremental_encode(FuzzedDataProvider &fdp) { + static const char *kIncCodecs[] = {"shift_jis", "utf-8"}; + int ci = fdp.ConsumeIntegralInRange(0, 1); + int str_enc = fdp.ConsumeIntegralInRange(0, 3); + std::string data = fdp.ConsumeBytesAsString( + std::min(fdp.remaining_bytes(), (size_t)10000)); + + PyRef pystr(fuzz_bytes_to_str(data, str_enc)); + CHECK(pystr); + Py_ssize_t slen = PyUnicode_GET_LENGTH(pystr); + Py_ssize_t mid = slen / 2; + + PyRef codec_name = PyUnicode_FromString(kIncCodecs[ci]); + CHECK(codec_name); + PyRef encoder_factory = PyObject_CallFunction( + codecs_getincrementalencoder, "O", (PyObject *)codec_name); + CHECK(encoder_factory); + + PyRef encoder = PyObject_CallFunction(encoder_factory, "s", "replace"); + CHECK(encoder); + + PyRef half1 = PyUnicode_Substring(pystr, 0, mid); + CHECK(half1); + PyRef r1 = PyObject_CallMethod(encoder, "encode", "O", + (PyObject *)half1); + if (!r1) { + PyErr_Clear(); + return; + } + + PyRef reset_r = PyObject_CallMethod(encoder, "reset", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + + PyRef half2 = PyUnicode_Substring(pystr, mid, slen); + CHECK(half2); + PyRef r2 = PyObject_CallMethod(encoder, "encode", "O", + (PyObject *)half2); + if (PyErr_Occurred()) PyErr_Clear(); + + PyRef state = PyObject_CallMethod(encoder, "getstate", NULL); + if (PyErr_Occurred()) PyErr_Clear(); +} + +// OP_CODECS_STREAM: Wrap fuzz data in BytesIO, create a UTF-8 StreamReader +// via codecs.getreader('utf-8')(bio, errors='replace'), then .read(). +// Exercises the StreamReader code path (MultibyteStreamReader_Type for +// multibyte codecs, or built-in StreamReader for UTF-8). +static void op_codecs_stream(FuzzedDataProvider &fdp) { + std::string data = fdp.ConsumeRemainingBytesAsString(); + PyRef pydata = PyBytes_FromStringAndSize(Y(data)); + CHECK(pydata); + PyRef bio = PyObject_CallFunction(bytesio_ctor, "O", + (PyObject *)pydata); + CHECK(bio); + + PyRef reader_factory = PyObject_CallFunction( + codecs_getreader, "s", "utf-8"); + CHECK(reader_factory); + + PyRef reader = PyObject_CallFunction(reader_factory, "Os", + (PyObject *)bio, "replace"); + CHECK(reader); + + PyRef r = PyObject_CallMethod(reader, "read", NULL); + if (PyErr_Occurred()) PyErr_Clear(); +} + +// --------------------------------------------------------------------------- +// Operations — SSL (1 op) +// --------------------------------------------------------------------------- + +// OP_SSL_CERT: Call ssl.DER_cert_to_PEM_cert(data) to attempt DER-to-PEM +// certificate conversion. If successful, create an SSLContext with +// PROTOCOL_TLS_CLIENT and call .load_verify_locations(cadata=pem_string) +// to exercise the OpenSSL certificate parsing path in the _ssl C module. +static void op_ssl_cert(FuzzedDataProvider &fdp) { + std::string data = fdp.ConsumeRemainingBytesAsString(); + PyRef pydata = PyBytes_FromStringAndSize(Y(data)); + CHECK(pydata); + PyRef pem = PyObject_CallFunction(ssl_DER_cert_to_PEM_cert, "O", + (PyObject *)pydata); + if (!pem) { + PyErr_Clear(); + return; + } + + // Optionally try to load into SSLContext. + PyRef ctx = PyObject_CallFunction(ssl_SSLContext, "l", + ssl_PROTOCOL_TLS_CLIENT_val); + if (!ctx) { + PyErr_Clear(); + return; + } + + PyRef kwargs = PyDict_New(); + CHECK(kwargs); + PyDict_SetItemString(kwargs, "cadata", pem); + PyRef empty_args = PyTuple_New(0); + CHECK(empty_args); + PyRef method = PyObject_GetAttrString(ctx, "load_verify_locations"); + if (!method) { + PyErr_Clear(); + return; + } + PyRef r = PyObject_Call(method, empty_args, kwargs); + if (PyErr_Occurred()) PyErr_Clear(); +} + +// --------------------------------------------------------------------------- +// Dispatch. +// --------------------------------------------------------------------------- + +enum Op { + OP_ZLIB_DECOMPRESS, + OP_ZLIB_COMPRESS, + OP_ZLIB_CHECKSUM, + OP_BZ2, + OP_LZMA_DECOMPRESS, + OP_LZMA_COMPRESS, + OP_BINASCII_DECODE, + OP_BINASCII_ENCODE, + OP_BINASCII_CHECKSUM, + OP_BINASCII_ROUNDTRIP, + OP_PICKLE_DUMPS, + OP_PICKLE_LOADS, + OP_PICKLE_PICKLER, + OP_PICKLE_ROUNDTRIP, + OP_CODECS_DECODE, + OP_CODECS_ENCODE, + OP_CODECS_INCREMENTAL_DECODE, + OP_CODECS_INCREMENTAL_ENCODE, + OP_CODECS_STREAM, + OP_SSL_CERT, + NUM_OPS +}; + +extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + assert(Py_IsInitialized()); + init_decode(); + if (size < 1 || size > kMaxInputSize) return 0; + if (PyErr_Occurred()) PyErr_Clear(); + + FuzzedDataProvider fdp(data, size); + switch (fdp.ConsumeIntegralInRange(0, NUM_OPS - 1)) { + case OP_ZLIB_DECOMPRESS: + op_zlib_decompress(fdp); + break; + case OP_ZLIB_COMPRESS: + op_zlib_compress(fdp); + break; + case OP_ZLIB_CHECKSUM: + op_zlib_checksum(fdp); + break; + case OP_BZ2: + op_bz2(fdp); + break; + case OP_LZMA_DECOMPRESS: + op_lzma_decompress(fdp); + break; + case OP_LZMA_COMPRESS: + op_lzma_compress(fdp); + break; + case OP_BINASCII_DECODE: + op_binascii_decode(fdp); + break; + case OP_BINASCII_ENCODE: + op_binascii_encode(fdp); + break; + case OP_BINASCII_CHECKSUM: + op_binascii_checksum(fdp); + break; + case OP_BINASCII_ROUNDTRIP: + op_binascii_roundtrip(fdp); + break; + case OP_PICKLE_DUMPS: + op_pickle_dumps(fdp); + break; + case OP_PICKLE_LOADS: + op_pickle_loads(fdp); + break; + case OP_PICKLE_PICKLER: + op_pickle_pickler(fdp); + break; + case OP_PICKLE_ROUNDTRIP: + op_pickle_roundtrip(fdp); + break; + case OP_CODECS_DECODE: + op_codecs_decode(fdp); + break; + case OP_CODECS_ENCODE: + op_codecs_encode(fdp); + break; + case OP_CODECS_INCREMENTAL_DECODE: + op_codecs_incremental_decode(fdp); + break; + case OP_CODECS_INCREMENTAL_ENCODE: + op_codecs_incremental_encode(fdp); + break; + case OP_CODECS_STREAM: + op_codecs_stream(fdp); + break; + case OP_SSL_CERT: + op_ssl_cert(fdp); + break; + } + + if (++gc_counter % kGcInterval == 0) PyGC_Collect(); + return 0; +} diff --git a/module-fuzzers/fuzz_helpers.h b/module-fuzzers/fuzz_helpers.h new file mode 100644 index 0000000..c9c270a --- /dev/null +++ b/module-fuzzers/fuzz_helpers.h @@ -0,0 +1,139 @@ +// fuzz_helpers.h — Shared infrastructure for CPython fuzz targets. +// +// Each CPython fuzzer binary (.cpp) includes this header. Since each binary +// compiles exactly one .cpp file, all definitions here are safe (no ODR +// issues across translation units). + +#ifndef FUZZ_HELPERS_H_ +#define FUZZ_HELPERS_H_ + +#include +#include +#include +#include +#include + +// --------------------------------------------------------------------------- +// LibFuzzer hooks +// --------------------------------------------------------------------------- + +// Disable LeakSanitizer. CPython's pymalloc allocator uses custom freelists +// and arenas that LSAN cannot track, causing thousands of false-positive leak +// reports on every fuzzer iteration. +extern "C" int __lsan_is_turned_off(void) { return 1; } + +// Initialize the CPython interpreter. Called once by libFuzzer before the +// main fuzzing loop begins. +extern "C" int LLVMFuzzerInitialize(int *argc, char ***argv) { + PyConfig config; + PyConfig_InitPythonConfig(&config); + config.install_signal_handlers = 0; + config.int_max_str_digits = 8086; + PyStatus status; + status = + PyConfig_SetBytesString(&config, &config.program_name, *argv[0]); + if (PyStatus_Exception(status)) goto fail; + status = Py_InitializeFromConfig(&config); + if (PyStatus_Exception(status)) goto fail; + PyConfig_Clear(&config); + return 0; +fail: + PyConfig_Clear(&config); + Py_ExitStatusException(status); +} + +// --------------------------------------------------------------------------- +// RAII wrapper and macros +// --------------------------------------------------------------------------- + +// RAII wrapper for PyObject*. Prevents reference leaks by calling Py_XDECREF +// in the destructor. Non-copyable, move-enabled. +struct PyRef { + PyObject *p; + PyRef(PyObject *o = nullptr) : p(o) {} + ~PyRef() { Py_XDECREF(p); } + operator PyObject *() const { return p; } + explicit operator bool() const { return p != nullptr; } + + PyRef(const PyRef &) = delete; + PyRef &operator=(const PyRef &) = delete; + PyRef(PyRef &&o) : p(o.p) { o.p = nullptr; } + PyRef &operator=(PyRef &&o) { + Py_XDECREF(p); + p = o.p; + o.p = nullptr; + return *this; + } +}; + +// Bail out of the current operation if a Python call returns NULL/false. +// Clears the pending Python exception so the next iteration starts clean. +#define CHECK(x) \ + do { \ + if (!(x)) { \ + PyErr_Clear(); \ + return; \ + } \ + } while (0) + +// Expand a std::string into (const char*, Py_ssize_t) for "y#" format codes. +#define Y(s) (s).data(), (Py_ssize_t)(s).size() + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +// Import mod.attr and return a new reference. Aborts on failure — called only +// during one-time init, so missing modules indicate a broken build. +static PyObject *import_attr(const char *mod, const char *attr) { + PyObject *m = PyImport_ImportModule(mod); + if (!m) { + PyErr_Print(); + abort(); + } + PyObject *a = PyObject_GetAttrString(m, attr); + Py_DECREF(m); + if (!a) { + PyErr_Print(); + abort(); + } + return a; +} + +// Convert raw fuzz bytes to a Python str using a fuzz-chosen decoding. +// Different decodings give the fuzzer control over different codepoint ranges: +// 0 — Latin-1: lossless 1:1 byte-to-codepoint (U+0000-U+00FF) +// 1 — UTF-8: variable-width, full Unicode (invalid bytes -> U+FFFD) +// 2 — UTF-16-LE: 2 bytes per codepoint, covers BMP including CJK ranges +// 3 — UTF-32-LE: 4 bytes per codepoint, full Unicode incl. supplementary +static PyObject *fuzz_bytes_to_str(const std::string &data, int method) { + switch (method & 3) { + case 0: + return PyUnicode_DecodeLatin1(Y(data), NULL); + case 1: + return PyUnicode_DecodeUTF8(Y(data), "replace"); + case 2: { + int order = -1; // little-endian + return PyUnicode_DecodeUTF16( + data.data(), data.size(), "replace", &order); + } + case 3: { + int order = -1; // little-endian + return PyUnicode_DecodeUTF32( + data.data(), data.size(), "replace", &order); + } + } + return PyUnicode_DecodeLatin1(Y(data), NULL); // unreachable +} + +// --------------------------------------------------------------------------- +// Constants +// --------------------------------------------------------------------------- + +// How often (in iterations) to run PyGC_Collect(). +static constexpr int kGcInterval = 200; + +// Maximum fuzz input size (1 MB). +static constexpr size_t kMaxInputSize = 0x100000; + +#endif // FUZZ_HELPERS_H_ diff --git a/module-fuzzers/fuzz_ioops.cpp b/module-fuzzers/fuzz_ioops.cpp new file mode 100644 index 0000000..a8dbd49 --- /dev/null +++ b/module-fuzzers/fuzz_ioops.cpp @@ -0,0 +1,1015 @@ +// fuzz_ioops.cpp — Fuzzer for CPython's I/O C extension modules. +// +// This fuzzer exercises the following CPython C extension modules via +// their Python API, called through the Python C API from C++: +// +// _io/bytesio.c — BytesIO: write, seek, read, readline, readlines, +// readinto, read1, readinto1, getvalue, getbuffer, +// truncate, tell, iteration, peek (via BufferedReader) +// _io/textio.c — TextIOWrapper: write, read, readline, readlines, +// flush, seek, reconfigure, detach, properties +// (readable/writable/seekable/encoding/buffer), +// IncrementalNewlineDecoder +// _io/bufferedio.c — BufferedReader, BufferedWriter, BufferedRandom, +// BufferedRWPair: read, write, peek, read1, readline, +// seek, tell, truncate, flush, detach, raw +// _io/fileio.c — FileIO: read, readall, readinto, write, flush, +// tell, seek, truncate, fileno, isatty, name, mode, +// closefd, readable, writable, seekable +// _io/_iomodule.c — io.open() with various modes (r, rb, w, wb) +// _io/stringio.c — StringIO: write, seek, readline, readlines, +// truncate, tell, close +// +// The first byte of fuzz input selects one of 7 operation types. Each +// operation consumes further bytes via FuzzedDataProvider to parameterize +// the call (encoding, error handler, newline mode, I/O variant). +// +// All module functions and class constructors are imported once during init +// and cached as static PyObject* pointers. Temporary directory and test file +// are created once at init. PyRef (RAII) prevents reference leaks. +// PyGC_Collect() runs every 200 iterations. Max input size: 64 KB. + +#include "fuzz_helpers.h" + +// --------------------------------------------------------------------------- +// Cached module objects, initialized once. +// --------------------------------------------------------------------------- + +// io classes +static PyObject *io_BytesIO, *io_TextIOWrapper; +static PyObject *io_BufferedReader, *io_BufferedWriter; +static PyObject *io_BufferedRandom, *io_BufferedRWPair; +static PyObject *io_FileIO, *io_open, *io_StringIO; +static PyObject *io_IncrementalNewlineDecoder; + +// os +static PyObject *os_path_join, *os_open_fn, *os_unlink; +static PyObject *os_O_RDONLY; + +// Temp paths (as C strings). +static char tmpdir[256]; +static char tmpfile_path[256]; + +static unsigned long gc_counter = 0; + +static int initialized = 0; + +static void init_ioops(void) { + if (initialized) return; + + // io + io_BytesIO = import_attr("io", "BytesIO"); + io_TextIOWrapper = import_attr("io", "TextIOWrapper"); + io_BufferedReader = import_attr("io", "BufferedReader"); + io_BufferedWriter = import_attr("io", "BufferedWriter"); + io_BufferedRandom = import_attr("io", "BufferedRandom"); + io_BufferedRWPair = import_attr("io", "BufferedRWPair"); + io_FileIO = import_attr("io", "FileIO"); + io_open = import_attr("io", "open"); + io_StringIO = import_attr("io", "StringIO"); + io_IncrementalNewlineDecoder = import_attr("io", + "IncrementalNewlineDecoder"); + + // os + os_path_join = import_attr("os.path", "join"); + os_open_fn = import_attr("os", "open"); + os_unlink = import_attr("os", "unlink"); + os_O_RDONLY = import_attr("os", "O_RDONLY"); + + // Create temp directory and test file. + { + PyObject *globals = PyDict_New(); + PyDict_SetItemString(globals, "__builtins__", PyEval_GetBuiltins()); + PyObject *r = PyRun_String( + "import tempfile, os\n" + "_tmpdir = tempfile.mkdtemp(prefix='fuzz_io_')\n" + "_tmpfile = os.path.join(_tmpdir, 'test')\n" + "with open(_tmpfile, 'wb') as f:\n" + " f.write(b'A' * 4096)\n", + Py_file_input, globals, globals); + if (!r) { PyErr_Print(); abort(); } + Py_DECREF(r); + PyObject *td = PyDict_GetItemString(globals, "_tmpdir"); + PyObject *tf = PyDict_GetItemString(globals, "_tmpfile"); + const char *td_str = PyUnicode_AsUTF8(td); + const char *tf_str = PyUnicode_AsUTF8(tf); + snprintf(tmpdir, sizeof(tmpdir), "%s", td_str); + snprintf(tmpfile_path, sizeof(tmpfile_path), "%s", tf_str); + Py_DECREF(globals); + } + + // Suppress warnings. + PyRun_SimpleString("import warnings; warnings.filterwarnings('ignore')"); + + assert(!PyErr_Occurred()); + initialized = 1; +} + +// Helper: Build a temp file path. +static PyObject *make_tmppath(const char *name) { + return PyObject_CallFunction(os_path_join, "ss", tmpdir, name); +} + +// Helper: Unlink a file (ignore errors). +static void unlink_path(PyObject *path) { + PyRef r = PyObject_CallFunction(os_unlink, "O", path); + if (PyErr_Occurred()) PyErr_Clear(); +} + +// --------------------------------------------------------------------------- +// Operations (7 ops). +// --------------------------------------------------------------------------- + +// OP_BYTESIO: BytesIO with fuzz data, then FDP selects actions. +// Exercises _io/bytesio.c paths. +static void op_bytesio(FuzzedDataProvider &fdp) { + int variant = fdp.ConsumeIntegralInRange(0, 6); + std::string data = fdp.ConsumeBytesAsString( + std::min(fdp.remaining_bytes(), (size_t)10000)); + PyRef pydata = PyBytes_FromStringAndSize(Y(data)); + CHECK(pydata); + + switch (variant) { + case 0: { + // Basic: write/seek/read/getvalue/tell. + PyRef bio = PyObject_CallFunction(io_BytesIO, NULL); + CHECK(bio); + PyRef wr = PyObject_CallMethod(bio, "write", "O", (PyObject *)pydata); + if (!wr) { PyErr_Clear(); break; } + PyRef sk = PyObject_CallMethod(bio, "seek", "i", 0); + if (PyErr_Occurred()) PyErr_Clear(); + PyRef rd = PyObject_CallMethod(bio, "read", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + PyRef gv = PyObject_CallMethod(bio, "getvalue", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + PyRef tl = PyObject_CallMethod(bio, "tell", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + PyRef cl = PyObject_CallMethod(bio, "close", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + break; + } + case 1: { + // readline, readlines, readinto. + PyRef bio = PyObject_CallFunction(io_BytesIO, "O", (PyObject *)pydata); + CHECK(bio); + { + PyRef r = PyObject_CallMethod(bio, "readline", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef sk = PyObject_CallMethod(bio, "seek", "i", 0); + if (PyErr_Occurred()) PyErr_Clear(); + PyRef r = PyObject_CallMethod(bio, "readlines", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef sk = PyObject_CallMethod(bio, "seek", "i", 0); + if (PyErr_Occurred()) PyErr_Clear(); + PyRef buf = PyByteArray_FromStringAndSize(NULL, 32); + CHECK(buf); + PyRef r = PyObject_CallMethod(bio, "readinto", "O", (PyObject *)buf); + if (PyErr_Occurred()) PyErr_Clear(); + } + PyRef cl = PyObject_CallMethod(bio, "close", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + break; + } + case 2: { + // truncate + write + getvalue. + PyRef bio = PyObject_CallFunction(io_BytesIO, "O", (PyObject *)pydata); + CHECK(bio); + long trunc_at = data.size() < 64 ? data.size() : 64; + PyRef tr = PyObject_CallMethod(bio, "truncate", "l", trunc_at); + if (PyErr_Occurred()) PyErr_Clear(); + PyRef wr = PyObject_CallMethod(bio, "write", "y#", "XX", 2); + if (PyErr_Occurred()) PyErr_Clear(); + PyRef gv = PyObject_CallMethod(bio, "getvalue", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + PyRef cl = PyObject_CallMethod(bio, "close", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + break; + } + case 3: { + // getbuffer (memoryview). + PyRef bio = PyObject_CallFunction(io_BytesIO, "O", (PyObject *)pydata); + CHECK(bio); + PyRef mv = PyObject_CallMethod(bio, "getbuffer", NULL); + if (mv) { + PyRef bytes_val = PyObject_CallFunction( + (PyObject *)&PyBytes_Type, "O", (PyObject *)mv); + if (PyErr_Occurred()) PyErr_Clear(); + PyRef rel = PyObject_CallMethod(mv, "release", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } else { + PyErr_Clear(); + } + PyRef cl = PyObject_CallMethod(bio, "close", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + break; + } + case 4: { + // read1, readinto1. + PyRef bio = PyObject_CallFunction(io_BytesIO, "O", (PyObject *)pydata); + CHECK(bio); + { + PyRef r = PyObject_CallMethod(bio, "read1", "i", 16); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef sk = PyObject_CallMethod(bio, "seek", "i", 0); + if (PyErr_Occurred()) PyErr_Clear(); + PyRef buf = PyByteArray_FromStringAndSize(NULL, 32); + CHECK(buf); + PyRef r = PyObject_CallMethod(bio, "readinto1", "O", (PyObject *)buf); + if (PyErr_Occurred()) PyErr_Clear(); + } + PyRef cl = PyObject_CallMethod(bio, "close", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + break; + } + case 5: { + // Iteration. + PyRef bio = PyObject_CallFunction(io_BytesIO, "O", (PyObject *)pydata); + CHECK(bio); + PyRef it = PyObject_GetIter(bio); + if (it) { + PyObject *line; + while ((line = PyIter_Next(it)) != NULL) + Py_DECREF(line); + if (PyErr_Occurred()) PyErr_Clear(); + } else { + PyErr_Clear(); + } + PyRef cl = PyObject_CallMethod(bio, "close", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + break; + } + case 6: { + // Peek via BufferedReader wrapping. + PyRef bio = PyObject_CallFunction(io_BytesIO, "O", (PyObject *)pydata); + CHECK(bio); + PyRef br = PyObject_CallFunction(io_BufferedReader, "O", + (PyObject *)bio); + CHECK(br); + { + PyRef r = PyObject_CallMethod(br, "peek", "i", 16); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(br, "read", "i", 8); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(br, "read1", "i", 8); + if (PyErr_Occurred()) PyErr_Clear(); + } + PyRef cl = PyObject_CallMethod(br, "close", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + break; + } + } +} + +// OP_TEXTIOWRAPPER: FDP selects encoding, errors, newline. Create BytesIO + +// TextIOWrapper. Exercises _io/textio.c paths. +static void op_textiowrapper(FuzzedDataProvider &fdp) { + static const char *kEncodings[] = {"utf-8", "latin-1", "ascii", "utf-16"}; + static const char *kErrors[] = { + "strict", "replace", "xmlcharrefreplace", "backslashreplace", + }; + // NULL = universal newline mode. + static const char *kNewlines[] = {NULL, "\n", "\r\n", ""}; + + int enc_idx = fdp.ConsumeIntegralInRange(0, 3); + int err_idx = fdp.ConsumeIntegralInRange(0, 3); + int nl_idx = fdp.ConsumeIntegralInRange(0, 3); + int variant = fdp.ConsumeIntegralInRange(0, 4); + int str_enc = fdp.ConsumeIntegralInRange(0, 3); + std::string data = fdp.ConsumeBytesAsString( + std::min(fdp.remaining_bytes(), (size_t)10000)); + + const char *encoding = kEncodings[enc_idx]; + const char *errors = kErrors[err_idx]; + const char *newline = kNewlines[nl_idx]; + + switch (variant) { + case 0: { + // Write mode: write string, flush, seek, read. + PyRef bio = PyObject_CallFunction(io_BytesIO, NULL); + CHECK(bio); + PyRef kwargs = PyDict_New(); + CHECK(kwargs); + PyRef enc_str = PyUnicode_FromString(encoding); + CHECK(enc_str); + PyDict_SetItemString(kwargs, "encoding", enc_str); + PyRef err_str = PyUnicode_FromString(errors); + CHECK(err_str); + PyDict_SetItemString(kwargs, "errors", err_str); + if (newline) { + PyRef nl_str = PyUnicode_FromString(newline); + CHECK(nl_str); + PyDict_SetItemString(kwargs, "newline", nl_str); + } + PyRef args = PyTuple_Pack(1, (PyObject *)bio); + CHECK(args); + PyRef tw = PyObject_Call(io_TextIOWrapper, args, kwargs); + CHECK(tw); + + PyRef pystr(fuzz_bytes_to_str(data, str_enc)); + CHECK(pystr); + { + PyRef r = PyObject_CallMethod(tw, "write", "O", (PyObject *)pystr); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(tw, "flush", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(tw, "seek", "i", 0); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(tw, "read", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(tw, "close", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + break; + } + case 1: { + // Read mode: BytesIO(data) + TextIOWrapper, read/readline/readlines. + PyRef pydata = PyBytes_FromStringAndSize(Y(data)); + CHECK(pydata); + PyRef bio = PyObject_CallFunction(io_BytesIO, "O", (PyObject *)pydata); + CHECK(bio); + PyRef kwargs = PyDict_New(); + CHECK(kwargs); + PyRef enc_str = PyUnicode_FromString(encoding); + CHECK(enc_str); + PyDict_SetItemString(kwargs, "encoding", enc_str); + PyRef err_str = PyUnicode_FromString("replace"); + CHECK(err_str); + PyDict_SetItemString(kwargs, "errors", err_str); + PyRef args = PyTuple_Pack(1, (PyObject *)bio); + CHECK(args); + PyRef tw = PyObject_Call(io_TextIOWrapper, args, kwargs); + CHECK(tw); + + // readline x3. + for (int i = 0; i < 3; i++) { + PyRef r = PyObject_CallMethod(tw, "readline", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef sk = PyObject_CallMethod(tw, "seek", "i", 0); + if (PyErr_Occurred()) PyErr_Clear(); + PyRef r = PyObject_CallMethod(tw, "readlines", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef sk = PyObject_CallMethod(tw, "seek", "i", 0); + if (PyErr_Occurred()) PyErr_Clear(); + PyRef r = PyObject_CallMethod(tw, "read", "i", 16); + if (PyErr_Occurred()) PyErr_Clear(); + PyRef r2 = PyObject_CallMethod(tw, "read", "i", 16); + if (PyErr_Occurred()) PyErr_Clear(); + PyRef r3 = PyObject_CallMethod(tw, "read", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(tw, "close", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + break; + } + case 2: { + // Reconfigure: write, reconfigure, write more, read. + PyRef bio = PyObject_CallFunction(io_BytesIO, NULL); + CHECK(bio); + PyRef kwargs = PyDict_New(); + CHECK(kwargs); + PyRef enc_str = PyUnicode_FromString("utf-8"); + CHECK(enc_str); + PyDict_SetItemString(kwargs, "encoding", enc_str); + PyRef args = PyTuple_Pack(1, (PyObject *)bio); + CHECK(args); + PyRef tw = PyObject_Call(io_TextIOWrapper, args, kwargs); + CHECK(tw); + + PyRef pystr(fuzz_bytes_to_str(data, str_enc)); + CHECK(pystr); + { + PyRef r = PyObject_CallMethod(tw, "write", "O", (PyObject *)pystr); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef kw = PyDict_New(); + CHECK(kw); + PyRef nl = PyUnicode_FromString("\n"); + CHECK(nl); + PyDict_SetItemString(kw, "newline", nl); + PyDict_SetItemString(kw, "line_buffering", Py_True); + PyRef empty = PyTuple_New(0); + CHECK(empty); + PyRef r = PyObject_Call( + PyObject_GetAttrString(tw, "reconfigure"), empty, kw); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef sub = PyUnicode_Substring(pystr, 0, 32); + if (sub) { + PyRef r = PyObject_CallMethod(tw, "write", "O", (PyObject *)sub); + if (PyErr_Occurred()) PyErr_Clear(); + } else { + PyErr_Clear(); + } + } + { + PyRef r = PyObject_CallMethod(tw, "seek", "i", 0); + if (PyErr_Occurred()) PyErr_Clear(); + PyRef rd = PyObject_CallMethod(tw, "read", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(tw, "close", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + break; + } + case 3: { + // Detach. + PyRef pydata = PyBytes_FromStringAndSize(Y(data)); + CHECK(pydata); + PyRef bio = PyObject_CallFunction(io_BytesIO, "O", (PyObject *)pydata); + CHECK(bio); + PyRef kwargs = PyDict_New(); + CHECK(kwargs); + PyRef enc_str = PyUnicode_FromString("utf-8"); + CHECK(enc_str); + PyDict_SetItemString(kwargs, "encoding", enc_str); + PyRef err_str = PyUnicode_FromString("replace"); + CHECK(err_str); + PyDict_SetItemString(kwargs, "errors", err_str); + PyRef args = PyTuple_Pack(1, (PyObject *)bio); + CHECK(args); + PyRef tw = PyObject_Call(io_TextIOWrapper, args, kwargs); + CHECK(tw); + { + PyRef r = PyObject_CallMethod(tw, "read", "i", 4); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef raw = PyObject_CallMethod(tw, "detach", NULL); + if (raw) { + PyRef rd = PyObject_CallMethod(raw, "read", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + PyRef cl = PyObject_CallMethod(raw, "close", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } else { + PyErr_Clear(); + } + } + break; + } + case 4: { + // Properties: writable/readable/seekable/encoding/buffer. + PyRef pydata = PyBytes_FromStringAndSize(Y(data)); + CHECK(pydata); + PyRef bio = PyObject_CallFunction(io_BytesIO, "O", (PyObject *)pydata); + CHECK(bio); + PyRef kwargs = PyDict_New(); + CHECK(kwargs); + PyRef enc_str = PyUnicode_FromString("utf-8"); + CHECK(enc_str); + PyDict_SetItemString(kwargs, "encoding", enc_str); + PyRef err_str = PyUnicode_FromString("replace"); + CHECK(err_str); + PyDict_SetItemString(kwargs, "errors", err_str); + PyRef args = PyTuple_Pack(1, (PyObject *)bio); + CHECK(args); + PyRef tw = PyObject_Call(io_TextIOWrapper, args, kwargs); + CHECK(tw); + { + PyRef r = PyObject_CallMethod(tw, "writable", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(tw, "readable", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(tw, "seekable", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_GetAttrString(tw, "encoding"); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_GetAttrString(tw, "buffer"); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(tw, "close", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + break; + } + } +} + +// OP_BUFFERED_IO: FDP selects variant — BufferedReader, BufferedWriter, +// BufferedRandom, BufferedRWPair. Exercises _io/bufferedio.c paths. +static void op_buffered_io(FuzzedDataProvider &fdp) { + int variant = fdp.ConsumeIntegralInRange(0, 3); + std::string data = fdp.ConsumeBytesAsString( + std::min(fdp.remaining_bytes(), (size_t)10000)); + PyRef pydata = PyBytes_FromStringAndSize(Y(data)); + CHECK(pydata); + + switch (variant) { + case 0: { + // BufferedReader wrapping BytesIO. + PyRef bio = PyObject_CallFunction(io_BytesIO, "O", (PyObject *)pydata); + CHECK(bio); + PyRef br = PyObject_CallFunction(io_BufferedReader, "O", + (PyObject *)bio); + CHECK(br); + { + PyRef r = PyObject_CallMethod(br, "read", "i", 64); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(br, "seek", "i", 0); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(br, "peek", "i", 16); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(br, "read1", "i", 16); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(br, "readline", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef raw = PyObject_GetAttrString(br, "raw"); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef det = PyObject_CallMethod(br, "detach", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + break; + } + case 1: { + // BufferedWriter wrapping BytesIO. + PyRef bio = PyObject_CallFunction(io_BytesIO, NULL); + CHECK(bio); + PyRef bw = PyObject_CallFunction(io_BufferedWriter, "O", + (PyObject *)bio); + CHECK(bw); + { + PyRef r = PyObject_CallMethod(bw, "write", "O", (PyObject *)pydata); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(bw, "flush", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(bw, "close", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + break; + } + case 2: { + // BufferedRandom wrapping BytesIO. + PyRef bio = PyObject_CallFunction(io_BytesIO, "O", (PyObject *)pydata); + CHECK(bio); + PyRef brnd = PyObject_CallFunction(io_BufferedRandom, "O", + (PyObject *)bio); + CHECK(brnd); + { + PyRef r = PyObject_CallMethod(brnd, "write", "O", (PyObject *)pydata); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(brnd, "seek", "i", 0); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(brnd, "read", "i", 64); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(brnd, "tell", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + long trunc_at = data.size() < 64 ? data.size() : 64; + PyRef r = PyObject_CallMethod(brnd, "truncate", "l", trunc_at); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(brnd, "close", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + break; + } + case 3: { + // BufferedRWPair. + PyRef r_bio = PyObject_CallFunction(io_BytesIO, "O", + (PyObject *)pydata); + CHECK(r_bio); + PyRef w_bio = PyObject_CallFunction(io_BytesIO, NULL); + CHECK(w_bio); + PyRef rw = PyObject_CallFunction(io_BufferedRWPair, "OO", + (PyObject *)r_bio, (PyObject *)w_bio); + CHECK(rw); + { + PyRef r = PyObject_CallMethod(rw, "read", "i", 32); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(rw, "write", "O", (PyObject *)pydata); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(rw, "close", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + break; + } + } +} + +// OP_FILEIO: FDP selects mode — read, write, read+write. +// Exercises _io/fileio.c paths. +static void op_fileio(FuzzedDataProvider &fdp) { + int variant = fdp.ConsumeIntegralInRange(0, 2); + std::string data = fdp.ConsumeBytesAsString( + std::min(fdp.remaining_bytes(), (size_t)10000)); + + switch (variant) { + case 0: { + // Read from tmpfile. + PyRef fio = PyObject_CallFunction(io_FileIO, "ss", + tmpfile_path, "r"); + CHECK(fio); + { + PyRef r = PyObject_CallMethod(fio, "read", "i", 64); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(fio, "seek", "i", 0); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(fio, "readall", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef sk = PyObject_CallMethod(fio, "seek", "i", 0); + if (PyErr_Occurred()) PyErr_Clear(); + PyRef buf = PyByteArray_FromStringAndSize(NULL, 64); + CHECK(buf); + PyRef r = PyObject_CallMethod(fio, "readinto", "O", (PyObject *)buf); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(fio, "fileno", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(fio, "isatty", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_GetAttrString(fio, "name"); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_GetAttrString(fio, "mode"); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(fio, "readable", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(fio, "seekable", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(fio, "close", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + break; + } + case 1: { + // Write to temp file. + PyRef path(make_tmppath("fio_w")); + CHECK(path); + PyRef fio = PyObject_CallFunction(io_FileIO, "Os", + (PyObject *)path, "w"); + CHECK(fio); + { + PyRef pydata = PyBytes_FromStringAndSize(Y(data)); + CHECK(pydata); + PyRef r = PyObject_CallMethod(fio, "write", "O", (PyObject *)pydata); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(fio, "flush", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(fio, "tell", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(fio, "close", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + unlink_path(path); + break; + } + case 2: { + // Read+write mode. + PyRef path(make_tmppath("fio_rw")); + CHECK(path); + PyRef fio = PyObject_CallFunction(io_FileIO, "Os", + (PyObject *)path, "w+b"); + CHECK(fio); + { + PyRef pydata = PyBytes_FromStringAndSize(Y(data)); + CHECK(pydata); + PyRef r = PyObject_CallMethod(fio, "write", "O", (PyObject *)pydata); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(fio, "seek", "i", 0); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(fio, "read", "i", 32); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(fio, "truncate", "i", 128); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(fio, "close", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + unlink_path(path); + break; + } + } +} + +// OP_IO_OPEN: FDP selects mode — read text, read binary, write text, write binary. +// Exercises _io/_iomodule.c open() paths. +static void op_io_open(FuzzedDataProvider &fdp) { + int variant = fdp.ConsumeIntegralInRange(0, 3); + int str_enc = fdp.ConsumeIntegralInRange(0, 3); + std::string data = fdp.ConsumeBytesAsString( + std::min(fdp.remaining_bytes(), (size_t)10000)); + + switch (variant) { + case 0: { + // Read text from tmpfile. + PyRef f = PyObject_CallFunction(io_open, "ss", tmpfile_path, "r"); + CHECK(f); + PyRef r = PyObject_CallMethod(f, "read", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + PyRef cl = PyObject_CallMethod(f, "close", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + break; + } + case 1: { + // Read binary from tmpfile. + PyRef f = PyObject_CallFunction(io_open, "ss", tmpfile_path, "rb"); + CHECK(f); + PyRef r = PyObject_CallMethod(f, "read", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + PyRef cl = PyObject_CallMethod(f, "close", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + break; + } + case 2: { + // Write text. + PyRef path(make_tmppath("ioopen_w")); + CHECK(path); + PyRef pystr(fuzz_bytes_to_str(data, str_enc)); + CHECK(pystr); + PyRef f = PyObject_CallFunction(io_open, "Os", (PyObject *)path, "w"); + CHECK(f); + PyRef r = PyObject_CallMethod(f, "write", "O", (PyObject *)pystr); + if (PyErr_Occurred()) PyErr_Clear(); + PyRef cl = PyObject_CallMethod(f, "close", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + unlink_path(path); + break; + } + case 3: { + // Write binary then read back. + PyRef path(make_tmppath("ioopen_wb")); + CHECK(path); + PyRef pydata = PyBytes_FromStringAndSize(Y(data)); + CHECK(pydata); + { + PyRef f = PyObject_CallFunction(io_open, "Os", + (PyObject *)path, "wb"); + CHECK(f); + PyRef r = PyObject_CallMethod(f, "write", "O", (PyObject *)pydata); + if (PyErr_Occurred()) PyErr_Clear(); + PyRef cl = PyObject_CallMethod(f, "close", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + // Read back. + { + PyRef kwargs = PyDict_New(); + CHECK(kwargs); + PyRef err = PyUnicode_FromString("replace"); + CHECK(err); + PyDict_SetItemString(kwargs, "errors", err); + PyRef args = PyTuple_Pack(1, (PyObject *)path); + CHECK(args); + PyRef f = PyObject_Call(io_open, args, kwargs); + if (f) { + PyRef r = PyObject_CallMethod(f, "read", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + PyRef cl = PyObject_CallMethod(f, "close", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } else { + PyErr_Clear(); + } + } + unlink_path(path); + break; + } + } +} + +// OP_NEWLINE_DECODER: FDP selects translate mode. Create +// IncrementalNewlineDecoder, split str at midpoint, decode halves. +// Exercises _io/textio.c's newline decoder paths. +static void op_newline_decoder(FuzzedDataProvider &fdp) { + bool translate = fdp.ConsumeBool(); + int str_enc = fdp.ConsumeIntegralInRange(0, 3); + std::string data = fdp.ConsumeBytesAsString( + std::min(fdp.remaining_bytes(), (size_t)10000)); + PyRef pystr(fuzz_bytes_to_str(data, str_enc)); + CHECK(pystr); + + PyRef dec = PyObject_CallFunction(io_IncrementalNewlineDecoder, "OO", + Py_None, + translate ? Py_True : Py_False); + CHECK(dec); + + Py_ssize_t slen = PyUnicode_GET_LENGTH(pystr); + Py_ssize_t mid = slen / 2; + + PyRef half1 = PyUnicode_Substring(pystr, 0, mid); + CHECK(half1); + PyRef half2 = PyUnicode_Substring(pystr, mid, slen); + CHECK(half2); + + { + PyRef r = PyObject_CallMethod(dec, "decode", "O", (PyObject *)half1); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(dec, "decode", "Oi", + (PyObject *)half2, 1); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef state = PyObject_CallMethod(dec, "getstate", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + + PyRef reset = PyObject_CallMethod(dec, "reset", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + + if (state && state.p != Py_None) { + PyRef ss = PyObject_CallMethod(dec, "setstate", "O", + (PyObject *)state); + if (PyErr_Occurred()) PyErr_Clear(); + } + } +} + +// OP_STRINGIO: StringIO write/readline/readlines/truncate/close. +// Exercises _io/stringio.c paths. +static void op_stringio(FuzzedDataProvider &fdp) { + int str_enc = fdp.ConsumeIntegralInRange(0, 3); + int variant = fdp.ConsumeIntegralInRange(0, 1); + std::string data = fdp.ConsumeBytesAsString( + std::min(fdp.remaining_bytes(), (size_t)10000)); + PyRef pystr(fuzz_bytes_to_str(data, str_enc)); + CHECK(pystr); + + PyRef sio = PyObject_CallFunction(io_StringIO, NULL); + CHECK(sio); + + { + PyRef r = PyObject_CallMethod(sio, "write", "O", (PyObject *)pystr); + if (!r) { PyErr_Clear(); return; } + } + { + PyRef r = PyObject_CallMethod(sio, "seek", "i", 0); + if (PyErr_Occurred()) PyErr_Clear(); + } + + if (variant == 0) { + // readline x3 + readlines. + for (int i = 0; i < 3; i++) { + PyRef r = PyObject_CallMethod(sio, "readline", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef sk = PyObject_CallMethod(sio, "seek", "i", 0); + if (PyErr_Occurred()) PyErr_Clear(); + PyRef r = PyObject_CallMethod(sio, "readlines", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + } else { + // readlines on initial content. + PyRef r = PyObject_CallMethod(sio, "readlines", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + + { + Py_ssize_t slen = PyUnicode_GET_LENGTH(pystr); + long trunc_at = slen < 64 ? slen : 64; + PyRef r = PyObject_CallMethod(sio, "truncate", "l", trunc_at); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(sio, "tell", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(sio, "close", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } +} + +// --------------------------------------------------------------------------- +// Dispatch. +// --------------------------------------------------------------------------- + +enum Op { + OP_BYTESIO, + OP_TEXTIOWRAPPER, + OP_BUFFERED_IO, + OP_FILEIO, + OP_IO_OPEN, + OP_NEWLINE_DECODER, + OP_STRINGIO, + NUM_OPS +}; + +extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + assert(Py_IsInitialized()); + init_ioops(); + if (size < 1 || size > 0x10000) return 0; + if (PyErr_Occurred()) PyErr_Clear(); + + FuzzedDataProvider fdp(data, size); + switch (fdp.ConsumeIntegralInRange(0, NUM_OPS - 1)) { + case OP_BYTESIO: + op_bytesio(fdp); + break; + case OP_TEXTIOWRAPPER: + op_textiowrapper(fdp); + break; + case OP_BUFFERED_IO: + op_buffered_io(fdp); + break; + case OP_FILEIO: + op_fileio(fdp); + break; + case OP_IO_OPEN: + op_io_open(fdp); + break; + case OP_NEWLINE_DECODER: + op_newline_decoder(fdp); + break; + case OP_STRINGIO: + op_stringio(fdp); + break; + } + + if (++gc_counter % kGcInterval == 0) PyGC_Collect(); + return 0; +} diff --git a/module-fuzzers/fuzz_parsers.cpp b/module-fuzzers/fuzz_parsers.cpp new file mode 100644 index 0000000..2e83878 --- /dev/null +++ b/module-fuzzers/fuzz_parsers.cpp @@ -0,0 +1,744 @@ +// fuzz_parsers.cpp — Fuzzer for CPython's parser and text processing C +// extension modules. +// +// This fuzzer exercises the following CPython C extension modules via +// their Python API, called through the Python C API from C++: +// +// _json — json.dumps(), JSONEncoder with various options +// _csv — csv.Sniffer.sniff/has_header, csv.writer, +// csv.DictWriter with quoting modes +// pyexpat — ParserCreate with encodings/namespace_separator, +// Parse, ParseFile, handlers, GetInputContext +// time — strftime with fuzz format, strptime with fuzz input +// _operator — lt, gt, eq, ne, contains, countOf, indexOf, +// length_hint, concat, getitem, methodcaller +// _locale — strxfrm, strcoll, getlocale +// _opcode (via dis) — dis.dis() on compiled code +// +// The first byte of fuzz input selects one of 7 operation types. Each +// operation consumes further bytes via FuzzedDataProvider to parameterize +// the call (encoder options, parser encoding, operator selection). +// +// All module functions and class constructors are imported once during init +// and cached as static PyObject* pointers. PyRef (RAII) prevents reference +// leaks. PyGC_Collect() runs every 200 iterations. Max input size: 64 KB. + +#include "fuzz_helpers.h" + +// --------------------------------------------------------------------------- +// Cached module objects, initialized once. +// --------------------------------------------------------------------------- + +// json +static PyObject *json_dumps, *json_JSONEncoder; + +// csv +static PyObject *csv_Sniffer, *csv_writer, *csv_DictWriter; +static PyObject *csv_QUOTE_ALL, *csv_QUOTE_NONNUMERIC; + +// expat +static PyObject *expat_ParserCreate; + +// io +static PyObject *bytesio_ctor, *stringio_ctor; + +// time +static PyObject *time_strftime, *time_strptime, *time_localtime; + +// operator +static PyObject *op_lt, *op_gt, *op_eq, *op_ne; +static PyObject *op_contains, *op_countOf, *op_indexOf, *op_length_hint; +static PyObject *op_concat, *op_getitem, *op_methodcaller; + +// dis +static PyObject *dis_dis; + +// locale +static PyObject *locale_strxfrm, *locale_strcoll, *locale_getlocale; + +// Handler lambdas (for expat). +static PyObject *noop_handler, *noop_handler_noargs; + +static unsigned long gc_counter = 0; + +static int initialized = 0; + +static void init_parsers(void) { + if (initialized) return; + + // json + json_dumps = import_attr("json", "dumps"); + json_JSONEncoder = import_attr("json", "JSONEncoder"); + + // csv + csv_Sniffer = import_attr("csv", "Sniffer"); + csv_writer = import_attr("csv", "writer"); + csv_DictWriter = import_attr("csv", "DictWriter"); + csv_QUOTE_ALL = import_attr("csv", "QUOTE_ALL"); + csv_QUOTE_NONNUMERIC = import_attr("csv", "QUOTE_NONNUMERIC"); + + // expat + expat_ParserCreate = import_attr("xml.parsers.expat", "ParserCreate"); + + // io + bytesio_ctor = import_attr("io", "BytesIO"); + stringio_ctor = import_attr("io", "StringIO"); + + // time + time_strftime = import_attr("time", "strftime"); + time_strptime = import_attr("time", "strptime"); + time_localtime = import_attr("time", "localtime"); + + // operator + op_lt = import_attr("operator", "lt"); + op_gt = import_attr("operator", "gt"); + op_eq = import_attr("operator", "eq"); + op_ne = import_attr("operator", "ne"); + op_contains = import_attr("operator", "contains"); + op_countOf = import_attr("operator", "countOf"); + op_indexOf = import_attr("operator", "indexOf"); + op_length_hint = import_attr("operator", "length_hint"); + op_concat = import_attr("operator", "concat"); + op_getitem = import_attr("operator", "getitem"); + op_methodcaller = import_attr("operator", "methodcaller"); + + // dis + dis_dis = import_attr("dis", "dis"); + + // locale + locale_strxfrm = import_attr("locale", "strxfrm"); + locale_strcoll = import_attr("locale", "strcoll"); + locale_getlocale = import_attr("locale", "getlocale"); + + // No-op handler lambdas for expat. + { + PyObject *globals = PyDict_New(); + PyDict_SetItemString(globals, "__builtins__", PyEval_GetBuiltins()); + PyObject *r = PyRun_String( + "_noop = lambda *a: None\n" + "_noop_noargs = lambda: None\n", + Py_file_input, globals, globals); + if (!r) { PyErr_Print(); abort(); } + Py_DECREF(r); + noop_handler = PyDict_GetItemString(globals, "_noop"); + Py_INCREF(noop_handler); + noop_handler_noargs = PyDict_GetItemString(globals, "_noop_noargs"); + Py_INCREF(noop_handler_noargs); + Py_DECREF(globals); + } + + // Suppress warnings. + PyRun_SimpleString("import warnings; warnings.filterwarnings('ignore')"); + + assert(!PyErr_Occurred()); + initialized = 1; +} + +// --------------------------------------------------------------------------- +// Operations (7 ops). +// --------------------------------------------------------------------------- + +// OP_JSON_ENCODE: FDP selects variant — json.dumps(str), json.dumps({str:str}), +// json.dumps([str,str]), or JSONEncoder with options. Exercises the _json +// C acceleration module's encoding paths. +static void op_json_encode(FuzzedDataProvider &fdp) { + int str_enc = fdp.ConsumeIntegralInRange(0, 3); + int variant = fdp.ConsumeIntegralInRange(0, 5); + std::string data = fdp.ConsumeBytesAsString( + std::min(fdp.remaining_bytes(), (size_t)10000)); + PyRef pystr(fuzz_bytes_to_str(data, str_enc)); + CHECK(pystr); + + switch (variant) { + case 0: { + // json.dumps(str) + PyRef r = PyObject_CallFunction(json_dumps, "O", (PyObject *)pystr); + break; + } + case 1: { + // json.dumps({str: str}) + PyRef d = PyDict_New(); + CHECK(d); + PyDict_SetItem(d, pystr, pystr); + PyRef r = PyObject_CallFunction(json_dumps, "O", (PyObject *)d); + break; + } + case 2: { + // json.dumps([str, str]) + PyRef lst = PyList_New(2); + CHECK(lst); + Py_INCREF((PyObject *)pystr); + Py_INCREF((PyObject *)pystr); + PyList_SET_ITEM((PyObject *)lst, 0, (PyObject *)pystr); + PyList_SET_ITEM((PyObject *)lst, 1, (PyObject *)pystr); + PyRef r = PyObject_CallFunction(json_dumps, "O", (PyObject *)lst); + break; + } + case 3: { + // JSONEncoder(ensure_ascii=False).encode(str) + PyRef kwargs = PyDict_New(); + CHECK(kwargs); + PyDict_SetItemString(kwargs, "ensure_ascii", Py_False); + PyRef empty = PyTuple_New(0); + CHECK(empty); + PyRef enc = PyObject_Call(json_JSONEncoder, empty, kwargs); + CHECK(enc); + PyRef r = PyObject_CallMethod(enc, "encode", "O", (PyObject *)pystr); + break; + } + case 4: { + // JSONEncoder(ensure_ascii=True).encode(str) + PyRef kwargs = PyDict_New(); + CHECK(kwargs); + PyDict_SetItemString(kwargs, "ensure_ascii", Py_True); + PyRef empty = PyTuple_New(0); + CHECK(empty); + PyRef enc = PyObject_Call(json_JSONEncoder, empty, kwargs); + CHECK(enc); + PyRef r = PyObject_CallMethod(enc, "encode", "O", (PyObject *)pystr); + break; + } + case 5: { + // JSONEncoder(sort_keys=True, indent=2, ensure_ascii=False).encode({s:s}) + PyRef kwargs = PyDict_New(); + CHECK(kwargs); + PyDict_SetItemString(kwargs, "sort_keys", Py_True); + PyRef indent = PyLong_FromLong(2); + CHECK(indent); + PyDict_SetItemString(kwargs, "indent", indent); + PyDict_SetItemString(kwargs, "ensure_ascii", Py_False); + PyRef empty = PyTuple_New(0); + CHECK(empty); + PyRef enc = PyObject_Call(json_JSONEncoder, empty, kwargs); + CHECK(enc); + PyRef d = PyDict_New(); + CHECK(d); + PyDict_SetItem(d, pystr, pystr); + PyRef r = PyObject_CallMethod(enc, "encode", "O", (PyObject *)d); + break; + } + } + if (PyErr_Occurred()) PyErr_Clear(); +} + +// OP_CSV_SNIFFER: Call csv.Sniffer().sniff() and .has_header() on fuzz str. +// Exercises the _csv C module's dialect detection paths. +static void op_csv_sniffer(FuzzedDataProvider &fdp) { + int str_enc = fdp.ConsumeIntegralInRange(0, 3); + std::string data = fdp.ConsumeBytesAsString( + std::min(fdp.remaining_bytes(), (size_t)1024)); + PyRef pystr(fuzz_bytes_to_str(data, str_enc)); + CHECK(pystr); + + PyRef sniffer = PyObject_CallFunction(csv_Sniffer, NULL); + CHECK(sniffer); + + { + PyRef r = PyObject_CallMethod(sniffer, "sniff", "O", (PyObject *)pystr); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(sniffer, "has_header", "O", + (PyObject *)pystr); + if (PyErr_Occurred()) PyErr_Clear(); + } +} + +// OP_CSV_WRITER: FDP selects variant — basic writerow, writerows, tab-delimited, +// DictWriter, QUOTE_ALL, QUOTE_NONNUMERIC. All write to StringIO. +// Exercises the _csv C module's writer paths. +static void op_csv_writer(FuzzedDataProvider &fdp) { + int str_enc = fdp.ConsumeIntegralInRange(0, 3); + int variant = fdp.ConsumeIntegralInRange(0, 5); + std::string data = fdp.ConsumeBytesAsString( + std::min(fdp.remaining_bytes(), (size_t)10000)); + PyRef pystr(fuzz_bytes_to_str(data, str_enc)); + CHECK(pystr); + + PyRef sio = PyObject_CallFunction(stringio_ctor, NULL); + CHECK(sio); + + // Split string into words for row data. + PyRef words = PyObject_CallMethod(pystr, "split", NULL); + if (!words) { PyErr_Clear(); return; } + + // Ensure non-empty. + if (PyList_Size(words) == 0) { + PyRef empty = PyUnicode_FromString(""); + PyList_Append(words, empty); + } + + switch (variant) { + case 0: { + // Basic writerow. + PyRef w = PyObject_CallFunction(csv_writer, "O", (PyObject *)sio); + CHECK(w); + PyRef r = PyObject_CallMethod(w, "writerow", "O", (PyObject *)words); + break; + } + case 1: { + // writerows with lines. + PyRef lines = PyObject_CallMethod(pystr, "splitlines", NULL); + if (!lines) { PyErr_Clear(); break; } + PyRef rows = PyList_New(0); + CHECK(rows); + Py_ssize_t nlines = PyList_Size(lines); + for (Py_ssize_t i = 0; i < nlines && i < 20; i++) { + PyObject *line = PyList_GetItem(lines, i); + PyRef lwords = PyObject_CallMethod(line, "split", NULL); + if (!lwords) { PyErr_Clear(); continue; } + if (PyList_Size(lwords) == 0) { + PyRef e = PyUnicode_FromString(""); + PyList_Append(lwords, e); + } + PyList_Append(rows, lwords); + } + PyRef w = PyObject_CallFunction(csv_writer, "O", (PyObject *)sio); + CHECK(w); + PyRef r = PyObject_CallMethod(w, "writerows", "O", (PyObject *)rows); + break; + } + case 2: { + // Tab-delimited. + PyRef kwargs = PyDict_New(); + CHECK(kwargs); + PyRef delim = PyUnicode_FromString("\t"); + CHECK(delim); + PyDict_SetItemString(kwargs, "delimiter", delim); + PyRef args = PyTuple_Pack(1, (PyObject *)sio); + CHECK(args); + PyRef w = PyObject_Call(csv_writer, args, kwargs); + CHECK(w); + PyRef r = PyObject_CallMethod(w, "writerow", "O", (PyObject *)words); + break; + } + case 3: { + // DictWriter. + Py_ssize_t nwords = PyList_Size(words); + Py_ssize_t nfields = nwords < 8 ? nwords : 8; + if (nfields == 0) nfields = 1; + PyRef fieldnames = PyList_GetSlice(words, 0, nfields); + CHECK(fieldnames); + PyRef kwargs = PyDict_New(); + CHECK(kwargs); + PyDict_SetItemString(kwargs, "fieldnames", fieldnames); + PyRef args = PyTuple_Pack(1, (PyObject *)sio); + CHECK(args); + PyRef dw = PyObject_Call(csv_DictWriter, args, kwargs); + CHECK(dw); + PyRef wh = PyObject_CallMethod(dw, "writeheader", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + // Build row dict. + PyRef row = PyDict_New(); + CHECK(row); + for (Py_ssize_t i = 0; i < nfields; i++) { + PyObject *fn = PyList_GetItem(fieldnames, i); + PyDict_SetItem(row, fn, pystr); + } + PyRef wr = PyObject_CallMethod(dw, "writerow", "O", (PyObject *)row); + break; + } + case 4: { + // QUOTE_ALL. + PyRef kwargs = PyDict_New(); + CHECK(kwargs); + PyDict_SetItemString(kwargs, "quoting", csv_QUOTE_ALL); + PyRef args = PyTuple_Pack(1, (PyObject *)sio); + CHECK(args); + PyRef w = PyObject_Call(csv_writer, args, kwargs); + CHECK(w); + PyRef r = PyObject_CallMethod(w, "writerow", "O", (PyObject *)words); + break; + } + case 5: { + // QUOTE_NONNUMERIC. + PyRef kwargs = PyDict_New(); + CHECK(kwargs); + PyDict_SetItemString(kwargs, "quoting", csv_QUOTE_NONNUMERIC); + PyRef args = PyTuple_Pack(1, (PyObject *)sio); + CHECK(args); + PyRef w = PyObject_Call(csv_writer, args, kwargs); + CHECK(w); + PyRef r = PyObject_CallMethod(w, "writerow", "O", (PyObject *)words); + break; + } + } + if (PyErr_Occurred()) PyErr_Clear(); + + // Read result. + PyRef val = PyObject_CallMethod(sio, "getvalue", NULL); + if (PyErr_Occurred()) PyErr_Clear(); +} + +// OP_EXPAT: FDP selects encoding and handler setup, then Parse or ParseFile. +// Exercises the pyexpat C module's XML parsing paths. +static void op_expat(FuzzedDataProvider &fdp) { + static const char *kEncodings[] = {"utf-8", "iso-8859-1", NULL}; + int enc_idx = fdp.ConsumeIntegralInRange(0, 2); + bool use_ns = fdp.ConsumeBool(); + bool set_handlers = fdp.ConsumeBool(); + bool use_parsefile = fdp.ConsumeBool(); + std::string data = fdp.ConsumeBytesAsString( + std::min(fdp.remaining_bytes(), (size_t)4096)); + + // Create parser. + PyRef parser; + if (use_ns) { + PyRef kwargs = PyDict_New(); + CHECK(kwargs); + PyRef ns_sep = PyUnicode_FromString(" "); + CHECK(ns_sep); + PyDict_SetItemString(kwargs, "namespace_separator", ns_sep); + PyRef empty = PyTuple_New(0); + CHECK(empty); + parser = PyRef(PyObject_Call(expat_ParserCreate, empty, kwargs)); + } else if (kEncodings[enc_idx]) { + parser = PyRef(PyObject_CallFunction(expat_ParserCreate, "s", + kEncodings[enc_idx])); + } else { + parser = PyRef(PyObject_CallFunction(expat_ParserCreate, NULL)); + } + CHECK(parser); + + // Set handlers. + if (set_handlers) { + PyObject_SetAttrString(parser, "StartElementHandler", noop_handler); + PyObject_SetAttrString(parser, "EndElementHandler", noop_handler); + PyObject_SetAttrString(parser, "CharacterDataHandler", noop_handler); + PyObject_SetAttrString(parser, "ProcessingInstructionHandler", + noop_handler); + PyObject_SetAttrString(parser, "CommentHandler", noop_handler); + PyObject_SetAttrString(parser, "StartCdataSectionHandler", + noop_handler_noargs); + PyObject_SetAttrString(parser, "EndCdataSectionHandler", + noop_handler_noargs); + } + + PyRef pydata = PyBytes_FromStringAndSize(Y(data)); + CHECK(pydata); + + if (use_parsefile) { + // ParseFile(BytesIO(data)). + PyRef bio = PyObject_CallFunction(bytesio_ctor, "O", (PyObject *)pydata); + CHECK(bio); + PyRef r = PyObject_CallMethod(parser, "ParseFile", "O", (PyObject *)bio); + } else { + // Parse(data, True). + PyRef r = PyObject_CallMethod(parser, "Parse", "Oi", + (PyObject *)pydata, 1); + } + if (PyErr_Occurred()) PyErr_Clear(); + + // Optionally GetInputContext. + if (data.size() % 2 == 0) { + PyRef ctx = PyObject_CallMethod(parser, "GetInputContext", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } +} + +// OP_TIME: FDP selects variant — strftime with fuzz format, strptime with +// fuzz input, or strptime with fuzz format. Exercises the time C module. +static void op_time(FuzzedDataProvider &fdp) { + int str_enc = fdp.ConsumeIntegralInRange(0, 3); + int variant = fdp.ConsumeIntegralInRange(0, 2); + std::string data = fdp.ConsumeBytesAsString( + std::min(fdp.remaining_bytes(), (size_t)10000)); + PyRef pystr(fuzz_bytes_to_str(data, str_enc)); + CHECK(pystr); + + switch (variant) { + case 0: { + // time.strftime(str, time.localtime()) + PyRef lt = PyObject_CallFunction(time_localtime, NULL); + CHECK(lt); + // Use non-empty format. + Py_ssize_t slen = PyUnicode_GET_LENGTH(pystr); + PyObject *fmt = slen > 0 ? (PyObject *)pystr : NULL; + if (!fmt) { + PyRef def_fmt = PyUnicode_FromString("%Y"); + CHECK(def_fmt); + PyRef r = PyObject_CallFunction(time_strftime, "OO", + (PyObject *)def_fmt, (PyObject *)lt); + } else { + PyRef r = PyObject_CallFunction(time_strftime, "OO", + fmt, (PyObject *)lt); + } + break; + } + case 1: { + // time.strptime(str, '%Y-%m-%d %H:%M:%S') + PyRef r = PyObject_CallFunction(time_strptime, "Os", + (PyObject *)pystr, + "%Y-%m-%d %H:%M:%S"); + break; + } + case 2: { + // time.strptime('2024-01-15 12:30:00', str) + // Use non-empty format. + Py_ssize_t slen = PyUnicode_GET_LENGTH(pystr); + PyObject *fmt = slen > 0 ? (PyObject *)pystr : NULL; + if (!fmt) { + PyRef def_fmt = PyUnicode_FromString("%Y-%m-%d %H:%M:%S"); + CHECK(def_fmt); + PyRef r = PyObject_CallFunction(time_strptime, "sO", + "2024-01-15 12:30:00", + (PyObject *)def_fmt); + } else { + PyRef r = PyObject_CallFunction(time_strptime, "sO", + "2024-01-15 12:30:00", fmt); + } + break; + } + } + if (PyErr_Occurred()) PyErr_Clear(); +} + +// OP_OPERATOR: FDP selects operator variant — comparisons, sequence ops, +// concat, getitem, methodcaller. Exercises the _operator C module. +static void op_operator(FuzzedDataProvider &fdp) { + int variant = fdp.ConsumeIntegralInRange(0, 5); + std::string data = fdp.ConsumeRemainingBytesAsString(); + + PyRef pydata = PyBytes_FromStringAndSize(Y(data)); + CHECK(pydata); + + switch (variant) { + case 0: { + // Comparisons: lt/gt/eq/ne(data, data[::-1]) + PyRef rev = PyObject_CallMethod(pydata, "__class__", NULL); + // Build reversed bytes. + std::string rdata(data.rbegin(), data.rend()); + PyRef pyrev = PyBytes_FromStringAndSize(Y(rdata)); + CHECK(pyrev); + { + PyRef r = PyObject_CallFunction(op_lt, "OO", + (PyObject *)pydata, (PyObject *)pyrev); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallFunction(op_gt, "OO", + (PyObject *)pydata, (PyObject *)pyrev); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallFunction(op_eq, "OO", + (PyObject *)pydata, (PyObject *)pydata); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef empty = PyBytes_FromStringAndSize("", 0); + CHECK(empty); + PyRef r = PyObject_CallFunction(op_ne, "OO", + (PyObject *)pydata, (PyObject *)empty); + if (PyErr_Occurred()) PyErr_Clear(); + } + break; + } + case 1: { + // Sequence ops: contains, countOf, indexOf, length_hint + if (data.empty()) break; + PyRef byte_val = PyLong_FromLong((unsigned char)data[0]); + CHECK(byte_val); + { + PyRef r = PyObject_CallFunction(op_contains, "OO", + (PyObject *)pydata, + (PyObject *)byte_val); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallFunction(op_countOf, "OO", + (PyObject *)pydata, + (PyObject *)byte_val); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallFunction(op_indexOf, "OO", + (PyObject *)pydata, + (PyObject *)byte_val); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallFunction(op_length_hint, "O", + (PyObject *)pydata); + if (PyErr_Occurred()) PyErr_Clear(); + } + break; + } + case 2: { + // concat(data, data) + PyRef r = PyObject_CallFunction(op_concat, "OO", + (PyObject *)pydata, (PyObject *)pydata); + if (PyErr_Occurred()) PyErr_Clear(); + break; + } + case 3: { + // getitem(data, 0) + getitem(data, slice) + if (data.empty()) break; + PyRef zero = PyLong_FromLong(0); + CHECK(zero); + { + PyRef r = PyObject_CallFunction(op_getitem, "OO", + (PyObject *)pydata, (PyObject *)zero); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef half = PyLong_FromLong(data.size() / 2); + CHECK(half); + PyRef sl = PySlice_New(zero, half, NULL); + CHECK(sl); + PyRef r = PyObject_CallFunction(op_getitem, "OO", + (PyObject *)pydata, (PyObject *)sl); + if (PyErr_Occurred()) PyErr_Clear(); + } + break; + } + case 4: { + // methodcaller('upper')(str) + methodcaller('encode', 'utf-8')(str) + int str_enc = data.size() > 0 ? data[0] & 3 : 0; + PyRef pystr(fuzz_bytes_to_str(data, str_enc)); + CHECK(pystr); + { + PyRef mc = PyObject_CallFunction(op_methodcaller, "s", "upper"); + CHECK(mc); + PyRef r = PyObject_CallFunction(mc, "O", (PyObject *)pystr); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef mc = PyObject_CallFunction(op_methodcaller, "ss", + "encode", "utf-8"); + CHECK(mc); + PyRef r = PyObject_CallFunction(mc, "O", (PyObject *)pystr); + if (PyErr_Occurred()) PyErr_Clear(); + } + break; + } + case 5: { + // contains on bytes with slice + if (data.empty()) break; + PyRef first = PyBytes_FromStringAndSize(data.data(), 1); + CHECK(first); + PyRef r = PyObject_CallFunction(op_contains, "OO", + (PyObject *)pydata, (PyObject *)first); + if (PyErr_Occurred()) PyErr_Clear(); + break; + } + } +} + +// OP_DIS_LOCALE: FDP selects — dis.dis(compile(str)), locale.strxfrm(str), +// locale.strcoll(str), or locale.getlocale(). Exercises _opcode via dis +// and _locale C module. +static void op_dis_locale(FuzzedDataProvider &fdp) { + int str_enc = fdp.ConsumeIntegralInRange(0, 3); + int variant = fdp.ConsumeIntegralInRange(0, 3); + std::string data = fdp.ConsumeBytesAsString( + std::min(fdp.remaining_bytes(), (size_t)10000)); + PyRef pystr(fuzz_bytes_to_str(data, str_enc)); + CHECK(pystr); + + switch (variant) { + case 0: { + // dis.dis(compile(str, '', 'exec')) + Py_ssize_t slen = PyUnicode_GET_LENGTH(pystr); + PyObject *src = slen > 0 ? (PyObject *)pystr : NULL; + if (!src) { + PyRef def_src = PyUnicode_FromString("pass"); + CHECK(def_src); + src = def_src; + Py_INCREF(src); + } else { + Py_INCREF(src); + } + PyRef code = PyRef(Py_CompileString( + PyUnicode_AsUTF8(src), "", Py_file_input)); + Py_DECREF(src); + if (!code) { PyErr_Clear(); break; } + // Capture dis output to StringIO. + PyRef sio = PyObject_CallFunction(stringio_ctor, NULL); + CHECK(sio); + PyRef kwargs = PyDict_New(); + CHECK(kwargs); + PyDict_SetItemString(kwargs, "file", sio); + PyRef args = PyTuple_Pack(1, (PyObject *)code); + CHECK(args); + PyRef r = PyObject_Call(dis_dis, args, kwargs); + if (PyErr_Occurred()) PyErr_Clear(); + break; + } + case 1: { + // locale.strxfrm(str) + PyRef r = PyObject_CallFunction(locale_strxfrm, "O", (PyObject *)pystr); + if (PyErr_Occurred()) PyErr_Clear(); + break; + } + case 2: { + // locale.strcoll(str[:mid], str[mid:]) + Py_ssize_t slen = PyUnicode_GET_LENGTH(pystr); + Py_ssize_t mid = slen / 2; + PyRef half1 = PyUnicode_Substring(pystr, 0, mid); + CHECK(half1); + PyRef half2 = PyUnicode_Substring(pystr, mid, slen); + CHECK(half2); + PyRef r = PyObject_CallFunction(locale_strcoll, "OO", + (PyObject *)half1, (PyObject *)half2); + if (PyErr_Occurred()) PyErr_Clear(); + break; + } + case 3: { + // locale.getlocale() + PyRef r = PyObject_CallFunction(locale_getlocale, NULL); + if (PyErr_Occurred()) PyErr_Clear(); + break; + } + } +} + +// --------------------------------------------------------------------------- +// Dispatch. +// --------------------------------------------------------------------------- + +enum Op { + OP_JSON_ENCODE, + OP_CSV_SNIFFER, + OP_CSV_WRITER, + OP_EXPAT, + OP_TIME, + OP_OPERATOR, + OP_DIS_LOCALE, + NUM_OPS +}; + +extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + assert(Py_IsInitialized()); + init_parsers(); + if (size < 1 || size > 0x10000) return 0; + if (PyErr_Occurred()) PyErr_Clear(); + + FuzzedDataProvider fdp(data, size); + switch (fdp.ConsumeIntegralInRange(0, NUM_OPS - 1)) { + case OP_JSON_ENCODE: + op_json_encode(fdp); + break; + case OP_CSV_SNIFFER: + op_csv_sniffer(fdp); + break; + case OP_CSV_WRITER: + op_csv_writer(fdp); + break; + case OP_EXPAT: + op_expat(fdp); + break; + case OP_TIME: + op_time(fdp); + break; + case OP_OPERATOR: + op_operator(fdp); + break; + case OP_DIS_LOCALE: + op_dis_locale(fdp); + break; + } + + if (++gc_counter % kGcInterval == 0) PyGC_Collect(); + return 0; +} diff --git a/module-fuzzers/fuzz_textops.cpp b/module-fuzzers/fuzz_textops.cpp new file mode 100644 index 0000000..03551e2 --- /dev/null +++ b/module-fuzzers/fuzz_textops.cpp @@ -0,0 +1,467 @@ +// fuzz_textops.cpp — Fuzzer for CPython's text-processing C extension modules. +// +// This fuzzer exercises the following CPython C extension modules via +// their Python API, called through the Python C API from C++: +// +// datetime — date/time/datetime.fromisoformat(), strptime(), +// strftime(), format() +// collections — _count_elements (Counter internals) +// unicodedata — category, bidirectional, numeric, decimal, +// combining, east_asian_width, mirrored, name, +// decomposition, normalize, is_normalized, lookup, +// ucd_3_2_0.normalize +// _io (StringIO) — write, seek, read, getvalue, readline, readlines, +// truncate, iteration +// +// The first byte of fuzz input selects one of 6 operation types. Each +// operation consumes further bytes via FuzzedDataProvider to parameterize +// the call (format selection, character range, normalization form). +// +// All module functions and class constructors are imported once during init +// and cached as static PyObject* pointers. PyRef (RAII) prevents reference +// leaks. PyGC_Collect() runs every 200 iterations. Max input size: 64 KB. + +#include "fuzz_helpers.h" + +// --------------------------------------------------------------------------- +// Cached module objects, initialized once. +// --------------------------------------------------------------------------- + +// datetime +static PyObject *dt_date, *dt_time, *dt_datetime; + +// collections +static PyObject *collections_count_elements; + +// unicodedata +static PyObject *ud_category, *ud_bidirectional, *ud_normalize, *ud_numeric; +static PyObject *ud_lookup, *ud_name, *ud_decomposition, *ud_is_normalized; +static PyObject *ud_east_asian_width, *ud_mirrored, *ud_decimal, *ud_combining; +static PyObject *ud_ucd_3_2_0; + +// io +static PyObject *stringio_ctor; + +// struct +static PyObject *struct_unpack; + +static unsigned long gc_counter = 0; + +static int initialized = 0; + +static void init_textops(void) { + if (initialized) return; + + // datetime + dt_date = import_attr("datetime", "date"); + dt_time = import_attr("datetime", "time"); + dt_datetime = import_attr("datetime", "datetime"); + + // collections + collections_count_elements = import_attr("collections", "_count_elements"); + + // unicodedata + ud_category = import_attr("unicodedata", "category"); + ud_bidirectional = import_attr("unicodedata", "bidirectional"); + ud_normalize = import_attr("unicodedata", "normalize"); + ud_numeric = import_attr("unicodedata", "numeric"); + ud_lookup = import_attr("unicodedata", "lookup"); + ud_name = import_attr("unicodedata", "name"); + ud_decomposition = import_attr("unicodedata", "decomposition"); + ud_is_normalized = import_attr("unicodedata", "is_normalized"); + ud_east_asian_width = import_attr("unicodedata", "east_asian_width"); + ud_mirrored = import_attr("unicodedata", "mirrored"); + ud_decimal = import_attr("unicodedata", "decimal"); + ud_combining = import_attr("unicodedata", "combining"); + ud_ucd_3_2_0 = import_attr("unicodedata", "ucd_3_2_0"); + + // io + stringio_ctor = import_attr("io", "StringIO"); + + // struct + struct_unpack = import_attr("struct", "unpack"); + + // Suppress warnings. + PyRun_SimpleString("import warnings; warnings.filterwarnings('ignore')"); + + assert(!PyErr_Occurred()); + initialized = 1; +} + +// --------------------------------------------------------------------------- +// Operations (6 ops). +// --------------------------------------------------------------------------- + +// OP_DATETIME_PARSE: FDP selects variant — date/time/datetime.fromisoformat() +// or datetime.strptime() with a fuzz-chosen format string. Exercises the +// datetime C module's parsing paths. +static void op_datetime_parse(FuzzedDataProvider &fdp) { + int str_enc = fdp.ConsumeIntegralInRange(0, 3); + int variant = fdp.ConsumeIntegralInRange(0, 4); + std::string data = fdp.ConsumeBytesAsString( + std::min(fdp.remaining_bytes(), (size_t)10000)); + PyRef pystr(fuzz_bytes_to_str(data, str_enc)); + CHECK(pystr); + + switch (variant) { + case 0: { + PyRef r = PyObject_CallMethod(dt_date, "fromisoformat", "O", + (PyObject *)pystr); + break; + } + case 1: { + PyRef r = PyObject_CallMethod(dt_time, "fromisoformat", "O", + (PyObject *)pystr); + break; + } + case 2: { + PyRef r = PyObject_CallMethod(dt_datetime, "fromisoformat", "O", + (PyObject *)pystr); + break; + } + case 3: { + PyRef fmt = PyUnicode_FromString("%Y-%m-%d %H:%M:%S"); + CHECK(fmt); + PyRef r = PyObject_CallMethod(dt_datetime, "strptime", "OO", + (PyObject *)pystr, (PyObject *)fmt); + break; + } + case 4: { + PyRef fmt = PyUnicode_FromString("%Y/%m/%dT%H:%M"); + CHECK(fmt); + PyRef r = PyObject_CallMethod(dt_datetime, "strptime", "OO", + (PyObject *)pystr, (PyObject *)fmt); + break; + } + } + if (PyErr_Occurred()) PyErr_Clear(); +} + +// OP_DATETIME_FORMAT: Unpack 6 shorts from first 12 bytes to build a valid +// datetime, then call strftime() with the remaining fuzz data as the format +// string. Exercises datetime formatting code paths. +static void op_datetime_format(FuzzedDataProvider &fdp) { + // Need at least 12 bytes for the datetime fields. + std::string header = fdp.ConsumeBytesAsString(12); + if (header.size() < 12) return; + + int str_enc = fdp.ConsumeIntegralInRange(0, 3); + std::string fmt_data = fdp.ConsumeBytesAsString( + std::min(fdp.remaining_bytes(), (size_t)10000)); + PyRef fmt_str(fuzz_bytes_to_str(fmt_data, str_enc)); + CHECK(fmt_str); + + // Unpack 6 unsigned shorts via struct.unpack. + PyRef hdr_bytes = PyBytes_FromStringAndSize(header.data(), 12); + CHECK(hdr_bytes); + PyRef vals = PyObject_CallFunction(struct_unpack, "sO", "6H", + (PyObject *)hdr_bytes); + CHECK(vals); + + // Extract fields and clamp to valid ranges. + long v[6]; + for (int i = 0; i < 6; i++) { + PyObject *item = PyTuple_GetItem(vals, i); + v[i] = PyLong_AsLong(item); + } + long year = (v[0] % 9999) + 1; + long month = (v[1] % 12) + 1; + long day = (v[2] % 28) + 1; + long hour = v[3] % 24; + long minute = v[4] % 60; + long second = v[5] % 60; + + PyRef dt = PyObject_CallFunction(dt_datetime, "llllll", + year, month, day, hour, minute, second); + CHECK(dt); + + // strftime on datetime. + { + PyRef r = PyObject_CallMethod(dt, "strftime", "O", (PyObject *)fmt_str); + if (PyErr_Occurred()) PyErr_Clear(); + } + + // strftime on date. + { + PyRef date_obj = PyObject_CallMethod(dt, "date", NULL); + if (date_obj) { + PyRef r = PyObject_CallMethod(date_obj, "strftime", "O", + (PyObject *)fmt_str); + if (PyErr_Occurred()) PyErr_Clear(); + } else { + PyErr_Clear(); + } + } + + // strftime on time. + { + PyRef time_obj = PyObject_CallMethod(dt, "time", NULL); + if (time_obj) { + PyRef r = PyObject_CallMethod(time_obj, "strftime", "O", + (PyObject *)fmt_str); + if (PyErr_Occurred()) PyErr_Clear(); + } else { + PyErr_Clear(); + } + } + + // format(date, str[:16]). + { + PyRef date_obj = PyObject_CallMethod(dt, "date", NULL); + if (date_obj) { + // Cap format spec to 16 chars. + Py_ssize_t flen = PyUnicode_GET_LENGTH(fmt_str); + PyRef short_fmt = PyUnicode_Substring(fmt_str, 0, + flen < 16 ? flen : 16); + if (short_fmt) { + PyRef r = PyObject_Format(date_obj, short_fmt); + if (PyErr_Occurred()) PyErr_Clear(); + } else { + PyErr_Clear(); + } + } else { + PyErr_Clear(); + } + } +} + +// OP_COLLECTIONS_COUNT: Build a dict and call collections._count_elements() +// with a fuzz-generated string. Exercises the Counter internals C path. +static void op_collections_count(FuzzedDataProvider &fdp) { + int str_enc = fdp.ConsumeIntegralInRange(0, 3); + std::string data = fdp.ConsumeBytesAsString( + std::min(fdp.remaining_bytes(), (size_t)10000)); + PyRef pystr(fuzz_bytes_to_str(data, str_enc)); + CHECK(pystr); + + PyRef d = PyDict_New(); + CHECK(d); + PyRef r = PyObject_CallFunction(collections_count_elements, "OO", + (PyObject *)d, (PyObject *)pystr); + if (PyErr_Occurred()) PyErr_Clear(); +} + +// OP_UNICODEDATA_CHARINFO: Convert data to str (cap 200 chars), then call +// per-character unicodedata functions. FDP selects which functions to call. +// Exercises the unicodedata C module character-info paths. +static void op_unicodedata_charinfo(FuzzedDataProvider &fdp) { + int str_enc = fdp.ConsumeIntegralInRange(0, 3); + uint8_t func_mask = fdp.ConsumeIntegral(); + std::string data = fdp.ConsumeBytesAsString( + std::min(fdp.remaining_bytes(), (size_t)800)); // ~200 chars max + PyRef pystr(fuzz_bytes_to_str(data, str_enc)); + CHECK(pystr); + + Py_ssize_t len = PyUnicode_GET_LENGTH(pystr); + if (len > 200) len = 200; + + PyRef neg_one = PyLong_FromLong(-1); + CHECK(neg_one); + PyRef empty_str = PyUnicode_FromString(""); + CHECK(empty_str); + + for (Py_ssize_t i = 0; i < len; i++) { + PyRef ch = PyUnicode_Substring(pystr, i, i + 1); + if (!ch) { PyErr_Clear(); continue; } + + if (func_mask & 0x01) { + PyRef r = PyObject_CallFunction(ud_category, "O", (PyObject *)ch); + if (PyErr_Occurred()) PyErr_Clear(); + } + if (func_mask & 0x02) { + PyRef r = PyObject_CallFunction(ud_bidirectional, "O", (PyObject *)ch); + if (PyErr_Occurred()) PyErr_Clear(); + } + if (func_mask & 0x04) { + PyRef r = PyObject_CallFunction(ud_numeric, "OO", + (PyObject *)ch, (PyObject *)neg_one); + if (PyErr_Occurred()) PyErr_Clear(); + } + if (func_mask & 0x08) { + PyRef r = PyObject_CallFunction(ud_decimal, "OO", + (PyObject *)ch, (PyObject *)neg_one); + if (PyErr_Occurred()) PyErr_Clear(); + } + if (func_mask & 0x10) { + PyRef r = PyObject_CallFunction(ud_combining, "O", (PyObject *)ch); + if (PyErr_Occurred()) PyErr_Clear(); + } + if (func_mask & 0x20) { + PyRef r = PyObject_CallFunction(ud_east_asian_width, "O", + (PyObject *)ch); + if (PyErr_Occurred()) PyErr_Clear(); + } + if (func_mask & 0x40) { + PyRef r = PyObject_CallFunction(ud_mirrored, "O", (PyObject *)ch); + if (PyErr_Occurred()) PyErr_Clear(); + } + if (func_mask & 0x80) { + PyRef r = PyObject_CallFunction(ud_name, "OO", + (PyObject *)ch, (PyObject *)empty_str); + if (PyErr_Occurred()) PyErr_Clear(); + PyRef r2 = PyObject_CallFunction(ud_decomposition, "O", + (PyObject *)ch); + if (PyErr_Occurred()) PyErr_Clear(); + } + } +} + +// OP_UNICODEDATA_NORMALIZE: FDP selects normalization form from +// {NFC, NFD, NFKC, NFKD}, calls normalize() and is_normalized(). +// Optionally calls ucd_3_2_0.normalize() and lookup(). +static void op_unicodedata_normalize(FuzzedDataProvider &fdp) { + static const char *kForms[] = {"NFC", "NFD", "NFKC", "NFKD"}; + int form_idx = fdp.ConsumeIntegralInRange(0, 3); + int str_enc = fdp.ConsumeIntegralInRange(0, 3); + bool try_ucd = fdp.ConsumeBool(); + bool try_lookup = fdp.ConsumeBool(); + std::string data = fdp.ConsumeBytesAsString( + std::min(fdp.remaining_bytes(), (size_t)10000)); + PyRef pystr(fuzz_bytes_to_str(data, str_enc)); + CHECK(pystr); + + const char *form = kForms[form_idx]; + + // normalize(form, str) + { + PyRef r = PyObject_CallFunction(ud_normalize, "sO", + form, (PyObject *)pystr); + if (PyErr_Occurred()) PyErr_Clear(); + } + + // is_normalized(form, str) + { + PyRef r = PyObject_CallFunction(ud_is_normalized, "sO", + form, (PyObject *)pystr); + if (PyErr_Occurred()) PyErr_Clear(); + } + + // ucd_3_2_0.normalize('NFC', str) + if (try_ucd) { + PyRef r = PyObject_CallMethod(ud_ucd_3_2_0, "normalize", "sO", + "NFC", (PyObject *)pystr); + if (PyErr_Occurred()) PyErr_Clear(); + } + + // lookup(str) + if (try_lookup) { + PyRef r = PyObject_CallFunction(ud_lookup, "O", (PyObject *)pystr); + if (PyErr_Occurred()) PyErr_Clear(); + } +} + +// OP_STRINGIO: Create io.StringIO(), write fuzz str, then exercise +// read/readline/readlines/truncate/iteration. Exercises _io/stringio.c. +static void op_stringio(FuzzedDataProvider &fdp) { + int str_enc = fdp.ConsumeIntegralInRange(0, 3); + int variant = fdp.ConsumeIntegralInRange(0, 2); + std::string data = fdp.ConsumeBytesAsString( + std::min(fdp.remaining_bytes(), (size_t)10000)); + PyRef pystr(fuzz_bytes_to_str(data, str_enc)); + CHECK(pystr); + + PyRef sio = PyObject_CallFunction(stringio_ctor, NULL); + CHECK(sio); + + // Write the fuzz string. + PyRef wr = PyObject_CallMethod(sio, "write", "O", (PyObject *)pystr); + if (!wr) { PyErr_Clear(); return; } + + // Seek to start. + PyRef sk = PyObject_CallMethod(sio, "seek", "i", 0); + if (!sk) { PyErr_Clear(); return; } + + switch (variant) { + case 0: { + // read + getvalue + PyRef r1 = PyObject_CallMethod(sio, "read", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + PyRef r2 = PyObject_CallMethod(sio, "getvalue", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + break; + } + case 1: { + // readline x3 + readlines + for (int i = 0; i < 3; i++) { + PyRef r = PyObject_CallMethod(sio, "readline", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + PyRef sk2 = PyObject_CallMethod(sio, "seek", "i", 0); + if (PyErr_Occurred()) PyErr_Clear(); + PyRef r = PyObject_CallMethod(sio, "readlines", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + break; + } + case 2: { + // truncate + tell + iteration + Py_ssize_t slen = PyUnicode_GET_LENGTH(pystr); + long trunc_at = slen < 64 ? slen : 64; + PyRef tr = PyObject_CallMethod(sio, "truncate", "l", trunc_at); + if (PyErr_Occurred()) PyErr_Clear(); + PyRef tl = PyObject_CallMethod(sio, "tell", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + PyRef sk2 = PyObject_CallMethod(sio, "seek", "i", 0); + if (PyErr_Occurred()) PyErr_Clear(); + // Iterate. + PyRef it = PyObject_GetIter(sio); + if (it) { + PyObject *line; + while ((line = PyIter_Next(it)) != NULL) + Py_DECREF(line); + if (PyErr_Occurred()) PyErr_Clear(); + } else { + PyErr_Clear(); + } + break; + } + } + + PyRef cl = PyObject_CallMethod(sio, "close", NULL); + if (PyErr_Occurred()) PyErr_Clear(); +} + +// --------------------------------------------------------------------------- +// Dispatch. +// --------------------------------------------------------------------------- + +enum Op { + OP_DATETIME_PARSE, + OP_DATETIME_FORMAT, + OP_COLLECTIONS_COUNT, + OP_UNICODEDATA_CHARINFO, + OP_UNICODEDATA_NORMALIZE, + OP_STRINGIO, + NUM_OPS +}; + +extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + assert(Py_IsInitialized()); + init_textops(); + if (size < 1 || size > 0x10000) return 0; + if (PyErr_Occurred()) PyErr_Clear(); + + FuzzedDataProvider fdp(data, size); + switch (fdp.ConsumeIntegralInRange(0, NUM_OPS - 1)) { + case OP_DATETIME_PARSE: + op_datetime_parse(fdp); + break; + case OP_DATETIME_FORMAT: + op_datetime_format(fdp); + break; + case OP_COLLECTIONS_COUNT: + op_collections_count(fdp); + break; + case OP_UNICODEDATA_CHARINFO: + op_unicodedata_charinfo(fdp); + break; + case OP_UNICODEDATA_NORMALIZE: + op_unicodedata_normalize(fdp); + break; + case OP_STRINGIO: + op_stringio(fdp); + break; + } + + if (++gc_counter % kGcInterval == 0) PyGC_Collect(); + return 0; +} From 22b7db38f090a8374a0cdeac14cafbca68f4df55 Mon Sep 17 00:00:00 2001 From: Adam Korczynski Date: Tue, 3 Mar 2026 15:57:53 +0000 Subject: [PATCH 2/2] break up fuzzers Signed-off-by: Adam Korczynski --- module-fuzzers/fuzz_crypto.cpp | 503 ------------ module-fuzzers/fuzz_dataops.cpp | 1166 --------------------------- module-fuzzers/fuzz_decode.cpp | 1029 ----------------------- module-fuzzers/fuzz_helpers.h | 27 +- module-fuzzers/fuzz_ioops.cpp | 1015 ----------------------- module-fuzzers/fuzz_locale.cpp | 73 ++ module-fuzzers/fuzz_mmap.cpp | 194 +++++ module-fuzzers/fuzz_operator.cpp | 190 +++++ module-fuzzers/fuzz_parsers.cpp | 744 ----------------- module-fuzzers/fuzz_pickle.cpp | 491 +++++++++++ module-fuzzers/fuzz_sqlite3.cpp | 530 ++++++++++++ module-fuzzers/fuzz_ssl.cpp | 84 ++ module-fuzzers/fuzz_textops.cpp | 467 ----------- module-fuzzers/fuzz_time.cpp | 97 +++ module-fuzzers/fuzz_unicodedata.cpp | 155 ++++ 15 files changed, 1836 insertions(+), 4929 deletions(-) delete mode 100644 module-fuzzers/fuzz_crypto.cpp delete mode 100644 module-fuzzers/fuzz_dataops.cpp delete mode 100644 module-fuzzers/fuzz_decode.cpp delete mode 100644 module-fuzzers/fuzz_ioops.cpp create mode 100644 module-fuzzers/fuzz_locale.cpp create mode 100644 module-fuzzers/fuzz_mmap.cpp create mode 100644 module-fuzzers/fuzz_operator.cpp delete mode 100644 module-fuzzers/fuzz_parsers.cpp create mode 100644 module-fuzzers/fuzz_pickle.cpp create mode 100644 module-fuzzers/fuzz_sqlite3.cpp create mode 100644 module-fuzzers/fuzz_ssl.cpp delete mode 100644 module-fuzzers/fuzz_textops.cpp create mode 100644 module-fuzzers/fuzz_time.cpp create mode 100644 module-fuzzers/fuzz_unicodedata.cpp diff --git a/module-fuzzers/fuzz_crypto.cpp b/module-fuzzers/fuzz_crypto.cpp deleted file mode 100644 index 1ce4b9e..0000000 --- a/module-fuzzers/fuzz_crypto.cpp +++ /dev/null @@ -1,503 +0,0 @@ -// fuzz_crypto.cpp — Fuzzer for CPython's hash and HMAC C extension modules. -// -// This fuzzer exercises the following CPython C extension modules via -// their Python API, called through the Python C API from C++: -// -// _md5, _sha1, _sha2 — MD5, SHA-1, SHA-224/256/384/512 -// _sha3 — SHA3-224/256/384/512, SHAKE-128/256 -// _blake2 — BLAKE2b (64-byte key/16-byte salt/person), -// BLAKE2s (32-byte key/8-byte salt/person) -// _hmac — Low-level compute_md5/sha1/sha256/sha512 -// hmac (Python module) — hmac.new(), hmac.digest(), hmac.compare_digest() -// hashlib (Python module) — hashlib.new(), hashlib.pbkdf2_hmac(), -// hashlib.file_digest() -// -// The first byte of fuzz input selects one of 13 operation types. Each -// operation consumes further bytes via FuzzedDataProvider to parameterize -// the call (algorithm choice, key/salt/data sizes, action sequences). -// -// Operations fall into two categories: -// -// Chained — Create a hash/HMAC object, then loop up to 100 actions -// chosen from: .update(data), .digest(), .hexdigest(), .copy().digest(), -// and reading .name/.digest_size/.block_size attributes. Used for -// standard hashes, SHAKE (variable-length digest), BLAKE2 (keyed + -// variable digest_size), hmac.new(), and hashlib.new(). -// -// One-shot — A single function call: _hmac.compute_*(key, msg), -// hmac.digest(key, msg, algo), hmac.compare_digest(a, b), -// hashlib.file_digest(BytesIO, algo), hashlib.pbkdf2_hmac(algo, pw, salt, 1). -// -// All module functions and constructors are imported once during init and -// cached as static PyObject* pointers. PyRef (RAII) prevents reference leaks. -// PyGC_Collect() runs every 200 iterations. Max input size: 1 MB. - -#include "fuzz_helpers.h" - -// --------------------------------------------------------------------------- -// Cached module objects, initialized once. -// --------------------------------------------------------------------------- - -static PyObject *ctor_md5, *ctor_sha1; -static PyObject *ctor_sha224, *ctor_sha256, *ctor_sha384, *ctor_sha512; -static PyObject *ctor_sha3_224, *ctor_sha3_256, *ctor_sha3_384, *ctor_sha3_512; -static PyObject *ctor_shake_128, *ctor_shake_256; -static PyObject *ctor_blake2b, *ctor_blake2s; - -static PyObject **all_hash_ctors[] = { - &ctor_md5, &ctor_sha1, &ctor_sha224, &ctor_sha256, - &ctor_sha384, &ctor_sha512, &ctor_sha3_224, &ctor_sha3_256, - &ctor_sha3_384, &ctor_sha3_512, &ctor_blake2b, &ctor_blake2s, -}; -static constexpr int kNumHashCtors = - sizeof(all_hash_ctors) / sizeof(all_hash_ctors[0]); - -static PyObject **shake_ctors[] = {&ctor_shake_128, &ctor_shake_256}; -static constexpr int kNumShakeCtors = 2; - -static PyObject *hmac_compute_funcs[4]; -static int num_hmac_compute_funcs = 0; - -static PyObject *hashlib_new, *hashlib_pbkdf2_hmac, *hashlib_file_digest; -static PyObject *py_hmac_new, *py_hmac_digest, *py_hmac_compare_digest; -static PyObject *bytesio_ctor; - -static const char *kHmacAlgos[] = { - "md5", "sha224", "sha256", "sha384", "sha512", "sha3_256", "blake2s", -}; -static constexpr int kNumHmacAlgos = - sizeof(kHmacAlgos) / sizeof(kHmacAlgos[0]); - -static const char *kPbkdf2Algos[] = {"sha1", "sha256", "sha512"}; -static constexpr int kNumPbkdf2Algos = 3; - -static const char *kHashlibAlgos[] = {"md5", "sha256", "sha3_256", "sha512"}; -static constexpr int kNumHashlibAlgos = 4; - -static unsigned long gc_counter = 0; - -static int initialized = 0; - -static void init_crypto(void) { - if (initialized) return; - - struct { - PyObject **dest; - const char *mod, *attr; - } inits[] = { - {&ctor_md5, "_md5", "md5"}, - {&ctor_sha1, "_sha1", "sha1"}, - {&ctor_sha224, "_sha2", "sha224"}, - {&ctor_sha256, "_sha2", "sha256"}, - {&ctor_sha384, "_sha2", "sha384"}, - {&ctor_sha512, "_sha2", "sha512"}, - {&ctor_sha3_224, "_sha3", "sha3_224"}, - {&ctor_sha3_256, "_sha3", "sha3_256"}, - {&ctor_sha3_384, "_sha3", "sha3_384"}, - {&ctor_sha3_512, "_sha3", "sha3_512"}, - {&ctor_shake_128, "_sha3", "shake_128"}, - {&ctor_shake_256, "_sha3", "shake_256"}, - {&ctor_blake2b, "_blake2", "blake2b"}, - {&ctor_blake2s, "_blake2", "blake2s"}, - }; - for (auto &i : inits) - *i.dest = import_attr(i.mod, i.attr); - - PyObject *hmac_mod = PyImport_ImportModule("_hmac"); - if (hmac_mod) { - const char *names[] = { - "compute_md5", "compute_sha1", "compute_sha256", "compute_sha512", - }; - for (auto name : names) { - PyObject *fn = PyObject_GetAttrString(hmac_mod, name); - if (fn) - hmac_compute_funcs[num_hmac_compute_funcs++] = fn; - else - PyErr_Clear(); - } - Py_DECREF(hmac_mod); - } else { - PyErr_Clear(); - } - - hashlib_new = import_attr("hashlib", "new"); - hashlib_pbkdf2_hmac = import_attr("hashlib", "pbkdf2_hmac"); - hashlib_file_digest = import_attr("hashlib", "file_digest"); - py_hmac_new = import_attr("hmac", "new"); - py_hmac_digest = import_attr("hmac", "digest"); - py_hmac_compare_digest = import_attr("hmac", "compare_digest"); - bytesio_ctor = import_attr("io", "BytesIO"); - - assert(!PyErr_Occurred()); - initialized = 1; -} - -// --------------------------------------------------------------------------- -// Chained action loop — shared by OP_HASH_CHAIN, OP_SHAKE_CHAIN, -// OP_BLAKE2*_KEYED, OP_BLAKE2*_VARDIGEST, OP_PYHMAC_CHAIN, and -// OP_HASHLIB_CHAIN. -// -// Takes a borrowed reference to a hash-like object and loops up to 100 -// fuzz-driven actions: .update(data), .digest(), .hexdigest(), -// .copy().digest(), and attribute reads (.name, .digest_size, .block_size). -// --------------------------------------------------------------------------- - -static void chain_hash_actions(PyObject *h, FuzzedDataProvider &fdp) { - for (int i = 0; fdp.remaining_bytes() > 0 && i < 100; i++) { - switch (fdp.ConsumeIntegralInRange(0, 4)) { - case 0: { // .update(data) - std::string data = fdp.ConsumeBytesAsString( - fdp.ConsumeIntegralInRange( - 0, std::min(fdp.remaining_bytes(), (size_t)10000))); - PyRef r = PyObject_CallMethod(h, "update", "y#", Y(data)); - CHECK(r); - break; - } - case 1: { - PyRef d = PyObject_CallMethod(h, "digest", NULL); - CHECK(d); - break; - } - case 2: { - PyRef d = PyObject_CallMethod(h, "hexdigest", NULL); - CHECK(d); - break; - } - case 3: { // .copy().digest() - PyRef h2 = PyObject_CallMethod(h, "copy", NULL); - CHECK(h2); - PyRef d = PyObject_CallMethod(h2, "digest", NULL); - CHECK(d); - break; - } - case 4: { // .name, .digest_size, .block_size - PyRef n = PyObject_GetAttrString(h, "name"); - CHECK(n); - PyRef ds = PyObject_GetAttrString(h, "digest_size"); - CHECK(ds); - PyRef bs = PyObject_GetAttrString(h, "block_size"); - CHECK(bs); - break; - } - } - } - if (PyErr_Occurred()) PyErr_Clear(); -} - -// --------------------------------------------------------------------------- -// Operations (13 ops). -// --------------------------------------------------------------------------- - -// OP_HASH_CHAIN: Create a hash object from one of 12 C module constructors -// (_md5.md5, _sha1.sha1, _sha2.sha224/256/384/512, _sha3.sha3_224/256/384/512, -// _blake2.blake2b/s) with fuzz-chosen initial data, then run chained actions. -static void op_hash_chain(PyObject *ctor, FuzzedDataProvider &fdp) { - std::string init = fdp.ConsumeBytesAsString( - fdp.ConsumeIntegralInRange(0, 10000)); - PyRef h = PyObject_CallFunction(ctor, "y#", Y(init)); - CHECK(h); - chain_hash_actions(h, fdp); -} - -// OP_SHAKE_CHAIN: Create a SHAKE-128 or SHAKE-256 XOF object, then loop -// up to 100 actions: .update(data), .digest(variable_length), or -// .copy().digest(variable_length). Exercises the variable-output-length -// code paths in _sha3. -static void op_shake_chain(PyObject *ctor, FuzzedDataProvider &fdp) { - std::string init = fdp.ConsumeBytesAsString( - fdp.ConsumeIntegralInRange(0, 10000)); - PyRef h = PyObject_CallFunction(ctor, "y#", Y(init)); - CHECK(h); - for (int i = 0; fdp.remaining_bytes() > 0 && i < 100; i++) { - switch (fdp.ConsumeIntegralInRange(0, 2)) { - case 0: { - std::string data = fdp.ConsumeBytesAsString( - fdp.ConsumeIntegralInRange( - 0, std::min(fdp.remaining_bytes(), (size_t)10000))); - PyRef r = PyObject_CallMethod(h, "update", "y#", Y(data)); - CHECK(r); - break; - } - case 1: { - int len = fdp.ConsumeIntegralInRange(1, 10000); - PyRef d = PyObject_CallMethod(h, "digest", "i", len); - CHECK(d); - break; - } - case 2: { - PyRef h2 = PyObject_CallMethod(h, "copy", NULL); - CHECK(h2); - int len = fdp.ConsumeIntegralInRange(1, 10000); - PyRef d = PyObject_CallMethod(h2, "digest", "i", len); - CHECK(d); - break; - } - } - } - if (PyErr_Occurred()) PyErr_Clear(); -} - -// OP_BLAKE2B_KEYED / OP_BLAKE2S_KEYED: Create a BLAKE2 object with -// fuzz-chosen key, salt, and person parameters (up to max_key/max_salt/ -// max_person bytes respectively), then run chained hash actions. -// BLAKE2b: key<=64, salt<=16, person<=16. BLAKE2s: key<=32, salt<=8, person<=8. -static void op_blake2_keyed(PyObject *ctor, int max_key, int max_salt, - int max_person, FuzzedDataProvider &fdp) { - std::string key = fdp.ConsumeBytesAsString( - fdp.ConsumeIntegralInRange(0, max_key)); - std::string salt = fdp.ConsumeBytesAsString( - fdp.ConsumeIntegralInRange(0, max_salt)); - std::string person = fdp.ConsumeBytesAsString( - fdp.ConsumeIntegralInRange(0, max_person)); - std::string data = fdp.ConsumeBytesAsString( - fdp.ConsumeIntegralInRange(0, 10000)); - - PyRef kwargs = PyDict_New(); - CHECK(kwargs); - PyRef k = PyBytes_FromStringAndSize(Y(key)); - CHECK(k); - PyRef s = PyBytes_FromStringAndSize(Y(salt)); - CHECK(s); - PyRef p = PyBytes_FromStringAndSize(Y(person)); - CHECK(p); - PyDict_SetItemString(kwargs, "key", k); - PyDict_SetItemString(kwargs, "salt", s); - PyDict_SetItemString(kwargs, "person", p); - - PyRef d = PyBytes_FromStringAndSize(Y(data)); - CHECK(d); - PyRef args = PyTuple_Pack(1, (PyObject *)d); - CHECK(args); - PyRef h = PyObject_Call(ctor, args, kwargs); - CHECK(h); - chain_hash_actions(h, fdp); -} - -// OP_BLAKE2B_VARDIGEST / OP_BLAKE2S_VARDIGEST: Create a BLAKE2 object with -// a fuzz-chosen digest_size (1 to max_ds bytes), then run chained actions. -// Exercises the variable output length code path in _blake2. -static void op_blake2_vardigest(PyObject *ctor, int max_ds, - FuzzedDataProvider &fdp) { - int ds = fdp.ConsumeIntegralInRange(1, max_ds); - std::string data = fdp.ConsumeBytesAsString( - fdp.ConsumeIntegralInRange(0, 10000)); - - PyRef kwargs = PyDict_New(); - CHECK(kwargs); - PyRef dsobj = PyLong_FromLong(ds); - CHECK(dsobj); - PyDict_SetItemString(kwargs, "digest_size", dsobj); - - PyRef d = PyBytes_FromStringAndSize(Y(data)); - CHECK(d); - PyRef args = PyTuple_Pack(1, (PyObject *)d); - CHECK(args); - PyRef h = PyObject_Call(ctor, args, kwargs); - CHECK(h); - chain_hash_actions(h, fdp); -} - -// OP_HMAC_COMPUTE: One-shot call to one of _hmac.compute_md5/sha1/sha256/sha512 -// with fuzz-chosen key and message. These are the low-level C implementations -// of HMAC in the _hmac module (not the Python hmac wrapper). -static void op_hmac_compute(PyObject *func, FuzzedDataProvider &fdp) { - std::string key = fdp.ConsumeBytesAsString( - fdp.ConsumeIntegralInRange(1, 10000)); - if (key.empty()) key.push_back('\x00'); - std::string msg = fdp.ConsumeRemainingBytesAsString(); - PyRef r = PyObject_CallFunction(func, "y#y#", Y(key), Y(msg)); - if (PyErr_Occurred()) PyErr_Clear(); -} - -// OP_PYHMAC_CHAIN: Create an HMAC object via hmac.new(key, digestmod=algo) -// where algo is fuzz-chosen from {md5, sha224, sha256, sha384, sha512, -// sha3_256, blake2s}, then run chained hash actions (update/digest/copy/etc). -// Exercises the Python hmac module which delegates to C hash constructors. -static void op_pyhmac_chain(const char *algo, FuzzedDataProvider &fdp) { - std::string key = fdp.ConsumeBytesAsString( - fdp.ConsumeIntegralInRange(1, 10000)); - if (key.empty()) key.push_back('\x00'); - - PyRef kwargs = PyDict_New(); - CHECK(kwargs); - PyRef dm = PyUnicode_FromString(algo); - CHECK(dm); - PyDict_SetItemString(kwargs, "digestmod", dm); - PyRef kb = PyBytes_FromStringAndSize(Y(key)); - CHECK(kb); - PyRef args = PyTuple_Pack(1, (PyObject *)kb); - CHECK(args); - PyRef h = PyObject_Call(py_hmac_new, args, kwargs); - CHECK(h); - chain_hash_actions(h, fdp); -} - -// OP_HMAC_DIGEST: One-shot call to hmac.digest(key, msg, "sha256"). -// Exercises the fast single-call HMAC path without creating an HMAC object. -static void op_hmac_digest(FuzzedDataProvider &fdp) { - std::string key = fdp.ConsumeBytesAsString( - fdp.ConsumeIntegralInRange(1, 10000)); - if (key.empty()) key.push_back('\x00'); - std::string msg = fdp.ConsumeRemainingBytesAsString(); - PyRef r = PyObject_CallFunction(py_hmac_digest, "y#y#s", - Y(key), Y(msg), "sha256"); - if (PyErr_Occurred()) PyErr_Clear(); -} - -// OP_HMAC_COMPARE: Compute HMAC-SHA256 of fuzz data, then call -// hmac.compare_digest() against a zero-padded 32-byte buffer derived from -// the same data. Exercises the constant-time comparison code path. -static void op_hmac_compare(FuzzedDataProvider &fdp) { - std::string data = fdp.ConsumeRemainingBytesAsString(); - PyRef h = PyObject_CallFunction(py_hmac_new, "sy#s", - "k", Y(data), "sha256"); - CHECK(h); - PyRef dig = PyObject_CallMethod(h, "digest", NULL); - CHECK(dig); - char padded[32] = {}; - memcpy(padded, data.data(), data.size() < 32 ? data.size() : 32); - PyRef padobj = PyBytes_FromStringAndSize(padded, 32); - CHECK(padobj); - PyRef r = PyObject_CallFunction(py_hmac_compare_digest, "OO", - (PyObject *)dig, (PyObject *)padobj); - if (PyErr_Occurred()) PyErr_Clear(); -} - -// OP_HASHLIB_CHAIN: Create a hash object via hashlib.new(algo, data, -// usedforsecurity=False) where algo is fuzz-chosen from {md5, sha256, -// sha3_256, sha512}, then run chained actions. Unlike OP_HASH_CHAIN which -// uses the C module constructors directly, this goes through hashlib's -// dispatch logic (OpenSSL vs builtin). -static void op_hashlib_chain(const char *algo, FuzzedDataProvider &fdp) { - std::string init = fdp.ConsumeBytesAsString( - fdp.ConsumeIntegralInRange(0, 10000)); - PyRef kwargs = PyDict_New(); - CHECK(kwargs); - PyDict_SetItemString(kwargs, "usedforsecurity", Py_False); - PyRef name = PyUnicode_FromString(algo); - CHECK(name); - PyRef d = PyBytes_FromStringAndSize(Y(init)); - CHECK(d); - PyRef args = PyTuple_Pack(2, (PyObject *)name, (PyObject *)d); - CHECK(args); - PyRef h = PyObject_Call(hashlib_new, args, kwargs); - CHECK(h); - chain_hash_actions(h, fdp); -} - -// OP_HASHLIB_FILE_DIGEST: One-shot call to hashlib.file_digest(BytesIO(data), -// algo) with fuzz-chosen algorithm, then .hexdigest(). Exercises the -// file-based hashing path that reads from a file-like object. -static void op_hashlib_file_digest(const char *algo, FuzzedDataProvider &fdp) { - std::string data = fdp.ConsumeRemainingBytesAsString(); - PyRef bio = PyObject_CallFunction(bytesio_ctor, "y#", Y(data)); - CHECK(bio); - PyRef h = PyObject_CallFunction(hashlib_file_digest, "Os", - (PyObject *)bio, algo); - CHECK(h); - PyRef r = PyObject_CallMethod(h, "hexdigest", NULL); - if (PyErr_Occurred()) PyErr_Clear(); -} - -// OP_PBKDF2: One-shot call to hashlib.pbkdf2_hmac(algo, password, salt, 1) -// with fuzz-chosen algorithm from {sha1, sha256, sha512}. Uses 1 iteration -// to keep execution fast while still exercising the PBKDF2 code path. -static void op_pbkdf2(const char *algo, FuzzedDataProvider &fdp) { - std::string salt = fdp.ConsumeBytesAsString( - fdp.ConsumeIntegralInRange(1, 10000)); - if (salt.empty()) salt.push_back('\x00'); - std::string pw = fdp.ConsumeRemainingBytesAsString(); - PyRef r = PyObject_CallFunction(hashlib_pbkdf2_hmac, "sy#y#i", - algo, Y(pw), Y(salt), 1); - if (PyErr_Occurred()) PyErr_Clear(); -} - -// --------------------------------------------------------------------------- -// Dispatch. -// --------------------------------------------------------------------------- - -enum Op { - OP_HASH_CHAIN, - OP_SHAKE_CHAIN, - OP_BLAKE2B_KEYED, - OP_BLAKE2S_KEYED, - OP_BLAKE2B_VARDIGEST, - OP_BLAKE2S_VARDIGEST, - OP_HMAC_COMPUTE, - OP_PYHMAC_CHAIN, - OP_HMAC_DIGEST, - OP_HMAC_COMPARE, - OP_HASHLIB_CHAIN, - OP_HASHLIB_FILE_DIGEST, - OP_PBKDF2, - NUM_OPS -}; - -extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { - assert(Py_IsInitialized()); - init_crypto(); - if (size < 1 || size > kMaxInputSize) return 0; - if (PyErr_Occurred()) PyErr_Clear(); - - FuzzedDataProvider fdp(data, size); - switch (fdp.ConsumeIntegralInRange(0, NUM_OPS - 1)) { - case OP_HASH_CHAIN: { - int ci = fdp.ConsumeIntegralInRange(0, kNumHashCtors - 1); - op_hash_chain(*all_hash_ctors[ci], fdp); - break; - } - case OP_SHAKE_CHAIN: { - int ci = fdp.ConsumeIntegralInRange(0, kNumShakeCtors - 1); - op_shake_chain(*shake_ctors[ci], fdp); - break; - } - case OP_BLAKE2B_KEYED: - op_blake2_keyed(ctor_blake2b, 64, 16, 16, fdp); - break; - case OP_BLAKE2S_KEYED: - op_blake2_keyed(ctor_blake2s, 32, 8, 8, fdp); - break; - case OP_BLAKE2B_VARDIGEST: - op_blake2_vardigest(ctor_blake2b, 64, fdp); - break; - case OP_BLAKE2S_VARDIGEST: - op_blake2_vardigest(ctor_blake2s, 32, fdp); - break; - case OP_HMAC_COMPUTE: - if (num_hmac_compute_funcs > 0) { - int fi = fdp.ConsumeIntegralInRange( - 0, num_hmac_compute_funcs - 1); - op_hmac_compute(hmac_compute_funcs[fi], fdp); - } - break; - case OP_PYHMAC_CHAIN: { - int ai = fdp.ConsumeIntegralInRange(0, kNumHmacAlgos - 1); - op_pyhmac_chain(kHmacAlgos[ai], fdp); - break; - } - case OP_HMAC_DIGEST: - op_hmac_digest(fdp); - break; - case OP_HMAC_COMPARE: - op_hmac_compare(fdp); - break; - case OP_HASHLIB_CHAIN: { - int ai = fdp.ConsumeIntegralInRange(0, kNumHashlibAlgos - 1); - op_hashlib_chain(kHashlibAlgos[ai], fdp); - break; - } - case OP_HASHLIB_FILE_DIGEST: { - int ai = fdp.ConsumeIntegralInRange(0, kNumHashlibAlgos - 1); - op_hashlib_file_digest(kHashlibAlgos[ai], fdp); - break; - } - case OP_PBKDF2: { - int ai = fdp.ConsumeIntegralInRange(0, kNumPbkdf2Algos - 1); - op_pbkdf2(kPbkdf2Algos[ai], fdp); - break; - } - } - - if (++gc_counter % kGcInterval == 0) PyGC_Collect(); - return 0; -} diff --git a/module-fuzzers/fuzz_dataops.cpp b/module-fuzzers/fuzz_dataops.cpp deleted file mode 100644 index 10afe04..0000000 --- a/module-fuzzers/fuzz_dataops.cpp +++ /dev/null @@ -1,1166 +0,0 @@ -// fuzz_dataops.cpp — Fuzzer for CPython's data-structure C extension modules. -// -// This fuzzer exercises the following CPython C extension modules via -// their Python API, called through the Python C API from C++: -// -// array — array(typecode) with frombytes, tobytes, tolist, -// reverse, byteswap, append, extend, pop, count, -// index, insert, remove, buffer_info, __sizeof__, -// __contains__, __iter__, slice ops, comparison, -// concatenation, repetition, fromlist -// _ctypes — c_char/c_int/c_double.from_buffer_copy(), -// create_string_buffer, (c_char*N).from_buffer_copy, -// Structure.from_buffer_copy -// mmap — anonymous mmap: write, find, rfind, read, readline, -// seek, resize, move, getitem, setitem, flush, size, -// tell, close, context manager -// _locale — strxfrm, strcoll -// _dbm — dbm.open, write, read, keys, delete, iteration -// _sqlite3 — connect(':memory:'), execute, executemany, -// executescript, complete_statement, create_function, -// create_aggregate, set_authorizer, create_collation, -// Row factory, blobopen, register_adapter -// -// The first byte of fuzz input selects one of 9 operation types. Each -// operation consumes further bytes via FuzzedDataProvider to parameterize -// the call (typecode, sub-operation, SQL, key/value splits). -// -// All module functions and class constructors are imported once during init -// and cached as static PyObject* pointers. Two helper classes (Structure -// subclass, Aggregate class) are defined via PyRun_String at init time. -// PyRef (RAII) prevents reference leaks. PyGC_Collect() runs every 200 -// iterations. Max input size: 64 KB. - -#include "fuzz_helpers.h" - -// --------------------------------------------------------------------------- -// Cached module objects, initialized once. -// --------------------------------------------------------------------------- - -// array -static PyObject *array_array; - -// ctypes -static PyObject *ct_c_char, *ct_c_int, *ct_c_double; -static PyObject *ct_create_string_buffer, *ct_sizeof; -static PyObject *ct_Structure_cls; - -// mmap -static PyObject *mmap_mmap; - -// locale -static PyObject *locale_strxfrm, *locale_strcoll; - -// dbm -static PyObject *dbm_open; - -// sqlite3 -static PyObject *sqlite3_connect, *sqlite3_complete_statement; -static PyObject *sqlite3_register_adapter, *sqlite3_Row; -static long sqlite3_SQLITE_OK_val; -static PyObject *sqlite3_Aggregate_cls; - -static unsigned long gc_counter = 0; - -static int initialized = 0; - -static void init_dataops(void) { - if (initialized) return; - - // array - array_array = import_attr("array", "array"); - - // ctypes - ct_c_char = import_attr("ctypes", "c_char"); - ct_c_int = import_attr("ctypes", "c_int"); - ct_c_double = import_attr("ctypes", "c_double"); - ct_create_string_buffer = import_attr("ctypes", "create_string_buffer"); - ct_sizeof = import_attr("ctypes", "sizeof"); - - // ctypes Structure subclass. - { - PyObject *globals = PyDict_New(); - PyDict_SetItemString(globals, "__builtins__", PyEval_GetBuiltins()); - PyObject *r = PyRun_String( - "import ctypes\n" - "class _S(ctypes.Structure):\n" - " _fields_ = [('a', ctypes.c_int), ('b', ctypes.c_double)]\n", - Py_file_input, globals, globals); - if (!r) { PyErr_Print(); abort(); } - Py_DECREF(r); - ct_Structure_cls = PyDict_GetItemString(globals, "_S"); - Py_INCREF(ct_Structure_cls); - Py_DECREF(globals); - } - - // mmap - mmap_mmap = import_attr("mmap", "mmap"); - - // locale - locale_strxfrm = import_attr("locale", "strxfrm"); - locale_strcoll = import_attr("locale", "strcoll"); - - // dbm - dbm_open = import_attr("dbm", "open"); - - // sqlite3 - sqlite3_connect = import_attr("sqlite3", "connect"); - sqlite3_complete_statement = import_attr("sqlite3", "complete_statement"); - sqlite3_register_adapter = import_attr("sqlite3", "register_adapter"); - sqlite3_Row = import_attr("sqlite3", "Row"); - { - PyObject *v = import_attr("sqlite3", "SQLITE_OK"); - sqlite3_SQLITE_OK_val = PyLong_AsLong(v); - Py_DECREF(v); - } - - // Aggregate class for sqlite3. - { - PyObject *globals = PyDict_New(); - PyDict_SetItemString(globals, "__builtins__", PyEval_GetBuiltins()); - PyObject *r = PyRun_String( - "class _Agg:\n" - " def __init__(self): self.vals = []\n" - " def step(self, v): self.vals.append(v)\n" - " def finalize(self): return len(self.vals)\n", - Py_file_input, globals, globals); - if (!r) { PyErr_Print(); abort(); } - Py_DECREF(r); - sqlite3_Aggregate_cls = PyDict_GetItemString(globals, "_Agg"); - Py_INCREF(sqlite3_Aggregate_cls); - Py_DECREF(globals); - } - - // Suppress warnings. - PyRun_SimpleString("import warnings; warnings.filterwarnings('ignore')"); - - assert(!PyErr_Occurred()); - initialized = 1; -} - -// --------------------------------------------------------------------------- -// Helpers -// --------------------------------------------------------------------------- - -// Item sizes for array typecodes. -static int typecode_itemsize(char tc) { - switch (tc) { - case 'b': case 'B': return 1; - case 'H': return 2; - case 'i': case 'I': case 'l': case 'L': case 'f': return 4; - case 'd': case 'q': case 'Q': return 8; - default: return 1; - } -} - -// Create an array with the given typecode and aligned data. -static PyObject *make_array(char tc, const std::string &data) { - int item_sz = typecode_itemsize(tc); - size_t aligned_len = (data.size() / item_sz) * item_sz; - if (aligned_len == 0) aligned_len = item_sz; - - char tc_str[2] = {tc, '\0'}; - PyObject *arr = PyObject_CallFunction(array_array, "s", tc_str); - if (!arr) return NULL; - - // frombytes with aligned data. - std::string aligned = data.substr(0, aligned_len); - if (aligned.size() < (size_t)item_sz) { - aligned.resize(item_sz, '\0'); - } - PyRef pydata = PyBytes_FromStringAndSize(aligned.data(), aligned.size()); - if (!pydata) { Py_DECREF(arr); return NULL; } - PyRef r = PyObject_CallMethod(arr, "frombytes", "O", (PyObject *)pydata); - if (!r) { PyErr_Clear(); Py_DECREF(arr); return NULL; } - return arr; -} - -// --------------------------------------------------------------------------- -// Operations (9 ops). -// --------------------------------------------------------------------------- - -// OP_ARRAY_FROMBYTES: FDP selects typecode, creates array from aligned fuzz -// data, then calls tobytes/tolist/reverse/byteswap. Exercises the array C -// module's core buffer and conversion operations. -static void op_array_frombytes(FuzzedDataProvider &fdp) { - static const char kTypecodes[] = "bBHiIlLfdqQ"; - char tc = kTypecodes[fdp.ConsumeIntegralInRange(0, 10)]; - std::string data = fdp.ConsumeBytesAsString( - std::min(fdp.remaining_bytes(), (size_t)10000)); - - PyRef arr(make_array(tc, data)); - CHECK(arr); - - { - PyRef r = PyObject_CallMethod(arr, "tobytes", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - PyRef r = PyObject_CallMethod(arr, "tolist", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - PyRef r = PyObject_CallMethod(arr, "reverse", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - PyRef r = PyObject_CallMethod(arr, "byteswap", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - } -} - -// OP_ARRAY_METHODS: FDP selects typecode, creates array, then exercises -// append/extend/pop/count/index/insert/remove/buffer_info/__sizeof__/ -// __contains__/__iter__/len. Exercises the array C module's element ops. -static void op_array_methods(FuzzedDataProvider &fdp) { - static const char kTypecodes[] = "bBHiIlLfdqQ"; - char tc = kTypecodes[fdp.ConsumeIntegralInRange(0, 10)]; - std::string data = fdp.ConsumeBytesAsString( - std::min(fdp.remaining_bytes(), (size_t)10000)); - - PyRef arr(make_array(tc, data)); - CHECK(arr); - - // append(0) - { - PyRef zero = PyLong_FromLong(0); - CHECK(zero); - PyRef r = PyObject_CallMethod(arr, "append", "O", (PyObject *)zero); - if (PyErr_Occurred()) PyErr_Clear(); - } - - // extend with a slice. - { - PyRef slice = PySequence_GetSlice(arr, 0, 1); - if (slice) { - PyRef r = PyObject_CallMethod(arr, "extend", "O", (PyObject *)slice); - if (PyErr_Occurred()) PyErr_Clear(); - } else { - PyErr_Clear(); - } - } - - // pop() - { - PyRef r = PyObject_CallMethod(arr, "pop", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - } - - // count(first_element) and index(first_element) - { - PyRef first = PySequence_GetItem(arr, 0); - if (first) { - PyRef c = PyObject_CallMethod(arr, "count", "O", (PyObject *)first); - if (PyErr_Occurred()) PyErr_Clear(); - PyRef idx = PyObject_CallMethod(arr, "index", "O", (PyObject *)first); - if (PyErr_Occurred()) PyErr_Clear(); - } else { - PyErr_Clear(); - } - } - - // insert(0, 42) + remove(42) - { - PyRef val = PyLong_FromLong(42); - CHECK(val); - PyRef r = PyObject_CallMethod(arr, "insert", "iO", 0, (PyObject *)val); - if (PyErr_Occurred()) PyErr_Clear(); - PyRef r2 = PyObject_CallMethod(arr, "remove", "O", (PyObject *)val); - if (PyErr_Occurred()) PyErr_Clear(); - } - - // buffer_info, __sizeof__ - { - PyRef bi = PyObject_CallMethod(arr, "buffer_info", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - PyRef sz = PyObject_CallMethod(arr, "__sizeof__", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - } - - // __contains__, iter, len - { - PyRef first = PySequence_GetItem(arr, 0); - if (first) { - int r = PySequence_Contains(arr, first); - (void)r; - if (PyErr_Occurred()) PyErr_Clear(); - } else { - PyErr_Clear(); - } - Py_ssize_t len = PyObject_Length(arr); - (void)len; - if (PyErr_Occurred()) PyErr_Clear(); - } -} - -// OP_ARRAY_SLICE: FDP selects typecode, creates two arrays, does slice read, -// slice assignment, concatenation, repetition, comparison. Exercises the -// array C module's sequence protocol paths. -static void op_array_slice(FuzzedDataProvider &fdp) { - static const char kTypecodes[] = "bBHiIlLfdqQ"; - char tc = kTypecodes[fdp.ConsumeIntegralInRange(0, 10)]; - std::string data = fdp.ConsumeBytesAsString( - std::min(fdp.remaining_bytes(), (size_t)10000)); - - PyRef a1(make_array(tc, data)); - CHECK(a1); - PyRef a2(make_array(tc, data)); - CHECK(a2); - - // Slice read a1[0:N]. - { - Py_ssize_t len = PyObject_Length(a1); - Py_ssize_t n = len < 4 ? len : 4; - PyRef sl = PySequence_GetSlice(a1, 0, n); - if (PyErr_Occurred()) PyErr_Clear(); - } - - // Slice assignment a1[::2] = array of zeros. - { - Py_ssize_t len = PyObject_Length(a1); - if (len > 0) { - // Count elements in a1[::2]. - Py_ssize_t slice_len = (len + 1) / 2; - // Build array of zeros with same typecode. - char tc_str[2] = {tc, '\0'}; - PyRef zeros_arr = PyObject_CallFunction(array_array, "s", tc_str); - if (zeros_arr) { - std::string zero_data(slice_len * typecode_itemsize(tc), '\0'); - PyRef pydata = PyBytes_FromStringAndSize(zero_data.data(), - zero_data.size()); - if (pydata) { - PyRef fb = PyObject_CallMethod(zeros_arr, "frombytes", "O", - (PyObject *)pydata); - if (fb) { - PyRef step = PyLong_FromLong(2); - PyRef sl = PySlice_New(NULL, NULL, step); - if (sl) { - int r = PyObject_SetItem(a1, sl, zeros_arr); - (void)r; - } - } - } - } - if (PyErr_Occurred()) PyErr_Clear(); - } - } - - // Concatenation a1 + a2. - { - PyRef r = PySequence_Concat(a1, a2); - if (PyErr_Occurred()) PyErr_Clear(); - } - - // Repetition a1 * min(len, 3). - { - Py_ssize_t len = PyObject_Length(a1); - int rep = len < 3 ? (int)len : 3; - PyRef r = PySequence_Repeat(a1, rep); - if (PyErr_Occurred()) PyErr_Clear(); - } - - // Comparison a1 == a2. - { - PyRef r = PyObject_RichCompare(a1, a2, Py_EQ); - if (PyErr_Occurred()) PyErr_Clear(); - } -} - -// OP_CTYPES: FDP selects sub-op for different ctypes from_buffer_copy calls. -// Exercises the _ctypes C module's buffer copy and array creation paths. -static void op_ctypes(FuzzedDataProvider &fdp) { - int variant = fdp.ConsumeIntegralInRange(0, 5); - std::string data = fdp.ConsumeBytesAsString( - std::min(fdp.remaining_bytes(), (size_t)10000)); - - switch (variant) { - case 0: { - // c_char.from_buffer_copy(1 byte) - std::string buf = data.substr(0, 1); - if (buf.empty()) buf.push_back('\0'); - PyRef pydata = PyBytes_FromStringAndSize(buf.data(), buf.size()); - CHECK(pydata); - PyRef r = PyObject_CallMethod(ct_c_char, "from_buffer_copy", "O", - (PyObject *)pydata); - break; - } - case 1: { - // c_int.from_buffer_copy(4 bytes) - std::string buf = data.substr(0, 4); - buf.resize(4, '\0'); - PyRef pydata = PyBytes_FromStringAndSize(buf.data(), buf.size()); - CHECK(pydata); - PyRef r = PyObject_CallMethod(ct_c_int, "from_buffer_copy", "O", - (PyObject *)pydata); - break; - } - case 2: { - // c_double.from_buffer_copy(8 bytes) - std::string buf = data.substr(0, 8); - buf.resize(8, '\0'); - PyRef pydata = PyBytes_FromStringAndSize(buf.data(), buf.size()); - CHECK(pydata); - PyRef r = PyObject_CallMethod(ct_c_double, "from_buffer_copy", "O", - (PyObject *)pydata); - break; - } - case 3: { - // create_string_buffer(data[:256]) - std::string buf = data.substr(0, 256); - PyRef pydata = PyBytes_FromStringAndSize(buf.data(), buf.size()); - CHECK(pydata); - PyRef r = PyObject_CallFunction(ct_create_string_buffer, "O", - (PyObject *)pydata); - break; - } - case 4: { - // (c_char * N).from_buffer_copy(data) - if (data.empty()) break; - PyRef n = PyLong_FromLong(data.size()); - CHECK(n); - PyRef arr_type = PyNumber_Multiply(ct_c_char, n); - CHECK(arr_type); - PyRef pydata = PyBytes_FromStringAndSize(Y(data)); - CHECK(pydata); - PyRef r = PyObject_CallMethod(arr_type, "from_buffer_copy", "O", - (PyObject *)pydata); - break; - } - case 5: { - // Structure.from_buffer_copy(padded data) - PyRef sz = PyObject_CallFunction(ct_sizeof, "O", ct_Structure_cls); - CHECK(sz); - long struct_sz = PyLong_AsLong(sz); - std::string buf = data.substr(0, struct_sz); - buf.resize(struct_sz, '\0'); - PyRef pydata = PyBytes_FromStringAndSize(buf.data(), buf.size()); - CHECK(pydata); - PyRef r = PyObject_CallMethod(ct_Structure_cls, "from_buffer_copy", "O", - (PyObject *)pydata); - break; - } - } - if (PyErr_Occurred()) PyErr_Clear(); -} - -// OP_MMAP: Create anonymous mmap, write data, then FDP selects actions. -// Exercises the mmap C module's core operations. -static void op_mmap(FuzzedDataProvider &fdp) { - int action = fdp.ConsumeIntegralInRange(0, 5); - std::string data = fdp.ConsumeBytesAsString( - std::min(fdp.remaining_bytes(), (size_t)10000)); - if (data.empty()) data.push_back('\0'); - - // mmap(-1, size) - Py_ssize_t map_size = data.size(); - PyRef mm = PyObject_CallFunction(mmap_mmap, "in", -1, map_size); - CHECK(mm); - - // Write data. - PyRef pydata = PyBytes_FromStringAndSize(Y(data)); - CHECK(pydata); - { - PyRef r = PyObject_CallMethod(mm, "write", "O", (PyObject *)pydata); - if (!r) { PyErr_Clear(); goto cleanup; } - } - - // Seek to 0. - { - PyRef r = PyObject_CallMethod(mm, "seek", "i", 0); - if (!r) { PyErr_Clear(); goto cleanup; } - } - - switch (action) { - case 0: { - // find + rfind - size_t pat_len = data.size() < 4 ? data.size() : 4; - PyRef pat = PyBytes_FromStringAndSize(data.data(), pat_len); - CHECK(pat); - { - PyRef r = PyObject_CallMethod(mm, "find", "O", (PyObject *)pat); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - PyRef r = PyObject_CallMethod(mm, "rfind", "O", (PyObject *)pat); - if (PyErr_Occurred()) PyErr_Clear(); - } - break; - } - case 1: { - // read + readline - { - long n = map_size < 4 ? map_size : 4; - PyRef r = PyObject_CallMethod(mm, "read", "l", n); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - PyRef sk = PyObject_CallMethod(mm, "seek", "i", 0); - if (PyErr_Occurred()) PyErr_Clear(); - PyRef r = PyObject_CallMethod(mm, "readline", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - } - break; - } - case 2: { - // resize + move - long new_size = map_size * 2; - if (new_size < 1) new_size = 1; - { - PyRef r = PyObject_CallMethod(mm, "resize", "l", new_size); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - long src = map_size < 2 ? 0 : 1; - long count = map_size < 2 ? 0 : (map_size / 2 < new_size / 2 ? - map_size / 2 : new_size / 2); - PyRef r = PyObject_CallMethod(mm, "move", "lll", - (long)0, src, count); - if (PyErr_Occurred()) PyErr_Clear(); - } - break; - } - case 3: { - // getitem + setitem - { - PyRef idx = PyLong_FromLong(0); - CHECK(idx); - PyRef r = PyObject_GetItem(mm, idx); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - Py_ssize_t n = map_size < 4 ? map_size : 4; - PyRef sl = PySlice_New(PyLong_FromLong(0), PyLong_FromLong(n), NULL); - CHECK(sl); - PyRef r = PyObject_GetItem(mm, sl); - if (PyErr_Occurred()) PyErr_Clear(); - } - if (data.size() > 0) { - PyRef idx = PyLong_FromLong(0); - CHECK(idx); - PyRef val = PyLong_FromLong((unsigned char)data[0]); - CHECK(val); - PyObject_SetItem(mm, idx, val); - if (PyErr_Occurred()) PyErr_Clear(); - } - break; - } - case 4: { - // flush + size + tell - { - PyRef r = PyObject_CallMethod(mm, "flush", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - PyRef r = PyObject_CallMethod(mm, "size", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - PyRef r = PyObject_CallMethod(mm, "tell", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - } - break; - } - case 5: { - // read all - { - PyRef r = PyObject_CallMethod(mm, "read", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - } - break; - } - } - -cleanup: - { - PyRef r = PyObject_CallMethod(mm, "close", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - } -} - -// OP_LOCALE: FDP selects strxfrm or strcoll. Exercises the _locale C module. -static void op_locale(FuzzedDataProvider &fdp) { - int str_enc = fdp.ConsumeIntegralInRange(0, 3); - bool use_strcoll = fdp.ConsumeBool(); - std::string data = fdp.ConsumeBytesAsString( - std::min(fdp.remaining_bytes(), (size_t)10000)); - PyRef pystr(fuzz_bytes_to_str(data, str_enc)); - CHECK(pystr); - - if (use_strcoll) { - Py_ssize_t slen = PyUnicode_GET_LENGTH(pystr); - Py_ssize_t mid = slen / 2; - PyRef half1 = PyUnicode_Substring(pystr, 0, mid); - CHECK(half1); - PyRef half2 = PyUnicode_Substring(pystr, mid, slen); - CHECK(half2); - PyRef r = PyObject_CallFunction(locale_strcoll, "OO", - (PyObject *)half1, (PyObject *)half2); - } else { - PyRef r = PyObject_CallFunction(locale_strxfrm, "O", (PyObject *)pystr); - } - if (PyErr_Occurred()) PyErr_Clear(); -} - -// OP_DBM: Open an in-memory dbm, write N key-value pairs, read back, iterate. -// Exercises the _dbm C extension module's storage operations. -static void op_dbm(FuzzedDataProvider &fdp) { - std::string data = fdp.ConsumeBytesAsString( - std::min(fdp.remaining_bytes(), (size_t)10000)); - - // Use a unique filename based on the gc counter to avoid conflicts. - char dbpath[64]; - snprintf(dbpath, sizeof(dbpath), "/tmp/_fuzz_dbm_%lu", gc_counter); - - PyRef db = PyObject_CallFunction(dbm_open, "ss", dbpath, "n"); - CHECK(db); - - // Write key-value pairs from fuzz data. - size_t limit = data.size() < 64 ? data.size() : 64; - for (size_t i = 0; i + 3 < limit; i += 4) { - PyRef key = PyBytes_FromStringAndSize(data.data() + i, 2); - if (!key) { PyErr_Clear(); continue; } - PyRef val = PyBytes_FromStringAndSize(data.data() + i + 2, 2); - if (!val) { PyErr_Clear(); continue; } - int r = PyObject_SetItem(db, key, val); - (void)r; - if (PyErr_Occurred()) PyErr_Clear(); - } - - // Read keys. - { - PyRef keys = PyObject_CallMethod(db, "keys", NULL); - if (keys) { - PyRef it = PyObject_GetIter(keys); - if (it) { - PyObject *k; - while ((k = PyIter_Next(it)) != NULL) { - PyRef val = PyObject_GetItem(db, k); - Py_DECREF(k); - if (PyErr_Occurred()) PyErr_Clear(); - } - } - } - if (PyErr_Occurred()) PyErr_Clear(); - } - - // Check membership. - { - PyRef test_key = PyBytes_FromStringAndSize("k", 1); - if (test_key) { - int r = PySequence_Contains(db, test_key); - (void)r; - if (PyErr_Occurred()) PyErr_Clear(); - } - } - - // Close. - { - PyRef r = PyObject_CallMethod(db, "close", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - } -} - -// Helper: Create a memory connection with PRAGMA max_page_count=100. -static PyObject *make_sqlite_conn() { - PyObject *conn = PyObject_CallFunction(sqlite3_connect, "s", ":memory:"); - if (!conn) return NULL; - PyRef r = PyObject_CallMethod(conn, "execute", "s", - "PRAGMA max_page_count=100"); - if (!r) { - PyErr_Clear(); - Py_DECREF(conn); - return NULL; - } - return conn; -} - -// OP_SQLITE3_BASIC: connect(':memory:'), then FDP selects: execute fuzz SQL, -// parameterized queries, executemany, executescript, complete_statement. -// Exercises the _sqlite3 C module's basic execution paths. -static void op_sqlite3_basic(FuzzedDataProvider &fdp) { - int str_enc = fdp.ConsumeIntegralInRange(0, 3); - int variant = fdp.ConsumeIntegralInRange(0, 4); - std::string data = fdp.ConsumeBytesAsString( - std::min(fdp.remaining_bytes(), (size_t)10000)); - PyRef pystr(fuzz_bytes_to_str(data, str_enc)); - CHECK(pystr); - - PyRef conn(make_sqlite_conn()); - CHECK(conn); - - switch (variant) { - case 0: { - // Execute fuzz SQL. - PyRef r = PyObject_CallMethod(conn, "execute", "O", (PyObject *)pystr); - break; - } - case 1: { - // Parameterized INSERT/SELECT/UPDATE/DELETE. - PyRef pydata = PyBytes_FromStringAndSize(Y(data)); - CHECK(pydata); - { - PyRef r = PyObject_CallMethod(conn, "execute", "s", - "CREATE TABLE t(a TEXT, b BLOB)"); - if (!r) { PyErr_Clear(); break; } - } - { - PyRef params = PyTuple_Pack(2, (PyObject *)pystr, (PyObject *)pydata); - CHECK(params); - PyRef r = PyObject_CallMethod(conn, "execute", "sO", - "INSERT INTO t VALUES(?, ?)", - (PyObject *)params); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - // SELECT. - PyRef sub = PyUnicode_Substring(pystr, 0, 32); - if (!sub) { PyErr_Clear(); break; } - PyRef params = PyTuple_Pack(1, (PyObject *)sub); - CHECK(params); - PyRef r = PyObject_CallMethod(conn, "execute", "sO", - "SELECT * FROM t WHERE a LIKE ?", - (PyObject *)params); - if (PyErr_Occurred()) PyErr_Clear(); - } - break; - } - case 2: { - // executemany. - { - PyRef r = PyObject_CallMethod(conn, "execute", "s", - "CREATE TABLE t(v INTEGER)"); - if (!r) { PyErr_Clear(); break; } - } - { - PyRef rows = PyList_New(0); - CHECK(rows); - size_t limit = data.size() < 64 ? data.size() : 64; - for (size_t i = 0; i < limit; i++) { - PyRef val = PyLong_FromLong((unsigned char)data[i]); - PyRef tup = PyTuple_Pack(1, (PyObject *)val); - if (tup) PyList_Append(rows, tup); - } - PyRef r = PyObject_CallMethod(conn, "executemany", "sO", - "INSERT INTO t VALUES(?)", - (PyObject *)rows); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - PyRef cur = PyObject_CallMethod(conn, "execute", "s", - "SELECT count(*), sum(v), avg(v), min(v), max(v) FROM t"); - if (cur) { - PyRef row = PyObject_CallMethod(cur, "fetchone", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - } else { - PyErr_Clear(); - } - } - break; - } - case 3: { - // executescript. - Py_ssize_t slen = PyUnicode_GET_LENGTH(pystr); - PyObject *sql = slen > 0 ? (PyObject *)pystr : NULL; - if (!sql) { - PyRef def = PyUnicode_FromString("SELECT 1;"); - PyRef r = PyObject_CallMethod(conn, "executescript", "O", - (PyObject *)def); - } else { - PyRef r = PyObject_CallMethod(conn, "executescript", "O", sql); - } - break; - } - case 4: { - // complete_statement. - Py_ssize_t slen = PyUnicode_GET_LENGTH(pystr); - PyObject *sql = slen > 0 ? (PyObject *)pystr : NULL; - if (!sql) { - PyRef def = PyUnicode_FromString("SELECT 1;"); - PyRef r = PyObject_CallFunction(sqlite3_complete_statement, "O", - (PyObject *)def); - } else { - PyRef r = PyObject_CallFunction(sqlite3_complete_statement, "O", sql); - } - break; - } - } - if (PyErr_Occurred()) PyErr_Clear(); - - PyRef cl = PyObject_CallMethod(conn, "close", NULL); - if (PyErr_Occurred()) PyErr_Clear(); -} - -// OP_SQLITE3_ADVANCED: connect(':memory:'), then FDP selects: create_function, -// create_aggregate, set_authorizer, create_collation, Row factory, blobopen, -// register_adapter. Exercises the _sqlite3 C module's advanced features. -static void op_sqlite3_advanced(FuzzedDataProvider &fdp) { - int str_enc = fdp.ConsumeIntegralInRange(0, 3); - int variant = fdp.ConsumeIntegralInRange(0, 6); - std::string data = fdp.ConsumeBytesAsString( - std::min(fdp.remaining_bytes(), (size_t)10000)); - PyRef pystr(fuzz_bytes_to_str(data, str_enc)); - CHECK(pystr); - - PyRef conn(make_sqlite_conn()); - CHECK(conn); - - switch (variant) { - case 0: { - // create_function + SELECT. - PyRef globals = PyDict_New(); - CHECK(globals); - PyDict_SetItemString(globals, "__builtins__", PyEval_GetBuiltins()); - PyRef fn = PyRun_String("lambda x: x", Py_eval_input, globals, globals); - CHECK(fn); - { - PyRef r = PyObject_CallMethod(conn, "create_function", "siO", - "fuzzfn", 1, (PyObject *)fn); - if (!r) { PyErr_Clear(); break; } - } - { - PyRef r = PyObject_CallMethod(conn, "execute", "s", - "CREATE TABLE t(a TEXT)"); - if (!r) { PyErr_Clear(); break; } - } - { - PyRef sub = PyUnicode_Substring(pystr, 0, 32); - if (!sub) { PyErr_Clear(); break; } - PyRef params = PyTuple_Pack(1, (PyObject *)sub); - CHECK(params); - PyRef r = PyObject_CallMethod(conn, "execute", "sO", - "INSERT INTO t VALUES(?)", - (PyObject *)params); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - PyRef cur = PyObject_CallMethod(conn, "execute", "s", - "SELECT fuzzfn(a) FROM t"); - if (cur) { - PyRef rows = PyObject_CallMethod(cur, "fetchall", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - } else { - PyErr_Clear(); - } - } - break; - } - case 1: { - // create_aggregate + SELECT. - { - PyRef r = PyObject_CallMethod(conn, "create_aggregate", "siO", - "fuzzagg", 1, - sqlite3_Aggregate_cls); - if (!r) { PyErr_Clear(); break; } - } - { - PyRef r = PyObject_CallMethod(conn, "execute", "s", - "CREATE TABLE t(v INTEGER)"); - if (!r) { PyErr_Clear(); break; } - } - { - PyRef rows = PyList_New(0); - CHECK(rows); - size_t limit = data.size() < 32 ? data.size() : 32; - for (size_t i = 0; i < limit; i++) { - PyRef val = PyLong_FromLong((unsigned char)data[i]); - PyRef tup = PyTuple_Pack(1, (PyObject *)val); - if (tup) PyList_Append(rows, tup); - } - PyRef r = PyObject_CallMethod(conn, "executemany", "sO", - "INSERT INTO t VALUES(?)", - (PyObject *)rows); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - PyRef cur = PyObject_CallMethod(conn, "execute", "s", - "SELECT fuzzagg(v) FROM t"); - if (cur) { - PyRef row = PyObject_CallMethod(cur, "fetchone", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - } else { - PyErr_Clear(); - } - } - break; - } - case 2: { - // set_authorizer + SELECT. - PyRef globals = PyDict_New(); - CHECK(globals); - PyDict_SetItemString(globals, "__builtins__", PyEval_GetBuiltins()); - PyRef code_str = PyUnicode_FromFormat( - "lambda *a: %ld", sqlite3_SQLITE_OK_val); - CHECK(code_str); - PyRef auth_fn = PyRun_String(PyUnicode_AsUTF8(code_str), - Py_eval_input, globals, globals); - CHECK(auth_fn); - { - PyRef r = PyObject_CallMethod(conn, "set_authorizer", "O", - (PyObject *)auth_fn); - if (!r) { PyErr_Clear(); break; } - } - { - PyRef r = PyObject_CallMethod(conn, "execute", "s", - "CREATE TABLE t(a TEXT)"); - if (!r) { PyErr_Clear(); break; } - } - { - PyRef sub = PyUnicode_Substring(pystr, 0, 16); - if (!sub) { PyErr_Clear(); break; } - PyRef params = PyTuple_Pack(1, (PyObject *)sub); - CHECK(params); - PyRef r = PyObject_CallMethod(conn, "execute", "sO", - "INSERT INTO t VALUES(?)", - (PyObject *)params); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - PyRef cur = PyObject_CallMethod(conn, "execute", "s", - "SELECT * FROM t"); - if (cur) { - PyRef rows = PyObject_CallMethod(cur, "fetchall", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - } else { - PyErr_Clear(); - } - } - break; - } - case 3: { - // create_collation + ORDER BY. - PyRef globals = PyDict_New(); - CHECK(globals); - PyDict_SetItemString(globals, "__builtins__", PyEval_GetBuiltins()); - PyRef coll_fn = PyRun_String( - "lambda a, b: (a > b) - (a < b)", - Py_eval_input, globals, globals); - CHECK(coll_fn); - { - PyRef r = PyObject_CallMethod(conn, "create_collation", "sO", - "fuzz", (PyObject *)coll_fn); - if (!r) { PyErr_Clear(); break; } - } - { - PyRef r = PyObject_CallMethod(conn, "execute", "s", - "CREATE TABLE t(a TEXT)"); - if (!r) { PyErr_Clear(); break; } - } - { - PyRef params = PyTuple_Pack(1, (PyObject *)pystr); - CHECK(params); - PyRef r = PyObject_CallMethod(conn, "execute", "sO", - "INSERT INTO t VALUES(?)", - (PyObject *)params); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - PyRef cur = PyObject_CallMethod(conn, "execute", "s", - "SELECT * FROM t ORDER BY a COLLATE fuzz"); - if (cur) { - PyRef rows = PyObject_CallMethod(cur, "fetchall", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - } else { - PyErr_Clear(); - } - } - break; - } - case 4: { - // Row factory + SELECT. - PyObject_SetAttrString(conn, "row_factory", sqlite3_Row); - { - PyRef r = PyObject_CallMethod(conn, "execute", "s", - "CREATE TABLE t(a TEXT, b INTEGER)"); - if (!r) { PyErr_Clear(); break; } - } - { - PyRef sub = PyUnicode_Substring(pystr, 0, 8); - if (!sub) { PyErr_Clear(); break; } - PyRef params = PyTuple_Pack(2, (PyObject *)sub, PyLong_FromLong(42)); - CHECK(params); - PyRef r = PyObject_CallMethod(conn, "execute", "sO", - "INSERT INTO t VALUES(?, ?)", - (PyObject *)params); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - PyRef cur = PyObject_CallMethod(conn, "execute", "s", - "SELECT * FROM t"); - if (cur) { - PyRef row = PyObject_CallMethod(cur, "fetchone", NULL); - if (row && row.p != Py_None) { - PyRef a = PyObject_GetItem(row, PyUnicode_FromString("a")); - if (PyErr_Occurred()) PyErr_Clear(); - PyRef b = PyObject_GetItem(row, PyUnicode_FromString("b")); - if (PyErr_Occurred()) PyErr_Clear(); - PyRef keys = PyObject_CallMethod(row, "keys", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - } - if (PyErr_Occurred()) PyErr_Clear(); - } else { - PyErr_Clear(); - } - } - break; - } - case 5: { - // blobopen + read/write. - { - PyRef r = PyObject_CallMethod(conn, "execute", "s", - "CREATE TABLE t(a BLOB)"); - if (!r) { PyErr_Clear(); break; } - } - { - std::string blob_data = data.substr(0, 64); - PyRef pydata = PyBytes_FromStringAndSize(blob_data.data(), - blob_data.size()); - CHECK(pydata); - PyRef params = PyTuple_Pack(1, (PyObject *)pydata); - CHECK(params); - PyRef r = PyObject_CallMethod(conn, "execute", "sO", - "INSERT INTO t VALUES(?)", - (PyObject *)params); - if (!r) { PyErr_Clear(); break; } - } - { - PyRef cur = PyObject_CallMethod(conn, "execute", "s", - "SELECT rowid FROM t"); - if (!cur) { PyErr_Clear(); break; } - PyRef row = PyObject_CallMethod(cur, "fetchone", NULL); - if (!row || row.p == Py_None) { PyErr_Clear(); break; } - PyRef rid = PySequence_GetItem(row, 0); - CHECK(rid); - PyRef blob = PyObject_CallMethod(conn, "blobopen", "sssO", - "main", "t", "a", (PyObject *)rid); - if (!blob) { PyErr_Clear(); break; } - { - PyRef rd = PyObject_CallMethod(blob, "read", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - PyRef sk = PyObject_CallMethod(blob, "seek", "i", 0); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - size_t wr_len = data.size() < 64 ? data.size() : 64; - PyRef wr_data = PyBytes_FromStringAndSize(data.data(), wr_len); - if (wr_data) { - PyRef wr = PyObject_CallMethod(blob, "write", "O", - (PyObject *)wr_data); - if (PyErr_Occurred()) PyErr_Clear(); - } - } - { - PyRef cl = PyObject_CallMethod(blob, "close", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - } - } - break; - } - case 6: { - // register_adapter. - PyRef globals = PyDict_New(); - CHECK(globals); - PyDict_SetItemString(globals, "__builtins__", PyEval_GetBuiltins()); - PyRef r = PyRun_String( - "class _AdaptMe:\n" - " def __init__(self, v): self.v = v\n", - Py_file_input, globals, globals); - CHECK(r); - PyRef adapt_cls = PyRef(PyDict_GetItemString(globals, "_AdaptMe")); - Py_INCREF(adapt_cls.p); - CHECK(adapt_cls); - - PyRef adapter_fn = PyRun_String( - "lambda a: str(a.v)", Py_eval_input, globals, globals); - CHECK(adapter_fn); - - { - PyRef reg = PyObject_CallFunction(sqlite3_register_adapter, "OO", - (PyObject *)adapt_cls, - (PyObject *)adapter_fn); - if (!reg) { PyErr_Clear(); break; } - } - { - PyRef r2 = PyObject_CallMethod(conn, "execute", "s", - "CREATE TABLE t(a TEXT)"); - if (!r2) { PyErr_Clear(); break; } - } - { - PyRef sub = PyUnicode_Substring(pystr, 0, 8); - if (!sub) { PyErr_Clear(); break; } - PyRef obj = PyObject_CallFunction(adapt_cls, "O", (PyObject *)sub); - if (!obj) { PyErr_Clear(); break; } - PyRef params = PyTuple_Pack(1, (PyObject *)obj); - CHECK(params); - PyRef r3 = PyObject_CallMethod(conn, "execute", "sO", - "INSERT INTO t VALUES(?)", - (PyObject *)params); - if (PyErr_Occurred()) PyErr_Clear(); - } - break; - } - } - if (PyErr_Occurred()) PyErr_Clear(); - - PyRef cl = PyObject_CallMethod(conn, "close", NULL); - if (PyErr_Occurred()) PyErr_Clear(); -} - -// --------------------------------------------------------------------------- -// Dispatch. -// --------------------------------------------------------------------------- - -enum Op { - OP_ARRAY_FROMBYTES, - OP_ARRAY_METHODS, - OP_ARRAY_SLICE, - OP_CTYPES, - OP_MMAP, - OP_LOCALE, - OP_DBM, - OP_SQLITE3_BASIC, - OP_SQLITE3_ADVANCED, - NUM_OPS -}; - -extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { - assert(Py_IsInitialized()); - init_dataops(); - if (size < 1 || size > 0x10000) return 0; - if (PyErr_Occurred()) PyErr_Clear(); - - FuzzedDataProvider fdp(data, size); - switch (fdp.ConsumeIntegralInRange(0, NUM_OPS - 1)) { - case OP_ARRAY_FROMBYTES: - op_array_frombytes(fdp); - break; - case OP_ARRAY_METHODS: - op_array_methods(fdp); - break; - case OP_ARRAY_SLICE: - op_array_slice(fdp); - break; - case OP_CTYPES: - op_ctypes(fdp); - break; - case OP_MMAP: - op_mmap(fdp); - break; - case OP_LOCALE: - op_locale(fdp); - break; - case OP_DBM: - op_dbm(fdp); - break; - case OP_SQLITE3_BASIC: - op_sqlite3_basic(fdp); - break; - case OP_SQLITE3_ADVANCED: - op_sqlite3_advanced(fdp); - break; - } - - if (++gc_counter % kGcInterval == 0) PyGC_Collect(); - return 0; -} diff --git a/module-fuzzers/fuzz_decode.cpp b/module-fuzzers/fuzz_decode.cpp deleted file mode 100644 index 234a265..0000000 --- a/module-fuzzers/fuzz_decode.cpp +++ /dev/null @@ -1,1029 +0,0 @@ -// fuzz_decode.cpp — Fuzzer for CPython's compression, encoding, serialization, -// and certificate-parsing C extension modules. -// -// This fuzzer exercises the following CPython C extension modules via -// their Python API, called through the Python C API from C++: -// -// zlib — compress/decompress (one-shot and streaming via -// compressobj/decompressobj with wbits, zdict, copy, -// flush), crc32, adler32 -// _bz2 — BZ2Decompressor.decompress(), bz2.compress() -// _lzma — LZMADecompressor.decompress() with FORMAT_AUTO/XZ/ALONE -// and 16 MB memlimit, lzma.compress() -// binascii — 6 decoders: a2b_base64 (with strict_mode), a2b_hex, -// a2b_uu, a2b_qp, a2b_ascii85, a2b_base85 -// 6 encoders: b2a_base64 (with newline), b2a_hex, -// b2a_uu (clamped to 45 bytes), b2a_qp, -// b2a_ascii85 (with foldspaces/wrapcol), b2a_base85 -// Checksums: crc32, crc_hqx -// Round-trip: hexlify -> unhexlify -// _pickle — pickle.dumps() with 8 container types (bytes, str, -// list, tuple, set, frozenset, bytearray, dict) across -// protocols 0-5 and fix_imports flag. -// pickle.loads() via RestrictedUnpickler (blocks -// find_class), PersistentUnpickler (handles PERSID/ -// BINPERSID), and RestrictedUnpickler with -// encoding='bytes'. -// Pickler chain: dump, clear_memo, dump, getvalue. -// Round-trip: dumps then loads. -// _ssl — ssl.DER_cert_to_PEM_cert(), then optionally -// SSLContext(PROTOCOL_TLS_CLIENT).load_verify_locations() -// _multibytecodec, -// _codecs_jp, _codecs_cn, _codecs_kr, -// _codecs_hk, _codecs_tw, _codecs_iso2022 -// — codecs.decode() with 17 codecs including shift_jis, -// euc-jp, gb2312, big5, gb18030, iso-2022-jp, etc. -// codecs.encode() with 19 codecs. -// Incremental decoders (shift_jis, gb18030, utf-16): -// split input at midpoint, decode halves, getstate, reset. -// Incremental encoders (shift_jis, utf-8): -// split string at midpoint, encode, reset, getstate. -// StreamReader: codecs.getreader('utf-8')(BytesIO).read() -// -// The first byte of fuzz input selects one of 20 operation types. Each -// operation consumes further bytes via FuzzedDataProvider to parameterize -// the call (algorithm/codec selection, compression level, protocol number, -// container type, wbits value, boolean flags, data splits). -// -// All module functions, constructors, and format constants are imported once -// during init and cached as static PyObject* and long pointers. Two pickle -// Unpickler subclasses (RestrictedUnpickler, PersistentUnpickler) are defined -// via PyRun_String at init time and cached as class objects. -// -// PyRef (RAII) prevents reference leaks. PyGC_Collect() runs every 200 -// iterations. Max input size: 1 MB. - -#include "fuzz_helpers.h" - -// --------------------------------------------------------------------------- -// Cached module objects, initialized once. -// --------------------------------------------------------------------------- - -// zlib -static PyObject *zlib_compress, *zlib_decompress; -static PyObject *zlib_decompressobj, *zlib_compressobj; -static PyObject *zlib_crc32, *zlib_adler32; - -// bz2 -static PyObject *bz2_compress, *bz2_BZ2Decompressor; - -// lzma -static PyObject *lzma_LZMADecompressor, *lzma_compress; -static long lzma_FORMAT_AUTO_val, lzma_FORMAT_XZ_val, lzma_FORMAT_ALONE_val; - -// binascii -static PyObject *ba_a2b_base64, *ba_a2b_hex, *ba_a2b_uu, *ba_a2b_qp; -static PyObject *ba_a2b_ascii85, *ba_a2b_base85; -static PyObject *ba_b2a_base64, *ba_b2a_hex, *ba_b2a_uu, *ba_b2a_qp; -static PyObject *ba_b2a_ascii85, *ba_b2a_base85; -static PyObject *ba_crc32, *ba_crc_hqx, *ba_hexlify, *ba_unhexlify; - -// pickle -static PyObject *pickle_dumps, *pickle_loads; - -// codecs -static PyObject *codecs_decode, *codecs_encode; -static PyObject *codecs_getincrementaldecoder, *codecs_getincrementalencoder; -static PyObject *codecs_getreader; - -// ssl -static PyObject *ssl_DER_cert_to_PEM_cert, *ssl_SSLContext; -static long ssl_PROTOCOL_TLS_CLIENT_val; - -// io -static PyObject *bytesio_ctor; - -// pickle helper classes -static PyObject *RestrictedUnpickler_cls, *PersistentUnpickler_cls; - -static unsigned long gc_counter = 0; - -static int initialized = 0; - -static void init_decode(void) { - if (initialized) return; - - // zlib - zlib_compress = import_attr("zlib", "compress"); - zlib_decompress = import_attr("zlib", "decompress"); - zlib_decompressobj = import_attr("zlib", "decompressobj"); - zlib_compressobj = import_attr("zlib", "compressobj"); - zlib_crc32 = import_attr("zlib", "crc32"); - zlib_adler32 = import_attr("zlib", "adler32"); - - // bz2 - bz2_compress = import_attr("bz2", "compress"); - bz2_BZ2Decompressor = import_attr("bz2", "BZ2Decompressor"); - - // lzma - lzma_LZMADecompressor = import_attr("lzma", "LZMADecompressor"); - lzma_compress = import_attr("lzma", "compress"); - { - PyObject *v; - v = import_attr("lzma", "FORMAT_AUTO"); - lzma_FORMAT_AUTO_val = PyLong_AsLong(v); - Py_DECREF(v); - v = import_attr("lzma", "FORMAT_XZ"); - lzma_FORMAT_XZ_val = PyLong_AsLong(v); - Py_DECREF(v); - v = import_attr("lzma", "FORMAT_ALONE"); - lzma_FORMAT_ALONE_val = PyLong_AsLong(v); - Py_DECREF(v); - } - - // binascii - ba_a2b_base64 = import_attr("binascii", "a2b_base64"); - ba_a2b_hex = import_attr("binascii", "a2b_hex"); - ba_a2b_uu = import_attr("binascii", "a2b_uu"); - ba_a2b_qp = import_attr("binascii", "a2b_qp"); - ba_a2b_ascii85 = import_attr("binascii", "a2b_ascii85"); - ba_a2b_base85 = import_attr("binascii", "a2b_base85"); - ba_b2a_base64 = import_attr("binascii", "b2a_base64"); - ba_b2a_hex = import_attr("binascii", "b2a_hex"); - ba_b2a_uu = import_attr("binascii", "b2a_uu"); - ba_b2a_qp = import_attr("binascii", "b2a_qp"); - ba_b2a_ascii85 = import_attr("binascii", "b2a_ascii85"); - ba_b2a_base85 = import_attr("binascii", "b2a_base85"); - ba_crc32 = import_attr("binascii", "crc32"); - ba_crc_hqx = import_attr("binascii", "crc_hqx"); - ba_hexlify = import_attr("binascii", "hexlify"); - ba_unhexlify = import_attr("binascii", "unhexlify"); - - // pickle - pickle_dumps = import_attr("pickle", "dumps"); - pickle_loads = import_attr("pickle", "loads"); - - // codecs - codecs_decode = import_attr("codecs", "decode"); - codecs_encode = import_attr("codecs", "encode"); - codecs_getincrementaldecoder = import_attr("codecs", - "getincrementaldecoder"); - codecs_getincrementalencoder = import_attr("codecs", - "getincrementalencoder"); - codecs_getreader = import_attr("codecs", "getreader"); - - // ssl - ssl_DER_cert_to_PEM_cert = import_attr("ssl", "DER_cert_to_PEM_cert"); - ssl_SSLContext = import_attr("ssl", "SSLContext"); - { - PyObject *v = import_attr("ssl", "PROTOCOL_TLS_CLIENT"); - ssl_PROTOCOL_TLS_CLIENT_val = PyLong_AsLong(v); - Py_DECREF(v); - } - - // io - bytesio_ctor = import_attr("io", "BytesIO"); - - // Suppress warnings. - PyRun_SimpleString("import warnings; warnings.filterwarnings('ignore')"); - - // Pickle helper classes via PyRun_String. - { - PyObject *globals = PyDict_New(); - PyDict_SetItemString(globals, "__builtins__", PyEval_GetBuiltins()); - PyObject *r = PyRun_String( - "import pickle, io\n" - "class RestrictedUnpickler(pickle.Unpickler):\n" - " def find_class(self, module, name):\n" - " raise pickle.UnpicklingError('restricted')\n" - "class PersistentUnpickler(pickle.Unpickler):\n" - " def persistent_load(self, pid): return pid\n" - " def find_class(self, module, name):\n" - " raise pickle.UnpicklingError('restricted')\n", - Py_file_input, globals, globals); - if (!r) { - PyErr_Print(); - abort(); - } - Py_DECREF(r); - RestrictedUnpickler_cls = - PyDict_GetItemString(globals, "RestrictedUnpickler"); - Py_INCREF(RestrictedUnpickler_cls); - PersistentUnpickler_cls = - PyDict_GetItemString(globals, "PersistentUnpickler"); - Py_INCREF(PersistentUnpickler_cls); - Py_DECREF(globals); - } - - assert(!PyErr_Occurred()); - initialized = 1; -} - -// --------------------------------------------------------------------------- -// Operations — Compression (6 ops) -// --------------------------------------------------------------------------- - -// OP_ZLIB_DECOMPRESS: Create a zlib.decompressobj with fuzz-chosen wbits -// from {-15 (raw), 0 (auto), 15 (zlib), 31 (gzip), 47 (auto-detect)} and -// an optional zdict (first 32 bytes of data). Call .decompress(data, 1MB), -// optionally .flush(), and optionally .copy() + decompress on the copy. -// Exercises Decomp_Type, zlib_Decompress_decompress, copy, flush paths. -static void op_zlib_decompress(FuzzedDataProvider &fdp) { - static const int kWbitsChoices[] = {-15, 0, 15, 31, 47}; - int wbits = kWbitsChoices[fdp.ConsumeIntegralInRange(0, 4)]; - bool use_zdict = fdp.ConsumeBool(); - std::string data = fdp.ConsumeRemainingBytesAsString(); - - PyRef kwargs = PyDict_New(); - CHECK(kwargs); - PyRef wbits_obj = PyLong_FromLong(wbits); - CHECK(wbits_obj); - PyRef args_dobj = PyTuple_Pack(1, (PyObject *)wbits_obj); - CHECK(args_dobj); - - if (use_zdict && data.size() > 32) { - PyRef zdict = PyBytes_FromStringAndSize(data.data(), 32); - CHECK(zdict); - PyDict_SetItemString(kwargs, "zdict", zdict); - data = data.substr(32); - } - - PyRef dobj = PyObject_Call(zlib_decompressobj, args_dobj, kwargs); - CHECK(dobj); - - PyRef pydata = PyBytes_FromStringAndSize(Y(data)); - CHECK(pydata); - PyRef r = PyObject_CallMethod(dobj, "decompress", "Oi", - (PyObject *)pydata, 1048576); - if (!r) { - PyErr_Clear(); - return; - } - - if (fdp.remaining_bytes() > 0 || data.size() % 2 == 0) { - PyRef flush_r = PyObject_CallMethod(dobj, "flush", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - } - - if (data.size() % 3 == 0) { - PyRef copy_obj = PyObject_CallMethod(dobj, "copy", NULL); - if (copy_obj) { - PyRef r2 = PyObject_CallMethod(copy_obj, "decompress", "Oi", - (PyObject *)pydata, 1048576); - if (PyErr_Occurred()) PyErr_Clear(); - } else { - PyErr_Clear(); - } - } -} - -// OP_ZLIB_COMPRESS: Either one-shot zlib.compress(data, level) or streaming -// via compressobj(level).compress(data).flush(), with optional .copy().flush(). -// Level is fuzz-chosen 0-9. Exercises Compress_Type and zlib_compress_impl. -static void op_zlib_compress(FuzzedDataProvider &fdp) { - int level = fdp.ConsumeIntegralInRange(0, 9); - bool use_obj = fdp.ConsumeBool(); - std::string data = fdp.ConsumeBytesAsString( - std::min(fdp.remaining_bytes(), (size_t)10000)); - - PyRef pydata = PyBytes_FromStringAndSize(Y(data)); - CHECK(pydata); - - if (use_obj) { - PyRef cobj = PyObject_CallFunction(zlib_compressobj, "i", level); - CHECK(cobj); - PyRef r1 = PyObject_CallMethod(cobj, "compress", "O", - (PyObject *)pydata); - CHECK(r1); - if (data.size() % 2 == 0) { - PyRef copy_obj = PyObject_CallMethod(cobj, "copy", NULL); - if (copy_obj) { - PyRef r2 = PyObject_CallMethod(copy_obj, "flush", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - } else { - PyErr_Clear(); - } - } - PyRef r3 = PyObject_CallMethod(cobj, "flush", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - } else { - PyRef r = PyObject_CallFunction(zlib_compress, "Oi", - (PyObject *)pydata, level); - if (PyErr_Occurred()) PyErr_Clear(); - } -} - -// OP_ZLIB_CHECKSUM: Call either zlib.crc32(data) or zlib.adler32(data), -// fuzz-chosen. Exercises the checksum C implementations in zlibmodule.c. -static void op_zlib_checksum(FuzzedDataProvider &fdp) { - bool use_crc = fdp.ConsumeBool(); - std::string data = fdp.ConsumeRemainingBytesAsString(); - PyRef pydata = PyBytes_FromStringAndSize(Y(data)); - CHECK(pydata); - PyRef r = PyObject_CallFunction( - use_crc ? zlib_crc32 : zlib_adler32, "O", (PyObject *)pydata); - if (PyErr_Occurred()) PyErr_Clear(); -} - -// OP_BZ2: Either bz2.compress(data) or BZ2Decompressor().decompress(data, 1MB), -// fuzz-chosen. Exercises the _bz2 C extension (BZ2Compressor/BZ2Decompressor). -static void op_bz2(FuzzedDataProvider &fdp) { - bool do_compress = fdp.ConsumeBool(); - std::string data = fdp.ConsumeBytesAsString( - std::min(fdp.remaining_bytes(), (size_t)10000)); - PyRef pydata = PyBytes_FromStringAndSize(Y(data)); - CHECK(pydata); - - if (do_compress) { - PyRef r = PyObject_CallFunction(bz2_compress, "O", - (PyObject *)pydata); - if (PyErr_Occurred()) PyErr_Clear(); - } else { - PyRef dobj = PyObject_CallFunction(bz2_BZ2Decompressor, NULL); - CHECK(dobj); - PyRef r = PyObject_CallMethod(dobj, "decompress", "Oi", - (PyObject *)pydata, 1048576); - if (PyErr_Occurred()) PyErr_Clear(); - } -} - -// OP_LZMA_DECOMPRESS: Create LZMADecompressor with fuzz-chosen format from -// {FORMAT_AUTO, FORMAT_XZ, FORMAT_ALONE} and 16 MB memlimit, then call -// .decompress(data, 1MB). Exercises the _lzma C extension decompressor. -static void op_lzma_decompress(FuzzedDataProvider &fdp) { - long fmt_vals[] = { - lzma_FORMAT_AUTO_val, lzma_FORMAT_XZ_val, lzma_FORMAT_ALONE_val, - }; - long fmt = fmt_vals[fdp.ConsumeIntegralInRange(0, 2)]; - std::string data = fdp.ConsumeBytesAsString( - std::min(fdp.remaining_bytes(), (size_t)10000)); - PyRef pydata = PyBytes_FromStringAndSize(Y(data)); - CHECK(pydata); - - PyRef kwargs = PyDict_New(); - CHECK(kwargs); - PyRef fmt_obj = PyLong_FromLong(fmt); - CHECK(fmt_obj); - PyDict_SetItemString(kwargs, "format", fmt_obj); - PyRef memlimit = PyLong_FromLong(16 * 1024 * 1024); - CHECK(memlimit); - PyDict_SetItemString(kwargs, "memlimit", memlimit); - - PyRef empty_args = PyTuple_New(0); - CHECK(empty_args); - PyRef dobj = PyObject_Call(lzma_LZMADecompressor, empty_args, kwargs); - CHECK(dobj); - - PyRef r = PyObject_CallMethod(dobj, "decompress", "Oi", - (PyObject *)pydata, 1048576); - if (PyErr_Occurred()) PyErr_Clear(); -} - -// OP_LZMA_COMPRESS: One-shot lzma.compress(data). Exercises the _lzma -// C extension compressor with default settings. -static void op_lzma_compress(FuzzedDataProvider &fdp) { - std::string data = fdp.ConsumeBytesAsString( - std::min(fdp.remaining_bytes(), (size_t)10000)); - PyRef pydata = PyBytes_FromStringAndSize(Y(data)); - CHECK(pydata); - PyRef r = PyObject_CallFunction(lzma_compress, "O", (PyObject *)pydata); - if (PyErr_Occurred()) PyErr_Clear(); -} - -// --------------------------------------------------------------------------- -// Operations — Binascii (4 ops) -// --------------------------------------------------------------------------- - -// OP_BINASCII_DECODE: Call one of 6 binary-to-binary decoders from the -// binascii C module: a2b_base64 (with optional strict_mode=True), a2b_hex, -// a2b_uu, a2b_qp, a2b_ascii85, a2b_base85. Fuzz selects which decoder. -static void op_binascii_decode(FuzzedDataProvider &fdp) { - int which = fdp.ConsumeIntegralInRange(0, 5); - bool strict = fdp.ConsumeBool(); - std::string data = fdp.ConsumeRemainingBytesAsString(); - PyRef pydata = PyBytes_FromStringAndSize(Y(data)); - CHECK(pydata); - - PyObject *funcs[] = { - ba_a2b_base64, ba_a2b_hex, ba_a2b_uu, - ba_a2b_qp, ba_a2b_ascii85, ba_a2b_base85, - }; - - if (which == 0 && strict) { - PyRef kwargs = PyDict_New(); - CHECK(kwargs); - PyDict_SetItemString(kwargs, "strict_mode", Py_True); - PyRef args = PyTuple_Pack(1, (PyObject *)pydata); - CHECK(args); - PyRef r = PyObject_Call(ba_a2b_base64, args, kwargs); - } else { - PyRef r = PyObject_CallFunction(funcs[which], "O", - (PyObject *)pydata); - } - if (PyErr_Occurred()) PyErr_Clear(); -} - -// OP_BINASCII_ENCODE: Call one of 6 binary-to-text encoders from the -// binascii C module: b2a_base64 (with optional newline kwarg), b2a_hex, -// b2a_uu (input clamped to 45 bytes), b2a_qp, b2a_ascii85 (with optional -// foldspaces and wrapcol=72), b2a_base85. Fuzz selects which encoder. -static void op_binascii_encode(FuzzedDataProvider &fdp) { - int which = fdp.ConsumeIntegralInRange(0, 5); - std::string data = fdp.ConsumeBytesAsString( - std::min(fdp.remaining_bytes(), (size_t)10000)); - - // b2a_uu requires <= 45 bytes. - if (which == 2 && data.size() > 45) data.resize(45); - - PyRef pydata = PyBytes_FromStringAndSize(Y(data)); - CHECK(pydata); - - PyObject *funcs[] = { - ba_b2a_base64, ba_b2a_hex, ba_b2a_uu, - ba_b2a_qp, ba_b2a_ascii85, ba_b2a_base85, - }; - - if (which == 0) { - // b2a_base64 with optional newline kwarg. - bool newline = fdp.ConsumeBool(); - PyRef kwargs = PyDict_New(); - CHECK(kwargs); - PyDict_SetItemString(kwargs, "newline", newline ? Py_True : Py_False); - PyRef args = PyTuple_Pack(1, (PyObject *)pydata); - CHECK(args); - PyRef r = PyObject_Call(ba_b2a_base64, args, kwargs); - } else if (which == 4) { - // b2a_ascii85 with optional foldspaces/wrapcol. - bool foldspaces = fdp.ConsumeBool(); - PyRef kwargs = PyDict_New(); - CHECK(kwargs); - if (foldspaces) - PyDict_SetItemString(kwargs, "foldspaces", Py_True); - PyRef wrapcol = PyLong_FromLong(72); - CHECK(wrapcol); - PyDict_SetItemString(kwargs, "wrapcol", wrapcol); - PyRef args = PyTuple_Pack(1, (PyObject *)pydata); - CHECK(args); - PyRef r = PyObject_Call(ba_b2a_ascii85, args, kwargs); - } else { - PyRef r = PyObject_CallFunction(funcs[which], "O", - (PyObject *)pydata); - } - if (PyErr_Occurred()) PyErr_Clear(); -} - -// OP_BINASCII_CHECKSUM: Call either binascii.crc32(data) or -// binascii.crc_hqx(data, 0), fuzz-chosen. -static void op_binascii_checksum(FuzzedDataProvider &fdp) { - bool use_crc32 = fdp.ConsumeBool(); - std::string data = fdp.ConsumeRemainingBytesAsString(); - PyRef pydata = PyBytes_FromStringAndSize(Y(data)); - CHECK(pydata); - - if (use_crc32) { - PyRef r = PyObject_CallFunction(ba_crc32, "O", (PyObject *)pydata); - } else { - PyRef r = PyObject_CallFunction(ba_crc_hqx, "Oi", - (PyObject *)pydata, 0); - } - if (PyErr_Occurred()) PyErr_Clear(); -} - -// OP_BINASCII_ROUNDTRIP: binascii.hexlify(data) then binascii.unhexlify() -// on the result. Exercises both directions of hex encoding. -static void op_binascii_roundtrip(FuzzedDataProvider &fdp) { - std::string data = fdp.ConsumeRemainingBytesAsString(); - PyRef pydata = PyBytes_FromStringAndSize(Y(data)); - CHECK(pydata); - PyRef hexed = PyObject_CallFunction(ba_hexlify, "O", - (PyObject *)pydata); - CHECK(hexed); - PyRef r = PyObject_CallFunction(ba_unhexlify, "O", (PyObject *)hexed); - if (PyErr_Occurred()) PyErr_Clear(); -} - -// --------------------------------------------------------------------------- -// Operations — Pickle (4 ops) -// --------------------------------------------------------------------------- - -// Build a Python container from fuzz bytes for pickle.dumps operations. -// type selects: 0=bytes, 1=str, 2=list of ints, 3=tuple of ints, -// 4=set of ints, 5=frozenset of ints, 6=bytearray, 7=dict(int->None). -// Capped at 256 elements to keep serialization fast. -// str_enc selects the byte-to-str decoding (see fuzz_bytes_to_str). -static PyObject *build_pickle_container(int type, const uint8_t *buf, - size_t len, int str_enc) { - if (len > 256) len = 256; - switch (type) { - case 0: // raw bytes - return PyBytes_FromStringAndSize((const char *)buf, len); - case 1: { // str - std::string s((const char *)buf, len); - return fuzz_bytes_to_str(s, str_enc); - } - case 2: { // list of ints - PyObject *lst = PyList_New((Py_ssize_t)len); - if (!lst) return NULL; - for (size_t i = 0; i < len; i++) - PyList_SET_ITEM(lst, i, PyLong_FromLong(buf[i])); - return lst; - } - case 3: { // tuple of ints - PyObject *tup = PyTuple_New((Py_ssize_t)len); - if (!tup) return NULL; - for (size_t i = 0; i < len; i++) - PyTuple_SET_ITEM(tup, i, PyLong_FromLong(buf[i])); - return tup; - } - case 4: { // set - PyObject *lst = PyList_New((Py_ssize_t)len); - if (!lst) return NULL; - for (size_t i = 0; i < len; i++) - PyList_SET_ITEM(lst, i, PyLong_FromLong(buf[i])); - PyObject *s = PySet_New(lst); - Py_DECREF(lst); - return s; - } - case 5: { // frozenset - PyObject *lst = PyList_New((Py_ssize_t)len); - if (!lst) return NULL; - for (size_t i = 0; i < len; i++) - PyList_SET_ITEM(lst, i, PyLong_FromLong(buf[i])); - PyObject *s = PyFrozenSet_New(lst); - Py_DECREF(lst); - return s; - } - case 6: // bytearray - return PyByteArray_FromStringAndSize((const char *)buf, len); - case 7: { // dict.fromkeys - PyObject *d = PyDict_New(); - if (!d) return NULL; - for (size_t i = 0; i < len; i++) { - PyRef key = PyLong_FromLong(buf[i]); - if (key) PyDict_SetItem(d, key, Py_None); - } - return d; - } - default: - return PyBytes_FromStringAndSize((const char *)buf, len); - } -} - -// OP_PICKLE_DUMPS: Build a fuzz-chosen container type (see -// build_pickle_container; str containers use fuzz_bytes_to_str for -// fuzz-chosen byte-to-str decoding), then call pickle.dumps(obj, protocol=N, -// fix_imports=bool). Protocol is fuzz-chosen 0-5, exercising all pickle -// opcodes: MARK, SHORT_BINBYTES, BINUNICODE, EMPTY_SET, ADDITEMS, -// FROZENSET, BYTEARRAY8, SETITEMS, etc. -static void op_pickle_dumps(FuzzedDataProvider &fdp) { - int container_type = fdp.ConsumeIntegralInRange(0, 7); - int protocol = fdp.ConsumeIntegralInRange(0, 5); - bool fix_imports = fdp.ConsumeBool(); - int str_enc = fdp.ConsumeIntegralInRange(0, 3); - std::string data = fdp.ConsumeBytesAsString( - std::min(fdp.remaining_bytes(), (size_t)10000)); - - PyRef obj(build_pickle_container( - container_type, (const uint8_t *)data.data(), data.size(), str_enc)); - CHECK(obj); - - PyRef kwargs = PyDict_New(); - CHECK(kwargs); - PyRef proto = PyLong_FromLong(protocol); - CHECK(proto); - PyDict_SetItemString(kwargs, "protocol", proto); - PyDict_SetItemString(kwargs, "fix_imports", - fix_imports ? Py_True : Py_False); - PyRef args = PyTuple_Pack(1, (PyObject *)obj); - CHECK(args); - PyRef r = PyObject_Call(pickle_dumps, args, kwargs); - if (PyErr_Occurred()) PyErr_Clear(); -} - -// OP_PICKLE_LOADS: Wrap fuzz data in BytesIO, then unpickle via one of 3 -// Unpickler subclass variants (fuzz-chosen): -// 0 — RestrictedUnpickler: blocks find_class (safe against arbitrary code) -// 1 — PersistentUnpickler: handles PERSID/BINPERSID opcodes, blocks find_class -// 2 — RestrictedUnpickler with fix_imports=True, encoding='bytes' (Py2 compat) -// Exercises the _pickle C extension's Unpickler_Type code paths. -static void op_pickle_loads(FuzzedDataProvider &fdp) { - int variant = fdp.ConsumeIntegralInRange(0, 2); - std::string data = fdp.ConsumeRemainingBytesAsString(); - PyRef pydata = PyBytes_FromStringAndSize(Y(data)); - CHECK(pydata); - PyRef bio = PyObject_CallFunction(bytesio_ctor, "O", - (PyObject *)pydata); - CHECK(bio); - - PyObject *cls = nullptr; - PyRef kwargs_ref; - switch (variant) { - case 0: // RestrictedUnpickler - cls = RestrictedUnpickler_cls; - break; - case 1: // PersistentUnpickler - cls = PersistentUnpickler_cls; - break; - case 2: { // RestrictedUnpickler with fix_imports + encoding='bytes' - cls = RestrictedUnpickler_cls; - kwargs_ref = PyRef(PyDict_New()); - CHECK(kwargs_ref); - PyDict_SetItemString(kwargs_ref, "fix_imports", Py_True); - PyRef enc = PyUnicode_FromString("bytes"); - CHECK(enc); - PyDict_SetItemString(kwargs_ref, "encoding", enc); - break; - } - } - - PyRef args = PyTuple_Pack(1, (PyObject *)bio); - CHECK(args); - PyRef unpickler = PyObject_Call( - cls, args, kwargs_ref.p ? (PyObject *)kwargs_ref : NULL); - CHECK(unpickler); - PyRef r = PyObject_CallMethod(unpickler, "load", NULL); - if (PyErr_Occurred()) PyErr_Clear(); -} - -// OP_PICKLE_PICKLER: Create pickle.Pickler(BytesIO, protocol=N), then chain: -// .dump(list_of_ints), .clear_memo(), .dump(str), .getvalue(). -// The str object for the second dump is built via fuzz_bytes_to_str with a -// fuzz-chosen decoding. Exercises the Pickler_Type, memo proxy clear, and -// multi-dump sequences in the _pickle C extension. Protocol is fuzz-chosen 0-5. -static void op_pickle_pickler(FuzzedDataProvider &fdp) { - int protocol = fdp.ConsumeIntegralInRange(0, 5); - int str_enc = fdp.ConsumeIntegralInRange(0, 3); - std::string data = fdp.ConsumeBytesAsString( - std::min(fdp.remaining_bytes(), (size_t)10000)); - - PyRef bio = PyObject_CallFunction(bytesio_ctor, NULL); - CHECK(bio); - - // Import pickle.Pickler (cached after first call). - static PyObject *pickle_Pickler = nullptr; - if (!pickle_Pickler) { - pickle_Pickler = import_attr("pickle", "Pickler"); - } - - PyRef pickler = PyObject_CallFunction(pickle_Pickler, "Oi", - (PyObject *)bio, protocol); - CHECK(pickler); - - // Build first object: list of ints. - PyRef obj1(build_pickle_container( - 2, (const uint8_t *)data.data(), data.size(), str_enc)); - CHECK(obj1); - - PyRef r1 = PyObject_CallMethod(pickler, "dump", "O", (PyObject *)obj1); - if (!r1) { - PyErr_Clear(); - return; - } - - PyRef cm = PyObject_CallMethod(pickler, "clear_memo", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - - // Build second object: string. - PyRef obj2(fuzz_bytes_to_str(data, str_enc)); - CHECK(obj2); - PyRef r2 = PyObject_CallMethod(pickler, "dump", "O", (PyObject *)obj2); - if (PyErr_Occurred()) PyErr_Clear(); - - PyRef val = PyObject_CallMethod(bio, "getvalue", NULL); - if (PyErr_Occurred()) PyErr_Clear(); -} - -// OP_PICKLE_ROUNDTRIP: Build a fuzz-chosen container (str containers use -// fuzz_bytes_to_str for fuzz-chosen byte-to-str decoding), pickle.dumps() it, -// then pickle.loads() the result. Exercises both Pickler and Unpickler in -// a single iteration, ensuring round-trip consistency. -static void op_pickle_roundtrip(FuzzedDataProvider &fdp) { - int container_type = fdp.ConsumeIntegralInRange(0, 7); - int str_enc = fdp.ConsumeIntegralInRange(0, 3); - std::string data = fdp.ConsumeBytesAsString( - std::min(fdp.remaining_bytes(), (size_t)10000)); - - PyRef obj(build_pickle_container( - container_type, (const uint8_t *)data.data(), data.size(), str_enc)); - CHECK(obj); - - PyRef dumped = PyObject_CallFunction(pickle_dumps, "O", (PyObject *)obj); - if (!dumped) { - PyErr_Clear(); - return; - } - PyRef loaded = PyObject_CallFunction(pickle_loads, "O", - (PyObject *)dumped); - if (PyErr_Occurred()) PyErr_Clear(); -} - -// --------------------------------------------------------------------------- -// Operations — Codecs (5 ops) -// -// These exercise the _multibytecodec C engine and per-language codec -// C modules (_codecs_jp, _codecs_cn, _codecs_kr, _codecs_hk, _codecs_tw, -// _codecs_iso2022) as well as built-in codecs (utf-7/8/16/32, ascii, -// latin-1, charmap, unicode_escape, raw_unicode_escape, cp1252). -// --------------------------------------------------------------------------- - -// Codec names for OP_CODECS_DECODE: 17 decoders covering multibyte CJK -// codecs plus single-byte and Unicode escape codecs. -static const char *kCodecDecoders[] = { - "utf-7", "shift_jis", "euc-jp", "gb2312", "big5", "iso-2022-jp", - "euc-kr", "gb18030", "big5hkscs", "charmap", "ascii", "latin-1", - "cp1252", "unicode_escape", "raw_unicode_escape", "utf-16", "utf-32", -}; -static constexpr int kNumCodecDecoders = - sizeof(kCodecDecoders) / sizeof(kCodecDecoders[0]); - -// Codec names for OP_CODECS_ENCODE: 19 encoders covering multibyte CJK -// codecs plus Unicode, UTF, and single-byte encoders. -static const char *kCodecEncoders[] = { - "shift_jis", "euc-jp", "gb2312", "big5", "iso-2022-jp", "euc-kr", - "gb18030", "big5hkscs", "unicode_escape", "raw_unicode_escape", - "utf-7", "utf-8", "utf-16", "utf-16-le", "utf-16-be", "utf-32", - "latin-1", "ascii", "charmap", -}; -static constexpr int kNumCodecEncoders = - sizeof(kCodecEncoders) / sizeof(kCodecEncoders[0]); - -// OP_CODECS_DECODE: Call codecs.decode(bytes, codec, 'replace') with a -// fuzz-chosen codec from 17 decoders. The 'replace' error handler ensures -// no UnicodeDecodeError is raised. Exercises the multibytecodec_decode and -// built-in codec decode paths. -static void op_codecs_decode(FuzzedDataProvider &fdp) { - int ci = fdp.ConsumeIntegralInRange(0, kNumCodecDecoders - 1); - std::string data = fdp.ConsumeRemainingBytesAsString(); - PyRef pydata = PyBytes_FromStringAndSize(Y(data)); - CHECK(pydata); - PyRef r = PyObject_CallFunction(codecs_decode, "Oss", - (PyObject *)pydata, - kCodecDecoders[ci], "replace"); - if (PyErr_Occurred()) PyErr_Clear(); -} - -// OP_CODECS_ENCODE: Convert fuzz bytes to a Python str using a fuzz-chosen -// decoding (Latin-1, UTF-8, UTF-16-LE, or UTF-32-LE — see fuzz_bytes_to_str), -// then call codecs.encode(str, codec, 'replace') with a fuzz-chosen codec -// from 19 encoders. Exercises the multibytecodec_encode and built-in codec -// encode paths. -static void op_codecs_encode(FuzzedDataProvider &fdp) { - int ci = fdp.ConsumeIntegralInRange(0, kNumCodecEncoders - 1); - int str_enc = fdp.ConsumeIntegralInRange(0, 3); - std::string data = fdp.ConsumeBytesAsString( - std::min(fdp.remaining_bytes(), (size_t)10000)); - PyRef pystr(fuzz_bytes_to_str(data, str_enc)); - CHECK(pystr); - PyRef r = PyObject_CallFunction(codecs_encode, "Oss", - (PyObject *)pystr, - kCodecEncoders[ci], "replace"); - if (PyErr_Occurred()) PyErr_Clear(); -} - -// OP_CODECS_INCREMENTAL_DECODE: Get an IncrementalDecoder for a fuzz-chosen -// codec from {shift_jis, gb18030, utf-16}, split the fuzz data at the -// midpoint, then: .decode(first_half), .decode(second_half, final=True), -// .getstate(), .reset(). Exercises the stateful incremental decoding path -// in _multibytecodec (MultibyteIncrementalDecoder_Type). -static void op_codecs_incremental_decode(FuzzedDataProvider &fdp) { - static const char *kIncCodecs[] = {"shift_jis", "gb18030", "utf-16"}; - int ci = fdp.ConsumeIntegralInRange(0, 2); - std::string data = fdp.ConsumeRemainingBytesAsString(); - size_t mid = data.size() / 2; - - PyRef codec_name = PyUnicode_FromString(kIncCodecs[ci]); - CHECK(codec_name); - PyRef decoder_factory = PyObject_CallFunction( - codecs_getincrementaldecoder, "O", (PyObject *)codec_name); - CHECK(decoder_factory); - - PyRef decoder = PyObject_CallFunction(decoder_factory, "s", "replace"); - CHECK(decoder); - - PyRef half1 = PyBytes_FromStringAndSize(data.data(), mid); - CHECK(half1); - PyRef r1 = PyObject_CallMethod(decoder, "decode", "O", - (PyObject *)half1); - if (!r1) { - PyErr_Clear(); - return; - } - - PyRef half2 = PyBytes_FromStringAndSize(data.data() + mid, - data.size() - mid); - CHECK(half2); - PyRef r2 = PyObject_CallMethod(decoder, "decode", "Oi", - (PyObject *)half2, 1); - if (PyErr_Occurred()) PyErr_Clear(); - - PyRef state = PyObject_CallMethod(decoder, "getstate", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - PyRef reset = PyObject_CallMethod(decoder, "reset", NULL); - if (PyErr_Occurred()) PyErr_Clear(); -} - -// OP_CODECS_INCREMENTAL_ENCODE: Get an IncrementalEncoder for a fuzz-chosen -// codec from {shift_jis, utf-8}. Convert fuzz bytes to str via fuzz-chosen -// decoding (see fuzz_bytes_to_str), split the resulting string at the -// midpoint, then: .encode(first_half), .reset(), .encode(second_half), -// .getstate(). Exercises the stateful incremental encoding path in -// _multibytecodec (MultibyteIncrementalEncoder_Type). -static void op_codecs_incremental_encode(FuzzedDataProvider &fdp) { - static const char *kIncCodecs[] = {"shift_jis", "utf-8"}; - int ci = fdp.ConsumeIntegralInRange(0, 1); - int str_enc = fdp.ConsumeIntegralInRange(0, 3); - std::string data = fdp.ConsumeBytesAsString( - std::min(fdp.remaining_bytes(), (size_t)10000)); - - PyRef pystr(fuzz_bytes_to_str(data, str_enc)); - CHECK(pystr); - Py_ssize_t slen = PyUnicode_GET_LENGTH(pystr); - Py_ssize_t mid = slen / 2; - - PyRef codec_name = PyUnicode_FromString(kIncCodecs[ci]); - CHECK(codec_name); - PyRef encoder_factory = PyObject_CallFunction( - codecs_getincrementalencoder, "O", (PyObject *)codec_name); - CHECK(encoder_factory); - - PyRef encoder = PyObject_CallFunction(encoder_factory, "s", "replace"); - CHECK(encoder); - - PyRef half1 = PyUnicode_Substring(pystr, 0, mid); - CHECK(half1); - PyRef r1 = PyObject_CallMethod(encoder, "encode", "O", - (PyObject *)half1); - if (!r1) { - PyErr_Clear(); - return; - } - - PyRef reset_r = PyObject_CallMethod(encoder, "reset", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - - PyRef half2 = PyUnicode_Substring(pystr, mid, slen); - CHECK(half2); - PyRef r2 = PyObject_CallMethod(encoder, "encode", "O", - (PyObject *)half2); - if (PyErr_Occurred()) PyErr_Clear(); - - PyRef state = PyObject_CallMethod(encoder, "getstate", NULL); - if (PyErr_Occurred()) PyErr_Clear(); -} - -// OP_CODECS_STREAM: Wrap fuzz data in BytesIO, create a UTF-8 StreamReader -// via codecs.getreader('utf-8')(bio, errors='replace'), then .read(). -// Exercises the StreamReader code path (MultibyteStreamReader_Type for -// multibyte codecs, or built-in StreamReader for UTF-8). -static void op_codecs_stream(FuzzedDataProvider &fdp) { - std::string data = fdp.ConsumeRemainingBytesAsString(); - PyRef pydata = PyBytes_FromStringAndSize(Y(data)); - CHECK(pydata); - PyRef bio = PyObject_CallFunction(bytesio_ctor, "O", - (PyObject *)pydata); - CHECK(bio); - - PyRef reader_factory = PyObject_CallFunction( - codecs_getreader, "s", "utf-8"); - CHECK(reader_factory); - - PyRef reader = PyObject_CallFunction(reader_factory, "Os", - (PyObject *)bio, "replace"); - CHECK(reader); - - PyRef r = PyObject_CallMethod(reader, "read", NULL); - if (PyErr_Occurred()) PyErr_Clear(); -} - -// --------------------------------------------------------------------------- -// Operations — SSL (1 op) -// --------------------------------------------------------------------------- - -// OP_SSL_CERT: Call ssl.DER_cert_to_PEM_cert(data) to attempt DER-to-PEM -// certificate conversion. If successful, create an SSLContext with -// PROTOCOL_TLS_CLIENT and call .load_verify_locations(cadata=pem_string) -// to exercise the OpenSSL certificate parsing path in the _ssl C module. -static void op_ssl_cert(FuzzedDataProvider &fdp) { - std::string data = fdp.ConsumeRemainingBytesAsString(); - PyRef pydata = PyBytes_FromStringAndSize(Y(data)); - CHECK(pydata); - PyRef pem = PyObject_CallFunction(ssl_DER_cert_to_PEM_cert, "O", - (PyObject *)pydata); - if (!pem) { - PyErr_Clear(); - return; - } - - // Optionally try to load into SSLContext. - PyRef ctx = PyObject_CallFunction(ssl_SSLContext, "l", - ssl_PROTOCOL_TLS_CLIENT_val); - if (!ctx) { - PyErr_Clear(); - return; - } - - PyRef kwargs = PyDict_New(); - CHECK(kwargs); - PyDict_SetItemString(kwargs, "cadata", pem); - PyRef empty_args = PyTuple_New(0); - CHECK(empty_args); - PyRef method = PyObject_GetAttrString(ctx, "load_verify_locations"); - if (!method) { - PyErr_Clear(); - return; - } - PyRef r = PyObject_Call(method, empty_args, kwargs); - if (PyErr_Occurred()) PyErr_Clear(); -} - -// --------------------------------------------------------------------------- -// Dispatch. -// --------------------------------------------------------------------------- - -enum Op { - OP_ZLIB_DECOMPRESS, - OP_ZLIB_COMPRESS, - OP_ZLIB_CHECKSUM, - OP_BZ2, - OP_LZMA_DECOMPRESS, - OP_LZMA_COMPRESS, - OP_BINASCII_DECODE, - OP_BINASCII_ENCODE, - OP_BINASCII_CHECKSUM, - OP_BINASCII_ROUNDTRIP, - OP_PICKLE_DUMPS, - OP_PICKLE_LOADS, - OP_PICKLE_PICKLER, - OP_PICKLE_ROUNDTRIP, - OP_CODECS_DECODE, - OP_CODECS_ENCODE, - OP_CODECS_INCREMENTAL_DECODE, - OP_CODECS_INCREMENTAL_ENCODE, - OP_CODECS_STREAM, - OP_SSL_CERT, - NUM_OPS -}; - -extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { - assert(Py_IsInitialized()); - init_decode(); - if (size < 1 || size > kMaxInputSize) return 0; - if (PyErr_Occurred()) PyErr_Clear(); - - FuzzedDataProvider fdp(data, size); - switch (fdp.ConsumeIntegralInRange(0, NUM_OPS - 1)) { - case OP_ZLIB_DECOMPRESS: - op_zlib_decompress(fdp); - break; - case OP_ZLIB_COMPRESS: - op_zlib_compress(fdp); - break; - case OP_ZLIB_CHECKSUM: - op_zlib_checksum(fdp); - break; - case OP_BZ2: - op_bz2(fdp); - break; - case OP_LZMA_DECOMPRESS: - op_lzma_decompress(fdp); - break; - case OP_LZMA_COMPRESS: - op_lzma_compress(fdp); - break; - case OP_BINASCII_DECODE: - op_binascii_decode(fdp); - break; - case OP_BINASCII_ENCODE: - op_binascii_encode(fdp); - break; - case OP_BINASCII_CHECKSUM: - op_binascii_checksum(fdp); - break; - case OP_BINASCII_ROUNDTRIP: - op_binascii_roundtrip(fdp); - break; - case OP_PICKLE_DUMPS: - op_pickle_dumps(fdp); - break; - case OP_PICKLE_LOADS: - op_pickle_loads(fdp); - break; - case OP_PICKLE_PICKLER: - op_pickle_pickler(fdp); - break; - case OP_PICKLE_ROUNDTRIP: - op_pickle_roundtrip(fdp); - break; - case OP_CODECS_DECODE: - op_codecs_decode(fdp); - break; - case OP_CODECS_ENCODE: - op_codecs_encode(fdp); - break; - case OP_CODECS_INCREMENTAL_DECODE: - op_codecs_incremental_decode(fdp); - break; - case OP_CODECS_INCREMENTAL_ENCODE: - op_codecs_incremental_encode(fdp); - break; - case OP_CODECS_STREAM: - op_codecs_stream(fdp); - break; - case OP_SSL_CERT: - op_ssl_cert(fdp); - break; - } - - if (++gc_counter % kGcInterval == 0) PyGC_Collect(); - return 0; -} diff --git a/module-fuzzers/fuzz_helpers.h b/module-fuzzers/fuzz_helpers.h index c9c270a..7d089a4 100644 --- a/module-fuzzers/fuzz_helpers.h +++ b/module-fuzzers/fuzz_helpers.h @@ -36,6 +36,10 @@ extern "C" int LLVMFuzzerInitialize(int *argc, char ***argv) { status = Py_InitializeFromConfig(&config); if (PyStatus_Exception(status)) goto fail; PyConfig_Clear(&config); + + // Suppress Python warnings globally — all fuzzers want this. + PyRun_SimpleString("import warnings; warnings.filterwarnings('ignore')"); + return 0; fail: PyConfig_Clear(&config); @@ -117,22 +121,35 @@ static PyObject *fuzz_bytes_to_str(const std::string &data, int method) { return PyUnicode_DecodeUTF16( data.data(), data.size(), "replace", &order); } - case 3: { + default: { int order = -1; // little-endian return PyUnicode_DecodeUTF32( data.data(), data.size(), "replace", &order); } } - return PyUnicode_DecodeLatin1(Y(data), NULL); // unreachable +} + +// Run a Python code string and extract a named attribute from the resulting +// globals dict. Returns a new reference. Aborts on failure — called only +// during one-time init. +static PyObject *run_python_and_get(const char *code, const char *name) { + PyObject *globals = PyDict_New(); + if (!globals) { PyErr_Print(); abort(); } + PyDict_SetItemString(globals, "__builtins__", PyEval_GetBuiltins()); + PyObject *r = PyRun_String(code, Py_file_input, globals, globals); + if (!r) { PyErr_Print(); Py_DECREF(globals); abort(); } + Py_DECREF(r); + PyObject *attr = PyDict_GetItemString(globals, name); // borrowed + if (!attr) { PyErr_Print(); Py_DECREF(globals); abort(); } + Py_INCREF(attr); + Py_DECREF(globals); + return attr; } // --------------------------------------------------------------------------- // Constants // --------------------------------------------------------------------------- -// How often (in iterations) to run PyGC_Collect(). -static constexpr int kGcInterval = 200; - // Maximum fuzz input size (1 MB). static constexpr size_t kMaxInputSize = 0x100000; diff --git a/module-fuzzers/fuzz_ioops.cpp b/module-fuzzers/fuzz_ioops.cpp deleted file mode 100644 index a8dbd49..0000000 --- a/module-fuzzers/fuzz_ioops.cpp +++ /dev/null @@ -1,1015 +0,0 @@ -// fuzz_ioops.cpp — Fuzzer for CPython's I/O C extension modules. -// -// This fuzzer exercises the following CPython C extension modules via -// their Python API, called through the Python C API from C++: -// -// _io/bytesio.c — BytesIO: write, seek, read, readline, readlines, -// readinto, read1, readinto1, getvalue, getbuffer, -// truncate, tell, iteration, peek (via BufferedReader) -// _io/textio.c — TextIOWrapper: write, read, readline, readlines, -// flush, seek, reconfigure, detach, properties -// (readable/writable/seekable/encoding/buffer), -// IncrementalNewlineDecoder -// _io/bufferedio.c — BufferedReader, BufferedWriter, BufferedRandom, -// BufferedRWPair: read, write, peek, read1, readline, -// seek, tell, truncate, flush, detach, raw -// _io/fileio.c — FileIO: read, readall, readinto, write, flush, -// tell, seek, truncate, fileno, isatty, name, mode, -// closefd, readable, writable, seekable -// _io/_iomodule.c — io.open() with various modes (r, rb, w, wb) -// _io/stringio.c — StringIO: write, seek, readline, readlines, -// truncate, tell, close -// -// The first byte of fuzz input selects one of 7 operation types. Each -// operation consumes further bytes via FuzzedDataProvider to parameterize -// the call (encoding, error handler, newline mode, I/O variant). -// -// All module functions and class constructors are imported once during init -// and cached as static PyObject* pointers. Temporary directory and test file -// are created once at init. PyRef (RAII) prevents reference leaks. -// PyGC_Collect() runs every 200 iterations. Max input size: 64 KB. - -#include "fuzz_helpers.h" - -// --------------------------------------------------------------------------- -// Cached module objects, initialized once. -// --------------------------------------------------------------------------- - -// io classes -static PyObject *io_BytesIO, *io_TextIOWrapper; -static PyObject *io_BufferedReader, *io_BufferedWriter; -static PyObject *io_BufferedRandom, *io_BufferedRWPair; -static PyObject *io_FileIO, *io_open, *io_StringIO; -static PyObject *io_IncrementalNewlineDecoder; - -// os -static PyObject *os_path_join, *os_open_fn, *os_unlink; -static PyObject *os_O_RDONLY; - -// Temp paths (as C strings). -static char tmpdir[256]; -static char tmpfile_path[256]; - -static unsigned long gc_counter = 0; - -static int initialized = 0; - -static void init_ioops(void) { - if (initialized) return; - - // io - io_BytesIO = import_attr("io", "BytesIO"); - io_TextIOWrapper = import_attr("io", "TextIOWrapper"); - io_BufferedReader = import_attr("io", "BufferedReader"); - io_BufferedWriter = import_attr("io", "BufferedWriter"); - io_BufferedRandom = import_attr("io", "BufferedRandom"); - io_BufferedRWPair = import_attr("io", "BufferedRWPair"); - io_FileIO = import_attr("io", "FileIO"); - io_open = import_attr("io", "open"); - io_StringIO = import_attr("io", "StringIO"); - io_IncrementalNewlineDecoder = import_attr("io", - "IncrementalNewlineDecoder"); - - // os - os_path_join = import_attr("os.path", "join"); - os_open_fn = import_attr("os", "open"); - os_unlink = import_attr("os", "unlink"); - os_O_RDONLY = import_attr("os", "O_RDONLY"); - - // Create temp directory and test file. - { - PyObject *globals = PyDict_New(); - PyDict_SetItemString(globals, "__builtins__", PyEval_GetBuiltins()); - PyObject *r = PyRun_String( - "import tempfile, os\n" - "_tmpdir = tempfile.mkdtemp(prefix='fuzz_io_')\n" - "_tmpfile = os.path.join(_tmpdir, 'test')\n" - "with open(_tmpfile, 'wb') as f:\n" - " f.write(b'A' * 4096)\n", - Py_file_input, globals, globals); - if (!r) { PyErr_Print(); abort(); } - Py_DECREF(r); - PyObject *td = PyDict_GetItemString(globals, "_tmpdir"); - PyObject *tf = PyDict_GetItemString(globals, "_tmpfile"); - const char *td_str = PyUnicode_AsUTF8(td); - const char *tf_str = PyUnicode_AsUTF8(tf); - snprintf(tmpdir, sizeof(tmpdir), "%s", td_str); - snprintf(tmpfile_path, sizeof(tmpfile_path), "%s", tf_str); - Py_DECREF(globals); - } - - // Suppress warnings. - PyRun_SimpleString("import warnings; warnings.filterwarnings('ignore')"); - - assert(!PyErr_Occurred()); - initialized = 1; -} - -// Helper: Build a temp file path. -static PyObject *make_tmppath(const char *name) { - return PyObject_CallFunction(os_path_join, "ss", tmpdir, name); -} - -// Helper: Unlink a file (ignore errors). -static void unlink_path(PyObject *path) { - PyRef r = PyObject_CallFunction(os_unlink, "O", path); - if (PyErr_Occurred()) PyErr_Clear(); -} - -// --------------------------------------------------------------------------- -// Operations (7 ops). -// --------------------------------------------------------------------------- - -// OP_BYTESIO: BytesIO with fuzz data, then FDP selects actions. -// Exercises _io/bytesio.c paths. -static void op_bytesio(FuzzedDataProvider &fdp) { - int variant = fdp.ConsumeIntegralInRange(0, 6); - std::string data = fdp.ConsumeBytesAsString( - std::min(fdp.remaining_bytes(), (size_t)10000)); - PyRef pydata = PyBytes_FromStringAndSize(Y(data)); - CHECK(pydata); - - switch (variant) { - case 0: { - // Basic: write/seek/read/getvalue/tell. - PyRef bio = PyObject_CallFunction(io_BytesIO, NULL); - CHECK(bio); - PyRef wr = PyObject_CallMethod(bio, "write", "O", (PyObject *)pydata); - if (!wr) { PyErr_Clear(); break; } - PyRef sk = PyObject_CallMethod(bio, "seek", "i", 0); - if (PyErr_Occurred()) PyErr_Clear(); - PyRef rd = PyObject_CallMethod(bio, "read", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - PyRef gv = PyObject_CallMethod(bio, "getvalue", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - PyRef tl = PyObject_CallMethod(bio, "tell", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - PyRef cl = PyObject_CallMethod(bio, "close", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - break; - } - case 1: { - // readline, readlines, readinto. - PyRef bio = PyObject_CallFunction(io_BytesIO, "O", (PyObject *)pydata); - CHECK(bio); - { - PyRef r = PyObject_CallMethod(bio, "readline", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - PyRef sk = PyObject_CallMethod(bio, "seek", "i", 0); - if (PyErr_Occurred()) PyErr_Clear(); - PyRef r = PyObject_CallMethod(bio, "readlines", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - PyRef sk = PyObject_CallMethod(bio, "seek", "i", 0); - if (PyErr_Occurred()) PyErr_Clear(); - PyRef buf = PyByteArray_FromStringAndSize(NULL, 32); - CHECK(buf); - PyRef r = PyObject_CallMethod(bio, "readinto", "O", (PyObject *)buf); - if (PyErr_Occurred()) PyErr_Clear(); - } - PyRef cl = PyObject_CallMethod(bio, "close", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - break; - } - case 2: { - // truncate + write + getvalue. - PyRef bio = PyObject_CallFunction(io_BytesIO, "O", (PyObject *)pydata); - CHECK(bio); - long trunc_at = data.size() < 64 ? data.size() : 64; - PyRef tr = PyObject_CallMethod(bio, "truncate", "l", trunc_at); - if (PyErr_Occurred()) PyErr_Clear(); - PyRef wr = PyObject_CallMethod(bio, "write", "y#", "XX", 2); - if (PyErr_Occurred()) PyErr_Clear(); - PyRef gv = PyObject_CallMethod(bio, "getvalue", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - PyRef cl = PyObject_CallMethod(bio, "close", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - break; - } - case 3: { - // getbuffer (memoryview). - PyRef bio = PyObject_CallFunction(io_BytesIO, "O", (PyObject *)pydata); - CHECK(bio); - PyRef mv = PyObject_CallMethod(bio, "getbuffer", NULL); - if (mv) { - PyRef bytes_val = PyObject_CallFunction( - (PyObject *)&PyBytes_Type, "O", (PyObject *)mv); - if (PyErr_Occurred()) PyErr_Clear(); - PyRef rel = PyObject_CallMethod(mv, "release", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - } else { - PyErr_Clear(); - } - PyRef cl = PyObject_CallMethod(bio, "close", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - break; - } - case 4: { - // read1, readinto1. - PyRef bio = PyObject_CallFunction(io_BytesIO, "O", (PyObject *)pydata); - CHECK(bio); - { - PyRef r = PyObject_CallMethod(bio, "read1", "i", 16); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - PyRef sk = PyObject_CallMethod(bio, "seek", "i", 0); - if (PyErr_Occurred()) PyErr_Clear(); - PyRef buf = PyByteArray_FromStringAndSize(NULL, 32); - CHECK(buf); - PyRef r = PyObject_CallMethod(bio, "readinto1", "O", (PyObject *)buf); - if (PyErr_Occurred()) PyErr_Clear(); - } - PyRef cl = PyObject_CallMethod(bio, "close", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - break; - } - case 5: { - // Iteration. - PyRef bio = PyObject_CallFunction(io_BytesIO, "O", (PyObject *)pydata); - CHECK(bio); - PyRef it = PyObject_GetIter(bio); - if (it) { - PyObject *line; - while ((line = PyIter_Next(it)) != NULL) - Py_DECREF(line); - if (PyErr_Occurred()) PyErr_Clear(); - } else { - PyErr_Clear(); - } - PyRef cl = PyObject_CallMethod(bio, "close", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - break; - } - case 6: { - // Peek via BufferedReader wrapping. - PyRef bio = PyObject_CallFunction(io_BytesIO, "O", (PyObject *)pydata); - CHECK(bio); - PyRef br = PyObject_CallFunction(io_BufferedReader, "O", - (PyObject *)bio); - CHECK(br); - { - PyRef r = PyObject_CallMethod(br, "peek", "i", 16); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - PyRef r = PyObject_CallMethod(br, "read", "i", 8); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - PyRef r = PyObject_CallMethod(br, "read1", "i", 8); - if (PyErr_Occurred()) PyErr_Clear(); - } - PyRef cl = PyObject_CallMethod(br, "close", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - break; - } - } -} - -// OP_TEXTIOWRAPPER: FDP selects encoding, errors, newline. Create BytesIO + -// TextIOWrapper. Exercises _io/textio.c paths. -static void op_textiowrapper(FuzzedDataProvider &fdp) { - static const char *kEncodings[] = {"utf-8", "latin-1", "ascii", "utf-16"}; - static const char *kErrors[] = { - "strict", "replace", "xmlcharrefreplace", "backslashreplace", - }; - // NULL = universal newline mode. - static const char *kNewlines[] = {NULL, "\n", "\r\n", ""}; - - int enc_idx = fdp.ConsumeIntegralInRange(0, 3); - int err_idx = fdp.ConsumeIntegralInRange(0, 3); - int nl_idx = fdp.ConsumeIntegralInRange(0, 3); - int variant = fdp.ConsumeIntegralInRange(0, 4); - int str_enc = fdp.ConsumeIntegralInRange(0, 3); - std::string data = fdp.ConsumeBytesAsString( - std::min(fdp.remaining_bytes(), (size_t)10000)); - - const char *encoding = kEncodings[enc_idx]; - const char *errors = kErrors[err_idx]; - const char *newline = kNewlines[nl_idx]; - - switch (variant) { - case 0: { - // Write mode: write string, flush, seek, read. - PyRef bio = PyObject_CallFunction(io_BytesIO, NULL); - CHECK(bio); - PyRef kwargs = PyDict_New(); - CHECK(kwargs); - PyRef enc_str = PyUnicode_FromString(encoding); - CHECK(enc_str); - PyDict_SetItemString(kwargs, "encoding", enc_str); - PyRef err_str = PyUnicode_FromString(errors); - CHECK(err_str); - PyDict_SetItemString(kwargs, "errors", err_str); - if (newline) { - PyRef nl_str = PyUnicode_FromString(newline); - CHECK(nl_str); - PyDict_SetItemString(kwargs, "newline", nl_str); - } - PyRef args = PyTuple_Pack(1, (PyObject *)bio); - CHECK(args); - PyRef tw = PyObject_Call(io_TextIOWrapper, args, kwargs); - CHECK(tw); - - PyRef pystr(fuzz_bytes_to_str(data, str_enc)); - CHECK(pystr); - { - PyRef r = PyObject_CallMethod(tw, "write", "O", (PyObject *)pystr); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - PyRef r = PyObject_CallMethod(tw, "flush", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - PyRef r = PyObject_CallMethod(tw, "seek", "i", 0); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - PyRef r = PyObject_CallMethod(tw, "read", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - PyRef r = PyObject_CallMethod(tw, "close", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - } - break; - } - case 1: { - // Read mode: BytesIO(data) + TextIOWrapper, read/readline/readlines. - PyRef pydata = PyBytes_FromStringAndSize(Y(data)); - CHECK(pydata); - PyRef bio = PyObject_CallFunction(io_BytesIO, "O", (PyObject *)pydata); - CHECK(bio); - PyRef kwargs = PyDict_New(); - CHECK(kwargs); - PyRef enc_str = PyUnicode_FromString(encoding); - CHECK(enc_str); - PyDict_SetItemString(kwargs, "encoding", enc_str); - PyRef err_str = PyUnicode_FromString("replace"); - CHECK(err_str); - PyDict_SetItemString(kwargs, "errors", err_str); - PyRef args = PyTuple_Pack(1, (PyObject *)bio); - CHECK(args); - PyRef tw = PyObject_Call(io_TextIOWrapper, args, kwargs); - CHECK(tw); - - // readline x3. - for (int i = 0; i < 3; i++) { - PyRef r = PyObject_CallMethod(tw, "readline", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - PyRef sk = PyObject_CallMethod(tw, "seek", "i", 0); - if (PyErr_Occurred()) PyErr_Clear(); - PyRef r = PyObject_CallMethod(tw, "readlines", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - PyRef sk = PyObject_CallMethod(tw, "seek", "i", 0); - if (PyErr_Occurred()) PyErr_Clear(); - PyRef r = PyObject_CallMethod(tw, "read", "i", 16); - if (PyErr_Occurred()) PyErr_Clear(); - PyRef r2 = PyObject_CallMethod(tw, "read", "i", 16); - if (PyErr_Occurred()) PyErr_Clear(); - PyRef r3 = PyObject_CallMethod(tw, "read", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - PyRef r = PyObject_CallMethod(tw, "close", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - } - break; - } - case 2: { - // Reconfigure: write, reconfigure, write more, read. - PyRef bio = PyObject_CallFunction(io_BytesIO, NULL); - CHECK(bio); - PyRef kwargs = PyDict_New(); - CHECK(kwargs); - PyRef enc_str = PyUnicode_FromString("utf-8"); - CHECK(enc_str); - PyDict_SetItemString(kwargs, "encoding", enc_str); - PyRef args = PyTuple_Pack(1, (PyObject *)bio); - CHECK(args); - PyRef tw = PyObject_Call(io_TextIOWrapper, args, kwargs); - CHECK(tw); - - PyRef pystr(fuzz_bytes_to_str(data, str_enc)); - CHECK(pystr); - { - PyRef r = PyObject_CallMethod(tw, "write", "O", (PyObject *)pystr); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - PyRef kw = PyDict_New(); - CHECK(kw); - PyRef nl = PyUnicode_FromString("\n"); - CHECK(nl); - PyDict_SetItemString(kw, "newline", nl); - PyDict_SetItemString(kw, "line_buffering", Py_True); - PyRef empty = PyTuple_New(0); - CHECK(empty); - PyRef r = PyObject_Call( - PyObject_GetAttrString(tw, "reconfigure"), empty, kw); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - PyRef sub = PyUnicode_Substring(pystr, 0, 32); - if (sub) { - PyRef r = PyObject_CallMethod(tw, "write", "O", (PyObject *)sub); - if (PyErr_Occurred()) PyErr_Clear(); - } else { - PyErr_Clear(); - } - } - { - PyRef r = PyObject_CallMethod(tw, "seek", "i", 0); - if (PyErr_Occurred()) PyErr_Clear(); - PyRef rd = PyObject_CallMethod(tw, "read", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - PyRef r = PyObject_CallMethod(tw, "close", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - } - break; - } - case 3: { - // Detach. - PyRef pydata = PyBytes_FromStringAndSize(Y(data)); - CHECK(pydata); - PyRef bio = PyObject_CallFunction(io_BytesIO, "O", (PyObject *)pydata); - CHECK(bio); - PyRef kwargs = PyDict_New(); - CHECK(kwargs); - PyRef enc_str = PyUnicode_FromString("utf-8"); - CHECK(enc_str); - PyDict_SetItemString(kwargs, "encoding", enc_str); - PyRef err_str = PyUnicode_FromString("replace"); - CHECK(err_str); - PyDict_SetItemString(kwargs, "errors", err_str); - PyRef args = PyTuple_Pack(1, (PyObject *)bio); - CHECK(args); - PyRef tw = PyObject_Call(io_TextIOWrapper, args, kwargs); - CHECK(tw); - { - PyRef r = PyObject_CallMethod(tw, "read", "i", 4); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - PyRef raw = PyObject_CallMethod(tw, "detach", NULL); - if (raw) { - PyRef rd = PyObject_CallMethod(raw, "read", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - PyRef cl = PyObject_CallMethod(raw, "close", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - } else { - PyErr_Clear(); - } - } - break; - } - case 4: { - // Properties: writable/readable/seekable/encoding/buffer. - PyRef pydata = PyBytes_FromStringAndSize(Y(data)); - CHECK(pydata); - PyRef bio = PyObject_CallFunction(io_BytesIO, "O", (PyObject *)pydata); - CHECK(bio); - PyRef kwargs = PyDict_New(); - CHECK(kwargs); - PyRef enc_str = PyUnicode_FromString("utf-8"); - CHECK(enc_str); - PyDict_SetItemString(kwargs, "encoding", enc_str); - PyRef err_str = PyUnicode_FromString("replace"); - CHECK(err_str); - PyDict_SetItemString(kwargs, "errors", err_str); - PyRef args = PyTuple_Pack(1, (PyObject *)bio); - CHECK(args); - PyRef tw = PyObject_Call(io_TextIOWrapper, args, kwargs); - CHECK(tw); - { - PyRef r = PyObject_CallMethod(tw, "writable", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - PyRef r = PyObject_CallMethod(tw, "readable", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - PyRef r = PyObject_CallMethod(tw, "seekable", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - PyRef r = PyObject_GetAttrString(tw, "encoding"); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - PyRef r = PyObject_GetAttrString(tw, "buffer"); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - PyRef r = PyObject_CallMethod(tw, "close", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - } - break; - } - } -} - -// OP_BUFFERED_IO: FDP selects variant — BufferedReader, BufferedWriter, -// BufferedRandom, BufferedRWPair. Exercises _io/bufferedio.c paths. -static void op_buffered_io(FuzzedDataProvider &fdp) { - int variant = fdp.ConsumeIntegralInRange(0, 3); - std::string data = fdp.ConsumeBytesAsString( - std::min(fdp.remaining_bytes(), (size_t)10000)); - PyRef pydata = PyBytes_FromStringAndSize(Y(data)); - CHECK(pydata); - - switch (variant) { - case 0: { - // BufferedReader wrapping BytesIO. - PyRef bio = PyObject_CallFunction(io_BytesIO, "O", (PyObject *)pydata); - CHECK(bio); - PyRef br = PyObject_CallFunction(io_BufferedReader, "O", - (PyObject *)bio); - CHECK(br); - { - PyRef r = PyObject_CallMethod(br, "read", "i", 64); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - PyRef r = PyObject_CallMethod(br, "seek", "i", 0); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - PyRef r = PyObject_CallMethod(br, "peek", "i", 16); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - PyRef r = PyObject_CallMethod(br, "read1", "i", 16); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - PyRef r = PyObject_CallMethod(br, "readline", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - PyRef raw = PyObject_GetAttrString(br, "raw"); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - PyRef det = PyObject_CallMethod(br, "detach", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - } - break; - } - case 1: { - // BufferedWriter wrapping BytesIO. - PyRef bio = PyObject_CallFunction(io_BytesIO, NULL); - CHECK(bio); - PyRef bw = PyObject_CallFunction(io_BufferedWriter, "O", - (PyObject *)bio); - CHECK(bw); - { - PyRef r = PyObject_CallMethod(bw, "write", "O", (PyObject *)pydata); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - PyRef r = PyObject_CallMethod(bw, "flush", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - PyRef r = PyObject_CallMethod(bw, "close", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - } - break; - } - case 2: { - // BufferedRandom wrapping BytesIO. - PyRef bio = PyObject_CallFunction(io_BytesIO, "O", (PyObject *)pydata); - CHECK(bio); - PyRef brnd = PyObject_CallFunction(io_BufferedRandom, "O", - (PyObject *)bio); - CHECK(brnd); - { - PyRef r = PyObject_CallMethod(brnd, "write", "O", (PyObject *)pydata); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - PyRef r = PyObject_CallMethod(brnd, "seek", "i", 0); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - PyRef r = PyObject_CallMethod(brnd, "read", "i", 64); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - PyRef r = PyObject_CallMethod(brnd, "tell", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - long trunc_at = data.size() < 64 ? data.size() : 64; - PyRef r = PyObject_CallMethod(brnd, "truncate", "l", trunc_at); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - PyRef r = PyObject_CallMethod(brnd, "close", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - } - break; - } - case 3: { - // BufferedRWPair. - PyRef r_bio = PyObject_CallFunction(io_BytesIO, "O", - (PyObject *)pydata); - CHECK(r_bio); - PyRef w_bio = PyObject_CallFunction(io_BytesIO, NULL); - CHECK(w_bio); - PyRef rw = PyObject_CallFunction(io_BufferedRWPair, "OO", - (PyObject *)r_bio, (PyObject *)w_bio); - CHECK(rw); - { - PyRef r = PyObject_CallMethod(rw, "read", "i", 32); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - PyRef r = PyObject_CallMethod(rw, "write", "O", (PyObject *)pydata); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - PyRef r = PyObject_CallMethod(rw, "close", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - } - break; - } - } -} - -// OP_FILEIO: FDP selects mode — read, write, read+write. -// Exercises _io/fileio.c paths. -static void op_fileio(FuzzedDataProvider &fdp) { - int variant = fdp.ConsumeIntegralInRange(0, 2); - std::string data = fdp.ConsumeBytesAsString( - std::min(fdp.remaining_bytes(), (size_t)10000)); - - switch (variant) { - case 0: { - // Read from tmpfile. - PyRef fio = PyObject_CallFunction(io_FileIO, "ss", - tmpfile_path, "r"); - CHECK(fio); - { - PyRef r = PyObject_CallMethod(fio, "read", "i", 64); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - PyRef r = PyObject_CallMethod(fio, "seek", "i", 0); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - PyRef r = PyObject_CallMethod(fio, "readall", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - PyRef sk = PyObject_CallMethod(fio, "seek", "i", 0); - if (PyErr_Occurred()) PyErr_Clear(); - PyRef buf = PyByteArray_FromStringAndSize(NULL, 64); - CHECK(buf); - PyRef r = PyObject_CallMethod(fio, "readinto", "O", (PyObject *)buf); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - PyRef r = PyObject_CallMethod(fio, "fileno", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - PyRef r = PyObject_CallMethod(fio, "isatty", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - PyRef r = PyObject_GetAttrString(fio, "name"); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - PyRef r = PyObject_GetAttrString(fio, "mode"); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - PyRef r = PyObject_CallMethod(fio, "readable", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - PyRef r = PyObject_CallMethod(fio, "seekable", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - PyRef r = PyObject_CallMethod(fio, "close", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - } - break; - } - case 1: { - // Write to temp file. - PyRef path(make_tmppath("fio_w")); - CHECK(path); - PyRef fio = PyObject_CallFunction(io_FileIO, "Os", - (PyObject *)path, "w"); - CHECK(fio); - { - PyRef pydata = PyBytes_FromStringAndSize(Y(data)); - CHECK(pydata); - PyRef r = PyObject_CallMethod(fio, "write", "O", (PyObject *)pydata); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - PyRef r = PyObject_CallMethod(fio, "flush", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - PyRef r = PyObject_CallMethod(fio, "tell", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - PyRef r = PyObject_CallMethod(fio, "close", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - } - unlink_path(path); - break; - } - case 2: { - // Read+write mode. - PyRef path(make_tmppath("fio_rw")); - CHECK(path); - PyRef fio = PyObject_CallFunction(io_FileIO, "Os", - (PyObject *)path, "w+b"); - CHECK(fio); - { - PyRef pydata = PyBytes_FromStringAndSize(Y(data)); - CHECK(pydata); - PyRef r = PyObject_CallMethod(fio, "write", "O", (PyObject *)pydata); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - PyRef r = PyObject_CallMethod(fio, "seek", "i", 0); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - PyRef r = PyObject_CallMethod(fio, "read", "i", 32); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - PyRef r = PyObject_CallMethod(fio, "truncate", "i", 128); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - PyRef r = PyObject_CallMethod(fio, "close", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - } - unlink_path(path); - break; - } - } -} - -// OP_IO_OPEN: FDP selects mode — read text, read binary, write text, write binary. -// Exercises _io/_iomodule.c open() paths. -static void op_io_open(FuzzedDataProvider &fdp) { - int variant = fdp.ConsumeIntegralInRange(0, 3); - int str_enc = fdp.ConsumeIntegralInRange(0, 3); - std::string data = fdp.ConsumeBytesAsString( - std::min(fdp.remaining_bytes(), (size_t)10000)); - - switch (variant) { - case 0: { - // Read text from tmpfile. - PyRef f = PyObject_CallFunction(io_open, "ss", tmpfile_path, "r"); - CHECK(f); - PyRef r = PyObject_CallMethod(f, "read", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - PyRef cl = PyObject_CallMethod(f, "close", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - break; - } - case 1: { - // Read binary from tmpfile. - PyRef f = PyObject_CallFunction(io_open, "ss", tmpfile_path, "rb"); - CHECK(f); - PyRef r = PyObject_CallMethod(f, "read", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - PyRef cl = PyObject_CallMethod(f, "close", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - break; - } - case 2: { - // Write text. - PyRef path(make_tmppath("ioopen_w")); - CHECK(path); - PyRef pystr(fuzz_bytes_to_str(data, str_enc)); - CHECK(pystr); - PyRef f = PyObject_CallFunction(io_open, "Os", (PyObject *)path, "w"); - CHECK(f); - PyRef r = PyObject_CallMethod(f, "write", "O", (PyObject *)pystr); - if (PyErr_Occurred()) PyErr_Clear(); - PyRef cl = PyObject_CallMethod(f, "close", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - unlink_path(path); - break; - } - case 3: { - // Write binary then read back. - PyRef path(make_tmppath("ioopen_wb")); - CHECK(path); - PyRef pydata = PyBytes_FromStringAndSize(Y(data)); - CHECK(pydata); - { - PyRef f = PyObject_CallFunction(io_open, "Os", - (PyObject *)path, "wb"); - CHECK(f); - PyRef r = PyObject_CallMethod(f, "write", "O", (PyObject *)pydata); - if (PyErr_Occurred()) PyErr_Clear(); - PyRef cl = PyObject_CallMethod(f, "close", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - } - // Read back. - { - PyRef kwargs = PyDict_New(); - CHECK(kwargs); - PyRef err = PyUnicode_FromString("replace"); - CHECK(err); - PyDict_SetItemString(kwargs, "errors", err); - PyRef args = PyTuple_Pack(1, (PyObject *)path); - CHECK(args); - PyRef f = PyObject_Call(io_open, args, kwargs); - if (f) { - PyRef r = PyObject_CallMethod(f, "read", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - PyRef cl = PyObject_CallMethod(f, "close", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - } else { - PyErr_Clear(); - } - } - unlink_path(path); - break; - } - } -} - -// OP_NEWLINE_DECODER: FDP selects translate mode. Create -// IncrementalNewlineDecoder, split str at midpoint, decode halves. -// Exercises _io/textio.c's newline decoder paths. -static void op_newline_decoder(FuzzedDataProvider &fdp) { - bool translate = fdp.ConsumeBool(); - int str_enc = fdp.ConsumeIntegralInRange(0, 3); - std::string data = fdp.ConsumeBytesAsString( - std::min(fdp.remaining_bytes(), (size_t)10000)); - PyRef pystr(fuzz_bytes_to_str(data, str_enc)); - CHECK(pystr); - - PyRef dec = PyObject_CallFunction(io_IncrementalNewlineDecoder, "OO", - Py_None, - translate ? Py_True : Py_False); - CHECK(dec); - - Py_ssize_t slen = PyUnicode_GET_LENGTH(pystr); - Py_ssize_t mid = slen / 2; - - PyRef half1 = PyUnicode_Substring(pystr, 0, mid); - CHECK(half1); - PyRef half2 = PyUnicode_Substring(pystr, mid, slen); - CHECK(half2); - - { - PyRef r = PyObject_CallMethod(dec, "decode", "O", (PyObject *)half1); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - PyRef r = PyObject_CallMethod(dec, "decode", "Oi", - (PyObject *)half2, 1); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - PyRef state = PyObject_CallMethod(dec, "getstate", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - - PyRef reset = PyObject_CallMethod(dec, "reset", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - - if (state && state.p != Py_None) { - PyRef ss = PyObject_CallMethod(dec, "setstate", "O", - (PyObject *)state); - if (PyErr_Occurred()) PyErr_Clear(); - } - } -} - -// OP_STRINGIO: StringIO write/readline/readlines/truncate/close. -// Exercises _io/stringio.c paths. -static void op_stringio(FuzzedDataProvider &fdp) { - int str_enc = fdp.ConsumeIntegralInRange(0, 3); - int variant = fdp.ConsumeIntegralInRange(0, 1); - std::string data = fdp.ConsumeBytesAsString( - std::min(fdp.remaining_bytes(), (size_t)10000)); - PyRef pystr(fuzz_bytes_to_str(data, str_enc)); - CHECK(pystr); - - PyRef sio = PyObject_CallFunction(io_StringIO, NULL); - CHECK(sio); - - { - PyRef r = PyObject_CallMethod(sio, "write", "O", (PyObject *)pystr); - if (!r) { PyErr_Clear(); return; } - } - { - PyRef r = PyObject_CallMethod(sio, "seek", "i", 0); - if (PyErr_Occurred()) PyErr_Clear(); - } - - if (variant == 0) { - // readline x3 + readlines. - for (int i = 0; i < 3; i++) { - PyRef r = PyObject_CallMethod(sio, "readline", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - PyRef sk = PyObject_CallMethod(sio, "seek", "i", 0); - if (PyErr_Occurred()) PyErr_Clear(); - PyRef r = PyObject_CallMethod(sio, "readlines", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - } - } else { - // readlines on initial content. - PyRef r = PyObject_CallMethod(sio, "readlines", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - } - - { - Py_ssize_t slen = PyUnicode_GET_LENGTH(pystr); - long trunc_at = slen < 64 ? slen : 64; - PyRef r = PyObject_CallMethod(sio, "truncate", "l", trunc_at); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - PyRef r = PyObject_CallMethod(sio, "tell", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - PyRef r = PyObject_CallMethod(sio, "close", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - } -} - -// --------------------------------------------------------------------------- -// Dispatch. -// --------------------------------------------------------------------------- - -enum Op { - OP_BYTESIO, - OP_TEXTIOWRAPPER, - OP_BUFFERED_IO, - OP_FILEIO, - OP_IO_OPEN, - OP_NEWLINE_DECODER, - OP_STRINGIO, - NUM_OPS -}; - -extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { - assert(Py_IsInitialized()); - init_ioops(); - if (size < 1 || size > 0x10000) return 0; - if (PyErr_Occurred()) PyErr_Clear(); - - FuzzedDataProvider fdp(data, size); - switch (fdp.ConsumeIntegralInRange(0, NUM_OPS - 1)) { - case OP_BYTESIO: - op_bytesio(fdp); - break; - case OP_TEXTIOWRAPPER: - op_textiowrapper(fdp); - break; - case OP_BUFFERED_IO: - op_buffered_io(fdp); - break; - case OP_FILEIO: - op_fileio(fdp); - break; - case OP_IO_OPEN: - op_io_open(fdp); - break; - case OP_NEWLINE_DECODER: - op_newline_decoder(fdp); - break; - case OP_STRINGIO: - op_stringio(fdp); - break; - } - - if (++gc_counter % kGcInterval == 0) PyGC_Collect(); - return 0; -} diff --git a/module-fuzzers/fuzz_locale.cpp b/module-fuzzers/fuzz_locale.cpp new file mode 100644 index 0000000..62819d7 --- /dev/null +++ b/module-fuzzers/fuzz_locale.cpp @@ -0,0 +1,73 @@ +// fuzz_locale.cpp — Fuzzer for CPython's _locale C extension module. +// +// This fuzzer exercises the following CPython C extension module via +// its Python API, called through the Python C API from C++: +// +// _locale — strxfrm, strcoll +// +// All module functions are imported once during init and cached as static +// PyObject* pointers. PyRef (RAII) prevents reference leaks. +// Max input size: 64 KB. + +#include "fuzz_helpers.h" + +static PyObject *locale_strxfrm, *locale_strcoll; + +static int initialized = 0; + +static void init_locale(void) { + if (initialized) return; + + locale_strxfrm = import_attr("locale", "strxfrm"); + locale_strcoll = import_attr("locale", "strcoll"); + assert(!PyErr_Occurred()); + initialized = 1; +} + +// op_locale: fuzz data selects target — strxfrm or strcoll. +// Exercises the _locale C module. +static void op_locale(FuzzedDataProvider &fdp) { + int str_enc = fdp.ConsumeIntegralInRange(0, 3); + + enum { STRXFRM, STRCOLL, NUM_TARGETS }; + int target_fn = fdp.ConsumeIntegralInRange(0, NUM_TARGETS - 1); + + switch (target_fn) { + case STRXFRM: { + // strxfrm: transform a string for locale-aware comparison. + std::string data = fdp.ConsumeRemainingBytesAsString(); + PyRef pystr(fuzz_bytes_to_str(data, str_enc)); + CHECK(pystr); + PyRef r = PyObject_CallFunction(locale_strxfrm, "O", (PyObject *)pystr); + break; + } + case STRCOLL: { + // strcoll: compare two substrings using locale collation rules. + // Both operands are independently produced from fuzz data. + int str_enc2 = fdp.ConsumeIntegralInRange(0, 3); + size_t split = fdp.ConsumeIntegralInRange(0, fdp.remaining_bytes()); + std::string data1 = fdp.ConsumeBytesAsString(split); + std::string data2 = fdp.ConsumeRemainingBytesAsString(); + PyRef pystr1(fuzz_bytes_to_str(data1, str_enc)); + CHECK(pystr1); + PyRef pystr2(fuzz_bytes_to_str(data2, str_enc2)); + CHECK(pystr2); + PyRef r = PyObject_CallFunction(locale_strcoll, "OO", + (PyObject *)pystr1, (PyObject *)pystr2); + break; + } + } + if (PyErr_Occurred()) PyErr_Clear(); +} + +extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + assert(Py_IsInitialized()); + init_locale(); + if (size < 1 || size > 0x10000) return 0; + if (PyErr_Occurred()) PyErr_Clear(); + + FuzzedDataProvider fdp(data, size); + op_locale(fdp); + + return 0; +} diff --git a/module-fuzzers/fuzz_mmap.cpp b/module-fuzzers/fuzz_mmap.cpp new file mode 100644 index 0000000..e813170 --- /dev/null +++ b/module-fuzzers/fuzz_mmap.cpp @@ -0,0 +1,194 @@ +// fuzz_mmap.cpp — Fuzzer for CPython's mmap C extension module. +// +// This fuzzer exercises the following CPython C extension module via +// its Python API, called through the Python C API from C++: +// +// mmap — anonymous mmap: write, find, rfind, read, readline, +// seek, resize, move, getitem, setitem, flush, size, +// tell, close +// +// All module functions are imported once during init and cached as static +// PyObject* pointers. PyRef (RAII) prevents reference leaks. +// Max input size: 64 KB. + +#include "fuzz_helpers.h" + +static PyObject *mmap_mmap; + +static int initialized = 0; + +static void init_mmap(void) { + if (initialized) return; + + mmap_mmap = import_attr("mmap", "mmap"); + assert(!PyErr_Occurred()); + initialized = 1; +} + +// op_mmap: Create anonymous mmap, write data. +// The fuzzer then selects the target. +// Exercises the mmap C module's core operations. +static void op_mmap(FuzzedDataProvider &fdp) { + enum { FIND_RFIND, READ_READLINE, RESIZE_MOVE, GETITEM_SETITEM, FLUSH_SIZE_TELL, READ_ALL, NUM_TARGETS }; + int target_fn = fdp.ConsumeIntegralInRange(0, NUM_TARGETS - 1); + if (fdp.remaining_bytes() == 0) return; + size_t data_len = fdp.ConsumeIntegralInRange( + 1, std::min(fdp.remaining_bytes(), (size_t)10000)); + std::string data = fdp.ConsumeBytesAsString(data_len); + if (data.empty()) data.push_back('\0'); + + // mmap(-1, size) + Py_ssize_t map_size = data.size(); + PyRef mm = PyObject_CallFunction(mmap_mmap, "in", -1, map_size); + CHECK(mm); + + // First write data and seek to 0. + PyRef pydata = PyBytes_FromStringAndSize(Y(data)); + CHECK(pydata); + { + PyRef r = PyObject_CallMethod(mm, "write", "O", (PyObject *)pydata); + if (!r) { PyErr_Clear(); + PyRef cl = PyObject_CallMethod(mm, "close", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + return; + } + } + { + PyRef r = PyObject_CallMethod(mm, "seek", "i", 0); + if (!r) { PyErr_Clear(); + PyRef cl = PyObject_CallMethod(mm, "close", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + return; + } + } + + // Now call the target method. + switch (target_fn) { + case FIND_RFIND: { + // find + rfind with fuzz-driven pattern and offsets + long start = fdp.ConsumeIntegralInRange(0, map_size); + long end = fdp.ConsumeIntegralInRange(start, map_size); + std::string pat_str = fdp.ConsumeRemainingBytesAsString(); + PyRef pat = PyBytes_FromStringAndSize(pat_str.data(), pat_str.size()); + CHECK(pat); + { + PyRef r = PyObject_CallMethod(mm, "find", "Oll", + (PyObject *)pat, start, end); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(mm, "rfind", "Oll", + (PyObject *)pat, start, end); + if (PyErr_Occurred()) PyErr_Clear(); + } + break; + } + case READ_READLINE: { + // read + readline with fuzz-driven count and seek position + { + long n = fdp.ConsumeIntegralInRange(0, map_size); + PyRef r = PyObject_CallMethod(mm, "read", "l", n); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + long pos = fdp.ConsumeIntegralInRange(0, map_size); + PyRef sk = PyObject_CallMethod(mm, "seek", "l", pos); + if (PyErr_Occurred()) PyErr_Clear(); + PyRef r = PyObject_CallMethod(mm, "readline", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + break; + } + case RESIZE_MOVE: { + // resize + move with fuzz-driven sizes and offsets + long new_size = fdp.ConsumeIntegralInRange(1, map_size * 4 + 1); + { + PyRef r = PyObject_CallMethod(mm, "resize", "l", new_size); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + // After resize, effective size is new_size (if resize succeeded) + // or map_size (if it failed). Use new_size as upper bound; + // mmap.move() will raise on out-of-bounds anyway. + long dest = fdp.ConsumeIntegralInRange(0, new_size - 1); + long src = fdp.ConsumeIntegralInRange(0, new_size - 1); + long max_count = new_size - std::max(dest, src); + long count = max_count > 0 + ? fdp.ConsumeIntegralInRange(0, max_count) + : 0; + PyRef r = PyObject_CallMethod(mm, "move", "lll", + dest, src, count); + if (PyErr_Occurred()) PyErr_Clear(); + } + break; + } + case GETITEM_SETITEM: { + // getitem + setitem + { + PyRef idx = PyLong_FromLong(0); + CHECK(idx); + PyRef r = PyObject_GetItem(mm, idx); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + Py_ssize_t n = map_size < 4 ? map_size : 4; + PyRef start = PyLong_FromLong(0); + PyRef stop = PyLong_FromLong(n); + PyRef sl = PySlice_New(start, stop, NULL); + CHECK(sl); + PyRef r = PyObject_GetItem(mm, sl); + if (PyErr_Occurred()) PyErr_Clear(); + } + if (data.size() > 0) { + PyRef idx = PyLong_FromLong(0); + CHECK(idx); + PyRef val = PyLong_FromLong((unsigned char)data[0]); + CHECK(val); + PyObject_SetItem(mm, idx, val); + if (PyErr_Occurred()) PyErr_Clear(); + } + break; + } + case FLUSH_SIZE_TELL: { + // flush + size + tell + { + PyRef r = PyObject_CallMethod(mm, "flush", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(mm, "size", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallMethod(mm, "tell", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + break; + } + case READ_ALL: { + // read all + { + PyRef r = PyObject_CallMethod(mm, "read", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } + break; + } + } + + { + PyRef r = PyObject_CallMethod(mm, "close", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + } +} + +extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + assert(Py_IsInitialized()); + init_mmap(); + if (size < 1 || size > 0x10000) return 0; + if (PyErr_Occurred()) PyErr_Clear(); + + FuzzedDataProvider fdp(data, size); + op_mmap(fdp); + + return 0; +} diff --git a/module-fuzzers/fuzz_operator.cpp b/module-fuzzers/fuzz_operator.cpp new file mode 100644 index 0000000..e3a003a --- /dev/null +++ b/module-fuzzers/fuzz_operator.cpp @@ -0,0 +1,190 @@ +// fuzz_operator.cpp — Fuzzer for CPython's _operator C extension module. +// +// This fuzzer exercises the following CPython C extension module via +// its Python API, called through the Python C API from C++: +// +// _operator — lt, gt, eq, ne, contains, countOf, indexOf, +// length_hint, concat, getitem, methodcaller +// +// All module functions are imported once during init and cached as static +// PyObject* pointers. PyRef (RAII) prevents reference leaks. +// Max input size: 64 KB. + +#include "fuzz_helpers.h" + +static PyObject *op_lt, *op_gt, *op_eq, *op_ne; +static PyObject *op_contains, *op_countOf, *op_indexOf, *op_length_hint; +static PyObject *op_concat, *op_getitem, *op_methodcaller; + +static int initialized = 0; + +static void init_operator(void) { + if (initialized) return; + + op_lt = import_attr("operator", "lt"); + op_gt = import_attr("operator", "gt"); + op_eq = import_attr("operator", "eq"); + op_ne = import_attr("operator", "ne"); + op_contains = import_attr("operator", "contains"); + op_countOf = import_attr("operator", "countOf"); + op_indexOf = import_attr("operator", "indexOf"); + op_length_hint = import_attr("operator", "length_hint"); + op_concat = import_attr("operator", "concat"); + op_getitem = import_attr("operator", "getitem"); + op_methodcaller = import_attr("operator", "methodcaller"); + assert(!PyErr_Occurred()); + initialized = 1; +} + +// op_operator: fuzzer selects operator variant — comparisons, sequence ops, +// concat, getitem, methodcaller. Exercises the _operator C module. +static void op_operator(FuzzedDataProvider &fdp) { + enum { COMPARISONS, SEQUENCE_OPS, CONCAT, GETITEM, METHODCALLER, CONTAINS, NUM_TARGETS }; + int target_fn = fdp.ConsumeIntegralInRange(0, NUM_TARGETS - 1); + if (fdp.remaining_bytes() == 0) return; + size_t data_len = fdp.ConsumeIntegralInRange( + 1, std::min(fdp.remaining_bytes(), (size_t)10000)); + std::string data = fdp.ConsumeBytesAsString(data_len); + + PyRef pydata = PyBytes_FromStringAndSize(Y(data)); + CHECK(pydata); + + switch (target_fn) { + case COMPARISONS: { + // Comparisons: lt/gt/eq/ne(data, other) + std::string other = fdp.ConsumeRemainingBytesAsString(); + PyRef pyother = PyBytes_FromStringAndSize(Y(other)); + CHECK(pyother); + { + PyRef r = PyObject_CallFunction(op_lt, "OO", + (PyObject *)pydata, (PyObject *)pyother); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallFunction(op_gt, "OO", + (PyObject *)pydata, (PyObject *)pyother); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallFunction(op_eq, "OO", + (PyObject *)pydata, (PyObject *)pyother); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallFunction(op_ne, "OO", + (PyObject *)pydata, (PyObject *)pyother); + if (PyErr_Occurred()) PyErr_Clear(); + } + break; + } + case SEQUENCE_OPS: { + // Sequence ops: contains, countOf, indexOf, length_hint + if (data.empty()) break; + int byte = fdp.ConsumeIntegralInRange(0, 255); + PyRef byte_val = PyLong_FromLong(byte); + CHECK(byte_val); + { + PyRef r = PyObject_CallFunction(op_contains, "OO", + (PyObject *)pydata, + (PyObject *)byte_val); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallFunction(op_countOf, "OO", + (PyObject *)pydata, + (PyObject *)byte_val); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallFunction(op_indexOf, "OO", + (PyObject *)pydata, + (PyObject *)byte_val); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef r = PyObject_CallFunction(op_length_hint, "O", + (PyObject *)pydata); + if (PyErr_Occurred()) PyErr_Clear(); + } + break; + } + case CONCAT: { + // concat(data, other) + std::string other = fdp.ConsumeRemainingBytesAsString(); + PyRef pyother = PyBytes_FromStringAndSize(Y(other)); + CHECK(pyother); + PyRef r = PyObject_CallFunction(op_concat, "OO", + (PyObject *)pydata, (PyObject *)pyother); + if (PyErr_Occurred()) PyErr_Clear(); + break; + } + case GETITEM: { + // getitem(data, idx) + getitem(data, slice) + if (data.empty()) break; + long idx = fdp.ConsumeIntegralInRange(0, data.size() - 1); + PyRef pyidx = PyLong_FromLong(idx); + CHECK(pyidx); + { + PyRef r = PyObject_CallFunction(op_getitem, "OO", + (PyObject *)pydata, (PyObject *)pyidx); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + long stop = fdp.ConsumeIntegralInRange(0, data.size()); + PyRef pystop = PyLong_FromLong(stop); + CHECK(pystop); + PyRef zero = PyLong_FromLong(0); + CHECK(zero); + PyRef sl = PySlice_New(zero, pystop, NULL); + CHECK(sl); + PyRef r = PyObject_CallFunction(op_getitem, "OO", + (PyObject *)pydata, (PyObject *)sl); + if (PyErr_Occurred()) PyErr_Clear(); + } + break; + } + case METHODCALLER: { + // methodcaller('upper')(str) + methodcaller('encode', 'utf-8')(str) + int str_enc = fdp.ConsumeIntegralInRange(0, 3); + std::string str_data = fdp.ConsumeRemainingBytesAsString(); + PyRef pystr(fuzz_bytes_to_str(str_data, str_enc)); + CHECK(pystr); + { + PyRef mc = PyObject_CallFunction(op_methodcaller, "s", "upper"); + CHECK(mc); + PyRef r = PyObject_CallFunction(mc, "O", (PyObject *)pystr); + if (PyErr_Occurred()) PyErr_Clear(); + } + { + PyRef mc = PyObject_CallFunction(op_methodcaller, "ss", + "encode", "utf-8"); + CHECK(mc); + PyRef r = PyObject_CallFunction(mc, "O", (PyObject *)pystr); + if (PyErr_Occurred()) PyErr_Clear(); + } + break; + } + case CONTAINS: { + // contains on bytes with fuzz-driven needle + std::string needle = fdp.ConsumeRemainingBytesAsString(); + PyRef pyneedle = PyBytes_FromStringAndSize(Y(needle)); + CHECK(pyneedle); + PyRef r = PyObject_CallFunction(op_contains, "OO", + (PyObject *)pydata, (PyObject *)pyneedle); + if (PyErr_Occurred()) PyErr_Clear(); + break; + } + } +} + +extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + assert(Py_IsInitialized()); + init_operator(); + if (size < 1 || size > 0x10000) return 0; + if (PyErr_Occurred()) PyErr_Clear(); + + FuzzedDataProvider fdp(data, size); + op_operator(fdp); + + return 0; +} diff --git a/module-fuzzers/fuzz_parsers.cpp b/module-fuzzers/fuzz_parsers.cpp deleted file mode 100644 index 2e83878..0000000 --- a/module-fuzzers/fuzz_parsers.cpp +++ /dev/null @@ -1,744 +0,0 @@ -// fuzz_parsers.cpp — Fuzzer for CPython's parser and text processing C -// extension modules. -// -// This fuzzer exercises the following CPython C extension modules via -// their Python API, called through the Python C API from C++: -// -// _json — json.dumps(), JSONEncoder with various options -// _csv — csv.Sniffer.sniff/has_header, csv.writer, -// csv.DictWriter with quoting modes -// pyexpat — ParserCreate with encodings/namespace_separator, -// Parse, ParseFile, handlers, GetInputContext -// time — strftime with fuzz format, strptime with fuzz input -// _operator — lt, gt, eq, ne, contains, countOf, indexOf, -// length_hint, concat, getitem, methodcaller -// _locale — strxfrm, strcoll, getlocale -// _opcode (via dis) — dis.dis() on compiled code -// -// The first byte of fuzz input selects one of 7 operation types. Each -// operation consumes further bytes via FuzzedDataProvider to parameterize -// the call (encoder options, parser encoding, operator selection). -// -// All module functions and class constructors are imported once during init -// and cached as static PyObject* pointers. PyRef (RAII) prevents reference -// leaks. PyGC_Collect() runs every 200 iterations. Max input size: 64 KB. - -#include "fuzz_helpers.h" - -// --------------------------------------------------------------------------- -// Cached module objects, initialized once. -// --------------------------------------------------------------------------- - -// json -static PyObject *json_dumps, *json_JSONEncoder; - -// csv -static PyObject *csv_Sniffer, *csv_writer, *csv_DictWriter; -static PyObject *csv_QUOTE_ALL, *csv_QUOTE_NONNUMERIC; - -// expat -static PyObject *expat_ParserCreate; - -// io -static PyObject *bytesio_ctor, *stringio_ctor; - -// time -static PyObject *time_strftime, *time_strptime, *time_localtime; - -// operator -static PyObject *op_lt, *op_gt, *op_eq, *op_ne; -static PyObject *op_contains, *op_countOf, *op_indexOf, *op_length_hint; -static PyObject *op_concat, *op_getitem, *op_methodcaller; - -// dis -static PyObject *dis_dis; - -// locale -static PyObject *locale_strxfrm, *locale_strcoll, *locale_getlocale; - -// Handler lambdas (for expat). -static PyObject *noop_handler, *noop_handler_noargs; - -static unsigned long gc_counter = 0; - -static int initialized = 0; - -static void init_parsers(void) { - if (initialized) return; - - // json - json_dumps = import_attr("json", "dumps"); - json_JSONEncoder = import_attr("json", "JSONEncoder"); - - // csv - csv_Sniffer = import_attr("csv", "Sniffer"); - csv_writer = import_attr("csv", "writer"); - csv_DictWriter = import_attr("csv", "DictWriter"); - csv_QUOTE_ALL = import_attr("csv", "QUOTE_ALL"); - csv_QUOTE_NONNUMERIC = import_attr("csv", "QUOTE_NONNUMERIC"); - - // expat - expat_ParserCreate = import_attr("xml.parsers.expat", "ParserCreate"); - - // io - bytesio_ctor = import_attr("io", "BytesIO"); - stringio_ctor = import_attr("io", "StringIO"); - - // time - time_strftime = import_attr("time", "strftime"); - time_strptime = import_attr("time", "strptime"); - time_localtime = import_attr("time", "localtime"); - - // operator - op_lt = import_attr("operator", "lt"); - op_gt = import_attr("operator", "gt"); - op_eq = import_attr("operator", "eq"); - op_ne = import_attr("operator", "ne"); - op_contains = import_attr("operator", "contains"); - op_countOf = import_attr("operator", "countOf"); - op_indexOf = import_attr("operator", "indexOf"); - op_length_hint = import_attr("operator", "length_hint"); - op_concat = import_attr("operator", "concat"); - op_getitem = import_attr("operator", "getitem"); - op_methodcaller = import_attr("operator", "methodcaller"); - - // dis - dis_dis = import_attr("dis", "dis"); - - // locale - locale_strxfrm = import_attr("locale", "strxfrm"); - locale_strcoll = import_attr("locale", "strcoll"); - locale_getlocale = import_attr("locale", "getlocale"); - - // No-op handler lambdas for expat. - { - PyObject *globals = PyDict_New(); - PyDict_SetItemString(globals, "__builtins__", PyEval_GetBuiltins()); - PyObject *r = PyRun_String( - "_noop = lambda *a: None\n" - "_noop_noargs = lambda: None\n", - Py_file_input, globals, globals); - if (!r) { PyErr_Print(); abort(); } - Py_DECREF(r); - noop_handler = PyDict_GetItemString(globals, "_noop"); - Py_INCREF(noop_handler); - noop_handler_noargs = PyDict_GetItemString(globals, "_noop_noargs"); - Py_INCREF(noop_handler_noargs); - Py_DECREF(globals); - } - - // Suppress warnings. - PyRun_SimpleString("import warnings; warnings.filterwarnings('ignore')"); - - assert(!PyErr_Occurred()); - initialized = 1; -} - -// --------------------------------------------------------------------------- -// Operations (7 ops). -// --------------------------------------------------------------------------- - -// OP_JSON_ENCODE: FDP selects variant — json.dumps(str), json.dumps({str:str}), -// json.dumps([str,str]), or JSONEncoder with options. Exercises the _json -// C acceleration module's encoding paths. -static void op_json_encode(FuzzedDataProvider &fdp) { - int str_enc = fdp.ConsumeIntegralInRange(0, 3); - int variant = fdp.ConsumeIntegralInRange(0, 5); - std::string data = fdp.ConsumeBytesAsString( - std::min(fdp.remaining_bytes(), (size_t)10000)); - PyRef pystr(fuzz_bytes_to_str(data, str_enc)); - CHECK(pystr); - - switch (variant) { - case 0: { - // json.dumps(str) - PyRef r = PyObject_CallFunction(json_dumps, "O", (PyObject *)pystr); - break; - } - case 1: { - // json.dumps({str: str}) - PyRef d = PyDict_New(); - CHECK(d); - PyDict_SetItem(d, pystr, pystr); - PyRef r = PyObject_CallFunction(json_dumps, "O", (PyObject *)d); - break; - } - case 2: { - // json.dumps([str, str]) - PyRef lst = PyList_New(2); - CHECK(lst); - Py_INCREF((PyObject *)pystr); - Py_INCREF((PyObject *)pystr); - PyList_SET_ITEM((PyObject *)lst, 0, (PyObject *)pystr); - PyList_SET_ITEM((PyObject *)lst, 1, (PyObject *)pystr); - PyRef r = PyObject_CallFunction(json_dumps, "O", (PyObject *)lst); - break; - } - case 3: { - // JSONEncoder(ensure_ascii=False).encode(str) - PyRef kwargs = PyDict_New(); - CHECK(kwargs); - PyDict_SetItemString(kwargs, "ensure_ascii", Py_False); - PyRef empty = PyTuple_New(0); - CHECK(empty); - PyRef enc = PyObject_Call(json_JSONEncoder, empty, kwargs); - CHECK(enc); - PyRef r = PyObject_CallMethod(enc, "encode", "O", (PyObject *)pystr); - break; - } - case 4: { - // JSONEncoder(ensure_ascii=True).encode(str) - PyRef kwargs = PyDict_New(); - CHECK(kwargs); - PyDict_SetItemString(kwargs, "ensure_ascii", Py_True); - PyRef empty = PyTuple_New(0); - CHECK(empty); - PyRef enc = PyObject_Call(json_JSONEncoder, empty, kwargs); - CHECK(enc); - PyRef r = PyObject_CallMethod(enc, "encode", "O", (PyObject *)pystr); - break; - } - case 5: { - // JSONEncoder(sort_keys=True, indent=2, ensure_ascii=False).encode({s:s}) - PyRef kwargs = PyDict_New(); - CHECK(kwargs); - PyDict_SetItemString(kwargs, "sort_keys", Py_True); - PyRef indent = PyLong_FromLong(2); - CHECK(indent); - PyDict_SetItemString(kwargs, "indent", indent); - PyDict_SetItemString(kwargs, "ensure_ascii", Py_False); - PyRef empty = PyTuple_New(0); - CHECK(empty); - PyRef enc = PyObject_Call(json_JSONEncoder, empty, kwargs); - CHECK(enc); - PyRef d = PyDict_New(); - CHECK(d); - PyDict_SetItem(d, pystr, pystr); - PyRef r = PyObject_CallMethod(enc, "encode", "O", (PyObject *)d); - break; - } - } - if (PyErr_Occurred()) PyErr_Clear(); -} - -// OP_CSV_SNIFFER: Call csv.Sniffer().sniff() and .has_header() on fuzz str. -// Exercises the _csv C module's dialect detection paths. -static void op_csv_sniffer(FuzzedDataProvider &fdp) { - int str_enc = fdp.ConsumeIntegralInRange(0, 3); - std::string data = fdp.ConsumeBytesAsString( - std::min(fdp.remaining_bytes(), (size_t)1024)); - PyRef pystr(fuzz_bytes_to_str(data, str_enc)); - CHECK(pystr); - - PyRef sniffer = PyObject_CallFunction(csv_Sniffer, NULL); - CHECK(sniffer); - - { - PyRef r = PyObject_CallMethod(sniffer, "sniff", "O", (PyObject *)pystr); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - PyRef r = PyObject_CallMethod(sniffer, "has_header", "O", - (PyObject *)pystr); - if (PyErr_Occurred()) PyErr_Clear(); - } -} - -// OP_CSV_WRITER: FDP selects variant — basic writerow, writerows, tab-delimited, -// DictWriter, QUOTE_ALL, QUOTE_NONNUMERIC. All write to StringIO. -// Exercises the _csv C module's writer paths. -static void op_csv_writer(FuzzedDataProvider &fdp) { - int str_enc = fdp.ConsumeIntegralInRange(0, 3); - int variant = fdp.ConsumeIntegralInRange(0, 5); - std::string data = fdp.ConsumeBytesAsString( - std::min(fdp.remaining_bytes(), (size_t)10000)); - PyRef pystr(fuzz_bytes_to_str(data, str_enc)); - CHECK(pystr); - - PyRef sio = PyObject_CallFunction(stringio_ctor, NULL); - CHECK(sio); - - // Split string into words for row data. - PyRef words = PyObject_CallMethod(pystr, "split", NULL); - if (!words) { PyErr_Clear(); return; } - - // Ensure non-empty. - if (PyList_Size(words) == 0) { - PyRef empty = PyUnicode_FromString(""); - PyList_Append(words, empty); - } - - switch (variant) { - case 0: { - // Basic writerow. - PyRef w = PyObject_CallFunction(csv_writer, "O", (PyObject *)sio); - CHECK(w); - PyRef r = PyObject_CallMethod(w, "writerow", "O", (PyObject *)words); - break; - } - case 1: { - // writerows with lines. - PyRef lines = PyObject_CallMethod(pystr, "splitlines", NULL); - if (!lines) { PyErr_Clear(); break; } - PyRef rows = PyList_New(0); - CHECK(rows); - Py_ssize_t nlines = PyList_Size(lines); - for (Py_ssize_t i = 0; i < nlines && i < 20; i++) { - PyObject *line = PyList_GetItem(lines, i); - PyRef lwords = PyObject_CallMethod(line, "split", NULL); - if (!lwords) { PyErr_Clear(); continue; } - if (PyList_Size(lwords) == 0) { - PyRef e = PyUnicode_FromString(""); - PyList_Append(lwords, e); - } - PyList_Append(rows, lwords); - } - PyRef w = PyObject_CallFunction(csv_writer, "O", (PyObject *)sio); - CHECK(w); - PyRef r = PyObject_CallMethod(w, "writerows", "O", (PyObject *)rows); - break; - } - case 2: { - // Tab-delimited. - PyRef kwargs = PyDict_New(); - CHECK(kwargs); - PyRef delim = PyUnicode_FromString("\t"); - CHECK(delim); - PyDict_SetItemString(kwargs, "delimiter", delim); - PyRef args = PyTuple_Pack(1, (PyObject *)sio); - CHECK(args); - PyRef w = PyObject_Call(csv_writer, args, kwargs); - CHECK(w); - PyRef r = PyObject_CallMethod(w, "writerow", "O", (PyObject *)words); - break; - } - case 3: { - // DictWriter. - Py_ssize_t nwords = PyList_Size(words); - Py_ssize_t nfields = nwords < 8 ? nwords : 8; - if (nfields == 0) nfields = 1; - PyRef fieldnames = PyList_GetSlice(words, 0, nfields); - CHECK(fieldnames); - PyRef kwargs = PyDict_New(); - CHECK(kwargs); - PyDict_SetItemString(kwargs, "fieldnames", fieldnames); - PyRef args = PyTuple_Pack(1, (PyObject *)sio); - CHECK(args); - PyRef dw = PyObject_Call(csv_DictWriter, args, kwargs); - CHECK(dw); - PyRef wh = PyObject_CallMethod(dw, "writeheader", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - // Build row dict. - PyRef row = PyDict_New(); - CHECK(row); - for (Py_ssize_t i = 0; i < nfields; i++) { - PyObject *fn = PyList_GetItem(fieldnames, i); - PyDict_SetItem(row, fn, pystr); - } - PyRef wr = PyObject_CallMethod(dw, "writerow", "O", (PyObject *)row); - break; - } - case 4: { - // QUOTE_ALL. - PyRef kwargs = PyDict_New(); - CHECK(kwargs); - PyDict_SetItemString(kwargs, "quoting", csv_QUOTE_ALL); - PyRef args = PyTuple_Pack(1, (PyObject *)sio); - CHECK(args); - PyRef w = PyObject_Call(csv_writer, args, kwargs); - CHECK(w); - PyRef r = PyObject_CallMethod(w, "writerow", "O", (PyObject *)words); - break; - } - case 5: { - // QUOTE_NONNUMERIC. - PyRef kwargs = PyDict_New(); - CHECK(kwargs); - PyDict_SetItemString(kwargs, "quoting", csv_QUOTE_NONNUMERIC); - PyRef args = PyTuple_Pack(1, (PyObject *)sio); - CHECK(args); - PyRef w = PyObject_Call(csv_writer, args, kwargs); - CHECK(w); - PyRef r = PyObject_CallMethod(w, "writerow", "O", (PyObject *)words); - break; - } - } - if (PyErr_Occurred()) PyErr_Clear(); - - // Read result. - PyRef val = PyObject_CallMethod(sio, "getvalue", NULL); - if (PyErr_Occurred()) PyErr_Clear(); -} - -// OP_EXPAT: FDP selects encoding and handler setup, then Parse or ParseFile. -// Exercises the pyexpat C module's XML parsing paths. -static void op_expat(FuzzedDataProvider &fdp) { - static const char *kEncodings[] = {"utf-8", "iso-8859-1", NULL}; - int enc_idx = fdp.ConsumeIntegralInRange(0, 2); - bool use_ns = fdp.ConsumeBool(); - bool set_handlers = fdp.ConsumeBool(); - bool use_parsefile = fdp.ConsumeBool(); - std::string data = fdp.ConsumeBytesAsString( - std::min(fdp.remaining_bytes(), (size_t)4096)); - - // Create parser. - PyRef parser; - if (use_ns) { - PyRef kwargs = PyDict_New(); - CHECK(kwargs); - PyRef ns_sep = PyUnicode_FromString(" "); - CHECK(ns_sep); - PyDict_SetItemString(kwargs, "namespace_separator", ns_sep); - PyRef empty = PyTuple_New(0); - CHECK(empty); - parser = PyRef(PyObject_Call(expat_ParserCreate, empty, kwargs)); - } else if (kEncodings[enc_idx]) { - parser = PyRef(PyObject_CallFunction(expat_ParserCreate, "s", - kEncodings[enc_idx])); - } else { - parser = PyRef(PyObject_CallFunction(expat_ParserCreate, NULL)); - } - CHECK(parser); - - // Set handlers. - if (set_handlers) { - PyObject_SetAttrString(parser, "StartElementHandler", noop_handler); - PyObject_SetAttrString(parser, "EndElementHandler", noop_handler); - PyObject_SetAttrString(parser, "CharacterDataHandler", noop_handler); - PyObject_SetAttrString(parser, "ProcessingInstructionHandler", - noop_handler); - PyObject_SetAttrString(parser, "CommentHandler", noop_handler); - PyObject_SetAttrString(parser, "StartCdataSectionHandler", - noop_handler_noargs); - PyObject_SetAttrString(parser, "EndCdataSectionHandler", - noop_handler_noargs); - } - - PyRef pydata = PyBytes_FromStringAndSize(Y(data)); - CHECK(pydata); - - if (use_parsefile) { - // ParseFile(BytesIO(data)). - PyRef bio = PyObject_CallFunction(bytesio_ctor, "O", (PyObject *)pydata); - CHECK(bio); - PyRef r = PyObject_CallMethod(parser, "ParseFile", "O", (PyObject *)bio); - } else { - // Parse(data, True). - PyRef r = PyObject_CallMethod(parser, "Parse", "Oi", - (PyObject *)pydata, 1); - } - if (PyErr_Occurred()) PyErr_Clear(); - - // Optionally GetInputContext. - if (data.size() % 2 == 0) { - PyRef ctx = PyObject_CallMethod(parser, "GetInputContext", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - } -} - -// OP_TIME: FDP selects variant — strftime with fuzz format, strptime with -// fuzz input, or strptime with fuzz format. Exercises the time C module. -static void op_time(FuzzedDataProvider &fdp) { - int str_enc = fdp.ConsumeIntegralInRange(0, 3); - int variant = fdp.ConsumeIntegralInRange(0, 2); - std::string data = fdp.ConsumeBytesAsString( - std::min(fdp.remaining_bytes(), (size_t)10000)); - PyRef pystr(fuzz_bytes_to_str(data, str_enc)); - CHECK(pystr); - - switch (variant) { - case 0: { - // time.strftime(str, time.localtime()) - PyRef lt = PyObject_CallFunction(time_localtime, NULL); - CHECK(lt); - // Use non-empty format. - Py_ssize_t slen = PyUnicode_GET_LENGTH(pystr); - PyObject *fmt = slen > 0 ? (PyObject *)pystr : NULL; - if (!fmt) { - PyRef def_fmt = PyUnicode_FromString("%Y"); - CHECK(def_fmt); - PyRef r = PyObject_CallFunction(time_strftime, "OO", - (PyObject *)def_fmt, (PyObject *)lt); - } else { - PyRef r = PyObject_CallFunction(time_strftime, "OO", - fmt, (PyObject *)lt); - } - break; - } - case 1: { - // time.strptime(str, '%Y-%m-%d %H:%M:%S') - PyRef r = PyObject_CallFunction(time_strptime, "Os", - (PyObject *)pystr, - "%Y-%m-%d %H:%M:%S"); - break; - } - case 2: { - // time.strptime('2024-01-15 12:30:00', str) - // Use non-empty format. - Py_ssize_t slen = PyUnicode_GET_LENGTH(pystr); - PyObject *fmt = slen > 0 ? (PyObject *)pystr : NULL; - if (!fmt) { - PyRef def_fmt = PyUnicode_FromString("%Y-%m-%d %H:%M:%S"); - CHECK(def_fmt); - PyRef r = PyObject_CallFunction(time_strptime, "sO", - "2024-01-15 12:30:00", - (PyObject *)def_fmt); - } else { - PyRef r = PyObject_CallFunction(time_strptime, "sO", - "2024-01-15 12:30:00", fmt); - } - break; - } - } - if (PyErr_Occurred()) PyErr_Clear(); -} - -// OP_OPERATOR: FDP selects operator variant — comparisons, sequence ops, -// concat, getitem, methodcaller. Exercises the _operator C module. -static void op_operator(FuzzedDataProvider &fdp) { - int variant = fdp.ConsumeIntegralInRange(0, 5); - std::string data = fdp.ConsumeRemainingBytesAsString(); - - PyRef pydata = PyBytes_FromStringAndSize(Y(data)); - CHECK(pydata); - - switch (variant) { - case 0: { - // Comparisons: lt/gt/eq/ne(data, data[::-1]) - PyRef rev = PyObject_CallMethod(pydata, "__class__", NULL); - // Build reversed bytes. - std::string rdata(data.rbegin(), data.rend()); - PyRef pyrev = PyBytes_FromStringAndSize(Y(rdata)); - CHECK(pyrev); - { - PyRef r = PyObject_CallFunction(op_lt, "OO", - (PyObject *)pydata, (PyObject *)pyrev); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - PyRef r = PyObject_CallFunction(op_gt, "OO", - (PyObject *)pydata, (PyObject *)pyrev); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - PyRef r = PyObject_CallFunction(op_eq, "OO", - (PyObject *)pydata, (PyObject *)pydata); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - PyRef empty = PyBytes_FromStringAndSize("", 0); - CHECK(empty); - PyRef r = PyObject_CallFunction(op_ne, "OO", - (PyObject *)pydata, (PyObject *)empty); - if (PyErr_Occurred()) PyErr_Clear(); - } - break; - } - case 1: { - // Sequence ops: contains, countOf, indexOf, length_hint - if (data.empty()) break; - PyRef byte_val = PyLong_FromLong((unsigned char)data[0]); - CHECK(byte_val); - { - PyRef r = PyObject_CallFunction(op_contains, "OO", - (PyObject *)pydata, - (PyObject *)byte_val); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - PyRef r = PyObject_CallFunction(op_countOf, "OO", - (PyObject *)pydata, - (PyObject *)byte_val); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - PyRef r = PyObject_CallFunction(op_indexOf, "OO", - (PyObject *)pydata, - (PyObject *)byte_val); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - PyRef r = PyObject_CallFunction(op_length_hint, "O", - (PyObject *)pydata); - if (PyErr_Occurred()) PyErr_Clear(); - } - break; - } - case 2: { - // concat(data, data) - PyRef r = PyObject_CallFunction(op_concat, "OO", - (PyObject *)pydata, (PyObject *)pydata); - if (PyErr_Occurred()) PyErr_Clear(); - break; - } - case 3: { - // getitem(data, 0) + getitem(data, slice) - if (data.empty()) break; - PyRef zero = PyLong_FromLong(0); - CHECK(zero); - { - PyRef r = PyObject_CallFunction(op_getitem, "OO", - (PyObject *)pydata, (PyObject *)zero); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - PyRef half = PyLong_FromLong(data.size() / 2); - CHECK(half); - PyRef sl = PySlice_New(zero, half, NULL); - CHECK(sl); - PyRef r = PyObject_CallFunction(op_getitem, "OO", - (PyObject *)pydata, (PyObject *)sl); - if (PyErr_Occurred()) PyErr_Clear(); - } - break; - } - case 4: { - // methodcaller('upper')(str) + methodcaller('encode', 'utf-8')(str) - int str_enc = data.size() > 0 ? data[0] & 3 : 0; - PyRef pystr(fuzz_bytes_to_str(data, str_enc)); - CHECK(pystr); - { - PyRef mc = PyObject_CallFunction(op_methodcaller, "s", "upper"); - CHECK(mc); - PyRef r = PyObject_CallFunction(mc, "O", (PyObject *)pystr); - if (PyErr_Occurred()) PyErr_Clear(); - } - { - PyRef mc = PyObject_CallFunction(op_methodcaller, "ss", - "encode", "utf-8"); - CHECK(mc); - PyRef r = PyObject_CallFunction(mc, "O", (PyObject *)pystr); - if (PyErr_Occurred()) PyErr_Clear(); - } - break; - } - case 5: { - // contains on bytes with slice - if (data.empty()) break; - PyRef first = PyBytes_FromStringAndSize(data.data(), 1); - CHECK(first); - PyRef r = PyObject_CallFunction(op_contains, "OO", - (PyObject *)pydata, (PyObject *)first); - if (PyErr_Occurred()) PyErr_Clear(); - break; - } - } -} - -// OP_DIS_LOCALE: FDP selects — dis.dis(compile(str)), locale.strxfrm(str), -// locale.strcoll(str), or locale.getlocale(). Exercises _opcode via dis -// and _locale C module. -static void op_dis_locale(FuzzedDataProvider &fdp) { - int str_enc = fdp.ConsumeIntegralInRange(0, 3); - int variant = fdp.ConsumeIntegralInRange(0, 3); - std::string data = fdp.ConsumeBytesAsString( - std::min(fdp.remaining_bytes(), (size_t)10000)); - PyRef pystr(fuzz_bytes_to_str(data, str_enc)); - CHECK(pystr); - - switch (variant) { - case 0: { - // dis.dis(compile(str, '', 'exec')) - Py_ssize_t slen = PyUnicode_GET_LENGTH(pystr); - PyObject *src = slen > 0 ? (PyObject *)pystr : NULL; - if (!src) { - PyRef def_src = PyUnicode_FromString("pass"); - CHECK(def_src); - src = def_src; - Py_INCREF(src); - } else { - Py_INCREF(src); - } - PyRef code = PyRef(Py_CompileString( - PyUnicode_AsUTF8(src), "", Py_file_input)); - Py_DECREF(src); - if (!code) { PyErr_Clear(); break; } - // Capture dis output to StringIO. - PyRef sio = PyObject_CallFunction(stringio_ctor, NULL); - CHECK(sio); - PyRef kwargs = PyDict_New(); - CHECK(kwargs); - PyDict_SetItemString(kwargs, "file", sio); - PyRef args = PyTuple_Pack(1, (PyObject *)code); - CHECK(args); - PyRef r = PyObject_Call(dis_dis, args, kwargs); - if (PyErr_Occurred()) PyErr_Clear(); - break; - } - case 1: { - // locale.strxfrm(str) - PyRef r = PyObject_CallFunction(locale_strxfrm, "O", (PyObject *)pystr); - if (PyErr_Occurred()) PyErr_Clear(); - break; - } - case 2: { - // locale.strcoll(str[:mid], str[mid:]) - Py_ssize_t slen = PyUnicode_GET_LENGTH(pystr); - Py_ssize_t mid = slen / 2; - PyRef half1 = PyUnicode_Substring(pystr, 0, mid); - CHECK(half1); - PyRef half2 = PyUnicode_Substring(pystr, mid, slen); - CHECK(half2); - PyRef r = PyObject_CallFunction(locale_strcoll, "OO", - (PyObject *)half1, (PyObject *)half2); - if (PyErr_Occurred()) PyErr_Clear(); - break; - } - case 3: { - // locale.getlocale() - PyRef r = PyObject_CallFunction(locale_getlocale, NULL); - if (PyErr_Occurred()) PyErr_Clear(); - break; - } - } -} - -// --------------------------------------------------------------------------- -// Dispatch. -// --------------------------------------------------------------------------- - -enum Op { - OP_JSON_ENCODE, - OP_CSV_SNIFFER, - OP_CSV_WRITER, - OP_EXPAT, - OP_TIME, - OP_OPERATOR, - OP_DIS_LOCALE, - NUM_OPS -}; - -extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { - assert(Py_IsInitialized()); - init_parsers(); - if (size < 1 || size > 0x10000) return 0; - if (PyErr_Occurred()) PyErr_Clear(); - - FuzzedDataProvider fdp(data, size); - switch (fdp.ConsumeIntegralInRange(0, NUM_OPS - 1)) { - case OP_JSON_ENCODE: - op_json_encode(fdp); - break; - case OP_CSV_SNIFFER: - op_csv_sniffer(fdp); - break; - case OP_CSV_WRITER: - op_csv_writer(fdp); - break; - case OP_EXPAT: - op_expat(fdp); - break; - case OP_TIME: - op_time(fdp); - break; - case OP_OPERATOR: - op_operator(fdp); - break; - case OP_DIS_LOCALE: - op_dis_locale(fdp); - break; - } - - if (++gc_counter % kGcInterval == 0) PyGC_Collect(); - return 0; -} diff --git a/module-fuzzers/fuzz_pickle.cpp b/module-fuzzers/fuzz_pickle.cpp new file mode 100644 index 0000000..ebd218b --- /dev/null +++ b/module-fuzzers/fuzz_pickle.cpp @@ -0,0 +1,491 @@ +// fuzz_pickle.cpp — Fuzzer for CPython's pickle C extension module. +// +// This fuzzer exercises the following CPython C extension module via +// its Python API, called through the Python C API from C++: +// +// _pickle — pickle.dumps() with 8 container types (bytes, str, +// list, tuple, set, frozenset, bytearray, dict) across +// protocols 0-5 and fix_imports flag. +// pickle.loads() via RestrictedUnpickler (blocks +// find_class), PersistentUnpickler (handles PERSID/ +// BINPERSID), and RestrictedUnpickler with +// encoding='bytes'. +// Pickler chain: dump, clear_memo, dump, getvalue. +// Round-trip: dumps then loads. +// +// FDP selects one of 4 operation types. Each operation consumes further +// bytes via FuzzedDataProvider to parameterize the call (protocol number, +// container type, boolean flags). +// +// All module functions are imported once during init and cached as static +// PyObject* pointers. Two pickle Unpickler subclasses (RestrictedUnpickler, +// PersistentUnpickler) are defined via PyRun_String at init time. +// PyRef (RAII) prevents reference leaks. Max input size: 1 MB. + +#include "fuzz_helpers.h" + +static PyObject *pickle_dumps, *pickle_loads; +static PyObject *pickle_Pickler; +static PyObject *bytesio_ctor; +static PyObject *RestrictedUnpickler_cls, *PersistentUnpickler_cls; + +static int initialized = 0; + +static void init_pickle(void) { + if (initialized) return; + + pickle_dumps = import_attr("pickle", "dumps"); + pickle_loads = import_attr("pickle", "loads"); + pickle_Pickler = import_attr("pickle", "Pickler"); + bytesio_ctor = import_attr("io", "BytesIO"); + static const char *kPickleHelpers = + "import pickle, io\n" + "class RestrictedUnpickler(pickle.Unpickler):\n" + " def find_class(self, module, name):\n" + " raise pickle.UnpicklingError('restricted')\n" + "class PersistentUnpickler(pickle.Unpickler):\n" + " def persistent_load(self, pid): return pid\n" + " def find_class(self, module, name):\n" + " raise pickle.UnpicklingError('restricted')\n"; + RestrictedUnpickler_cls = run_python_and_get(kPickleHelpers, + "RestrictedUnpickler"); + PersistentUnpickler_cls = run_python_and_get(kPickleHelpers, + "PersistentUnpickler"); + + assert(!PyErr_Occurred()); + initialized = 1; +} + +// Container types for build_pickle_container. +enum ContainerType { + CT_RAW_BYTES, CT_STR, CT_LIST, CT_TUPLE, + CT_SET, CT_FROZENSET, CT_BYTEARRAY, CT_DICT, + NUM_CONTAINER_TYPES +}; + +// Hashable types that can be used as dict keys. +// list, set, bytearray, and dict are unhashable. +enum HashableType { + HT_RAW_BYTES, HT_STR, HT_INT, HT_FLOAT, HT_TUPLE, HT_FROZENSET, + NUM_HASHABLE_TYPES +}; + +// Build a single hashable Python value from fdp, suitable for dict keys. +static PyObject *build_hashable_value(FuzzedDataProvider &fdp) { + int t = fdp.ConsumeIntegralInRange(0, NUM_HASHABLE_TYPES - 1); + switch (t) { + case HT_RAW_BYTES: { + size_t len = fdp.ConsumeIntegralInRange( + 0, std::min(fdp.remaining_bytes(), (size_t)10000)); + std::string s = fdp.ConsumeBytesAsString(len); + return PyBytes_FromStringAndSize(Y(s)); + } + case HT_STR: { + int str_enc = fdp.ConsumeIntegralInRange(0, 3); + size_t len = fdp.ConsumeIntegralInRange( + 0, std::min(fdp.remaining_bytes(), (size_t)10000)); + std::string s = fdp.ConsumeBytesAsString(len); + return fuzz_bytes_to_str(s, str_enc); + } + case HT_INT: { + long v = fdp.ConsumeIntegral(); + return PyLong_FromLong(v); + } + case HT_FLOAT: { + double v = fdp.ConsumeFloatingPoint(); + return PyFloat_FromDouble(v); + } + case HT_TUPLE: { + size_t n = fdp.ConsumeIntegralInRange(0, 200); + PyObject *tup = PyTuple_New((Py_ssize_t)n); + if (!tup) return NULL; + for (size_t i = 0; i < n; i++) { + long v = fdp.ConsumeIntegral(); + PyObject *item = PyLong_FromLong(v); + if (!item) { Py_DECREF(tup); return NULL; } + PyTuple_SET_ITEM(tup, i, item); + } + return tup; + } + case HT_FROZENSET: { + size_t n = fdp.ConsumeIntegralInRange(0, 200); + PyObject *lst = PyList_New((Py_ssize_t)n); + if (!lst) return NULL; + for (size_t i = 0; i < n; i++) { + long v = fdp.ConsumeIntegral(); + PyObject *item = PyLong_FromLong(v); + if (!item) { Py_DECREF(lst); return NULL; } + PyList_SET_ITEM(lst, i, item); + } + PyObject *fs = PyFrozenSet_New(lst); + Py_DECREF(lst); + return fs; + } + default: + return PyLong_FromLong(0); + } +} + +// Value types for dict values (any picklable type, no recursion into dict). +enum ValueType { + VT_RAW_BYTES, VT_STR, VT_INT, VT_FLOAT, VT_LIST, VT_TUPLE, + VT_SET, VT_FROZENSET, VT_BYTEARRAY, VT_NONE, + NUM_VALUE_TYPES +}; + +// Build a single Python value from fdp, suitable for dict values. +static PyObject *build_any_value(FuzzedDataProvider &fdp) { + int t = fdp.ConsumeIntegralInRange(0, NUM_VALUE_TYPES - 1); + switch (t) { + case VT_RAW_BYTES: { + size_t len = fdp.ConsumeIntegralInRange( + 0, std::min(fdp.remaining_bytes(), (size_t)10000)); + std::string s = fdp.ConsumeBytesAsString(len); + return PyBytes_FromStringAndSize(Y(s)); + } + case VT_STR: { + int str_enc = fdp.ConsumeIntegralInRange(0, 3); + size_t len = fdp.ConsumeIntegralInRange( + 0, std::min(fdp.remaining_bytes(), (size_t)10000)); + std::string s = fdp.ConsumeBytesAsString(len); + return fuzz_bytes_to_str(s, str_enc); + } + case VT_INT: { + long v = fdp.ConsumeIntegral(); + return PyLong_FromLong(v); + } + case VT_FLOAT: { + double v = fdp.ConsumeFloatingPoint(); + return PyFloat_FromDouble(v); + } + case VT_LIST: { + size_t n = fdp.ConsumeIntegralInRange(0, 200); + PyObject *lst = PyList_New((Py_ssize_t)n); + if (!lst) return NULL; + for (size_t i = 0; i < n; i++) { + long v = fdp.ConsumeIntegral(); + PyObject *item = PyLong_FromLong(v); + if (!item) { Py_DECREF(lst); return NULL; } + PyList_SET_ITEM(lst, i, item); + } + return lst; + } + case VT_TUPLE: { + size_t n = fdp.ConsumeIntegralInRange(0, 200); + PyObject *tup = PyTuple_New((Py_ssize_t)n); + if (!tup) return NULL; + for (size_t i = 0; i < n; i++) { + long v = fdp.ConsumeIntegral(); + PyObject *item = PyLong_FromLong(v); + if (!item) { Py_DECREF(tup); return NULL; } + PyTuple_SET_ITEM(tup, i, item); + } + return tup; + } + case VT_SET: { + size_t n = fdp.ConsumeIntegralInRange(0, 200); + PyObject *lst = PyList_New((Py_ssize_t)n); + if (!lst) return NULL; + for (size_t i = 0; i < n; i++) { + long v = fdp.ConsumeIntegral(); + PyObject *item = PyLong_FromLong(v); + if (!item) { Py_DECREF(lst); return NULL; } + PyList_SET_ITEM(lst, i, item); + } + PyObject *s = PySet_New(lst); + Py_DECREF(lst); + return s; + } + case VT_FROZENSET: { + size_t n = fdp.ConsumeIntegralInRange(0, 200); + PyObject *lst = PyList_New((Py_ssize_t)n); + if (!lst) return NULL; + for (size_t i = 0; i < n; i++) { + long v = fdp.ConsumeIntegral(); + PyObject *item = PyLong_FromLong(v); + if (!item) { Py_DECREF(lst); return NULL; } + PyList_SET_ITEM(lst, i, item); + } + PyObject *fs = PyFrozenSet_New(lst); + Py_DECREF(lst); + return fs; + } + case VT_BYTEARRAY: { + size_t len = fdp.ConsumeIntegralInRange( + 0, std::min(fdp.remaining_bytes(), (size_t)10000)); + std::string s = fdp.ConsumeBytesAsString(len); + return PyByteArray_FromStringAndSize(Y(s)); + } + case VT_NONE: + Py_INCREF(Py_None); + return Py_None; + default: + Py_INCREF(Py_None); + return Py_None; + } +} + +// Build a Python container from fuzz bytes for pickle.dumps operations. +// Capped at 10000 elements to keep serialization fast. +// str_enc selects the byte-to-str decoding (see fuzz_bytes_to_str). +// For CT_DICT, keys and values are consumed directly from fdp. +static PyObject *build_pickle_container(FuzzedDataProvider &fdp, + int type, const uint8_t *buf, + size_t len, int str_enc) { + if (len > 10000) len = 10000; + switch (type) { + case CT_RAW_BYTES: + return PyBytes_FromStringAndSize((const char *)buf, len); + case CT_STR: { + std::string s((const char *)buf, len); + return fuzz_bytes_to_str(s, str_enc); + } + case CT_LIST: { + PyObject *lst = PyList_New((Py_ssize_t)len); + if (!lst) return NULL; + for (size_t i = 0; i < len; i++) { + PyObject *v = PyLong_FromLong(buf[i]); + if (!v) { Py_DECREF(lst); return NULL; } + PyList_SET_ITEM(lst, i, v); + } + return lst; + } + case CT_TUPLE: { + PyObject *tup = PyTuple_New((Py_ssize_t)len); + if (!tup) return NULL; + for (size_t i = 0; i < len; i++) { + PyObject *v = PyLong_FromLong(buf[i]); + if (!v) { Py_DECREF(tup); return NULL; } + PyTuple_SET_ITEM(tup, i, v); + } + return tup; + } + case CT_SET: { + PyObject *lst = PyList_New((Py_ssize_t)len); + if (!lst) return NULL; + for (size_t i = 0; i < len; i++) { + PyObject *v = PyLong_FromLong(buf[i]); + if (!v) { Py_DECREF(lst); return NULL; } + PyList_SET_ITEM(lst, i, v); + } + PyObject *s = PySet_New(lst); + Py_DECREF(lst); + return s; + } + case CT_FROZENSET: { + PyObject *lst = PyList_New((Py_ssize_t)len); + if (!lst) return NULL; + for (size_t i = 0; i < len; i++) { + PyObject *v = PyLong_FromLong(buf[i]); + if (!v) { Py_DECREF(lst); return NULL; } + PyList_SET_ITEM(lst, i, v); + } + PyObject *s = PyFrozenSet_New(lst); + Py_DECREF(lst); + return s; + } + case CT_BYTEARRAY: + return PyByteArray_FromStringAndSize((const char *)buf, len); + case CT_DICT: { + // Build a dict with fuzzer-chosen types for each key and value. + // Keys use hashable types only; values can be any picklable type. + size_t n_entries = fdp.ConsumeIntegralInRange(0, 64); + PyObject *d = PyDict_New(); + if (!d) return NULL; + for (size_t i = 0; i < n_entries && fdp.remaining_bytes() > 0; i++) { + PyRef key(build_hashable_value(fdp)); + if (!key) { Py_DECREF(d); return NULL; } + PyRef val(build_any_value(fdp)); + if (!val) { Py_DECREF(d); return NULL; } + PyDict_SetItem(d, key, val); + } + return d; + } + default: + return PyBytes_FromStringAndSize((const char *)buf, len); + } +} + +// --------------------------------------------------------------------------- +// Operations (4 ops) +// --------------------------------------------------------------------------- + +// OP_PICKLE_DUMPS: Build a fuzz-chosen container type, then call +// pickle.dumps(obj, protocol=N, fix_imports=bool). Protocol is fuzz-chosen +// 0-5, exercising all pickle opcodes. +static void op_pickle_dumps(FuzzedDataProvider &fdp) { + int container_type = fdp.ConsumeIntegralInRange(0, NUM_CONTAINER_TYPES - 1); + int protocol = fdp.ConsumeIntegralInRange(0, 5); + bool fix_imports = fdp.ConsumeBool(); + int str_enc = fdp.ConsumeIntegralInRange(0, 3); + if (fdp.remaining_bytes() == 0) return; + size_t data_len = fdp.ConsumeIntegralInRange( + 1, std::min(fdp.remaining_bytes(), (size_t)10000)); + std::string data = fdp.ConsumeBytesAsString(data_len); + + PyRef obj(build_pickle_container( + fdp, container_type, (const uint8_t *)data.data(), data.size(), str_enc)); + CHECK(obj); + + PyRef kwargs = PyDict_New(); + CHECK(kwargs); + PyRef proto = PyLong_FromLong(protocol); + CHECK(proto); + PyDict_SetItemString(kwargs, "protocol", proto); + PyDict_SetItemString(kwargs, "fix_imports", + fix_imports ? Py_True : Py_False); + PyRef args = PyTuple_Pack(1, (PyObject *)obj); + CHECK(args); + PyRef r = PyObject_Call(pickle_dumps, args, kwargs); + if (PyErr_Occurred()) PyErr_Clear(); +} + +// OP_PICKLE_LOADS: Wrap fuzz data in BytesIO, then unpickle via one of 3 +// Unpickler subclass variants (fuzz-chosen). +static void op_pickle_loads(FuzzedDataProvider &fdp) { + int variant = fdp.ConsumeIntegralInRange(0, 2); + std::string data = fdp.ConsumeRemainingBytesAsString(); + PyRef pydata = PyBytes_FromStringAndSize(Y(data)); + CHECK(pydata); + PyRef bio = PyObject_CallFunction(bytesio_ctor, "O", + (PyObject *)pydata); + CHECK(bio); + + PyObject *cls = nullptr; + PyRef kwargs_ref; + switch (variant) { + case 0: + cls = RestrictedUnpickler_cls; + break; + case 1: + cls = PersistentUnpickler_cls; + break; + case 2: { + cls = RestrictedUnpickler_cls; + kwargs_ref = PyRef(PyDict_New()); + CHECK(kwargs_ref); + PyDict_SetItemString(kwargs_ref, "fix_imports", Py_True); + PyRef enc = PyUnicode_FromString("bytes"); + CHECK(enc); + PyDict_SetItemString(kwargs_ref, "encoding", enc); + break; + } + } + + PyRef args = PyTuple_Pack(1, (PyObject *)bio); + CHECK(args); + PyRef unpickler = PyObject_Call( + cls, args, kwargs_ref.p ? (PyObject *)kwargs_ref : NULL); + CHECK(unpickler); + PyRef r = PyObject_CallMethod(unpickler, "load", NULL); + if (PyErr_Occurred()) PyErr_Clear(); +} + +// OP_PICKLE_PICKLER: Create pickle.Pickler(BytesIO, protocol=N), then chain: +// .dump(list_of_ints), .clear_memo(), .dump(str), .getvalue(). +static void op_pickle_pickler(FuzzedDataProvider &fdp) { + int protocol = fdp.ConsumeIntegralInRange(0, 5); + int str_enc = fdp.ConsumeIntegralInRange(0, 3); + if (fdp.remaining_bytes() == 0) return; + size_t data_len = fdp.ConsumeIntegralInRange( + 1, std::min(fdp.remaining_bytes(), (size_t)10000)); + std::string data = fdp.ConsumeBytesAsString(data_len); + std::string data2 = fdp.ConsumeRemainingBytesAsString(); + + // Create an in-memory BytesIO buffer for the Pickler to write into. + PyRef bio = PyObject_CallFunction(bytesio_ctor, NULL); + CHECK(bio); + + // Construct a Pickler targeting the buffer with a fuzz-chosen protocol + // (0-5). Different protocols use different opcodes internally. + PyRef pickler = PyObject_CallFunction(pickle_Pickler, "Oi", + (PyObject *)bio, protocol); + CHECK(pickler); + + // Build a list-of-ints container from the first fuzz string and dump it. + // Exercises the Pickler's serialization of sequences. + PyRef obj1(build_pickle_container( + fdp, CT_LIST, (const uint8_t *)data.data(), data.size(), str_enc)); + CHECK(obj1); + + PyRef r1 = PyObject_CallMethod(pickler, "dump", "O", (PyObject *)obj1); + if (!r1) { + PyErr_Clear(); + return; + } + + // Clear the memo table between dumps. Exercises the memo-reset path + // so the second dump re-encodes objects from scratch. + PyRef cm = PyObject_CallMethod(pickler, "clear_memo", NULL); + if (PyErr_Occurred()) PyErr_Clear(); + + // Build a str from a second independent fuzz string and dump it. + // Exercises string serialization after a memo clear, using a different + // object type than the first dump. + PyRef obj2(fuzz_bytes_to_str(data2, str_enc)); + CHECK(obj2); + PyRef r2 = PyObject_CallMethod(pickler, "dump", "O", (PyObject *)obj2); + if (PyErr_Occurred()) PyErr_Clear(); + + // Retrieve the full serialized output from the BytesIO buffer. + // Exercises the buffer-readback path after multiple dumps. + PyRef val = PyObject_CallMethod(bio, "getvalue", NULL); + if (PyErr_Occurred()) PyErr_Clear(); +} + +// OP_PICKLE_ROUNDTRIP: Build a fuzz-chosen container, pickle.dumps() it, +// then pickle.loads() the result. +static void op_pickle_roundtrip(FuzzedDataProvider &fdp) { + int container_type = fdp.ConsumeIntegralInRange(0, NUM_CONTAINER_TYPES - 1); + int str_enc = fdp.ConsumeIntegralInRange(0, 3); + if (fdp.remaining_bytes() == 0) return; + size_t data_len = fdp.ConsumeIntegralInRange( + 1, std::min(fdp.remaining_bytes(), (size_t)10000)); + std::string data = fdp.ConsumeBytesAsString(data_len); + + PyRef obj(build_pickle_container( + fdp, container_type, (const uint8_t *)data.data(), data.size(), str_enc)); + CHECK(obj); + + PyRef dumped = PyObject_CallFunction(pickle_dumps, "O", (PyObject *)obj); + if (!dumped) { + PyErr_Clear(); + return; + } + PyRef loaded = PyObject_CallFunction(pickle_loads, "O", + (PyObject *)dumped); + if (PyErr_Occurred()) PyErr_Clear(); +} + +enum Op { + OP_PICKLE_DUMPS, + OP_PICKLE_LOADS, + OP_PICKLE_PICKLER, + OP_PICKLE_ROUNDTRIP, + NUM_OPS +}; + +extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + assert(Py_IsInitialized()); + init_pickle(); + if (size < 1 || size > kMaxInputSize) return 0; + if (PyErr_Occurred()) PyErr_Clear(); + + FuzzedDataProvider fdp(data, size); + switch (fdp.ConsumeIntegralInRange(0, NUM_OPS - 1)) { + case OP_PICKLE_DUMPS: + op_pickle_dumps(fdp); + break; + case OP_PICKLE_LOADS: + op_pickle_loads(fdp); + break; + case OP_PICKLE_PICKLER: + op_pickle_pickler(fdp); + break; + case OP_PICKLE_ROUNDTRIP: + op_pickle_roundtrip(fdp); + break; + } + + return 0; +} diff --git a/module-fuzzers/fuzz_sqlite3.cpp b/module-fuzzers/fuzz_sqlite3.cpp new file mode 100644 index 0000000..aef74da --- /dev/null +++ b/module-fuzzers/fuzz_sqlite3.cpp @@ -0,0 +1,530 @@ +// fuzz_sqlite3.cpp — Fuzzer for CPython's _sqlite3 C extension module. +// +// This fuzzer exercises the following CPython C extension module via +// its Python API, called through the Python C API from C++: +// +// _sqlite3 — connect(':memory:'), execute, executemany, +// executescript, complete_statement, create_function, +// create_aggregate, set_authorizer, create_collation, +// Row factory, blobopen, register_adapter +// +// Exercises the _sqlite3 C extension module wrapping the third-party +// SQLite library. +// +// The fuzzer loops 1-6 times (fdp-driven) on a single ':memory:' +// connection. Each iteration picks one of 25 single-call targets +// covering table creation, insert, select, executemany, executescript, +// complete_statement, create_function, create_aggregate, set_authorizer, +// create_collation, Row factory, blobopen, and register_adapter. +// Because iterations share one connection, the fuzzer naturally chains +// operations (e.g. CREATE_TABLE → INSERT → SET_AUTHORIZER → SELECT). +// +// All module functions and class constructors are imported once during init +// and cached as static PyObject* pointers. An Aggregate helper class is +// defined via PyRun_String at init time. +// PyRef (RAII) prevents reference leaks. Max input size: 64 KB. + +#include "fuzz_helpers.h" + +static PyObject *sqlite3_connect, *sqlite3_complete_statement; +static PyObject *sqlite3_register_adapter, *sqlite3_Row; +static long sqlite3_SQLITE_OK_val; +static PyObject *sqlite3_Aggregate_cls; + +// Cached Python lambdas and helper classes used by op_sqlite3. +// These are created once in init_sqlite3() via run_python_and_get() and +// reused across every fuzzer iteration. Caching avoids the overhead of +// calling PyRun_String to rebuild them on each invocation, and ensures +// the fuzzer spends its time in _sqlite3 code rather than the compiler. +static PyObject *sqlite3_identity_fn; // lambda x: x +static PyObject *sqlite3_auth_fn; // lambda *a: SQLITE_OK +static PyObject *sqlite3_collation_fn; // lambda a, b: (a > b) - (a < b) +static PyObject *sqlite3_adapt_cls; // class _AdaptMe +static PyObject *sqlite3_adapter_fn; // lambda a: str(a.v) + +static int initialized = 0; + +// init_sqlite3 is a one-time initialization for the sqlite3 fuzzer. +// +// Imports the Python-level sqlite3 module attributes (connect, +// complete_statement, register_adapter, Row, SQLITE_OK) and caches +// them as static PyObject* pointers so they don't need to be looked +// up on every fuzzer iteration. +// +// Also creates the helper Python objects used by op_sqlite3: +// - _Agg class (aggregate step/finalize) +// - identity lambda (scalar function callback) +// - auth lambda (authorizer callback, returns SQLITE_OK) +// - collation lambda (three-way string comparison) +// - _AdaptMe class (type adaptation target) +// - adapter lambda (converts _AdaptMe -> str) +// +// These are built once via run_python_and_get() and held for the +// lifetime of the process. Called from LLVMFuzzerTestOneInput on +// the first invocation; the `initialized` guard makes it a no-op +// on subsequent calls. +static void init_sqlite3(void) { + if (initialized) return; + + sqlite3_connect = import_attr("sqlite3", "connect"); + sqlite3_complete_statement = import_attr("sqlite3", "complete_statement"); + sqlite3_register_adapter = import_attr("sqlite3", "register_adapter"); + sqlite3_Row = import_attr("sqlite3", "Row"); + // Fetch the integer value of sqlite3.SQLITE_OK (typically 0). + // Used to build the authorizer callback that always permits operations. + { + PyObject *v = import_attr("sqlite3", "SQLITE_OK"); + sqlite3_SQLITE_OK_val = PyLong_AsLong(v); + Py_DECREF(v); + } + + // Aggregate class for conn.create_aggregate("fuzzagg", 1, _Agg). + // step() collects values; finalize() returns the count. + // Exercises the _sqlite3 aggregate callback dispatch path. + sqlite3_Aggregate_cls = run_python_and_get( + "class _Agg:\n" + " def __init__(self): self.vals = []\n" + " def step(self, v): self.vals.append(v)\n" + " def finalize(self): return len(self.vals)\n", + "_Agg"); + + // Identity function for conn.create_function("fuzzfn", 1, f). + // Simply returns its argument unchanged. + // Exercises the _sqlite3 scalar function callback dispatch path. + sqlite3_identity_fn = run_python_and_get("_f = lambda x: x\n", "_f"); + + // Authorizer callback for conn.set_authorizer(f). + // Accepts any number of args and always returns SQLITE_OK (permit). + // Exercises the _sqlite3 authorizer callback dispatch path without + // blocking any SQL operations. + { + char buf[80]; + snprintf(buf, sizeof(buf), "_f = lambda *a: %ld\n", sqlite3_SQLITE_OK_val); + sqlite3_auth_fn = run_python_and_get(buf, "_f"); + } + + // Collation function for conn.create_collation("fuzz", f). + // Standard three-way comparison: returns -1, 0, or 1. + // Exercises the _sqlite3 collation callback dispatch path used by + // ORDER BY ... COLLATE fuzz. + sqlite3_collation_fn = run_python_and_get( + "_f = lambda a, b: (a > b) - (a < b)\n", "_f"); + + // Adapter class for sqlite3.register_adapter(_AdaptMe, f). + // A simple wrapper holding a single value .v. + // Exercises the _sqlite3 type adaptation path: when an _AdaptMe instance + // is passed as a query parameter, SQLite calls the registered adapter. + sqlite3_adapt_cls = run_python_and_get( + "class _AdaptMe:\n" + " def __init__(self, v): self.v = v\n", + "_AdaptMe"); + + // Adapter function for sqlite3.register_adapter(_AdaptMe, f). + // Converts an _AdaptMe instance to a string via str(a.v). + // Exercises the _sqlite3 adapter lookup and conversion path. + sqlite3_adapter_fn = run_python_and_get("_f = lambda a: str(a.v)\n", "_f"); + + assert(!PyErr_Occurred()); + initialized = 1; +} + +// Helper: Create a memory connection with PRAGMA max_page_count=100. +static PyObject *make_sqlite_conn() { + PyObject *conn = PyObject_CallFunction(sqlite3_connect, "s", ":memory:"); + if (!conn) return NULL; + PyRef r = PyObject_CallMethod(conn, "execute", "s", + "PRAGMA max_page_count=100"); + if (!r) { + PyErr_Clear(); + Py_DECREF(conn); + return NULL; + } + return conn; +} + +// op_sqlite3: connect(':memory:'), then loop 1-6 times (fdp-driven). +// Each iteration FDP selects one of many single-call targets. Since +// iterations share the same connection, the fuzzer naturally chains +// operations (e.g. CREATE_TABLE → INSERT_PARAM → SELECT_LIKE across +// three iterations). Each case makes exactly one SQL/API call. +static void op_sqlite3(FuzzedDataProvider &fdp) { + enum { + // Basic SQL execution + EXECUTE, // conn.execute(fuzz_sql) + EXECUTESCRIPT, // conn.executescript(fuzz_sql) + COMPLETE_STMT, // sqlite3.complete_statement(fuzz_sql) + // Table setup + CREATE_TABLE_TEXT, // CREATE TABLE t(a TEXT) + CREATE_TABLE_BLOB, // CREATE TABLE t(a BLOB) + CREATE_TABLE_INT, // CREATE TABLE t(v INTEGER) + CREATE_TABLE_MULTI, // CREATE TABLE t(a TEXT, b BLOB) + CREATE_TABLE_TEXT_INT, // CREATE TABLE t(a TEXT, b INTEGER) + // Insert operations + INSERT_TEXT, // INSERT INTO t VALUES(?) with fuzz text + INSERT_PARAM, // INSERT INTO t VALUES(?,?) with text + blob + INSERT_ADAPTED, // INSERT INTO t VALUES(?) with _AdaptMe obj + // Bulk insert + EXECUTEMANY_INT, // conn.executemany("INSERT INTO t VALUES(?)", rows) + // Select operations + SELECT_ALL, // SELECT * FROM t + SELECT_LIKE, // SELECT * FROM t WHERE a LIKE ? + SELECT_AGGREGATES, // SELECT count(*), sum(v), avg(v), min(v), max(v) FROM t + SELECT_ORDERED, // SELECT * FROM t ORDER BY a COLLATE + SELECT_VIA_FUNC, // SELECT (a) FROM t + SELECT_VIA_AGG, // SELECT (v) FROM t + // Feature registration + CREATE_FUNCTION, // conn.create_function(fuzz_name, fuzz_narg, ...) + CREATE_AGGREGATE, // conn.create_aggregate(fuzz_name, fuzz_narg, ...) + SET_AUTHORIZER, // conn.set_authorizer(auth_fn) + CREATE_COLLATION, // conn.create_collation(fuzz_name, ...) + SET_ROW_FACTORY, // conn.row_factory = sqlite3.Row + REGISTER_ADAPTER, // sqlite3.register_adapter(_AdaptMe, ...) + // Blob I/O + BLOBOPEN, // conn.blobopen("main","t","a", rowid) + NUM_TARGETS + }; + + PyRef conn(make_sqlite_conn()); + CHECK(conn); + + // Track the last-registered fuzz-derived names so that SELECT_VIA_FUNC, + // SELECT_VIA_AGG, and SELECT_ORDERED can reference whatever was registered + // by an earlier iteration's CREATE_FUNCTION / CREATE_AGGREGATE / + // CREATE_COLLATION case. + std::string func_name, agg_name, collation_name; + + // Helper: consume a fuzz-derived string from fdp. Only called by cases + // that actually need string data, so cases like CREATE_TABLE_* and + // SET_AUTHORIZER don't waste fuzz bytes. + auto consume_pystr = [&fdp]() -> std::pair { + int str_enc = fdp.ConsumeIntegralInRange(0, 3); + if (fdp.remaining_bytes() == 0) + return {std::string(), PyRef(nullptr)}; + size_t data_len = fdp.ConsumeIntegralInRange( + 1, std::min(fdp.remaining_bytes(), (size_t)10000)); + std::string data = fdp.ConsumeBytesAsString(data_len); + PyRef pystr(fuzz_bytes_to_str(data, str_enc)); + return {std::move(data), std::move(pystr)}; + }; + + // Loop so the fuzzer can chain single-call operations on the same + // connection across iterations (e.g. CREATE_TABLE_TEXT → INSERT_TEXT + // → SET_AUTHORIZER → SELECT_ALL). FDP picks the iteration count (1-6). + int num_iters = fdp.ConsumeIntegralInRange(1, 6); + for (int iter = 0; iter < num_iters && fdp.remaining_bytes() > 0; iter++) { + int target_fn = fdp.ConsumeIntegralInRange(0, NUM_TARGETS - 1); + + switch (target_fn) { + case EXECUTE: { + // conn.execute(fuzz_sql) — arbitrary SQL on the live connection. + auto [data, pystr] = consume_pystr(); + if (!pystr) { PyErr_Clear(); break; } + PyRef r = PyObject_CallMethod(conn, "execute", "O", (PyObject *)pystr); + break; + } + case EXECUTESCRIPT: { + // conn.executescript(fuzz_sql) — multi-statement SQL in autocommit. + auto [data, pystr] = consume_pystr(); + if (!pystr) { PyErr_Clear(); break; } + Py_ssize_t slen = PyUnicode_GET_LENGTH(pystr); + if (slen > 0) { + PyRef r = PyObject_CallMethod(conn, "executescript", "O", + (PyObject *)pystr); + } else { + PyRef def = PyUnicode_FromString("SELECT 1;"); + PyRef r = PyObject_CallMethod(conn, "executescript", "O", + (PyObject *)def); + } + break; + } + case COMPLETE_STMT: { + // sqlite3.complete_statement(fuzz_sql) — checks for trailing ";". + auto [data, pystr] = consume_pystr(); + if (!pystr) { PyErr_Clear(); break; } + Py_ssize_t slen = PyUnicode_GET_LENGTH(pystr); + if (slen > 0) { + PyRef r = PyObject_CallFunction(sqlite3_complete_statement, "O", + (PyObject *)pystr); + } else { + PyRef def = PyUnicode_FromString("SELECT 1;"); + PyRef r = PyObject_CallFunction(sqlite3_complete_statement, "O", + (PyObject *)def); + } + break; + } + case CREATE_TABLE_TEXT: { + // CREATE TABLE t(a TEXT) — single text column. + PyRef r = PyObject_CallMethod(conn, "execute", "s", + "CREATE TABLE t(a TEXT)"); + break; + } + case CREATE_TABLE_BLOB: { + // CREATE TABLE t(a BLOB) — single blob column. + PyRef r = PyObject_CallMethod(conn, "execute", "s", + "CREATE TABLE t(a BLOB)"); + break; + } + case CREATE_TABLE_INT: { + // CREATE TABLE t(v INTEGER) — single integer column. + PyRef r = PyObject_CallMethod(conn, "execute", "s", + "CREATE TABLE t(v INTEGER)"); + break; + } + case CREATE_TABLE_MULTI: { + // CREATE TABLE t(a TEXT, b BLOB) — text + blob columns. + PyRef r = PyObject_CallMethod(conn, "execute", "s", + "CREATE TABLE t(a TEXT, b BLOB)"); + break; + } + case CREATE_TABLE_TEXT_INT: { + // CREATE TABLE t(a TEXT, b INTEGER) — text + integer columns. + PyRef r = PyObject_CallMethod(conn, "execute", "s", + "CREATE TABLE t(a TEXT, b INTEGER)"); + break; + } + case INSERT_TEXT: { + // INSERT INTO t VALUES(?) — bind fuzz text as a parameter. + auto [data, pystr] = consume_pystr(); + if (!pystr) { PyErr_Clear(); break; } + PyRef params = PyTuple_Pack(1, (PyObject *)pystr); + CHECK(params); + PyRef r = PyObject_CallMethod(conn, "execute", "sO", + "INSERT INTO t VALUES(?)", + (PyObject *)params); + break; + } + case INSERT_PARAM: { + // INSERT INTO t VALUES(?,?) — bind fuzz text + fuzz bytes. + auto [data, pystr] = consume_pystr(); + if (!pystr) { PyErr_Clear(); break; } + PyRef pydata = PyBytes_FromStringAndSize(data.data(), data.size()); + CHECK(pydata); + PyRef params = PyTuple_Pack(2, (PyObject *)pystr, (PyObject *)pydata); + CHECK(params); + PyRef r = PyObject_CallMethod(conn, "execute", "sO", + "INSERT INTO t VALUES(?, ?)", + (PyObject *)params); + break; + } + case INSERT_ADAPTED: { + // INSERT INTO t VALUES(?) — bind an _AdaptMe(fuzz_substr) object. + // Requires REGISTER_ADAPTER to have run earlier on this connection + // for the adapter to fire; otherwise sqlite3 raises an error. + auto [data, pystr] = consume_pystr(); + if (!pystr) { PyErr_Clear(); break; } + Py_ssize_t sub_len = fdp.ConsumeIntegralInRange( + 0, PyUnicode_GET_LENGTH(pystr)); + PyRef sub = PyUnicode_Substring(pystr, 0, sub_len); + if (!sub) { PyErr_Clear(); break; } + PyRef obj = PyObject_CallFunction(sqlite3_adapt_cls, "O", + (PyObject *)sub); + if (!obj) { PyErr_Clear(); break; } + PyRef params = PyTuple_Pack(1, (PyObject *)obj); + CHECK(params); + PyRef r = PyObject_CallMethod(conn, "execute", "sO", + "INSERT INTO t VALUES(?)", + (PyObject *)params); + break; + } + case EXECUTEMANY_INT: { + // conn.executemany("INSERT INTO t VALUES(?)", rows) — bulk insert + // of integer tuples built from fuzz bytes. + auto [data, pystr] = consume_pystr(); + PyRef rows = PyList_New(0); + CHECK(rows); + size_t limit = fdp.ConsumeIntegralInRange( + 0, std::min(data.size(), (size_t)10000)); + for (size_t i = 0; i < limit; i++) { + PyRef val = PyLong_FromLong((unsigned char)data[i]); + PyRef tup = PyTuple_Pack(1, (PyObject *)val); + if (tup) PyList_Append(rows, tup); + } + PyRef r = PyObject_CallMethod(conn, "executemany", "sO", + "INSERT INTO t VALUES(?)", + (PyObject *)rows); + break; + } + case SELECT_ALL: { + // SELECT * FROM t — fetch all rows. + PyRef cur = PyObject_CallMethod(conn, "execute", "s", + "SELECT * FROM t"); + if (cur) { + PyRef rows = PyObject_CallMethod(cur, "fetchall", NULL); + } + break; + } + case SELECT_LIKE: { + // SELECT * FROM t WHERE a LIKE ? — parameterized LIKE query. + auto [data, pystr] = consume_pystr(); + if (!pystr) { PyErr_Clear(); break; } + Py_ssize_t sub_len = fdp.ConsumeIntegralInRange( + 0, PyUnicode_GET_LENGTH(pystr)); + PyRef sub = PyUnicode_Substring(pystr, 0, sub_len); + if (!sub) { PyErr_Clear(); break; } + PyRef params = PyTuple_Pack(1, (PyObject *)sub); + CHECK(params); + PyRef r = PyObject_CallMethod(conn, "execute", "sO", + "SELECT * FROM t WHERE a LIKE ?", + (PyObject *)params); + break; + } + case SELECT_AGGREGATES: { + // SELECT count(*), sum(v), avg(v), min(v), max(v) FROM t. + PyRef cur = PyObject_CallMethod(conn, "execute", "s", + "SELECT count(*), sum(v), avg(v), min(v), max(v) FROM t"); + if (cur) { + PyRef row = PyObject_CallMethod(cur, "fetchone", NULL); + } + break; + } + case SELECT_ORDERED: { + // SELECT * FROM t ORDER BY a COLLATE . + // Uses the name from the last CREATE_COLLATION iteration. + if (collation_name.empty()) break; + std::string sql = "SELECT * FROM t ORDER BY a COLLATE \"" + + collation_name + "\""; + PyRef cur = PyObject_CallMethod(conn, "execute", "s", sql.c_str()); + if (cur) { + PyRef rows = PyObject_CallMethod(cur, "fetchall", NULL); + } + break; + } + case SELECT_VIA_FUNC: { + // SELECT (a) FROM t — triggers scalar function callback. + // Uses the name from the last CREATE_FUNCTION iteration. + if (func_name.empty()) break; + std::string sql = "SELECT \"" + func_name + "\"(a) FROM t"; + PyRef cur = PyObject_CallMethod(conn, "execute", "s", sql.c_str()); + if (cur) { + PyRef rows = PyObject_CallMethod(cur, "fetchall", NULL); + } + break; + } + case SELECT_VIA_AGG: { + // SELECT (v) FROM t — triggers step()/finalize() callbacks. + // Uses the name from the last CREATE_AGGREGATE iteration. + if (agg_name.empty()) break; + std::string sql = "SELECT \"" + agg_name + "\"(v) FROM t"; + PyRef cur = PyObject_CallMethod(conn, "execute", "s", sql.c_str()); + if (cur) { + PyRef row = PyObject_CallMethod(cur, "fetchone", NULL); + } + break; + } + case CREATE_FUNCTION: { + // conn.create_function(name, narg, identity_fn) — register scalar fn + // with a fuzz-derived name and argument count. Exercises _sqlite3 + // create_function name handling and narg validation. + auto [data, pystr] = consume_pystr(); + if (!pystr) { PyErr_Clear(); break; } + const char *name = PyUnicode_AsUTF8(pystr); + if (!name) { PyErr_Clear(); break; } + int narg = fdp.ConsumeIntegralInRange(-1, 8); + PyRef r = PyObject_CallMethod(conn, "create_function", "siO", + name, narg, sqlite3_identity_fn); + if (!r) { PyErr_Clear(); break; } + func_name.assign(name); + break; + } + case CREATE_AGGREGATE: { + // conn.create_aggregate(name, narg, _Agg) — register aggregate fn + // with a fuzz-derived name and argument count. Exercises _sqlite3 + // create_aggregate name handling and narg validation. + auto [data, pystr] = consume_pystr(); + if (!pystr) { PyErr_Clear(); break; } + const char *name = PyUnicode_AsUTF8(pystr); + if (!name) { PyErr_Clear(); break; } + int narg = fdp.ConsumeIntegralInRange(-1, 8); + PyRef r = PyObject_CallMethod(conn, "create_aggregate", "siO", + name, narg, sqlite3_Aggregate_cls); + if (!r) { PyErr_Clear(); break; } + agg_name.assign(name); + break; + } + case SET_AUTHORIZER: { + // conn.set_authorizer(auth_fn) — install authorizer callback. + // All subsequent SQL on this connection goes through the authorizer. + PyRef r = PyObject_CallMethod(conn, "set_authorizer", "O", + sqlite3_auth_fn); + break; + } + case CREATE_COLLATION: { + // conn.create_collation(name, collation_fn) — register collation + // with a fuzz-derived name. Exercises _sqlite3 create_collation + // name handling. + auto [data, pystr] = consume_pystr(); + if (!pystr) { PyErr_Clear(); break; } + const char *name = PyUnicode_AsUTF8(pystr); + if (!name) { PyErr_Clear(); break; } + PyRef r = PyObject_CallMethod(conn, "create_collation", "sO", + name, sqlite3_collation_fn); + if (!r) { PyErr_Clear(); break; } + collation_name.assign(name); + break; + } + case SET_ROW_FACTORY: { + // conn.row_factory = sqlite3.Row — switch to Row-based fetch. + // Subsequent SELECT results on this connection return sqlite3.Row + // objects instead of plain tuples. + PyObject_SetAttrString(conn, "row_factory", sqlite3_Row); + break; + } + case REGISTER_ADAPTER: { + // sqlite3.register_adapter(_AdaptMe, adapter_fn) — global registration. + PyRef reg = PyObject_CallFunction(sqlite3_register_adapter, "OO", + sqlite3_adapt_cls, + sqlite3_adapter_fn); + break; + } + case BLOBOPEN: { + // conn.blobopen("main","t","a", rowid) — open incremental I/O handle. + // Requires a table "t" with a BLOB column "a" and at least one row. + // Reads the first rowid, opens the blob, then does one read or write + // (fdp-chosen) before closing. + PyRef cur = PyObject_CallMethod(conn, "execute", "s", + "SELECT rowid FROM t LIMIT 1"); + if (!cur) { PyErr_Clear(); break; } + PyRef row = PyObject_CallMethod(cur, "fetchone", NULL); + if (!row || row.p == Py_None) { PyErr_Clear(); break; } + PyRef rid = PySequence_GetItem(row, 0); + CHECK(rid); + PyRef blob = PyObject_CallMethod(conn, "blobopen", "sssO", + "main", "t", "a", (PyObject *)rid); + if (!blob) { PyErr_Clear(); break; } + if (fdp.ConsumeBool()) { + // Read the blob content. + PyRef rd = PyObject_CallMethod(blob, "read", NULL); + } else { + // Write fuzz-derived bytes into the blob. + auto [data, pystr] = consume_pystr(); + size_t wr_len = fdp.ConsumeIntegralInRange( + 0, std::min(data.size(), (size_t)10000)); + PyRef wr_data = PyBytes_FromStringAndSize(data.data(), wr_len); + if (wr_data) { + PyRef wr = PyObject_CallMethod(blob, "write", "O", + (PyObject *)wr_data); + } + } + { + PyRef cl = PyObject_CallMethod(blob, "close", NULL); + } + break; + } + } + if (PyErr_Occurred()) PyErr_Clear(); + } // end loop + + PyRef cl = PyObject_CallMethod(conn, "close", NULL); + if (PyErr_Occurred()) PyErr_Clear(); +} + +extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + assert(Py_IsInitialized()); + init_sqlite3(); + if (size < 1 || size > 0x10000) return 0; + if (PyErr_Occurred()) PyErr_Clear(); + + FuzzedDataProvider fdp(data, size); + op_sqlite3(fdp); + return 0; +} diff --git a/module-fuzzers/fuzz_ssl.cpp b/module-fuzzers/fuzz_ssl.cpp new file mode 100644 index 0000000..a6e4efb --- /dev/null +++ b/module-fuzzers/fuzz_ssl.cpp @@ -0,0 +1,84 @@ +// fuzz_ssl.cpp — Fuzzer for CPython's _ssl C extension module. +// +// This fuzzer exercises the following CPython C extension module via +// its Python API, called through the Python C API from C++: +// +// _ssl — ssl.DER_cert_to_PEM_cert(), then optionally +// SSLContext(PROTOCOL_TLS_CLIENT).load_verify_locations() +// +// Exercises the OpenSSL certificate parsing path in the _ssl C module. +// +// All module functions are imported once during init and cached as static +// PyObject* pointers. PyRef (RAII) prevents reference leaks. +// Max input size: 1 MB. + +#include "fuzz_helpers.h" + +static PyObject *ssl_DER_cert_to_PEM_cert, *ssl_SSLContext; +static long ssl_PROTOCOL_TLS_CLIENT_val; + +static int initialized = 0; + +static void init_ssl(void) { + if (initialized) return; + + ssl_DER_cert_to_PEM_cert = import_attr("ssl", "DER_cert_to_PEM_cert"); + ssl_SSLContext = import_attr("ssl", "SSLContext"); + { + PyObject *v = import_attr("ssl", "PROTOCOL_TLS_CLIENT"); + ssl_PROTOCOL_TLS_CLIENT_val = PyLong_AsLong(v); + Py_DECREF(v); + } + + assert(!PyErr_Occurred()); + initialized = 1; +} + +// op_ssl_cert: Call ssl.DER_cert_to_PEM_cert(data) to attempt DER-to-PEM +// certificate conversion. If successful, create an SSLContext with +// PROTOCOL_TLS_CLIENT and call .load_verify_locations(cadata=pem_string) +// to exercise the OpenSSL certificate parsing path in the _ssl C module. +static void op_ssl_cert(FuzzedDataProvider &fdp) { + std::string data = fdp.ConsumeRemainingBytesAsString(); + PyRef pydata = PyBytes_FromStringAndSize(Y(data)); + CHECK(pydata); + PyRef pem = PyObject_CallFunction(ssl_DER_cert_to_PEM_cert, "O", + (PyObject *)pydata); + if (!pem) { + PyErr_Clear(); + return; + } + + // Optionally try to load into SSLContext. + PyRef ctx = PyObject_CallFunction(ssl_SSLContext, "l", + ssl_PROTOCOL_TLS_CLIENT_val); + if (!ctx) { + PyErr_Clear(); + return; + } + + PyRef kwargs = PyDict_New(); + CHECK(kwargs); + PyDict_SetItemString(kwargs, "cadata", pem); + PyRef empty_args = PyTuple_New(0); + CHECK(empty_args); + PyRef method = PyObject_GetAttrString(ctx, "load_verify_locations"); + if (!method) { + PyErr_Clear(); + return; + } + PyRef r = PyObject_Call(method, empty_args, kwargs); + if (PyErr_Occurred()) PyErr_Clear(); +} + +extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + assert(Py_IsInitialized()); + init_ssl(); + if (size < 1 || size > kMaxInputSize) return 0; + if (PyErr_Occurred()) PyErr_Clear(); + + FuzzedDataProvider fdp(data, size); + op_ssl_cert(fdp); + + return 0; +} diff --git a/module-fuzzers/fuzz_textops.cpp b/module-fuzzers/fuzz_textops.cpp deleted file mode 100644 index 03551e2..0000000 --- a/module-fuzzers/fuzz_textops.cpp +++ /dev/null @@ -1,467 +0,0 @@ -// fuzz_textops.cpp — Fuzzer for CPython's text-processing C extension modules. -// -// This fuzzer exercises the following CPython C extension modules via -// their Python API, called through the Python C API from C++: -// -// datetime — date/time/datetime.fromisoformat(), strptime(), -// strftime(), format() -// collections — _count_elements (Counter internals) -// unicodedata — category, bidirectional, numeric, decimal, -// combining, east_asian_width, mirrored, name, -// decomposition, normalize, is_normalized, lookup, -// ucd_3_2_0.normalize -// _io (StringIO) — write, seek, read, getvalue, readline, readlines, -// truncate, iteration -// -// The first byte of fuzz input selects one of 6 operation types. Each -// operation consumes further bytes via FuzzedDataProvider to parameterize -// the call (format selection, character range, normalization form). -// -// All module functions and class constructors are imported once during init -// and cached as static PyObject* pointers. PyRef (RAII) prevents reference -// leaks. PyGC_Collect() runs every 200 iterations. Max input size: 64 KB. - -#include "fuzz_helpers.h" - -// --------------------------------------------------------------------------- -// Cached module objects, initialized once. -// --------------------------------------------------------------------------- - -// datetime -static PyObject *dt_date, *dt_time, *dt_datetime; - -// collections -static PyObject *collections_count_elements; - -// unicodedata -static PyObject *ud_category, *ud_bidirectional, *ud_normalize, *ud_numeric; -static PyObject *ud_lookup, *ud_name, *ud_decomposition, *ud_is_normalized; -static PyObject *ud_east_asian_width, *ud_mirrored, *ud_decimal, *ud_combining; -static PyObject *ud_ucd_3_2_0; - -// io -static PyObject *stringio_ctor; - -// struct -static PyObject *struct_unpack; - -static unsigned long gc_counter = 0; - -static int initialized = 0; - -static void init_textops(void) { - if (initialized) return; - - // datetime - dt_date = import_attr("datetime", "date"); - dt_time = import_attr("datetime", "time"); - dt_datetime = import_attr("datetime", "datetime"); - - // collections - collections_count_elements = import_attr("collections", "_count_elements"); - - // unicodedata - ud_category = import_attr("unicodedata", "category"); - ud_bidirectional = import_attr("unicodedata", "bidirectional"); - ud_normalize = import_attr("unicodedata", "normalize"); - ud_numeric = import_attr("unicodedata", "numeric"); - ud_lookup = import_attr("unicodedata", "lookup"); - ud_name = import_attr("unicodedata", "name"); - ud_decomposition = import_attr("unicodedata", "decomposition"); - ud_is_normalized = import_attr("unicodedata", "is_normalized"); - ud_east_asian_width = import_attr("unicodedata", "east_asian_width"); - ud_mirrored = import_attr("unicodedata", "mirrored"); - ud_decimal = import_attr("unicodedata", "decimal"); - ud_combining = import_attr("unicodedata", "combining"); - ud_ucd_3_2_0 = import_attr("unicodedata", "ucd_3_2_0"); - - // io - stringio_ctor = import_attr("io", "StringIO"); - - // struct - struct_unpack = import_attr("struct", "unpack"); - - // Suppress warnings. - PyRun_SimpleString("import warnings; warnings.filterwarnings('ignore')"); - - assert(!PyErr_Occurred()); - initialized = 1; -} - -// --------------------------------------------------------------------------- -// Operations (6 ops). -// --------------------------------------------------------------------------- - -// OP_DATETIME_PARSE: FDP selects variant — date/time/datetime.fromisoformat() -// or datetime.strptime() with a fuzz-chosen format string. Exercises the -// datetime C module's parsing paths. -static void op_datetime_parse(FuzzedDataProvider &fdp) { - int str_enc = fdp.ConsumeIntegralInRange(0, 3); - int variant = fdp.ConsumeIntegralInRange(0, 4); - std::string data = fdp.ConsumeBytesAsString( - std::min(fdp.remaining_bytes(), (size_t)10000)); - PyRef pystr(fuzz_bytes_to_str(data, str_enc)); - CHECK(pystr); - - switch (variant) { - case 0: { - PyRef r = PyObject_CallMethod(dt_date, "fromisoformat", "O", - (PyObject *)pystr); - break; - } - case 1: { - PyRef r = PyObject_CallMethod(dt_time, "fromisoformat", "O", - (PyObject *)pystr); - break; - } - case 2: { - PyRef r = PyObject_CallMethod(dt_datetime, "fromisoformat", "O", - (PyObject *)pystr); - break; - } - case 3: { - PyRef fmt = PyUnicode_FromString("%Y-%m-%d %H:%M:%S"); - CHECK(fmt); - PyRef r = PyObject_CallMethod(dt_datetime, "strptime", "OO", - (PyObject *)pystr, (PyObject *)fmt); - break; - } - case 4: { - PyRef fmt = PyUnicode_FromString("%Y/%m/%dT%H:%M"); - CHECK(fmt); - PyRef r = PyObject_CallMethod(dt_datetime, "strptime", "OO", - (PyObject *)pystr, (PyObject *)fmt); - break; - } - } - if (PyErr_Occurred()) PyErr_Clear(); -} - -// OP_DATETIME_FORMAT: Unpack 6 shorts from first 12 bytes to build a valid -// datetime, then call strftime() with the remaining fuzz data as the format -// string. Exercises datetime formatting code paths. -static void op_datetime_format(FuzzedDataProvider &fdp) { - // Need at least 12 bytes for the datetime fields. - std::string header = fdp.ConsumeBytesAsString(12); - if (header.size() < 12) return; - - int str_enc = fdp.ConsumeIntegralInRange(0, 3); - std::string fmt_data = fdp.ConsumeBytesAsString( - std::min(fdp.remaining_bytes(), (size_t)10000)); - PyRef fmt_str(fuzz_bytes_to_str(fmt_data, str_enc)); - CHECK(fmt_str); - - // Unpack 6 unsigned shorts via struct.unpack. - PyRef hdr_bytes = PyBytes_FromStringAndSize(header.data(), 12); - CHECK(hdr_bytes); - PyRef vals = PyObject_CallFunction(struct_unpack, "sO", "6H", - (PyObject *)hdr_bytes); - CHECK(vals); - - // Extract fields and clamp to valid ranges. - long v[6]; - for (int i = 0; i < 6; i++) { - PyObject *item = PyTuple_GetItem(vals, i); - v[i] = PyLong_AsLong(item); - } - long year = (v[0] % 9999) + 1; - long month = (v[1] % 12) + 1; - long day = (v[2] % 28) + 1; - long hour = v[3] % 24; - long minute = v[4] % 60; - long second = v[5] % 60; - - PyRef dt = PyObject_CallFunction(dt_datetime, "llllll", - year, month, day, hour, minute, second); - CHECK(dt); - - // strftime on datetime. - { - PyRef r = PyObject_CallMethod(dt, "strftime", "O", (PyObject *)fmt_str); - if (PyErr_Occurred()) PyErr_Clear(); - } - - // strftime on date. - { - PyRef date_obj = PyObject_CallMethod(dt, "date", NULL); - if (date_obj) { - PyRef r = PyObject_CallMethod(date_obj, "strftime", "O", - (PyObject *)fmt_str); - if (PyErr_Occurred()) PyErr_Clear(); - } else { - PyErr_Clear(); - } - } - - // strftime on time. - { - PyRef time_obj = PyObject_CallMethod(dt, "time", NULL); - if (time_obj) { - PyRef r = PyObject_CallMethod(time_obj, "strftime", "O", - (PyObject *)fmt_str); - if (PyErr_Occurred()) PyErr_Clear(); - } else { - PyErr_Clear(); - } - } - - // format(date, str[:16]). - { - PyRef date_obj = PyObject_CallMethod(dt, "date", NULL); - if (date_obj) { - // Cap format spec to 16 chars. - Py_ssize_t flen = PyUnicode_GET_LENGTH(fmt_str); - PyRef short_fmt = PyUnicode_Substring(fmt_str, 0, - flen < 16 ? flen : 16); - if (short_fmt) { - PyRef r = PyObject_Format(date_obj, short_fmt); - if (PyErr_Occurred()) PyErr_Clear(); - } else { - PyErr_Clear(); - } - } else { - PyErr_Clear(); - } - } -} - -// OP_COLLECTIONS_COUNT: Build a dict and call collections._count_elements() -// with a fuzz-generated string. Exercises the Counter internals C path. -static void op_collections_count(FuzzedDataProvider &fdp) { - int str_enc = fdp.ConsumeIntegralInRange(0, 3); - std::string data = fdp.ConsumeBytesAsString( - std::min(fdp.remaining_bytes(), (size_t)10000)); - PyRef pystr(fuzz_bytes_to_str(data, str_enc)); - CHECK(pystr); - - PyRef d = PyDict_New(); - CHECK(d); - PyRef r = PyObject_CallFunction(collections_count_elements, "OO", - (PyObject *)d, (PyObject *)pystr); - if (PyErr_Occurred()) PyErr_Clear(); -} - -// OP_UNICODEDATA_CHARINFO: Convert data to str (cap 200 chars), then call -// per-character unicodedata functions. FDP selects which functions to call. -// Exercises the unicodedata C module character-info paths. -static void op_unicodedata_charinfo(FuzzedDataProvider &fdp) { - int str_enc = fdp.ConsumeIntegralInRange(0, 3); - uint8_t func_mask = fdp.ConsumeIntegral(); - std::string data = fdp.ConsumeBytesAsString( - std::min(fdp.remaining_bytes(), (size_t)800)); // ~200 chars max - PyRef pystr(fuzz_bytes_to_str(data, str_enc)); - CHECK(pystr); - - Py_ssize_t len = PyUnicode_GET_LENGTH(pystr); - if (len > 200) len = 200; - - PyRef neg_one = PyLong_FromLong(-1); - CHECK(neg_one); - PyRef empty_str = PyUnicode_FromString(""); - CHECK(empty_str); - - for (Py_ssize_t i = 0; i < len; i++) { - PyRef ch = PyUnicode_Substring(pystr, i, i + 1); - if (!ch) { PyErr_Clear(); continue; } - - if (func_mask & 0x01) { - PyRef r = PyObject_CallFunction(ud_category, "O", (PyObject *)ch); - if (PyErr_Occurred()) PyErr_Clear(); - } - if (func_mask & 0x02) { - PyRef r = PyObject_CallFunction(ud_bidirectional, "O", (PyObject *)ch); - if (PyErr_Occurred()) PyErr_Clear(); - } - if (func_mask & 0x04) { - PyRef r = PyObject_CallFunction(ud_numeric, "OO", - (PyObject *)ch, (PyObject *)neg_one); - if (PyErr_Occurred()) PyErr_Clear(); - } - if (func_mask & 0x08) { - PyRef r = PyObject_CallFunction(ud_decimal, "OO", - (PyObject *)ch, (PyObject *)neg_one); - if (PyErr_Occurred()) PyErr_Clear(); - } - if (func_mask & 0x10) { - PyRef r = PyObject_CallFunction(ud_combining, "O", (PyObject *)ch); - if (PyErr_Occurred()) PyErr_Clear(); - } - if (func_mask & 0x20) { - PyRef r = PyObject_CallFunction(ud_east_asian_width, "O", - (PyObject *)ch); - if (PyErr_Occurred()) PyErr_Clear(); - } - if (func_mask & 0x40) { - PyRef r = PyObject_CallFunction(ud_mirrored, "O", (PyObject *)ch); - if (PyErr_Occurred()) PyErr_Clear(); - } - if (func_mask & 0x80) { - PyRef r = PyObject_CallFunction(ud_name, "OO", - (PyObject *)ch, (PyObject *)empty_str); - if (PyErr_Occurred()) PyErr_Clear(); - PyRef r2 = PyObject_CallFunction(ud_decomposition, "O", - (PyObject *)ch); - if (PyErr_Occurred()) PyErr_Clear(); - } - } -} - -// OP_UNICODEDATA_NORMALIZE: FDP selects normalization form from -// {NFC, NFD, NFKC, NFKD}, calls normalize() and is_normalized(). -// Optionally calls ucd_3_2_0.normalize() and lookup(). -static void op_unicodedata_normalize(FuzzedDataProvider &fdp) { - static const char *kForms[] = {"NFC", "NFD", "NFKC", "NFKD"}; - int form_idx = fdp.ConsumeIntegralInRange(0, 3); - int str_enc = fdp.ConsumeIntegralInRange(0, 3); - bool try_ucd = fdp.ConsumeBool(); - bool try_lookup = fdp.ConsumeBool(); - std::string data = fdp.ConsumeBytesAsString( - std::min(fdp.remaining_bytes(), (size_t)10000)); - PyRef pystr(fuzz_bytes_to_str(data, str_enc)); - CHECK(pystr); - - const char *form = kForms[form_idx]; - - // normalize(form, str) - { - PyRef r = PyObject_CallFunction(ud_normalize, "sO", - form, (PyObject *)pystr); - if (PyErr_Occurred()) PyErr_Clear(); - } - - // is_normalized(form, str) - { - PyRef r = PyObject_CallFunction(ud_is_normalized, "sO", - form, (PyObject *)pystr); - if (PyErr_Occurred()) PyErr_Clear(); - } - - // ucd_3_2_0.normalize('NFC', str) - if (try_ucd) { - PyRef r = PyObject_CallMethod(ud_ucd_3_2_0, "normalize", "sO", - "NFC", (PyObject *)pystr); - if (PyErr_Occurred()) PyErr_Clear(); - } - - // lookup(str) - if (try_lookup) { - PyRef r = PyObject_CallFunction(ud_lookup, "O", (PyObject *)pystr); - if (PyErr_Occurred()) PyErr_Clear(); - } -} - -// OP_STRINGIO: Create io.StringIO(), write fuzz str, then exercise -// read/readline/readlines/truncate/iteration. Exercises _io/stringio.c. -static void op_stringio(FuzzedDataProvider &fdp) { - int str_enc = fdp.ConsumeIntegralInRange(0, 3); - int variant = fdp.ConsumeIntegralInRange(0, 2); - std::string data = fdp.ConsumeBytesAsString( - std::min(fdp.remaining_bytes(), (size_t)10000)); - PyRef pystr(fuzz_bytes_to_str(data, str_enc)); - CHECK(pystr); - - PyRef sio = PyObject_CallFunction(stringio_ctor, NULL); - CHECK(sio); - - // Write the fuzz string. - PyRef wr = PyObject_CallMethod(sio, "write", "O", (PyObject *)pystr); - if (!wr) { PyErr_Clear(); return; } - - // Seek to start. - PyRef sk = PyObject_CallMethod(sio, "seek", "i", 0); - if (!sk) { PyErr_Clear(); return; } - - switch (variant) { - case 0: { - // read + getvalue - PyRef r1 = PyObject_CallMethod(sio, "read", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - PyRef r2 = PyObject_CallMethod(sio, "getvalue", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - break; - } - case 1: { - // readline x3 + readlines - for (int i = 0; i < 3; i++) { - PyRef r = PyObject_CallMethod(sio, "readline", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - } - PyRef sk2 = PyObject_CallMethod(sio, "seek", "i", 0); - if (PyErr_Occurred()) PyErr_Clear(); - PyRef r = PyObject_CallMethod(sio, "readlines", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - break; - } - case 2: { - // truncate + tell + iteration - Py_ssize_t slen = PyUnicode_GET_LENGTH(pystr); - long trunc_at = slen < 64 ? slen : 64; - PyRef tr = PyObject_CallMethod(sio, "truncate", "l", trunc_at); - if (PyErr_Occurred()) PyErr_Clear(); - PyRef tl = PyObject_CallMethod(sio, "tell", NULL); - if (PyErr_Occurred()) PyErr_Clear(); - PyRef sk2 = PyObject_CallMethod(sio, "seek", "i", 0); - if (PyErr_Occurred()) PyErr_Clear(); - // Iterate. - PyRef it = PyObject_GetIter(sio); - if (it) { - PyObject *line; - while ((line = PyIter_Next(it)) != NULL) - Py_DECREF(line); - if (PyErr_Occurred()) PyErr_Clear(); - } else { - PyErr_Clear(); - } - break; - } - } - - PyRef cl = PyObject_CallMethod(sio, "close", NULL); - if (PyErr_Occurred()) PyErr_Clear(); -} - -// --------------------------------------------------------------------------- -// Dispatch. -// --------------------------------------------------------------------------- - -enum Op { - OP_DATETIME_PARSE, - OP_DATETIME_FORMAT, - OP_COLLECTIONS_COUNT, - OP_UNICODEDATA_CHARINFO, - OP_UNICODEDATA_NORMALIZE, - OP_STRINGIO, - NUM_OPS -}; - -extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { - assert(Py_IsInitialized()); - init_textops(); - if (size < 1 || size > 0x10000) return 0; - if (PyErr_Occurred()) PyErr_Clear(); - - FuzzedDataProvider fdp(data, size); - switch (fdp.ConsumeIntegralInRange(0, NUM_OPS - 1)) { - case OP_DATETIME_PARSE: - op_datetime_parse(fdp); - break; - case OP_DATETIME_FORMAT: - op_datetime_format(fdp); - break; - case OP_COLLECTIONS_COUNT: - op_collections_count(fdp); - break; - case OP_UNICODEDATA_CHARINFO: - op_unicodedata_charinfo(fdp); - break; - case OP_UNICODEDATA_NORMALIZE: - op_unicodedata_normalize(fdp); - break; - case OP_STRINGIO: - op_stringio(fdp); - break; - } - - if (++gc_counter % kGcInterval == 0) PyGC_Collect(); - return 0; -} diff --git a/module-fuzzers/fuzz_time.cpp b/module-fuzzers/fuzz_time.cpp new file mode 100644 index 0000000..d42f936 --- /dev/null +++ b/module-fuzzers/fuzz_time.cpp @@ -0,0 +1,97 @@ +// fuzz_time.cpp — Fuzzer for CPython's time C extension module. +// +// This fuzzer exercises the following CPython C extension module via +// its Python API, called through the Python C API from C++: +// +// time — strftime with fuzz format, strptime with fuzz input, +// strptime with fuzz format +// +// All module functions are imported once during init and cached as static +// PyObject* pointers. PyRef (RAII) prevents reference leaks. +// Max input size: 64 KB. + +#include "fuzz_helpers.h" + +static PyObject *time_strftime, *time_strptime, *time_localtime; + +static int initialized = 0; + +static void init_time(void) { + if (initialized) return; + + time_strftime = import_attr("time", "strftime"); + time_strptime = import_attr("time", "strptime"); + time_localtime = import_attr("time", "localtime"); + assert(!PyErr_Occurred()); + initialized = 1; +} + +// op_time: FDP selects variant — strftime with fuzz format, strptime with +// fuzz input, or strptime with fuzz format. Exercises the time C module. +static void op_time(FuzzedDataProvider &fdp) { + int str_enc = fdp.ConsumeIntegralInRange(0, 3); + enum { STRFTIME, STRPTIME_INPUT, STRPTIME_FORMAT, NUM_TARGETS }; + int target_fn = fdp.ConsumeIntegralInRange(0, NUM_TARGETS - 1); + if (fdp.remaining_bytes() == 0) return; + std::string data = fdp.ConsumeRemainingBytesAsString(); + PyRef pystr(fuzz_bytes_to_str(data, str_enc)); + CHECK(pystr); + + switch (target_fn) { + case STRFTIME: { + // time.strftime(str, time.localtime()) + PyRef lt = PyObject_CallFunction(time_localtime, NULL); + CHECK(lt); + // Use non-empty format. + Py_ssize_t slen = PyUnicode_GET_LENGTH(pystr); + PyObject *fmt = slen > 0 ? (PyObject *)pystr : NULL; + if (!fmt) { + PyRef def_fmt = PyUnicode_FromString("%Y"); + CHECK(def_fmt); + PyRef r = PyObject_CallFunction(time_strftime, "OO", + (PyObject *)def_fmt, (PyObject *)lt); + } else { + PyRef r = PyObject_CallFunction(time_strftime, "OO", + fmt, (PyObject *)lt); + } + break; + } + case STRPTIME_INPUT: { + // time.strptime(str, '%Y-%m-%d %H:%M:%S') + PyRef r = PyObject_CallFunction(time_strptime, "Os", + (PyObject *)pystr, + "%Y-%m-%d %H:%M:%S"); + break; + } + case STRPTIME_FORMAT: { + // time.strptime('2024-01-15 12:30:00', str) + // Use non-empty format. + Py_ssize_t slen = PyUnicode_GET_LENGTH(pystr); + PyObject *fmt = slen > 0 ? (PyObject *)pystr : NULL; + if (!fmt) { + PyRef def_fmt = PyUnicode_FromString("%Y-%m-%d %H:%M:%S"); + CHECK(def_fmt); + PyRef r = PyObject_CallFunction(time_strptime, "sO", + "2024-01-15 12:30:00", + (PyObject *)def_fmt); + } else { + PyRef r = PyObject_CallFunction(time_strptime, "sO", + "2024-01-15 12:30:00", fmt); + } + break; + } + } + if (PyErr_Occurred()) PyErr_Clear(); +} + +extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + assert(Py_IsInitialized()); + init_time(); + if (size < 1 || size > 0x10000) return 0; + if (PyErr_Occurred()) PyErr_Clear(); + + FuzzedDataProvider fdp(data, size); + op_time(fdp); + + return 0; +} diff --git a/module-fuzzers/fuzz_unicodedata.cpp b/module-fuzzers/fuzz_unicodedata.cpp new file mode 100644 index 0000000..5c2eb84 --- /dev/null +++ b/module-fuzzers/fuzz_unicodedata.cpp @@ -0,0 +1,155 @@ +// fuzz_unicodedata.cpp — Fuzzer for CPython's unicodedata C extension module. +// +// This fuzzer exercises the following CPython C extension module via +// its Python API, called through the Python C API from C++: +// +// unicodedata — category, bidirectional, numeric, decimal, +// combining, east_asian_width, mirrored, name, +// decomposition, normalize, is_normalized, lookup, +// ucd_3_2_0.normalize +// +// The first two bytes of fuzz input select string encoding and target +// function (one of 13). Remaining bytes become the input string. +// Each target makes a single call. +// +// All module functions are imported once during init and cached as static +// PyObject* pointers. PyRef (RAII) prevents reference leaks. +// Max input size: 64 KB. + +#include "fuzz_helpers.h" + +// unicodedata +static PyObject *ud_category, *ud_bidirectional, *ud_normalize, *ud_numeric; +static PyObject *ud_lookup, *ud_name, *ud_decomposition, *ud_is_normalized; +static PyObject *ud_east_asian_width, *ud_mirrored, *ud_decimal, *ud_combining; +static PyObject *ud_ucd_3_2_0; + +static int initialized = 0; + +static void init_unicodedata(void) { + if (initialized) return; + + // unicodedata + ud_category = import_attr("unicodedata", "category"); + ud_bidirectional = import_attr("unicodedata", "bidirectional"); + ud_normalize = import_attr("unicodedata", "normalize"); + ud_numeric = import_attr("unicodedata", "numeric"); + ud_lookup = import_attr("unicodedata", "lookup"); + ud_name = import_attr("unicodedata", "name"); + ud_decomposition = import_attr("unicodedata", "decomposition"); + ud_is_normalized = import_attr("unicodedata", "is_normalized"); + ud_east_asian_width = import_attr("unicodedata", "east_asian_width"); + ud_mirrored = import_attr("unicodedata", "mirrored"); + ud_decimal = import_attr("unicodedata", "decimal"); + ud_combining = import_attr("unicodedata", "combining"); + ud_ucd_3_2_0 = import_attr("unicodedata", "ucd_3_2_0"); + assert(!PyErr_Occurred()); + initialized = 1; +} + +// op_unicodedata: the fuzzer selects one of 13 targets — 9 single-character +// functions (category, bidirectional, numeric, decimal, combining, +// east_asian_width, mirrored, name, decomposition) or 4 whole-string +// functions (normalize, is_normalized, ucd_3_2_0.normalize, lookup). +// Each target makes a single call. Exercises the unicodedata C module's +// character-info, normalization, and name-lookup code paths. +static void op_unicodedata(FuzzedDataProvider &fdp) { + int str_enc = fdp.ConsumeIntegralInRange(0, 3); + enum { + CATEGORY, BIDIRECTIONAL, NUMERIC, DECIMAL, COMBINING, + EAST_ASIAN_WIDTH, MIRRORED, NAME, DECOMPOSITION, + NORMALIZE, IS_NORMALIZED, UCD_NORMALIZE, LOOKUP, + NUM_TARGETS + }; + int target_fn = fdp.ConsumeIntegralInRange(0, NUM_TARGETS - 1); + if (fdp.remaining_bytes() == 0) return; + std::string data = fdp.ConsumeRemainingBytesAsString(); + PyRef pystr(fuzz_bytes_to_str(data, str_enc)); + CHECK(pystr); + + static const char *kForms[] = {"NFC", "NFD", "NFKC", "NFKD"}; + + switch (target_fn) { + case CATEGORY: { + PyRef r = PyObject_CallFunction(ud_category, "O", (PyObject *)pystr); + break; + } + case BIDIRECTIONAL: { + PyRef r = PyObject_CallFunction(ud_bidirectional, "O", (PyObject *)pystr); + break; + } + case NUMERIC: { + PyRef dflt = PyLong_FromLong(fdp.ConsumeIntegral()); + CHECK(dflt); + PyRef r = PyObject_CallFunction(ud_numeric, "OO", + (PyObject *)pystr, (PyObject *)dflt); + break; + } + case DECIMAL: { + PyRef dflt = PyLong_FromLong(fdp.ConsumeIntegral()); + CHECK(dflt); + PyRef r = PyObject_CallFunction(ud_decimal, "OO", + (PyObject *)pystr, (PyObject *)dflt); + break; + } + case COMBINING: { + PyRef r = PyObject_CallFunction(ud_combining, "O", (PyObject *)pystr); + break; + } + case EAST_ASIAN_WIDTH: { + PyRef r = PyObject_CallFunction(ud_east_asian_width, "O", + (PyObject *)pystr); + break; + } + case MIRRORED: { + PyRef r = PyObject_CallFunction(ud_mirrored, "O", (PyObject *)pystr); + break; + } + case NAME: { + PyRef empty_str = PyUnicode_FromString(""); + CHECK(empty_str); + PyRef r = PyObject_CallFunction(ud_name, "OO", + (PyObject *)pystr, (PyObject *)empty_str); + break; + } + case DECOMPOSITION: { + PyRef r = PyObject_CallFunction(ud_decomposition, "O", + (PyObject *)pystr); + break; + } + case NORMALIZE: { + const char *form = kForms[str_enc & 3]; + PyRef r = PyObject_CallFunction(ud_normalize, "sO", + form, (PyObject *)pystr); + break; + } + case IS_NORMALIZED: { + const char *form = kForms[str_enc & 3]; + PyRef r = PyObject_CallFunction(ud_is_normalized, "sO", + form, (PyObject *)pystr); + break; + } + case UCD_NORMALIZE: { + PyRef r = PyObject_CallMethod(ud_ucd_3_2_0, "normalize", "sO", + "NFC", (PyObject *)pystr); + break; + } + case LOOKUP: { + PyRef r = PyObject_CallFunction(ud_lookup, "O", (PyObject *)pystr); + break; + } + } + if (PyErr_Occurred()) PyErr_Clear(); +} + +extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + assert(Py_IsInitialized()); + init_unicodedata(); + if (size < 1 || size > 0x10000) return 0; + if (PyErr_Occurred()) PyErr_Clear(); + + FuzzedDataProvider fdp(data, size); + op_unicodedata(fdp); + + return 0; +}