From 7c9ea2f641c6dfd5402e570f579439d6a8fb4934 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Tue, 5 May 2026 11:18:19 +0200 Subject: [PATCH 1/2] GH-49918: [C++][Parquet] Catch std::vector allocation errors in encoding fuzzer --- cpp/src/arrow/stl_allocator.h | 14 +++++- .../parquet/arrow/fuzz_encoding_internal.cc | 49 +++++++++++++------ testing | 2 +- 3 files changed, 47 insertions(+), 18 deletions(-) diff --git a/cpp/src/arrow/stl_allocator.h b/cpp/src/arrow/stl_allocator.h index 82e6aaa8772b..71d86c2529c9 100644 --- a/cpp/src/arrow/stl_allocator.h +++ b/cpp/src/arrow/stl_allocator.h @@ -30,6 +30,16 @@ namespace arrow { namespace stl { +class BadAlloc : public std::bad_alloc { + public: + explicit BadAlloc(Status st) noexcept : st_(std::move(st)) {} + + const char* what() const noexcept override { return st_.message().c_str(); } + + protected: + Status st_; +}; + /// \brief A STL allocator delegating allocations to a Arrow MemoryPool template class allocator { @@ -50,7 +60,7 @@ class allocator { /// \brief Construct an allocator from the default MemoryPool allocator() noexcept : pool_(default_memory_pool()) {} /// \brief Construct an allocator from the given MemoryPool - explicit allocator(MemoryPool* pool) noexcept : pool_(pool) {} + allocator(MemoryPool* pool) noexcept : pool_(pool) {} template allocator(const allocator& rhs) noexcept : pool_(rhs.pool()) {} @@ -64,7 +74,7 @@ class allocator { pointer allocate(size_type n, const void* /*hint*/ = NULLPTR) { uint8_t* data; Status s = pool_->Allocate(n * sizeof(T), &data); - if (!s.ok()) throw std::bad_alloc(); + if (!s.ok()) throw BadAlloc(std::move(s)); return reinterpret_cast(data); } diff --git a/cpp/src/parquet/arrow/fuzz_encoding_internal.cc b/cpp/src/parquet/arrow/fuzz_encoding_internal.cc index 0694f550b149..4270eb543788 100644 --- a/cpp/src/parquet/arrow/fuzz_encoding_internal.cc +++ b/cpp/src/parquet/arrow/fuzz_encoding_internal.cc @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -32,6 +33,7 @@ #include "arrow/compare.h" #include "arrow/io/memory.h" #include "arrow/pretty_print.h" +#include "arrow/stl_allocator.h" #include "arrow/type.h" #include "arrow/util/fuzz_internal.h" #include "arrow/util/logging.h" @@ -153,6 +155,19 @@ namespace { // Just to use std::vector while avoiding std::vector using BooleanSlot = std::array; +template +using PoolAllocator = ::arrow::stl::allocator; + +template +using PoolVector = std::vector>; + +#define BEGIN_CATCH_BAD_ALLOC try { +#define END_CATCH_BAD_ALLOC \ + } \ + catch (const std::bad_alloc& e) { \ + return Status::OutOfMemory(e.what()); \ + } + template struct TypedFuzzEncoding { static constexpr Type::type kType = DType::type_num; @@ -176,9 +191,9 @@ struct TypedFuzzEncoding { // decoder's internal scratch space, which get invalidated on the // following decoder call. We circumvent the issue by executing a // functor on each decoded chunk before moving to the next one. - Status RunOnDecodedChunks(Encoding::type encoding, - std::span encoded_data, int chunk_size, - std::function)> func) { + Status RunOnDecodedChunks( + Encoding::type encoding, std::span encoded_data, int chunk_size, + std::function)> func) { BEGIN_PARQUET_CATCH_EXCEPTIONS int total_values = 0; auto decoder = MakeDecoder(encoding); @@ -191,8 +206,10 @@ struct TypedFuzzEncoding { static_cast(encoded_data.size())); while (total_values < num_values_) { const int read_size = std::min(num_values_ - total_values, chunk_size); - // ARROW_ASSIGN_OR_RAISE(auto chunk_values, DecodeChunk(read_size)); - std::vector chunk_values(read_size); + PoolVector chunk_values(pool()); + BEGIN_CATCH_BAD_ALLOC + chunk_values.resize(read_size); + END_CATCH_BAD_ALLOC int values_read; if constexpr (kType == Type::BOOLEAN) { values_read = @@ -201,8 +218,7 @@ struct TypedFuzzEncoding { values_read = decoder->Decode(chunk_values.data(), read_size); } ARROW_CHECK_LE(values_read, read_size); - chunk_values.resize(values_read); - RETURN_NOT_OK(func(total_values, std::move(chunk_values))); + RETURN_NOT_OK(func(total_values, std::span(chunk_values).first(values_read))); total_values += values_read; if (values_read < chunk_size) { break; @@ -215,14 +231,16 @@ struct TypedFuzzEncoding { return Status::OK(); } - Result> Decode(Encoding::type encoding, - std::span encoded_data, - int chunk_size) { + Result> Decode(Encoding::type encoding, + std::span encoded_data, + int chunk_size) { // Decoded chunk values shouldn't embed pointers to decoder scratch space. static_assert(decoded_values_can_be_persisted()); - std::vector values; - auto accumulate_chunk = [&](int offset, std::vector chunk_values) { + PoolVector values(pool()); + auto accumulate_chunk = [&](int offset, std::span chunk_values) { + BEGIN_CATCH_BAD_ALLOC values.insert(values.end(), chunk_values.begin(), chunk_values.end()); + END_CATCH_BAD_ALLOC return Status::OK(); }; RETURN_NOT_OK( @@ -273,7 +291,7 @@ struct TypedFuzzEncoding { // Re-encode and re-decode using roundtrip encoding { - auto compare_chunk = [&](int offset, std::vector chunk_values) { + auto compare_chunk = [&](int offset, std::span chunk_values) { return CompareChunkAgainstReference(offset, chunk_values); }; auto encoder = MakeEncoder(roundtrip_encoding_); @@ -291,7 +309,8 @@ struct TypedFuzzEncoding { chunk_size, compare_chunk)); } } else { - encoder->Put(reference_values_); + encoder->Put(reference_values_.data(), + static_cast(reference_values_.size())); auto reencoded_buffer = encoder->FlushValues(); auto reencoded_data = reencoded_buffer->template span_as(); // Vary chunk sizes @@ -462,7 +481,7 @@ struct TypedFuzzEncoding { std::shared_ptr reference_array_; // Only for INT96 as there is no strictly equivalent Arrow type - std::vector reference_values_; + PoolVector reference_values_; }; } // namespace diff --git a/testing b/testing index 4080a40f573c..4aeaf00ad3e7 160000 --- a/testing +++ b/testing @@ -1 +1 @@ -Subproject commit 4080a40f573c18beca92fc0db68fb322710f5ebb +Subproject commit 4aeaf00ad3e726d37852e5be0d3e1bfb7ddc18f9 From 0d501cd40952d317845387bf6bf0644bf89bd8bc Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Tue, 5 May 2026 12:27:57 +0200 Subject: [PATCH 2/2] Fix lint --- cpp/src/arrow/stl_allocator.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/stl_allocator.h b/cpp/src/arrow/stl_allocator.h index 71d86c2529c9..bb52767282ec 100644 --- a/cpp/src/arrow/stl_allocator.h +++ b/cpp/src/arrow/stl_allocator.h @@ -60,7 +60,7 @@ class allocator { /// \brief Construct an allocator from the default MemoryPool allocator() noexcept : pool_(default_memory_pool()) {} /// \brief Construct an allocator from the given MemoryPool - allocator(MemoryPool* pool) noexcept : pool_(pool) {} + allocator(MemoryPool* pool) noexcept : pool_(pool) {} // NOLINT: runtime/explicit template allocator(const allocator& rhs) noexcept : pool_(rhs.pool()) {}