apache
diff --git a/‎ci/docker/ubuntu-22.04-cpp.dockerfile‎
Lines changed: 1 addition & 0 deletions b/‎ci/docker/ubuntu-22.04-cpp.dockerfile‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎ci/docker/ubuntu-24.04-cpp.dockerfile‎
Lines changed: 1 addition & 0 deletions b/‎ci/docker/ubuntu-24.04-cpp.dockerfile‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎ci/scripts/cpp_test.sh‎
Lines changed: 27 additions & 1 deletion b/‎ci/scripts/cpp_test.sh‎
Lines changed: 27 additions & 1 deletion
diff --git a/‎cpp/build-support/fuzzing/generate_corpuses.sh‎
Lines changed: 7 additions & 1 deletion b/‎cpp/build-support/fuzzing/generate_corpuses.sh‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎cpp/src/arrow/util/macros.h‎
Lines changed: 11 additions & 21 deletions b/‎cpp/src/arrow/util/macros.h‎
Lines changed: 11 additions & 21 deletions
diff --git a/‎cpp/src/parquet/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎cpp/src/parquet/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎cpp/src/parquet/arrow/CMakeLists.txt‎
Lines changed: 6 additions & 0 deletions b/‎cpp/src/parquet/arrow/CMakeLists.txt‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎cpp/src/parquet/arrow/encoding_fuzz.cc‎
Lines changed: 27 additions & 0 deletions b/‎cpp/src/parquet/arrow/encoding_fuzz.cc‎
Lines changed: 27 additions & 0 deletions
@@ -120,6 +120,7 @@ RUN apt-get update -y -q && \
         rsync \
         tzdata \
         uuid-runtime \
+        unzip \
         wget \
         xz-utils && \
     apt-get clean && \
 
@@ -122,6 +122,7 @@ RUN apt-get update -y -q && \
         tzdata \
         tzdata-legacy \
         uuid-runtime \
+        unzip \
         wget && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists*
 
@@ -180,10 +180,36 @@ fi
 
 if [ "${ARROW_FUZZING}" == "ON" ]; then
     # Fuzzing regression tests
+
+    # This will display any errors generated during fuzzing. These errors are
+    # usually not bugs (most fuzz files are invalid and hence generate errors
+    # when trying to read them), which is why they are hidden by default when
+    # fuzzing.
+    export ARROW_FUZZING_VERBOSITY=1
     # Some fuzz regression files may trigger huge memory allocations,
     # let the allocator return null instead of aborting.
     export ASAN_OPTIONS="$ASAN_OPTIONS allocator_may_return_null=1"
-    export ARROW_FUZZING_VERBOSITY=1
+
+    # 1. Generate seed corpuses
+    "${source_dir}/build-support/fuzzing/generate_corpuses.sh" "${binary_output_dir}"
+
+    # 2. Run fuzz targets on seed corpus entries
+    function run_fuzz_target_on_seed_corpus() {
+      fuzz_target_basename=$1
+      corpus_dir=${binary_output_dir}/${fuzz_target_basename}_seed_corpus
+      mkdir -p "${corpus_dir}"
+      rm -f "${corpus_dir}"/*
+      unzip "${binary_output_dir}"/"${fuzz_target_basename}"_seed_corpus.zip -d "${corpus_dir}"
+      "${binary_output_dir}"/"${fuzz_target_basename}" -rss_limit_mb=4000 "${corpus_dir}"/*
+    }
+    run_fuzz_target_on_seed_corpus arrow-csv-fuzz
+    run_fuzz_target_on_seed_corpus arrow-ipc-file-fuzz
+    run_fuzz_target_on_seed_corpus arrow-ipc-stream-fuzz
+    run_fuzz_target_on_seed_corpus arrow-ipc-tensor-stream-fuzz
+    run_fuzz_target_on_seed_corpus parquet-arrow-fuzz
+    run_fuzz_target_on_seed_corpus parquet-encoding-fuzz
+
+    # 3. Run fuzz targets on regression files from arrow-testing
     # Run golden IPC integration files: these should ideally load without errors,
     # though some very old ones carry invalid data (such as decimal values
     # larger than their advertised precision).
 
@@ -56,7 +56,7 @@ rm -rf ${CORPUS_DIR}
 ${OUT}/arrow-ipc-generate-tensor-fuzz-corpus -stream ${CORPUS_DIR}
 ${ARROW_CPP}/build-support/fuzzing/pack_corpus.py ${CORPUS_DIR} ${OUT}/arrow-ipc-tensor-stream-fuzz_seed_corpus.zip
 
-# Parquet
+# Parquet file-level fuzzer
 
 rm -rf ${CORPUS_DIR}
 ${OUT}/parquet-arrow-generate-fuzz-corpus ${CORPUS_DIR}
@@ -65,6 +65,12 @@ cp ${ARROW_CPP}/submodules/parquet-testing/data/*.parquet ${CORPUS_DIR}
 cp ${ARROW_CPP}/submodules/parquet-testing/bad_data/*.parquet ${CORPUS_DIR}
 ${ARROW_CPP}/build-support/fuzzing/pack_corpus.py ${CORPUS_DIR} ${OUT}/parquet-arrow-fuzz_seed_corpus.zip
 
+# Parquet encoding fuzzer
+
+rm -rf ${CORPUS_DIR}
+${OUT}/parquet-generate-encoding-fuzz-corpus ${CORPUS_DIR}
+${ARROW_CPP}/build-support/fuzzing/pack_corpus.py ${CORPUS_DIR} ${OUT}/parquet-encoding-fuzz_seed_corpus.zip
+
 # CSV
 
 rm -rf ${PANDAS_DIR}
 
@@ -183,28 +183,18 @@
 #endif
 
 // ----------------------------------------------------------------------
+// Macros to enforce struct member packing
 
-// macros to disable padding
-// these macros are portable across different compilers and platforms
-//[https://github.com/google/flatbuffers/blob/master/include/flatbuffers/flatbuffers.h#L1355]
-#if !defined(MANUALLY_ALIGNED_STRUCT)
-#  if defined(_MSC_VER)
-#    define MANUALLY_ALIGNED_STRUCT(alignment) \
-      __pragma(pack(1));                       \
-      struct __declspec(align(alignment))
-#    define STRUCT_END(name, size) \
-      __pragma(pack());            \
-      static_assert(sizeof(name) == size, "compiler breaks packing rules")
-#  elif defined(__GNUC__) || defined(__clang__)
-#    define MANUALLY_ALIGNED_STRUCT(alignment) \
-      _Pragma("pack(1)") struct __attribute__((aligned(alignment)))
-#    define STRUCT_END(name, size)                          \
-      _Pragma("pack()") static_assert(sizeof(name) == size, \
-                                      "compiler breaks packing rules")
-#  else
-#    error Unknown compiler, please define structure alignment macros
-#  endif
-#endif  // !defined(MANUALLY_ALIGNED_STRUCT)
+#if defined(__GNUC__)
+#  define ARROW_PACKED_START(KEYWORD, ...) KEYWORD [[gnu::packed]] __VA_ARGS__
+#  define ARROW_PACKED_END
+#elif defined(_MSC_VER)
+#  define ARROW_PACKED_START(KEYWORD, ...) _Pragma("pack(push, 1)") KEYWORD __VA_ARGS__
+#  define ARROW_PACKED_END _Pragma("pack(pop)")
+#else
+#  define ARROW_PACKED_START(KEYWORD, ...) KEYWORD __VA_ARGS__
+#  define ARROW_PACKED_END
+#endif
 
 // ----------------------------------------------------------------------
 // Convenience macro disabling a particular UBSan check in a function
 
@@ -151,6 +151,7 @@ endif()
 # Library config
 
 set(PARQUET_SRCS
+    arrow/fuzz_encoding_internal.cc
     arrow/fuzz_internal.cc
     arrow/path_internal.cc
     arrow/reader.cc
 
@@ -19,13 +19,19 @@ arrow_install_all_headers("parquet/arrow")
 
 if(ARROW_BUILD_FUZZING_UTILITIES)
   add_executable(parquet-arrow-generate-fuzz-corpus generate_fuzz_corpus.cc)
+  add_executable(parquet-generate-encoding-fuzz-corpus generate_encoding_fuzz_corpus.cc)
   if(ARROW_BUILD_STATIC)
     target_link_libraries(parquet-arrow-generate-fuzz-corpus parquet_static
                           arrow_testing_static)
+    target_link_libraries(parquet-generate-encoding-fuzz-corpus parquet_static
+                          arrow_testing_static)
   else()
     target_link_libraries(parquet-arrow-generate-fuzz-corpus parquet_shared
                           arrow_testing_shared)
+    target_link_libraries(parquet-generate-encoding-fuzz-corpus parquet_shared
+                          arrow_testing_shared)
   endif()
 endif()
 
 add_parquet_fuzz_target(fuzz PREFIX "parquet-arrow")
+add_parquet_fuzz_target(encoding_fuzz PREFIX "parquet")
@@ -0,0 +1,27 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/status.h"
+#include "arrow/util/fuzz_internal.h"
+#include "parquet/arrow/fuzz_encoding_internal.h"
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
+  auto status =
+      parquet::fuzzing::internal::FuzzEncoding(data, static_cast<int64_t>(size));
+  arrow::internal::LogFuzzStatus(status, data, static_cast<int64_t>(size));
+  return 0;
+}