From 0da1a60e4ac2a0811275fa5e0f2054fbdab8953e Mon Sep 17 00:00:00 2001 From: Alvaro-Kothe Date: Fri, 20 Feb 2026 15:51:47 -0300 Subject: [PATCH 01/11] refactor: use optional for null_count --- cpp/src/arrow/dataset/file_parquet_test.cc | 6 +- cpp/src/parquet/column_writer.cc | 2 +- cpp/src/parquet/metadata.cc | 15 +++-- cpp/src/parquet/page_index.cc | 4 +- cpp/src/parquet/page_index_test.cc | 6 +- cpp/src/parquet/printer.cc | 2 +- cpp/src/parquet/statistics.cc | 77 +++++++++++----------- cpp/src/parquet/statistics.h | 35 ++++------ cpp/src/parquet/statistics_test.cc | 31 ++++----- cpp/src/parquet/thrift_internal.h | 4 +- 10 files changed, 88 insertions(+), 94 deletions(-) diff --git a/cpp/src/arrow/dataset/file_parquet_test.cc b/cpp/src/arrow/dataset/file_parquet_test.cc index 696bda19359..adec960f28f 100644 --- a/cpp/src/arrow/dataset/file_parquet_test.cc +++ b/cpp/src/arrow/dataset/file_parquet_test.cc @@ -941,9 +941,8 @@ TEST(TestParquetStatistics, NoNullCount) { ::parquet::EncodedStatistics encoded_stats; encoded_stats.set_min(int32_to_parquet_stats(1)); encoded_stats.set_max(int32_to_parquet_stats(100)); - encoded_stats.has_null_count = false; encoded_stats.all_null_value = false; - encoded_stats.null_count = 0; + encoded_stats.null_count.reset(); auto stats = ::parquet::Statistics::Make(&descr, &encoded_stats, /*num_values=*/10); auto stat_expression = @@ -956,7 +955,6 @@ TEST(TestParquetStatistics, NoNullCount) { // Special case: when num_value is 0, it would return // "is_null". ::parquet::EncodedStatistics encoded_stats; - encoded_stats.has_null_count = true; encoded_stats.null_count = 1; encoded_stats.all_null_value = true; auto stats = ::parquet::Statistics::Make(&descr, &encoded_stats, /*num_values=*/0); @@ -965,7 +963,7 @@ TEST(TestParquetStatistics, NoNullCount) { ASSERT_TRUE(stat_expression.has_value()); EXPECT_EQ(stat_expression->ToString(), "is_null(x, {nan_is_null=false})"); - encoded_stats.has_null_count = false; + encoded_stats.null_count.reset(); encoded_stats.all_null_value = false; stats = ::parquet::Statistics::Make(&descr, &encoded_stats, /*num_values=*/0); stat_expression = ParquetFileFragment::EvaluateStatisticsAsExpression(*field, *stats); diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 797d435e73e..61aca8c8e58 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -1084,7 +1084,7 @@ void ColumnWriterImpl::BuildDataPageV2(int64_t definition_levels_rle_size, // page_stats.null_count is not set when page_statistics_ is nullptr. It is only used // here for safety check. - DCHECK(!page_stats.has_null_count || page_stats.null_count == null_count); + DCHECK(!page_stats.null_count.has_value() || page_stats.null_count == null_count); // Write the page to OutputStream eagerly if there is no dictionary or // if dictionary encoding has fallen back to PLAIN diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 505ace275b1..bb9aba37c12 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -101,23 +101,26 @@ static std::shared_ptr MakeTypedColumnStats( metadata.statistics.__isset.is_max_value_exact ? std::optional(metadata.statistics.is_max_value_exact) : std::nullopt; + std::optional null_count = + metadata.statistics.__isset.null_count + ? std::optional(metadata.statistics.null_count) + : std::nullopt; // If ColumnOrder is defined, return max_value and min_value if (descr->column_order().get_order() == ColumnOrder::TYPE_DEFINED_ORDER) { return MakeStatistics( descr, metadata.statistics.min_value, metadata.statistics.max_value, - metadata.num_values - metadata.statistics.null_count, - metadata.statistics.null_count, metadata.statistics.distinct_count, + metadata.num_values - (null_count.value_or(0)), + null_count, metadata.statistics.distinct_count, metadata.statistics.__isset.max_value && metadata.statistics.__isset.min_value, - metadata.statistics.__isset.null_count, metadata.statistics.__isset.distinct_count, min_exact, max_exact, pool); } // Default behavior return MakeStatistics( descr, metadata.statistics.min, metadata.statistics.max, - metadata.num_values - metadata.statistics.null_count, - metadata.statistics.null_count, metadata.statistics.distinct_count, + metadata.num_values - (null_count.value_or(0)), + null_count, metadata.statistics.distinct_count, metadata.statistics.__isset.max && metadata.statistics.__isset.min, - metadata.statistics.__isset.null_count, metadata.statistics.__isset.distinct_count, + metadata.statistics.__isset.distinct_count, min_exact, max_exact, pool); } diff --git a/cpp/src/parquet/page_index.cc b/cpp/src/parquet/page_index.cc index c06fc77dc53..17d7d8bebc4 100644 --- a/cpp/src/parquet/page_index.cc +++ b/cpp/src/parquet/page_index.cc @@ -524,8 +524,8 @@ class ColumnIndexBuilderImpl final : public ColumnIndexBuilder { return; } - if (column_index_.__isset.null_counts && stats.has_null_count) { - column_index_.null_counts.emplace_back(stats.null_count); + if (column_index_.__isset.null_counts && stats.null_count) { + column_index_.null_counts.emplace_back(*stats.null_count); } else { column_index_.__isset.null_counts = false; column_index_.null_counts.clear(); diff --git a/cpp/src/parquet/page_index_test.cc b/cpp/src/parquet/page_index_test.cc index 3a7308c1c6b..9b3386055a4 100644 --- a/cpp/src/parquet/page_index_test.cc +++ b/cpp/src/parquet/page_index_test.cc @@ -815,9 +815,9 @@ class PageIndexBuilderTest : public ::testing::Test { ASSERT_EQ(stats.all_null_value, column_index->null_pages()[0]); ASSERT_EQ(stats.min(), column_index->encoded_min_values()[0]); ASSERT_EQ(stats.max(), column_index->encoded_max_values()[0]); - ASSERT_EQ(stats.has_null_count, column_index->has_null_counts()); - if (stats.has_null_count) { - ASSERT_EQ(stats.null_count, column_index->null_counts()[0]); + ASSERT_EQ(stats.null_count.has_value(), column_index->has_null_counts()); + if (stats.null_count) { + ASSERT_EQ(*stats.null_count, column_index->null_counts()[0]); } } diff --git a/cpp/src/parquet/printer.cc b/cpp/src/parquet/printer.cc index dfce57a00fc..5f2f31b25ab 100644 --- a/cpp/src/parquet/printer.cc +++ b/cpp/src/parquet/printer.cc @@ -174,7 +174,7 @@ void ParquetFilePrinter::DebugPrint(std::ostream& stream, std::list selecte stats->is_min_value_exact.has_value() ? (stats->is_min_value_exact.value() ? "true" : "false") : "unknown"; - stream << ", Null Values: " << stats->null_count + stream << ", Null Values: " << stats->null_count.value_or(0) << ", Distinct Values: " << stats->distinct_count << std::endl << " Max (exact: " << max_exact << "): " << FormatStatValue(descr->physical_type(), max, descr->logical_type()) diff --git a/cpp/src/parquet/statistics.cc b/cpp/src/parquet/statistics.cc index 2e5f6fe37c4..a3651b2d57d 100644 --- a/cpp/src/parquet/statistics.cc +++ b/cpp/src/parquet/statistics.cc @@ -604,26 +604,27 @@ class TypedStatisticsImpl : public TypedStatistics { // Create stats from a thrift Statistics object. TypedStatisticsImpl(const ColumnDescriptor* descr, const std::string& encoded_min, const std::string& encoded_max, int64_t num_values, - int64_t null_count, int64_t distinct_count, bool has_min_max, - bool has_null_count, bool has_distinct_count, MemoryPool* pool) + std::optional null_count, int64_t distinct_count, + bool has_min_max, bool has_distinct_count, + MemoryPool* pool) : TypedStatisticsImpl(descr, encoded_min, encoded_max, num_values, null_count, - distinct_count, has_min_max, has_null_count, + distinct_count, has_min_max, has_distinct_count, /*is_min_value_exact=*/std::nullopt, /*is_max_value_exact=*/std::nullopt, pool) {} TypedStatisticsImpl(const ColumnDescriptor* descr, const std::string& encoded_min, const std::string& encoded_max, int64_t num_values, - int64_t null_count, int64_t distinct_count, bool has_min_max, - bool has_null_count, bool has_distinct_count, + std::optional null_count, int64_t distinct_count, + bool has_min_max, bool has_distinct_count, std::optional is_min_value_exact, std::optional is_max_value_exact, MemoryPool* pool) : TypedStatisticsImpl(descr, pool) { TypedStatisticsImpl::IncrementNumValues(num_values); - if (has_null_count) { - TypedStatisticsImpl::IncrementNullCount(null_count); + if (null_count) { + TypedStatisticsImpl::IncrementNullCount(*null_count); } else { - has_null_count_ = false; + statistics_.null_count = std::nullopt; } if (has_distinct_count) { SetDistinctCount(distinct_count); @@ -643,11 +644,10 @@ class TypedStatisticsImpl : public TypedStatistics { bool HasDistinctCount() const override { return has_distinct_count_; }; bool HasMinMax() const override { return has_min_max_; } - bool HasNullCount() const override { return has_null_count_; }; + bool HasNullCount() const override { return statistics_.null_count.has_value(); }; void IncrementNullCount(int64_t n) override { - statistics_.null_count += n; - has_null_count_ = true; + statistics_.null_count = statistics_.null_count.value_or(0) + n; } void IncrementNumValues(int64_t n) override { num_values_ += n; } @@ -700,12 +700,12 @@ class TypedStatisticsImpl : public TypedStatistics { void Merge(const TypedStatistics& other) override { this->num_values_ += other.num_values(); - // null_count is always valid when merging page statistics into - // column chunk statistics. - if (other.HasNullCount()) { - this->statistics_.null_count += other.null_count(); + // null_count is valid only if both sides have it. + if (this->HasNullCount() && other.HasNullCount()) { + this->statistics_.null_count = + *this->statistics_.null_count + other.null_count(); } else { - this->has_null_count_ = false; + this->statistics_.null_count = std::nullopt; } if (has_distinct_count_ && other.HasDistinctCount() && (distinct_count() == 0 || other.distinct_count() == 0)) { @@ -782,7 +782,7 @@ class TypedStatisticsImpl : public TypedStatistics { return s; } - int64_t null_count() const override { return statistics_.null_count; } + int64_t null_count() const override { return statistics_.null_count.value_or(0); } int64_t distinct_count() const override { return statistics_.distinct_count; } int64_t num_values() const override { return num_values_; } std::optional is_min_value_exact() const override { @@ -795,14 +795,13 @@ class TypedStatisticsImpl : public TypedStatistics { private: const ColumnDescriptor* descr_; bool has_min_max_ = false; - bool has_null_count_ = false; bool has_distinct_count_ = false; T min_; T max_; ::arrow::MemoryPool* pool_; // Number of non-null values. - // Please note that num_values_ is reliable when has_null_count_ is set. - // When has_null_count_ is not set, e.g. a page statistics created from + // Please note that num_values_ is reliable when null_count is set. + // When null_count is not set, e.g. a page statistics created from // a statistics thrift message which doesn't have the optional null_count, // `num_values_` may include null values. int64_t num_values_ = 0; @@ -836,7 +835,7 @@ class TypedStatisticsImpl : public TypedStatistics { // disabled by default. this->has_distinct_count_ = false; // Null count calculation is cheap and enabled by default. - this->has_null_count_ = true; + this->statistics_.null_count = this->statistics_.null_count.value_or(0); } void SetMinMaxPair(std::pair min_max) { @@ -1079,35 +1078,35 @@ std::shared_ptr Statistics::Make(const ColumnDescriptor* descr, return Make(descr, encoded_stats->min(), encoded_stats->max(), num_values, encoded_stats->null_count, encoded_stats->distinct_count, encoded_stats->has_min && encoded_stats->has_max, - encoded_stats->has_null_count, encoded_stats->has_distinct_count, + encoded_stats->has_distinct_count, encoded_stats->is_min_value_exact, encoded_stats->is_max_value_exact, pool); } -std::shared_ptr Statistics::Make(const ColumnDescriptor* descr, - const std::string& encoded_min, - const std::string& encoded_max, - int64_t num_values, int64_t null_count, - int64_t distinct_count, bool has_min_max, - bool has_null_count, bool has_distinct_count, - ::arrow::MemoryPool* pool) { - return Statistics::Make(descr, encoded_min, encoded_max, num_values, null_count, - distinct_count, has_min_max, has_null_count, has_distinct_count, +std::shared_ptr Statistics::Make( + const ColumnDescriptor* descr, const std::string& encoded_min, + const std::string& encoded_max, int64_t num_values, std::optional null_count, + int64_t distinct_count, bool has_min_max, + bool has_distinct_count, ::arrow::MemoryPool* pool) { + return Statistics::Make(descr, encoded_min, encoded_max, num_values, + null_count, distinct_count, + has_min_max, has_distinct_count, /*is_min_value_exact=*/std::nullopt, /*is_max_value_exact=*/std::nullopt, pool); } std::shared_ptr Statistics::Make( const ColumnDescriptor* descr, const std::string& encoded_min, - const std::string& encoded_max, int64_t num_values, int64_t null_count, - int64_t distinct_count, bool has_min_max, bool has_null_count, + const std::string& encoded_max, int64_t num_values, std::optional null_count, + int64_t distinct_count, bool has_min_max, bool has_distinct_count, std::optional is_min_value_exact, std::optional is_max_value_exact, ::arrow::MemoryPool* pool) { -#define MAKE_STATS(CAP_TYPE, KLASS) \ - case Type::CAP_TYPE: \ - return std::make_shared>( \ - descr, encoded_min, encoded_max, num_values, null_count, distinct_count, \ - has_min_max, has_null_count, has_distinct_count, is_min_value_exact, \ - is_max_value_exact, pool) +#define MAKE_STATS(CAP_TYPE, KLASS) \ + case Type::CAP_TYPE: \ + return std::make_shared>( \ + descr, encoded_min, encoded_max, num_values, \ + null_count, distinct_count, has_min_max, \ + has_distinct_count, is_min_value_exact, is_max_value_exact, \ + pool) switch (descr->physical_type()) { MAKE_STATS(BOOLEAN, BooleanType); diff --git a/cpp/src/parquet/statistics.h b/cpp/src/parquet/statistics.h index c80fb8e3b52..669fa353dad 100644 --- a/cpp/src/parquet/statistics.h +++ b/cpp/src/parquet/statistics.h @@ -131,12 +131,11 @@ class PARQUET_EXPORT EncodedStatistics { std::optional is_max_value_exact; std::optional is_min_value_exact; - int64_t null_count = 0; + std::optional null_count; int64_t distinct_count = 0; bool has_min = false; bool has_max = false; - bool has_null_count = false; bool has_distinct_count = false; // When all values in the statistics are null, it is set to true. @@ -171,9 +170,7 @@ class PARQUET_EXPORT EncodedStatistics { min_.clear(); } - bool is_set() const { - return has_min || has_max || has_null_count || has_distinct_count; - } + bool is_set() const { return has_min || has_max || null_count || has_distinct_count; } bool is_signed() const { return is_signed_; } @@ -193,7 +190,6 @@ class PARQUET_EXPORT EncodedStatistics { EncodedStatistics& set_null_count(int64_t value) { null_count = value; - has_null_count = true; return *this; } @@ -223,16 +219,15 @@ class PARQUET_EXPORT Statistics { /// \param[in] encoded_min the encoded minimum value /// \param[in] encoded_max the encoded maximum value /// \param[in] num_values total number of values - /// \param[in] null_count number of null values + /// \param[in] null_count number of null values (std::nullopt if not set) /// \param[in] distinct_count number of distinct values /// \param[in] has_min_max whether the min/max statistics are set - /// \param[in] has_null_count whether the null_count statistics are set /// \param[in] has_distinct_count whether the distinct_count statistics are set /// \param[in] pool a memory pool to use for any memory allocations, optional static std::shared_ptr Make( const ColumnDescriptor* descr, const std::string& encoded_min, - const std::string& encoded_max, int64_t num_values, int64_t null_count, - int64_t distinct_count, bool has_min_max, bool has_null_count, + const std::string& encoded_max, int64_t num_values, std::optional null_count, + int64_t distinct_count, bool has_min_max, bool has_distinct_count, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()); @@ -242,18 +237,17 @@ class PARQUET_EXPORT Statistics { /// \param[in] encoded_min the encoded minimum value /// \param[in] encoded_max the encoded maximum value /// \param[in] num_values total number of values - /// \param[in] null_count number of null values + /// \param[in] null_count number of null values (std::nullopt if not set) /// \param[in] distinct_count number of distinct values /// \param[in] has_min_max whether the min/max statistics are set - /// \param[in] has_null_count whether the null_count statistics are set /// \param[in] has_distinct_count whether the distinct_count statistics are set /// \param[in] is_min_value_exact whether the min value is exact /// \param[in] is_max_value_exact whether the max value is exact /// \param[in] pool a memory pool to use for any memory allocations, optional static std::shared_ptr Make( const ColumnDescriptor* descr, const std::string& encoded_min, - const std::string& encoded_max, int64_t num_values, int64_t null_count, - int64_t distinct_count, bool has_min_max, bool has_null_count, + const std::string& encoded_max, int64_t num_values, std::optional null_count, + int64_t distinct_count, bool has_min_max, bool has_distinct_count, std::optional is_min_value_exact, std::optional is_max_value_exact, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()); @@ -414,12 +408,12 @@ std::shared_ptr> MakeStatistics(const typename DType::c_t template std::shared_ptr> MakeStatistics( const ColumnDescriptor* descr, const std::string& encoded_min, - const std::string& encoded_max, int64_t num_values, int64_t null_count, - int64_t distinct_count, bool has_min_max, bool has_null_count, + const std::string& encoded_max, int64_t num_values, std::optional null_count, + int64_t distinct_count, bool has_min_max, bool has_distinct_count, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) { return std::static_pointer_cast>(Statistics::Make( descr, encoded_min, encoded_max, num_values, null_count, distinct_count, - has_min_max, has_null_count, has_distinct_count, + has_min_max, has_distinct_count, /*is_min_value_exact=*/std::nullopt, /*is_max_value_exact=*/std::nullopt, pool)); } @@ -427,15 +421,14 @@ std::shared_ptr> MakeStatistics( template std::shared_ptr> MakeStatistics( const ColumnDescriptor* descr, const std::string& encoded_min, - const std::string& encoded_max, int64_t num_values, int64_t null_count, - int64_t distinct_count, bool has_min_max, bool has_null_count, + const std::string& encoded_max, int64_t num_values, std::optional null_count, + int64_t distinct_count, bool has_min_max, bool has_distinct_count, std::optional is_min_value_exact, std::optional is_max_value_exact, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) { return std::static_pointer_cast>( Statistics::Make(descr, encoded_min, encoded_max, num_values, null_count, - distinct_count, has_min_max, has_null_count, has_distinct_count, + distinct_count, has_min_max, has_distinct_count, is_min_value_exact, is_max_value_exact, pool)); } - } // namespace parquet diff --git a/cpp/src/parquet/statistics_test.cc b/cpp/src/parquet/statistics_test.cc index 905502cb0a5..c79029f979c 100644 --- a/cpp/src/parquet/statistics_test.cc +++ b/cpp/src/parquet/statistics_test.cc @@ -323,7 +323,7 @@ class TestStatistics : public PrimitiveTypedTest { auto statistics2 = MakeStatistics( this->schema_.Column(0), encoded_min, encoded_max, this->values_.size(), /*null_count=*/0, /*distinct_count=*/0, - /*has_min_max=*/true, /*has_null_count=*/true, /*has_distinct_count=*/true, + /*has_min_max=*/true, /*has_distinct_count=*/true, /*is_min_value_exact=*/true, /*is_max_value_exact=*/true); auto statistics3 = MakeStatistics(this->schema_.Column(0)); @@ -338,7 +338,7 @@ class TestStatistics : public PrimitiveTypedTest { auto statistics4 = MakeStatistics( this->schema_.Column(0), encoded_min, encoded_max, this->values_.size(), /*null_count=*/0, /*distinct_count=*/0, - /*has_min_max=*/true, /*has_null_count=*/true, /*has_distinct_count=*/true); + /*has_min_max=*/true, /*has_distinct_count=*/true); ASSERT_EQ(encoded_min, statistics2->EncodeMin()); ASSERT_EQ(encoded_max, statistics2->EncodeMax()); ASSERT_EQ(statistics1->min(), statistics2->min()); @@ -572,7 +572,7 @@ void TestStatistics::TestMinMaxEncode() { auto statistics2 = MakeStatistics( this->schema_.Column(0), encoded_min, encoded_max, this->values_.size(), /*null_count=*/0, - /*distinct_count=*/0, /*has_min_max=*/true, /*has_null_count=*/true, + /*distinct_count=*/0, /*has_min_max=*/true, /*has_distinct_count=*/true, /*is_min_value_exact=*/true, /*is_max_value_exact=*/true); @@ -758,7 +758,7 @@ class TestStatisticsHasFlag : public TestStatistics { auto encoded_stats1 = statistics1->Encode(); EXPECT_TRUE(statistics1->HasNullCount()); EXPECT_EQ(0, statistics1->null_count()); - EXPECT_TRUE(statistics1->Encode().has_null_count); + EXPECT_TRUE(statistics1->Encode().null_count.has_value()); } // Merge with null-count should also have null count VerifyMergedStatistics(*statistics1, *statistics1, @@ -766,7 +766,7 @@ class TestStatisticsHasFlag : public TestStatistics { EXPECT_TRUE(merged_statistics->HasNullCount()); EXPECT_EQ(0, merged_statistics->null_count()); auto encoded = merged_statistics->Encode(); - EXPECT_TRUE(encoded.has_null_count); + EXPECT_TRUE(encoded.null_count.has_value()); EXPECT_EQ(0, encoded.null_count); }); @@ -774,34 +774,35 @@ class TestStatisticsHasFlag : public TestStatistics { std::shared_ptr> statistics2; { EncodedStatistics encoded_statistics2; - encoded_statistics2.has_null_count = false; + encoded_statistics2.null_count.reset(); statistics2 = std::dynamic_pointer_cast>( Statistics::Make(this->schema_.Column(0), &encoded_statistics2, /*num_values=*/1000)); - EXPECT_FALSE(statistics2->Encode().has_null_count); + EXPECT_FALSE(statistics2->Encode().null_count.has_value()); EXPECT_FALSE(statistics2->HasNullCount()); } // Merge without null-count should not have null count - VerifyMergedStatistics(*statistics1, *statistics2, - [](TypedStatistics* merged_statistics) { - EXPECT_FALSE(merged_statistics->HasNullCount()); - EXPECT_FALSE(merged_statistics->Encode().has_null_count); - }); + // BUG: Failing here + VerifyMergedStatistics( + *statistics1, *statistics2, [](TypedStatistics* merged_statistics) { + EXPECT_FALSE(merged_statistics->HasNullCount()); + EXPECT_FALSE(merged_statistics->Encode().null_count.has_value()); + }); } // statistics.all_null_value is used to build the page index. // If statistics doesn't have null count, all_null_value should be false. void TestMissingNullCount() { EncodedStatistics encoded_statistics; - encoded_statistics.has_null_count = false; + encoded_statistics.null_count.reset(); auto statistics = Statistics::Make(this->schema_.Column(0), &encoded_statistics, /*num_values=*/1000); auto typed_stats = std::dynamic_pointer_cast>(statistics); EXPECT_FALSE(typed_stats->HasNullCount()); auto encoded = typed_stats->Encode(); EXPECT_FALSE(encoded.all_null_value); - EXPECT_FALSE(encoded.has_null_count); + EXPECT_FALSE(encoded.null_count.has_value()); EXPECT_FALSE(encoded.has_distinct_count); EXPECT_FALSE(encoded.has_min); EXPECT_FALSE(encoded.has_max); @@ -1653,7 +1654,7 @@ TEST(TestStatisticsSortOrder, UNKNOWN) { ASSERT_TRUE(column_chunk->is_stats_set()); std::shared_ptr enc_stats = column_chunk->encoded_statistics(); - ASSERT_TRUE(enc_stats->has_null_count); + ASSERT_TRUE(enc_stats->null_count.has_value()); ASSERT_FALSE(enc_stats->has_distinct_count); ASSERT_FALSE(enc_stats->has_min); ASSERT_FALSE(enc_stats->has_max); diff --git a/cpp/src/parquet/thrift_internal.h b/cpp/src/parquet/thrift_internal.h index 1ffe99eb3c9..ad1e24c92d4 100644 --- a/cpp/src/parquet/thrift_internal.h +++ b/cpp/src/parquet/thrift_internal.h @@ -491,8 +491,8 @@ static inline format::Statistics ToThrift(const EncodedStatistics& stats) { statistics.__set_max(stats.max()); } } - if (stats.has_null_count) { - statistics.__set_null_count(stats.null_count); + if (stats.null_count) { + statistics.__set_null_count(*stats.null_count); } if (stats.has_distinct_count) { statistics.__set_distinct_count(stats.distinct_count); From cb982bf159db8470d132c68fe6f61637a3bbfcae Mon Sep 17 00:00:00 2001 From: Alvaro-Kothe Date: Sat, 21 Feb 2026 11:39:35 -0300 Subject: [PATCH 02/11] fix: remove has_distinct_count parameter in favor of std::optional --- cpp/src/parquet/metadata.cc | 17 ++++---- cpp/src/parquet/printer.cc | 2 +- cpp/src/parquet/statistics.cc | 67 ++++++++++++++---------------- cpp/src/parquet/statistics.h | 43 ++++++++----------- cpp/src/parquet/statistics_test.cc | 21 +++++----- cpp/src/parquet/thrift_internal.h | 4 +- 6 files changed, 71 insertions(+), 83 deletions(-) diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index bb9aba37c12..cd186ec8ba7 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -105,23 +105,24 @@ static std::shared_ptr MakeTypedColumnStats( metadata.statistics.__isset.null_count ? std::optional(metadata.statistics.null_count) : std::nullopt; + std::optional distinct_count = + metadata.statistics.__isset.distinct_count + ? std::optional(metadata.statistics.distinct_count) + : std::nullopt; // If ColumnOrder is defined, return max_value and min_value if (descr->column_order().get_order() == ColumnOrder::TYPE_DEFINED_ORDER) { return MakeStatistics( descr, metadata.statistics.min_value, metadata.statistics.max_value, - metadata.num_values - (null_count.value_or(0)), - null_count, metadata.statistics.distinct_count, + metadata.num_values - null_count.value_or(0), null_count, distinct_count, metadata.statistics.__isset.max_value && metadata.statistics.__isset.min_value, - metadata.statistics.__isset.distinct_count, min_exact, max_exact, pool); + min_exact, max_exact, pool); } // Default behavior return MakeStatistics( descr, metadata.statistics.min, metadata.statistics.max, - metadata.num_values - (null_count.value_or(0)), - null_count, metadata.statistics.distinct_count, - metadata.statistics.__isset.max && metadata.statistics.__isset.min, - metadata.statistics.__isset.distinct_count, - min_exact, max_exact, pool); + metadata.num_values - null_count.value_or(0), null_count, distinct_count, + metadata.statistics.__isset.max && metadata.statistics.__isset.min, min_exact, + max_exact, pool); } namespace { diff --git a/cpp/src/parquet/printer.cc b/cpp/src/parquet/printer.cc index 5f2f31b25ab..ab79926a315 100644 --- a/cpp/src/parquet/printer.cc +++ b/cpp/src/parquet/printer.cc @@ -175,7 +175,7 @@ void ParquetFilePrinter::DebugPrint(std::ostream& stream, std::list selecte ? (stats->is_min_value_exact.value() ? "true" : "false") : "unknown"; stream << ", Null Values: " << stats->null_count.value_or(0) - << ", Distinct Values: " << stats->distinct_count << std::endl + << ", Distinct Values: " << stats->distinct_count.value_or(0) << std::endl << " Max (exact: " << max_exact << "): " << FormatStatValue(descr->physical_type(), max, descr->logical_type()) << ", Min (exact: " << min_exact << "): " diff --git a/cpp/src/parquet/statistics.cc b/cpp/src/parquet/statistics.cc index a3651b2d57d..6414b474625 100644 --- a/cpp/src/parquet/statistics.cc +++ b/cpp/src/parquet/statistics.cc @@ -604,19 +604,18 @@ class TypedStatisticsImpl : public TypedStatistics { // Create stats from a thrift Statistics object. TypedStatisticsImpl(const ColumnDescriptor* descr, const std::string& encoded_min, const std::string& encoded_max, int64_t num_values, - std::optional null_count, int64_t distinct_count, - bool has_min_max, bool has_distinct_count, + std::optional null_count, + std::optional distinct_count, bool has_min_max, MemoryPool* pool) : TypedStatisticsImpl(descr, encoded_min, encoded_max, num_values, null_count, distinct_count, has_min_max, - has_distinct_count, /*is_min_value_exact=*/std::nullopt, /*is_max_value_exact=*/std::nullopt, pool) {} TypedStatisticsImpl(const ColumnDescriptor* descr, const std::string& encoded_min, const std::string& encoded_max, int64_t num_values, - std::optional null_count, int64_t distinct_count, - bool has_min_max, bool has_distinct_count, + std::optional null_count, + std::optional distinct_count, bool has_min_max, std::optional is_min_value_exact, std::optional is_max_value_exact, MemoryPool* pool) : TypedStatisticsImpl(descr, pool) { @@ -626,10 +625,10 @@ class TypedStatisticsImpl : public TypedStatistics { } else { statistics_.null_count = std::nullopt; } - if (has_distinct_count) { - SetDistinctCount(distinct_count); + if (distinct_count) { + SetDistinctCount(*distinct_count); } else { - has_distinct_count_ = false; + statistics_.distinct_count = std::nullopt; } if (has_min_max) { @@ -642,7 +641,9 @@ class TypedStatisticsImpl : public TypedStatistics { has_min_max_ = has_min_max; } - bool HasDistinctCount() const override { return has_distinct_count_; }; + bool HasDistinctCount() const override { + return statistics_.distinct_count.has_value(); + }; bool HasMinMax() const override { return has_min_max_; } bool HasNullCount() const override { return statistics_.null_count.has_value(); }; @@ -702,19 +703,18 @@ class TypedStatisticsImpl : public TypedStatistics { this->num_values_ += other.num_values(); // null_count is valid only if both sides have it. if (this->HasNullCount() && other.HasNullCount()) { - this->statistics_.null_count = - *this->statistics_.null_count + other.null_count(); + this->statistics_.null_count = *this->statistics_.null_count + other.null_count(); } else { this->statistics_.null_count = std::nullopt; } - if (has_distinct_count_ && other.HasDistinctCount() && + if (this->HasDistinctCount() && other.HasDistinctCount() && (distinct_count() == 0 || other.distinct_count() == 0)) { // We can merge distinct counts if either side is zero. statistics_.distinct_count = - std::max(statistics_.distinct_count, other.distinct_count()); + std::max(*statistics_.distinct_count, other.distinct_count()); } else { - // Otherwise clear has_distinct_count_ as distinct count cannot be merged. - this->has_distinct_count_ = false; + // Otherwise clear distinct_count as distinct count cannot be merged. + this->statistics_.distinct_count = std::nullopt; } // Do not clear min/max here if the other side does not provide // min/max which may happen when other is an empty stats or all @@ -783,7 +783,9 @@ class TypedStatisticsImpl : public TypedStatistics { } int64_t null_count() const override { return statistics_.null_count.value_or(0); } - int64_t distinct_count() const override { return statistics_.distinct_count; } + int64_t distinct_count() const override { + return statistics_.distinct_count.value_or(0); + } int64_t num_values() const override { return num_values_; } std::optional is_min_value_exact() const override { return statistics_.is_min_value_exact; @@ -795,7 +797,6 @@ class TypedStatisticsImpl : public TypedStatistics { private: const ColumnDescriptor* descr_; bool has_min_max_ = false; - bool has_distinct_count_ = false; T min_; T max_; ::arrow::MemoryPool* pool_; @@ -818,7 +819,6 @@ class TypedStatisticsImpl : public TypedStatistics { void SetDistinctCount(int64_t n) { // distinct count can only be "set", and cannot be incremented. statistics_.distinct_count = n; - has_distinct_count_ = true; } void ResetCounts() { @@ -830,10 +830,10 @@ class TypedStatisticsImpl : public TypedStatistics { void ResetHasFlags() { // has_min_max_ will only be set when it meets any valid value. this->has_min_max_ = false; - // has_distinct_count_ will only be set once SetDistinctCount() + // distinct_count will only be set once SetDistinctCount() // is called because distinct count calculation is not cheap and // disabled by default. - this->has_distinct_count_ = false; + this->statistics_.distinct_count = std::nullopt; // Null count calculation is cheap and enabled by default. this->statistics_.null_count = this->statistics_.null_count.value_or(0); } @@ -1078,18 +1078,15 @@ std::shared_ptr Statistics::Make(const ColumnDescriptor* descr, return Make(descr, encoded_stats->min(), encoded_stats->max(), num_values, encoded_stats->null_count, encoded_stats->distinct_count, encoded_stats->has_min && encoded_stats->has_max, - encoded_stats->has_distinct_count, encoded_stats->is_min_value_exact, encoded_stats->is_max_value_exact, pool); } std::shared_ptr Statistics::Make( const ColumnDescriptor* descr, const std::string& encoded_min, const std::string& encoded_max, int64_t num_values, std::optional null_count, - int64_t distinct_count, bool has_min_max, - bool has_distinct_count, ::arrow::MemoryPool* pool) { - return Statistics::Make(descr, encoded_min, encoded_max, num_values, - null_count, distinct_count, - has_min_max, has_distinct_count, + std::optional distinct_count, bool has_min_max, ::arrow::MemoryPool* pool) { + return Statistics::Make(descr, encoded_min, encoded_max, num_values, null_count, + distinct_count, has_min_max, /*is_min_value_exact=*/std::nullopt, /*is_max_value_exact=*/std::nullopt, pool); } @@ -1097,16 +1094,14 @@ std::shared_ptr Statistics::Make( std::shared_ptr Statistics::Make( const ColumnDescriptor* descr, const std::string& encoded_min, const std::string& encoded_max, int64_t num_values, std::optional null_count, - int64_t distinct_count, bool has_min_max, - bool has_distinct_count, std::optional is_min_value_exact, - std::optional is_max_value_exact, ::arrow::MemoryPool* pool) { -#define MAKE_STATS(CAP_TYPE, KLASS) \ - case Type::CAP_TYPE: \ - return std::make_shared>( \ - descr, encoded_min, encoded_max, num_values, \ - null_count, distinct_count, has_min_max, \ - has_distinct_count, is_min_value_exact, is_max_value_exact, \ - pool) + std::optional distinct_count, bool has_min_max, + std::optional is_min_value_exact, std::optional is_max_value_exact, + ::arrow::MemoryPool* pool) { +#define MAKE_STATS(CAP_TYPE, KLASS) \ + case Type::CAP_TYPE: \ + return std::make_shared>( \ + descr, encoded_min, encoded_max, num_values, null_count, distinct_count, \ + has_min_max, is_min_value_exact, is_max_value_exact, pool) switch (descr->physical_type()) { MAKE_STATS(BOOLEAN, BooleanType); diff --git a/cpp/src/parquet/statistics.h b/cpp/src/parquet/statistics.h index 669fa353dad..d4cb6c6c4c3 100644 --- a/cpp/src/parquet/statistics.h +++ b/cpp/src/parquet/statistics.h @@ -132,11 +132,10 @@ class PARQUET_EXPORT EncodedStatistics { std::optional is_min_value_exact; std::optional null_count; - int64_t distinct_count = 0; + std::optional distinct_count; bool has_min = false; bool has_max = false; - bool has_distinct_count = false; // When all values in the statistics are null, it is set to true. // Otherwise, at least one value is not null, or we are not sure at all. @@ -170,7 +169,7 @@ class PARQUET_EXPORT EncodedStatistics { min_.clear(); } - bool is_set() const { return has_min || has_max || null_count || has_distinct_count; } + bool is_set() const { return has_min || has_max || null_count || distinct_count; } bool is_signed() const { return is_signed_; } @@ -195,7 +194,6 @@ class PARQUET_EXPORT EncodedStatistics { EncodedStatistics& set_distinct_count(int64_t value) { distinct_count = value; - has_distinct_count = true; return *this; } }; @@ -220,16 +218,14 @@ class PARQUET_EXPORT Statistics { /// \param[in] encoded_max the encoded maximum value /// \param[in] num_values total number of values /// \param[in] null_count number of null values (std::nullopt if not set) - /// \param[in] distinct_count number of distinct values + /// \param[in] distinct_count number of distinct values (std::nullopt if not set) /// \param[in] has_min_max whether the min/max statistics are set - /// \param[in] has_distinct_count whether the distinct_count statistics are set /// \param[in] pool a memory pool to use for any memory allocations, optional static std::shared_ptr Make( const ColumnDescriptor* descr, const std::string& encoded_min, - const std::string& encoded_max, int64_t num_values, std::optional null_count, - int64_t distinct_count, bool has_min_max, - bool has_distinct_count, - ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()); + const std::string& encoded_max, int64_t num_values, + std::optional null_count, std::optional distinct_count, + bool has_min_max, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()); /// \brief Create a new statistics instance given a column schema /// definition and preexisting state @@ -238,17 +234,16 @@ class PARQUET_EXPORT Statistics { /// \param[in] encoded_max the encoded maximum value /// \param[in] num_values total number of values /// \param[in] null_count number of null values (std::nullopt if not set) - /// \param[in] distinct_count number of distinct values + /// \param[in] distinct_count number of distinct values (std::nullopt if not set) /// \param[in] has_min_max whether the min/max statistics are set - /// \param[in] has_distinct_count whether the distinct_count statistics are set /// \param[in] is_min_value_exact whether the min value is exact /// \param[in] is_max_value_exact whether the max value is exact /// \param[in] pool a memory pool to use for any memory allocations, optional static std::shared_ptr Make( const ColumnDescriptor* descr, const std::string& encoded_min, - const std::string& encoded_max, int64_t num_values, std::optional null_count, - int64_t distinct_count, bool has_min_max, - bool has_distinct_count, std::optional is_min_value_exact, + const std::string& encoded_max, int64_t num_values, + std::optional null_count, std::optional distinct_count, + bool has_min_max, std::optional is_min_value_exact, std::optional is_max_value_exact, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()); @@ -409,11 +404,11 @@ template std::shared_ptr> MakeStatistics( const ColumnDescriptor* descr, const std::string& encoded_min, const std::string& encoded_max, int64_t num_values, std::optional null_count, - int64_t distinct_count, bool has_min_max, - bool has_distinct_count, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) { + std::optional distinct_count, bool has_min_max, + ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) { return std::static_pointer_cast>(Statistics::Make( descr, encoded_min, encoded_max, num_values, null_count, distinct_count, - has_min_max, has_distinct_count, + has_min_max, /*is_min_value_exact=*/std::nullopt, /*is_max_value_exact=*/std::nullopt, pool)); } @@ -422,13 +417,11 @@ template std::shared_ptr> MakeStatistics( const ColumnDescriptor* descr, const std::string& encoded_min, const std::string& encoded_max, int64_t num_values, std::optional null_count, - int64_t distinct_count, bool has_min_max, - bool has_distinct_count, std::optional is_min_value_exact, - std::optional is_max_value_exact, + std::optional distinct_count, bool has_min_max, + std::optional is_min_value_exact, std::optional is_max_value_exact, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) { - return std::static_pointer_cast>( - Statistics::Make(descr, encoded_min, encoded_max, num_values, null_count, - distinct_count, has_min_max, has_distinct_count, - is_min_value_exact, is_max_value_exact, pool)); + return std::static_pointer_cast>(Statistics::Make( + descr, encoded_min, encoded_max, num_values, null_count, distinct_count, + has_min_max, is_min_value_exact, is_max_value_exact, pool)); } } // namespace parquet diff --git a/cpp/src/parquet/statistics_test.cc b/cpp/src/parquet/statistics_test.cc index c79029f979c..b71b82b6dec 100644 --- a/cpp/src/parquet/statistics_test.cc +++ b/cpp/src/parquet/statistics_test.cc @@ -323,7 +323,7 @@ class TestStatistics : public PrimitiveTypedTest { auto statistics2 = MakeStatistics( this->schema_.Column(0), encoded_min, encoded_max, this->values_.size(), /*null_count=*/0, /*distinct_count=*/0, - /*has_min_max=*/true, /*has_distinct_count=*/true, + /*has_min_max=*/true, /*is_min_value_exact=*/true, /*is_max_value_exact=*/true); auto statistics3 = MakeStatistics(this->schema_.Column(0)); @@ -335,10 +335,10 @@ class TestStatistics : public PrimitiveTypedTest { std::string encoded_max_spaced = statistics3->EncodeMax(); // Use old API without is_{min/max}_value_exact - auto statistics4 = MakeStatistics( - this->schema_.Column(0), encoded_min, encoded_max, this->values_.size(), - /*null_count=*/0, /*distinct_count=*/0, - /*has_min_max=*/true, /*has_distinct_count=*/true); + auto statistics4 = MakeStatistics(this->schema_.Column(0), encoded_min, + encoded_max, this->values_.size(), + /*null_count=*/0, /*distinct_count=*/0, + /*has_min_max=*/true); ASSERT_EQ(encoded_min, statistics2->EncodeMin()); ASSERT_EQ(encoded_max, statistics2->EncodeMax()); ASSERT_EQ(statistics1->min(), statistics2->min()); @@ -573,7 +573,7 @@ void TestStatistics::TestMinMaxEncode() { this->schema_.Column(0), encoded_min, encoded_max, this->values_.size(), /*null_count=*/0, /*distinct_count=*/0, /*has_min_max=*/true, - /*has_distinct_count=*/true, /*is_min_value_exact=*/true, + /*is_min_value_exact=*/true, /*is_max_value_exact=*/true); ASSERT_EQ(encoded_min, statistics2->EncodeMin()); @@ -639,7 +639,6 @@ class TestStatisticsHasFlag : public TestStatistics { const std::vector>& subsequent) { EncodedStatistics encoded_statistics; if (initial) { - encoded_statistics.has_distinct_count = true; encoded_statistics.distinct_count = *initial; } std::shared_ptr> statistics = @@ -649,7 +648,6 @@ class TestStatisticsHasFlag : public TestStatistics { for (const auto& distinct_count : subsequent) { EncodedStatistics next_encoded_statistics; if (distinct_count) { - next_encoded_statistics.has_distinct_count = true; next_encoded_statistics.distinct_count = *distinct_count; } std::shared_ptr> next_statistics = @@ -659,7 +657,8 @@ class TestStatisticsHasFlag : public TestStatistics { statistics->Merge(*next_statistics); } EncodedStatistics final_statistics = statistics->Encode(); - EXPECT_EQ(statistics->HasDistinctCount(), final_statistics.has_distinct_count); + EXPECT_EQ(statistics->HasDistinctCount(), + final_statistics.distinct_count.has_value()); if (statistics->HasDistinctCount()) { EXPECT_EQ(statistics->distinct_count(), final_statistics.distinct_count); return statistics->distinct_count(); @@ -803,7 +802,7 @@ class TestStatisticsHasFlag : public TestStatistics { auto encoded = typed_stats->Encode(); EXPECT_FALSE(encoded.all_null_value); EXPECT_FALSE(encoded.null_count.has_value()); - EXPECT_FALSE(encoded.has_distinct_count); + EXPECT_FALSE(encoded.distinct_count.has_value()); EXPECT_FALSE(encoded.has_min); EXPECT_FALSE(encoded.has_max); EXPECT_FALSE(encoded.is_min_value_exact.has_value()); @@ -1655,7 +1654,7 @@ TEST(TestStatisticsSortOrder, UNKNOWN) { std::shared_ptr enc_stats = column_chunk->encoded_statistics(); ASSERT_TRUE(enc_stats->null_count.has_value()); - ASSERT_FALSE(enc_stats->has_distinct_count); + ASSERT_FALSE(enc_stats->distinct_count.has_value()); ASSERT_FALSE(enc_stats->has_min); ASSERT_FALSE(enc_stats->has_max); ASSERT_EQ(1, enc_stats->null_count); diff --git a/cpp/src/parquet/thrift_internal.h b/cpp/src/parquet/thrift_internal.h index ad1e24c92d4..b499550221e 100644 --- a/cpp/src/parquet/thrift_internal.h +++ b/cpp/src/parquet/thrift_internal.h @@ -494,8 +494,8 @@ static inline format::Statistics ToThrift(const EncodedStatistics& stats) { if (stats.null_count) { statistics.__set_null_count(*stats.null_count); } - if (stats.has_distinct_count) { - statistics.__set_distinct_count(stats.distinct_count); + if (stats.distinct_count) { + statistics.__set_distinct_count(*stats.distinct_count); } return statistics; From ffce9a1ff21279031fa652b9972c316e87b86cf4 Mon Sep 17 00:00:00 2001 From: Alvaro-Kothe Date: Mon, 23 Feb 2026 21:18:26 -0300 Subject: [PATCH 03/11] fix: remove has_min in favor of std::optional --- .../parquet/arrow/arrow_reader_writer_test.cc | 8 +-- cpp/src/parquet/arrow/reader_internal.cc | 10 +-- cpp/src/parquet/column_writer_test.cc | 2 +- cpp/src/parquet/file_deserialize_test.cc | 4 +- cpp/src/parquet/metadata.cc | 4 +- cpp/src/parquet/metadata_test.cc | 18 +++--- cpp/src/parquet/page_index.cc | 4 +- cpp/src/parquet/page_index_test.cc | 4 +- cpp/src/parquet/printer.cc | 2 +- cpp/src/parquet/statistics.cc | 63 ++++++++++++++----- cpp/src/parquet/statistics.h | 31 +++++---- cpp/src/parquet/statistics_test.cc | 39 ++++++------ cpp/src/parquet/thrift_internal.h | 6 +- 13 files changed, 115 insertions(+), 80 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index e2384972cf5..c968d0fa773 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -4735,10 +4735,10 @@ TEST_P(TestArrowWriteDictionary, Statistics) { auto expect_has_min_max = expected_has_min_max_by_page[case_index][row_group_index][page_index]; - EXPECT_EQ(stats.has_min, expect_has_min_max); + EXPECT_EQ(stats.min.has_value(), expect_has_min_max); EXPECT_EQ(stats.has_max, expect_has_min_max); if (expect_has_min_max) { - EXPECT_EQ(stats.min(), + EXPECT_EQ(stats.min.value(), expected_min_by_page[case_index][row_group_index][page_index]); EXPECT_EQ(stats.max(), expected_max_by_page[case_index][row_group_index][page_index]); @@ -4839,7 +4839,7 @@ TEST_P(TestArrowWriteDictionary, StatisticsUnifiedDictionary) { ASSERT_EQ(3, data_page->num_values()); const auto& stats = data_page->statistics(); EXPECT_EQ(1, stats.null_count); - EXPECT_EQ(rg0_min_values[i], stats.min()); + EXPECT_EQ(rg0_min_values[i], stats.min.value()); EXPECT_EQ(rg0_max_values[i], stats.max()); } ASSERT_EQ(rg0_page_reader->NextPage(), nullptr); @@ -4853,7 +4853,7 @@ TEST_P(TestArrowWriteDictionary, StatisticsUnifiedDictionary) { ASSERT_EQ(3, data_page->num_values()); const auto& stats = data_page->statistics(); EXPECT_EQ(1, stats.null_count); - EXPECT_EQ("b", stats.min()); + EXPECT_EQ("b", stats.min.value()); EXPECT_EQ("c", stats.max()); } ASSERT_EQ(rg1_page_reader->NextPage(), nullptr); diff --git a/cpp/src/parquet/arrow/reader_internal.cc b/cpp/src/parquet/arrow/reader_internal.cc index 12f36fe39cf..2720e0c03d4 100644 --- a/cpp/src/parquet/arrow/reader_internal.cc +++ b/cpp/src/parquet/arrow/reader_internal.cc @@ -169,7 +169,7 @@ template Status MakeMinMaxScalar(const StatisticsType& statistics, std::shared_ptr<::arrow::Scalar>* min, std::shared_ptr<::arrow::Scalar>* max) { - *min = ::arrow::MakeScalar(static_cast(statistics.min())); + *min = ::arrow::MakeScalar(static_cast(statistics.min().value())); *max = ::arrow::MakeScalar(static_cast(statistics.max())); return Status::OK(); } @@ -179,7 +179,7 @@ Status MakeMinMaxTypedScalar(const StatisticsType& statistics, std::shared_ptr type, std::shared_ptr<::arrow::Scalar>* min, std::shared_ptr<::arrow::Scalar>* max) { - ARROW_ASSIGN_OR_RAISE(*min, ::arrow::MakeScalar(type, statistics.min())); + ARROW_ASSIGN_OR_RAISE(*min, ::arrow::MakeScalar(type, statistics.min().value())); ARROW_ASSIGN_OR_RAISE(*max, ::arrow::MakeScalar(type, statistics.max())); return Status::OK(); } @@ -227,7 +227,7 @@ static Status FromInt32Statistics(const Int32Statistics& statistics, case LogicalType::Type::NONE: return MakeMinMaxTypedScalar(statistics, type, min, max); case LogicalType::Type::DECIMAL: - return ExtractDecimalMinMaxFromInteger(statistics.min(), statistics.max(), + return ExtractDecimalMinMaxFromInteger(statistics.min().value(), statistics.max(), logical_type, min, max); default: break; @@ -252,7 +252,7 @@ static Status FromInt64Statistics(const Int64Statistics& statistics, case LogicalType::Type::NONE: return MakeMinMaxTypedScalar(statistics, type, min, max); case LogicalType::Type::DECIMAL: - return ExtractDecimalMinMaxFromInteger(statistics.min(), statistics.max(), + return ExtractDecimalMinMaxFromInteger(statistics.min().value(), statistics.max(), logical_type, min, max); default: break; @@ -389,7 +389,7 @@ void AttachStatistics(::arrow::ArrayData* data, if (statistics->HasMinMax()) { const auto* typed_statistics = checked_cast*>(statistics.get()); - const ArrowCType min = typed_statistics->min(); + const ArrowCType min = typed_statistics->min().value(); const ArrowCType max = typed_statistics->max(); if constexpr (std::is_same_v) { array_statistics->min = static_cast(min); diff --git a/cpp/src/parquet/column_writer_test.cc b/cpp/src/parquet/column_writer_test.cc index 157e73ffec4..d02cbf0e97a 100644 --- a/cpp/src/parquet/column_writer_test.cc +++ b/cpp/src/parquet/column_writer_test.cc @@ -375,7 +375,7 @@ class TestPrimitiveWriter : public PrimitiveTypedTest { auto metadata_accessor = ColumnChunkMetaData::Make( metadata_->contents(), this->descr_, default_reader_properties(), &app_version); auto encoded_stats = metadata_accessor->statistics()->Encode(); - return {encoded_stats.has_min, encoded_stats.has_max}; + return {encoded_stats.min.has_value(), encoded_stats.has_max}; } std::vector metadata_encodings() { diff --git a/cpp/src/parquet/file_deserialize_test.cc b/cpp/src/parquet/file_deserialize_test.cc index 7fa5e2f167e..25ee2b0c785 100644 --- a/cpp/src/parquet/file_deserialize_test.cc +++ b/cpp/src/parquet/file_deserialize_test.cc @@ -72,7 +72,7 @@ static inline void CheckStatistics(const H& expected, const EncodedStatistics& a ASSERT_EQ(expected.statistics.max, actual.max()); } if (expected.statistics.__isset.min) { - ASSERT_EQ(expected.statistics.min, actual.min()); + ASSERT_EQ(expected.statistics.min, actual.min); } if (expected.statistics.__isset.null_count) { ASSERT_EQ(expected.statistics.null_count, actual.null_count); @@ -514,7 +514,7 @@ TYPED_TEST(PageFilterTest, TestPageFilterCallback) { auto data_page = static_cast(current_page.get()); const EncodedStatistics encoded_statistics = data_page->statistics(); ASSERT_EQ(read_stats[i].max(), encoded_statistics.max()); - ASSERT_EQ(read_stats[i].min(), encoded_statistics.min()); + ASSERT_EQ(read_stats[i].min, encoded_statistics.min); ASSERT_EQ(read_stats[i].null_count, encoded_statistics.null_count); ASSERT_EQ(read_stats[i].distinct_count, encoded_statistics.distinct_count); ASSERT_EQ(read_num_values[i], this->data_page_headers_[i].num_values); diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index cd186ec8ba7..54efe2d2fd5 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -1614,8 +1614,8 @@ bool ApplicationVersion::HasCorrectStatistics(Type::type col_type, (application_ == "parquet-mr" && VersionLt(PARQUET_MR_FIXED_STATS_VERSION()))) { // Only SIGNED are valid unless max and min are the same // (in which case the sort order does not matter) - bool max_equals_min = statistics.has_min && statistics.has_max - ? statistics.min() == statistics.max() + bool max_equals_min = statistics.min.has_value() && statistics.has_max + ? statistics.min.value() == statistics.max() : false; if (SortOrder::SIGNED != sort_order && !max_equals_min) { return false; diff --git a/cpp/src/parquet/metadata_test.cc b/cpp/src/parquet/metadata_test.cc index 572f053179c..ab21cca8606 100644 --- a/cpp/src/parquet/metadata_test.cc +++ b/cpp/src/parquet/metadata_test.cc @@ -154,13 +154,14 @@ TEST(Metadata, TestBuildAccess) { auto rg1_column2 = rg1_accessor->ColumnChunk(1); ASSERT_EQ(true, rg1_column1->is_stats_set()); ASSERT_EQ(true, rg1_column2->is_stats_set()); - ASSERT_EQ(stats_float.min(), rg1_column2->statistics()->EncodeMin()); + ASSERT_EQ(stats_float.min, + std::make_optional(rg1_column2->statistics()->EncodeMin())); ASSERT_EQ(stats_float.max(), rg1_column2->statistics()->EncodeMax()); - ASSERT_EQ(stats_int.min(), rg1_column1->statistics()->EncodeMin()); + ASSERT_EQ(stats_int.min, std::make_optional(rg1_column1->statistics()->EncodeMin())); ASSERT_EQ(stats_int.max(), rg1_column1->statistics()->EncodeMax()); - ASSERT_EQ(stats_float.min(), rg1_column2->encoded_statistics()->min()); + ASSERT_EQ(stats_float.min, rg1_column2->encoded_statistics()->min); ASSERT_EQ(stats_float.max(), rg1_column2->encoded_statistics()->max()); - ASSERT_EQ(stats_int.min(), rg1_column1->encoded_statistics()->min()); + ASSERT_EQ(stats_int.min, rg1_column1->encoded_statistics()->min); ASSERT_EQ(stats_int.max(), rg1_column1->encoded_statistics()->max()); ASSERT_EQ(0, rg1_column1->statistics()->null_count()); ASSERT_EQ(0, rg1_column2->statistics()->null_count()); @@ -205,13 +206,14 @@ TEST(Metadata, TestBuildAccess) { auto rg2_column2 = rg2_accessor->ColumnChunk(1); ASSERT_EQ(true, rg2_column1->is_stats_set()); ASSERT_EQ(true, rg2_column2->is_stats_set()); - ASSERT_EQ(stats_float.min(), rg2_column2->statistics()->EncodeMin()); + ASSERT_EQ(stats_float.min, + std::make_optional(rg2_column2->statistics()->EncodeMin())); ASSERT_EQ(stats_float.max(), rg2_column2->statistics()->EncodeMax()); - ASSERT_EQ(stats_int.min(), rg1_column1->statistics()->EncodeMin()); + ASSERT_EQ(stats_int.min, std::make_optional(rg1_column1->statistics()->EncodeMin())); ASSERT_EQ(stats_int.max(), rg1_column1->statistics()->EncodeMax()); - ASSERT_EQ(stats_float.min(), rg2_column2->encoded_statistics()->min()); + ASSERT_EQ(stats_float.min, rg2_column2->encoded_statistics()->min); ASSERT_EQ(stats_float.max(), rg2_column2->encoded_statistics()->max()); - ASSERT_EQ(stats_int.min(), rg1_column1->encoded_statistics()->min()); + ASSERT_EQ(stats_int.min, rg1_column1->encoded_statistics()->min); ASSERT_EQ(stats_int.max(), rg1_column1->encoded_statistics()->max()); ASSERT_EQ(0, rg2_column1->statistics()->null_count()); ASSERT_EQ(0, rg2_column2->statistics()->null_count()); diff --git a/cpp/src/parquet/page_index.cc b/cpp/src/parquet/page_index.cc index 17d7d8bebc4..c421cd31e63 100644 --- a/cpp/src/parquet/page_index.cc +++ b/cpp/src/parquet/page_index.cc @@ -511,10 +511,10 @@ class ColumnIndexBuilderImpl final : public ColumnIndexBuilder { column_index_.null_pages.emplace_back(true); column_index_.min_values.emplace_back(""); column_index_.max_values.emplace_back(""); - } else if (stats.has_min && stats.has_max) { + } else if (stats.min.has_value() && stats.has_max) { const size_t page_ordinal = column_index_.null_pages.size(); non_null_page_indices_.emplace_back(page_ordinal); - column_index_.min_values.emplace_back(stats.min()); + column_index_.min_values.emplace_back(stats.min.value()); column_index_.max_values.emplace_back(stats.max()); column_index_.null_pages.emplace_back(false); } else { diff --git a/cpp/src/parquet/page_index_test.cc b/cpp/src/parquet/page_index_test.cc index 9b3386055a4..41c8ba5ab45 100644 --- a/cpp/src/parquet/page_index_test.cc +++ b/cpp/src/parquet/page_index_test.cc @@ -546,7 +546,7 @@ void TestWriteTypedColumnIndex(schema::NodePtr node, for (size_t i = 0; i < num_pages; ++i) { ASSERT_EQ(page_stats[i].all_null_value, column_index->null_pages()[i]); - ASSERT_EQ(page_stats[i].min(), column_index->encoded_min_values()[i]); + ASSERT_EQ(page_stats[i].min.value_or(""), column_index->encoded_min_values()[i]); ASSERT_EQ(page_stats[i].max(), column_index->encoded_max_values()[i]); if (has_null_counts) { ASSERT_EQ(page_stats[i].null_count, column_index->null_counts()[i]); @@ -813,7 +813,7 @@ class PageIndexBuilderTest : public ::testing::Test { ASSERT_NE(nullptr, column_index); ASSERT_EQ(size_t{1}, column_index->null_pages().size()); ASSERT_EQ(stats.all_null_value, column_index->null_pages()[0]); - ASSERT_EQ(stats.min(), column_index->encoded_min_values()[0]); + ASSERT_EQ(stats.min, column_index->encoded_min_values()[0]); ASSERT_EQ(stats.max(), column_index->encoded_max_values()[0]); ASSERT_EQ(stats.null_count.has_value(), column_index->has_null_counts()); if (stats.null_count) { diff --git a/cpp/src/parquet/printer.cc b/cpp/src/parquet/printer.cc index ab79926a315..4429e3aeb14 100644 --- a/cpp/src/parquet/printer.cc +++ b/cpp/src/parquet/printer.cc @@ -165,7 +165,7 @@ void ParquetFilePrinter::DebugPrint(std::ostream& stream, std::list selecte } stream << " Values: " << column_chunk->num_values(); if (column_chunk->is_stats_set()) { - std::string min = stats->min(), max = stats->max(); + std::string min = stats->min.value_or(""), max = stats->max(); std::string max_exact = stats->is_max_value_exact.has_value() ? (stats->is_max_value_exact.value() ? "true" : "false") diff --git a/cpp/src/parquet/statistics.cc b/cpp/src/parquet/statistics.cc index 6414b474625..03cd10e2e07 100644 --- a/cpp/src/parquet/statistics.cc +++ b/cpp/src/parquet/statistics.cc @@ -594,7 +594,8 @@ class TypedStatisticsImpl : public TypedStatistics { TypedStatisticsImpl::IncrementNullCount(null_count); SetDistinctCount(distinct_count); - Copy(min, &min_, min_buffer_.get()); + min_.emplace(); + Copy(min, &min_.value(), min_buffer_.get()); Copy(max, &max_, max_buffer_.get()); has_min_max_ = true; statistics_.is_min_value_exact = true; @@ -602,7 +603,8 @@ class TypedStatisticsImpl : public TypedStatistics { } // Create stats from a thrift Statistics object. - TypedStatisticsImpl(const ColumnDescriptor* descr, const std::string& encoded_min, + TypedStatisticsImpl(const ColumnDescriptor* descr, + const std::optional& encoded_min, const std::string& encoded_max, int64_t num_values, std::optional null_count, std::optional distinct_count, bool has_min_max, @@ -612,7 +614,8 @@ class TypedStatisticsImpl : public TypedStatistics { /*is_min_value_exact=*/std::nullopt, /*is_max_value_exact=*/std::nullopt, pool) {} - TypedStatisticsImpl(const ColumnDescriptor* descr, const std::string& encoded_min, + TypedStatisticsImpl(const ColumnDescriptor* descr, + const std::optional& encoded_min, const std::string& encoded_max, int64_t num_values, std::optional null_count, std::optional distinct_count, bool has_min_max, @@ -632,8 +635,12 @@ class TypedStatisticsImpl : public TypedStatistics { } if (has_min_max) { - PlainDecode(encoded_min, &min_); - PlainDecode(encoded_max, &max_); + T min_val, max_val; + PlainDecode(encoded_min.value(), &min_val); + PlainDecode(encoded_max, &max_val); + min_.emplace(); + Copy(min_val, &min_.value(), min_buffer_.get()); + Copy(max_val, &max_, max_buffer_.get()); statistics_.is_min_value_exact = is_min_value_exact; statistics_.is_max_value_exact = is_max_value_exact; } @@ -699,6 +706,18 @@ class TypedStatisticsImpl : public TypedStatistics { SetMinMaxPair({arg_min, arg_max}); } + void SetMinMax(std::optional arg_min, const T& arg_max) override { + if (arg_min.has_value()) { + SetMinMaxPair({arg_min.value(), arg_max}); + } else { + // If min is not provided, we only update max if we have it. + // But SetMinMaxPair expects a pair. + // For now, if min is not provided, we might need a different logic or + // just ignore if the intention was to merge stats where min might be missing. + // However, HasMinMax() usually implies both are present in Parquet. + } + } + void Merge(const TypedStatistics& other) override { this->num_values_ += other.num_values(); // null_count is valid only if both sides have it. @@ -743,7 +762,7 @@ class TypedStatisticsImpl : public TypedStatistics { SetMinMaxPair(comparator_->GetMinMax(values)); } - const T& min() const override { return min_; } + std::optional min() const override { return min_; } const T& max() const override { return max_; } @@ -753,7 +772,7 @@ class TypedStatisticsImpl : public TypedStatistics { std::string EncodeMin() const override { std::string s; - if (HasMinMax()) this->PlainEncode(min_, &s); + if (HasMinMax()) this->PlainEncode(min_.value(), &s); return s; } @@ -797,7 +816,7 @@ class TypedStatisticsImpl : public TypedStatistics { private: const ColumnDescriptor* descr_; bool has_min_max_ = false; - T min_; + std::optional min_; T max_; ::arrow::MemoryPool* pool_; // Number of non-null values. @@ -849,10 +868,12 @@ class TypedStatisticsImpl : public TypedStatistics { if (!has_min_max_) { has_min_max_ = true; - Copy(min, &min_, min_buffer_.get()); + min_.emplace(); + Copy(min, &min_.value(), min_buffer_.get()); Copy(max, &max_, max_buffer_.get()); } else { - Copy(comparator_->Compare(min_, min) ? min_ : min, &min_, min_buffer_.get()); + Copy(comparator_->Compare(min_.value(), min) ? min_.value() : min, &min_.value(), + min_buffer_.get()); Copy(comparator_->Compare(max_, max) ? max : max_, &max_, max_buffer_.get()); } statistics_.is_min_value_exact = true; @@ -860,11 +881,15 @@ class TypedStatisticsImpl : public TypedStatistics { } }; +#define MinMaxSetCheck \ + DCHECK(has_min_max_&& min_.has_value() && other.min_.has_value()) \ + << "Min and Max must be set" template <> inline bool TypedStatisticsImpl::MinMaxEqual( const TypedStatisticsImpl& other) const { uint32_t len = descr_->type_length(); - return std::memcmp(min_.ptr, other.min_.ptr, len) == 0 && + MinMaxSetCheck; + return std::memcmp(min_.value().ptr, other.min_.value().ptr, len) == 0 && std::memcmp(max_.ptr, other.max_.ptr, len) == 0; } @@ -948,6 +973,14 @@ void TypedStatisticsImpl::PlainEncode(const T& src, dst->assign(reinterpret_cast(src.ptr), src.len); } +template <> +void TypedStatisticsImpl::PlainDecode(const std::string& src, T* dst) const { + if (src.size() != static_cast(descr_->type_length())) { + throw ParquetException("Size of encoded FLBA does not match descriptor"); + } + dst->ptr = reinterpret_cast(src.c_str()); +} + template <> void TypedStatisticsImpl::PlainDecode(const std::string& src, T* dst) const { @@ -1075,14 +1108,14 @@ std::shared_ptr Statistics::Make(const ColumnDescriptor* descr, int64_t num_values, ::arrow::MemoryPool* pool) { DCHECK(encoded_stats != nullptr); - return Make(descr, encoded_stats->min(), encoded_stats->max(), num_values, + return Make(descr, encoded_stats->min, encoded_stats->max(), num_values, encoded_stats->null_count, encoded_stats->distinct_count, - encoded_stats->has_min && encoded_stats->has_max, + encoded_stats->min.has_value() && encoded_stats->has_max, encoded_stats->is_min_value_exact, encoded_stats->is_max_value_exact, pool); } std::shared_ptr Statistics::Make( - const ColumnDescriptor* descr, const std::string& encoded_min, + const ColumnDescriptor* descr, const std::optional& encoded_min, const std::string& encoded_max, int64_t num_values, std::optional null_count, std::optional distinct_count, bool has_min_max, ::arrow::MemoryPool* pool) { return Statistics::Make(descr, encoded_min, encoded_max, num_values, null_count, @@ -1092,7 +1125,7 @@ std::shared_ptr Statistics::Make( } std::shared_ptr Statistics::Make( - const ColumnDescriptor* descr, const std::string& encoded_min, + const ColumnDescriptor* descr, const std::optional& encoded_min, const std::string& encoded_max, int64_t num_values, std::optional null_count, std::optional distinct_count, bool has_min_max, std::optional is_min_value_exact, std::optional is_max_value_exact, diff --git a/cpp/src/parquet/statistics.h b/cpp/src/parquet/statistics.h index d4cb6c6c4c3..6ffa764217b 100644 --- a/cpp/src/parquet/statistics.h +++ b/cpp/src/parquet/statistics.h @@ -119,14 +119,14 @@ std::shared_ptr> MakeComparator(const ColumnDescriptor* d /// \brief Structure represented encoded statistics to be written to /// and read from Parquet serialized metadata. class PARQUET_EXPORT EncodedStatistics { - std::string max_, min_; + std::string max_; bool is_signed_ = false; public: EncodedStatistics() = default; const std::string& max() const { return max_; } - const std::string& min() const { return min_; } + std::optional min; std::optional is_max_value_exact; std::optional is_min_value_exact; @@ -134,7 +134,6 @@ class PARQUET_EXPORT EncodedStatistics { std::optional null_count; std::optional distinct_count; - bool has_min = false; bool has_max = false; // When all values in the statistics are null, it is set to true. @@ -154,9 +153,8 @@ class PARQUET_EXPORT EncodedStatistics { max_.clear(); is_max_value_exact = std::nullopt; } - if (min_.length() > length) { - has_min = false; - min_.clear(); + if (this->min.has_value() && this->min->length() > length) { + this->min = std::nullopt; is_min_value_exact = std::nullopt; } } @@ -165,11 +163,10 @@ class PARQUET_EXPORT EncodedStatistics { void ClearMinMax() { has_max = false; max_.clear(); - has_min = false; - min_.clear(); + this->min = std::nullopt; } - bool is_set() const { return has_min || has_max || null_count || distinct_count; } + bool is_set() const { return this->min || has_max || null_count || distinct_count; } bool is_signed() const { return is_signed_; } @@ -182,8 +179,7 @@ class PARQUET_EXPORT EncodedStatistics { } EncodedStatistics& set_min(std::string value) { - min_ = std::move(value); - has_min = true; + this->min = std::move(value); return *this; } @@ -222,7 +218,7 @@ class PARQUET_EXPORT Statistics { /// \param[in] has_min_max whether the min/max statistics are set /// \param[in] pool a memory pool to use for any memory allocations, optional static std::shared_ptr Make( - const ColumnDescriptor* descr, const std::string& encoded_min, + const ColumnDescriptor* descr, const std::optional& encoded_min, const std::string& encoded_max, int64_t num_values, std::optional null_count, std::optional distinct_count, bool has_min_max, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()); @@ -240,7 +236,7 @@ class PARQUET_EXPORT Statistics { /// \param[in] is_max_value_exact whether the max value is exact /// \param[in] pool a memory pool to use for any memory allocations, optional static std::shared_ptr Make( - const ColumnDescriptor* descr, const std::string& encoded_min, + const ColumnDescriptor* descr, const std::optional& encoded_min, const std::string& encoded_max, int64_t num_values, std::optional null_count, std::optional distinct_count, bool has_min_max, std::optional is_min_value_exact, @@ -316,7 +312,7 @@ class TypedStatistics : public Statistics { using T = typename DType::c_type; /// \brief The current minimum value - virtual const T& min() const = 0; + virtual std::optional min() const = 0; /// \brief The current maximum value virtual const T& max() const = 0; @@ -356,6 +352,9 @@ class TypedStatistics : public Statistics { /// \brief Set min and max values to particular values virtual void SetMinMax(const T& min, const T& max) = 0; + /// \brief Set min and max values to particular values, where min is optional + virtual void SetMinMax(std::optional min, const T& max) = 0; + /// \brief Increments the null count directly /// Use Update to extract the null count from data. Use this if you determine /// the null count through some other means (e.g. dictionary arrays where the @@ -402,7 +401,7 @@ std::shared_ptr> MakeStatistics(const typename DType::c_t /// \brief Typed version of Statistics::Make template std::shared_ptr> MakeStatistics( - const ColumnDescriptor* descr, const std::string& encoded_min, + const ColumnDescriptor* descr, const std::optional& encoded_min, const std::string& encoded_max, int64_t num_values, std::optional null_count, std::optional distinct_count, bool has_min_max, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) { @@ -415,7 +414,7 @@ std::shared_ptr> MakeStatistics( /// \brief Typed version of Statistics::Make template std::shared_ptr> MakeStatistics( - const ColumnDescriptor* descr, const std::string& encoded_min, + const ColumnDescriptor* descr, const std::optional& encoded_min, const std::string& encoded_max, int64_t num_values, std::optional null_count, std::optional distinct_count, bool has_min_max, std::optional is_min_value_exact, std::optional is_max_value_exact, diff --git a/cpp/src/parquet/statistics_test.cc b/cpp/src/parquet/statistics_test.cc index b71b82b6dec..c7f157ded33 100644 --- a/cpp/src/parquet/statistics_test.cc +++ b/cpp/src/parquet/statistics_test.cc @@ -472,9 +472,9 @@ class TestStatistics : public PrimitiveTypedTest { std::shared_ptr enc_stats = column_chunk->encoded_statistics(); EXPECT_EQ(null_count, enc_stats->null_count); - EXPECT_TRUE(enc_stats->has_min); + EXPECT_TRUE(enc_stats->min.has_value()); EXPECT_TRUE(enc_stats->has_max); - EXPECT_EQ(expected_stats->EncodeMin(), enc_stats->min()); + EXPECT_EQ(std::make_optional(expected_stats->EncodeMin()), enc_stats->min); EXPECT_EQ(expected_stats->EncodeMax(), enc_stats->max()); EXPECT_EQ(enc_stats->is_min_value_exact, std::make_optional(true)); EXPECT_EQ(enc_stats->is_max_value_exact, std::make_optional(true)); @@ -563,8 +563,8 @@ void TestStatistics::TestMinMaxEncode() { // encoded is same as unencoded ASSERT_EQ(encoded_min, - std::string(reinterpret_cast(statistics1->min().ptr), - statistics1->min().len)); + std::string(reinterpret_cast(statistics1->min().value().ptr), + statistics1->min().value().len)); ASSERT_EQ(encoded_max, std::string(reinterpret_cast(statistics1->max().ptr), statistics1->max().len)); @@ -712,7 +712,7 @@ class TestStatisticsHasFlag : public TestStatistics { /*null_count=*/this->values_.size()); auto encoded_stats1 = statistics1->Encode(); EXPECT_FALSE(statistics1->HasMinMax()); - EXPECT_FALSE(encoded_stats1.has_min); + EXPECT_FALSE(encoded_stats1.min.has_value()); EXPECT_FALSE(encoded_stats1.has_max); EXPECT_EQ(encoded_stats1.is_max_value_exact, std::nullopt); EXPECT_EQ(encoded_stats1.is_min_value_exact, std::nullopt); @@ -724,7 +724,7 @@ class TestStatisticsHasFlag : public TestStatistics { statistics2->Update(this->values_ptr_, this->values_.size(), 0); auto encoded_stats2 = statistics2->Encode(); EXPECT_TRUE(statistics2->HasMinMax()); - EXPECT_TRUE(encoded_stats2.has_min); + EXPECT_TRUE(encoded_stats2.min.has_value()); EXPECT_TRUE(encoded_stats2.has_max); EXPECT_EQ(encoded_stats2.is_min_value_exact, std::make_optional(true)); EXPECT_EQ(encoded_stats2.is_max_value_exact, std::make_optional(true)); @@ -732,7 +732,7 @@ class TestStatisticsHasFlag : public TestStatistics { VerifyMergedStatistics(*statistics1, *statistics2, [](TypedStatistics* merged_statistics) { EXPECT_TRUE(merged_statistics->HasMinMax()); - EXPECT_TRUE(merged_statistics->Encode().has_min); + EXPECT_TRUE(merged_statistics->Encode().min.has_value()); EXPECT_TRUE(merged_statistics->Encode().has_max); EXPECT_EQ(merged_statistics->Encode().is_min_value_exact, std::make_optional(true)); @@ -803,7 +803,7 @@ class TestStatisticsHasFlag : public TestStatistics { EXPECT_FALSE(encoded.all_null_value); EXPECT_FALSE(encoded.null_count.has_value()); EXPECT_FALSE(encoded.distinct_count.has_value()); - EXPECT_FALSE(encoded.has_min); + EXPECT_FALSE(encoded.min.has_value()); EXPECT_FALSE(encoded.has_max); EXPECT_FALSE(encoded.is_min_value_exact.has_value()); EXPECT_FALSE(encoded.is_max_value_exact.has_value()); @@ -1001,7 +1001,8 @@ class TestStatisticsSortOrder : public ::testing::Test { ARROW_SCOPED_TRACE("Statistics for field #", i); std::shared_ptr cc_metadata = rg_metadata->ColumnChunk(i); - EXPECT_EQ(stats_[i].min(), cc_metadata->statistics()->EncodeMin()); + EXPECT_EQ(stats_[i].min, + std::make_optional(cc_metadata->statistics()->EncodeMin())); EXPECT_EQ(stats_[i].max(), cc_metadata->statistics()->EncodeMax()); EXPECT_EQ(stats_[i].is_max_value_exact, std::make_optional(true)); EXPECT_EQ(stats_[i].is_min_value_exact, std::make_optional(true)); @@ -1242,7 +1243,7 @@ void TestByteArrayStatisticsFromArrow() { auto stats = MakeStatistics(&descr); ASSERT_NO_FATAL_FAILURE(stats->Update(*values)); - ASSERT_EQ(ByteArray(typed_values.GetView(2)), stats->min()); + ASSERT_EQ(ByteArray(typed_values.GetView(2)), stats->min().value()); ASSERT_EQ(ByteArray(typed_values.GetView(9)), stats->max()); ASSERT_EQ(2, stats->null_count()); } @@ -1419,8 +1420,8 @@ class TestFloatStatistics : public ::testing::Test { stats->Update(values.data(), values.size(), /*null_count=*/0); ASSERT_TRUE(stats->HasMinMax()); - this->CheckEq(stats->min(), positive_zero_); - ASSERT_TRUE(this->signbit(stats->min())); + this->CheckEq(stats->min().value(), positive_zero_); + ASSERT_TRUE(this->signbit(stats->min().value())); ASSERT_EQ(stats->EncodeMin(), EncodeValue(negative_zero_)); this->CheckEq(stats->max(), positive_zero_); @@ -1655,7 +1656,7 @@ TEST(TestStatisticsSortOrder, UNKNOWN) { std::shared_ptr enc_stats = column_chunk->encoded_statistics(); ASSERT_TRUE(enc_stats->null_count.has_value()); ASSERT_FALSE(enc_stats->distinct_count.has_value()); - ASSERT_FALSE(enc_stats->has_min); + ASSERT_FALSE(enc_stats->min.has_value()); ASSERT_FALSE(enc_stats->has_max); ASSERT_EQ(1, enc_stats->null_count); ASSERT_FALSE(enc_stats->is_max_value_exact.has_value()); @@ -1723,7 +1724,7 @@ TEST(TestEncodedStatistics, TruncatedMinMax) { column_chunk->encoded_statistics(); ASSERT_TRUE(encoded_statistics != NULL); ASSERT_EQ(0, encoded_statistics->null_count); - EXPECT_EQ("Al", encoded_statistics->min()); + EXPECT_EQ(std::make_optional("Al"), encoded_statistics->min); ASSERT_TRUE(encoded_statistics->is_max_value_exact.has_value()); ASSERT_TRUE(encoded_statistics->is_min_value_exact.has_value()); switch (num_column) { @@ -1763,7 +1764,7 @@ TEST(TestEncodedStatistics, CopySafe) { ASSERT_TRUE(encoded_statistics.is_max_value_exact.has_value()); encoded_statistics.set_min("abc"); - ASSERT_TRUE(encoded_statistics.has_min); + ASSERT_TRUE(encoded_statistics.min.has_value()); encoded_statistics.is_min_value_exact = true; ASSERT_TRUE(encoded_statistics.is_min_value_exact.has_value()); @@ -1773,7 +1774,7 @@ TEST(TestEncodedStatistics, CopySafe) { copy_statistics.is_max_value_exact = false; copy_statistics.is_min_value_exact = false; - EXPECT_EQ("abc", encoded_statistics.min()); + EXPECT_EQ(std::make_optional("abc"), encoded_statistics.min); EXPECT_EQ("abc", encoded_statistics.max()); EXPECT_EQ(encoded_statistics.is_min_value_exact, std::make_optional(true)); EXPECT_EQ(encoded_statistics.is_max_value_exact, std::make_optional(true)); @@ -1782,15 +1783,15 @@ TEST(TestEncodedStatistics, CopySafe) { TEST(TestEncodedStatistics, ApplyStatSizeLimits) { EncodedStatistics encoded_statistics; encoded_statistics.set_min("a"); - ASSERT_TRUE(encoded_statistics.has_min); + ASSERT_TRUE(encoded_statistics.min.has_value()); encoded_statistics.set_max("abc"); ASSERT_TRUE(encoded_statistics.has_max); encoded_statistics.ApplyStatSizeLimits(2); - ASSERT_TRUE(encoded_statistics.has_min); - ASSERT_EQ("a", encoded_statistics.min()); + ASSERT_TRUE(encoded_statistics.min.has_value()); + ASSERT_EQ(std::make_optional("a"), encoded_statistics.min); ASSERT_FALSE(encoded_statistics.has_max); NodePtr node = diff --git a/cpp/src/parquet/thrift_internal.h b/cpp/src/parquet/thrift_internal.h index b499550221e..388a6509b36 100644 --- a/cpp/src/parquet/thrift_internal.h +++ b/cpp/src/parquet/thrift_internal.h @@ -469,15 +469,15 @@ static inline format::GeospatialStatistics ToThrift( static inline format::Statistics ToThrift(const EncodedStatistics& stats) { format::Statistics statistics; - if (stats.has_min) { - statistics.__set_min_value(stats.min()); + if (stats.min.has_value()) { + statistics.__set_min_value(stats.min.value()); if (stats.is_min_value_exact.has_value()) { statistics.__set_is_min_value_exact(stats.is_min_value_exact.value()); } // If the order is SIGNED, then the old min value must be set too. // This for backward compatibility if (stats.is_signed()) { - statistics.__set_min(stats.min()); + statistics.__set_min(stats.min.value()); } } if (stats.has_max) { From 1755f6eebbd318be8520ce3b720dc81c36e680e2 Mon Sep 17 00:00:00 2001 From: Alvaro-Kothe Date: Tue, 24 Feb 2026 00:02:47 -0300 Subject: [PATCH 04/11] fix: use std::optional for max --- .../parquet/arrow/arrow_reader_writer_test.cc | 8 +- cpp/src/parquet/arrow/reader_internal.cc | 10 +- cpp/src/parquet/column_writer_test.cc | 2 +- cpp/src/parquet/file_deserialize_test.cc | 4 +- cpp/src/parquet/metadata.cc | 34 +++-- cpp/src/parquet/metadata_test.cc | 16 +-- cpp/src/parquet/page_index.cc | 4 +- cpp/src/parquet/page_index_test.cc | 4 +- cpp/src/parquet/printer.cc | 2 +- cpp/src/parquet/statistics.cc | 131 ++++++++++-------- cpp/src/parquet/statistics.h | 48 +++---- cpp/src/parquet/statistics_test.cc | 56 ++++---- cpp/src/parquet/thrift_internal.h | 6 +- 13 files changed, 170 insertions(+), 155 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index c968d0fa773..27ae5b5ccdd 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -4736,11 +4736,11 @@ TEST_P(TestArrowWriteDictionary, Statistics) { auto expect_has_min_max = expected_has_min_max_by_page[case_index][row_group_index][page_index]; EXPECT_EQ(stats.min.has_value(), expect_has_min_max); - EXPECT_EQ(stats.has_max, expect_has_min_max); + EXPECT_EQ(stats.max.has_value(), expect_has_min_max); if (expect_has_min_max) { EXPECT_EQ(stats.min.value(), expected_min_by_page[case_index][row_group_index][page_index]); - EXPECT_EQ(stats.max(), + EXPECT_EQ(stats.max.value(), expected_max_by_page[case_index][row_group_index][page_index]); } @@ -4840,7 +4840,7 @@ TEST_P(TestArrowWriteDictionary, StatisticsUnifiedDictionary) { const auto& stats = data_page->statistics(); EXPECT_EQ(1, stats.null_count); EXPECT_EQ(rg0_min_values[i], stats.min.value()); - EXPECT_EQ(rg0_max_values[i], stats.max()); + EXPECT_EQ(rg0_max_values[i], stats.max.value()); } ASSERT_EQ(rg0_page_reader->NextPage(), nullptr); @@ -4854,7 +4854,7 @@ TEST_P(TestArrowWriteDictionary, StatisticsUnifiedDictionary) { const auto& stats = data_page->statistics(); EXPECT_EQ(1, stats.null_count); EXPECT_EQ("b", stats.min.value()); - EXPECT_EQ("c", stats.max()); + EXPECT_EQ("c", stats.max.value()); } ASSERT_EQ(rg1_page_reader->NextPage(), nullptr); } diff --git a/cpp/src/parquet/arrow/reader_internal.cc b/cpp/src/parquet/arrow/reader_internal.cc index 2720e0c03d4..0341223d82d 100644 --- a/cpp/src/parquet/arrow/reader_internal.cc +++ b/cpp/src/parquet/arrow/reader_internal.cc @@ -170,7 +170,7 @@ Status MakeMinMaxScalar(const StatisticsType& statistics, std::shared_ptr<::arrow::Scalar>* min, std::shared_ptr<::arrow::Scalar>* max) { *min = ::arrow::MakeScalar(static_cast(statistics.min().value())); - *max = ::arrow::MakeScalar(static_cast(statistics.max())); + *max = ::arrow::MakeScalar(static_cast(statistics.max().value())); return Status::OK(); } @@ -180,7 +180,7 @@ Status MakeMinMaxTypedScalar(const StatisticsType& statistics, std::shared_ptr<::arrow::Scalar>* min, std::shared_ptr<::arrow::Scalar>* max) { ARROW_ASSIGN_OR_RAISE(*min, ::arrow::MakeScalar(type, statistics.min().value())); - ARROW_ASSIGN_OR_RAISE(*max, ::arrow::MakeScalar(type, statistics.max())); + ARROW_ASSIGN_OR_RAISE(*max, ::arrow::MakeScalar(type, statistics.max().value())); return Status::OK(); } @@ -227,7 +227,7 @@ static Status FromInt32Statistics(const Int32Statistics& statistics, case LogicalType::Type::NONE: return MakeMinMaxTypedScalar(statistics, type, min, max); case LogicalType::Type::DECIMAL: - return ExtractDecimalMinMaxFromInteger(statistics.min().value(), statistics.max(), + return ExtractDecimalMinMaxFromInteger(statistics.min().value(), statistics.max().value(), logical_type, min, max); default: break; @@ -252,7 +252,7 @@ static Status FromInt64Statistics(const Int64Statistics& statistics, case LogicalType::Type::NONE: return MakeMinMaxTypedScalar(statistics, type, min, max); case LogicalType::Type::DECIMAL: - return ExtractDecimalMinMaxFromInteger(statistics.min().value(), statistics.max(), + return ExtractDecimalMinMaxFromInteger(statistics.min().value(), statistics.max().value(), logical_type, min, max); default: break; @@ -390,7 +390,7 @@ void AttachStatistics(::arrow::ArrayData* data, const auto* typed_statistics = checked_cast*>(statistics.get()); const ArrowCType min = typed_statistics->min().value(); - const ArrowCType max = typed_statistics->max(); + const ArrowCType max = typed_statistics->max().value(); if constexpr (std::is_same_v) { array_statistics->min = static_cast(min); array_statistics->max = static_cast(max); diff --git a/cpp/src/parquet/column_writer_test.cc b/cpp/src/parquet/column_writer_test.cc index d02cbf0e97a..b392a4d4cef 100644 --- a/cpp/src/parquet/column_writer_test.cc +++ b/cpp/src/parquet/column_writer_test.cc @@ -375,7 +375,7 @@ class TestPrimitiveWriter : public PrimitiveTypedTest { auto metadata_accessor = ColumnChunkMetaData::Make( metadata_->contents(), this->descr_, default_reader_properties(), &app_version); auto encoded_stats = metadata_accessor->statistics()->Encode(); - return {encoded_stats.min.has_value(), encoded_stats.has_max}; + return {encoded_stats.min.has_value(), encoded_stats.max.has_value()}; } std::vector metadata_encodings() { diff --git a/cpp/src/parquet/file_deserialize_test.cc b/cpp/src/parquet/file_deserialize_test.cc index 25ee2b0c785..fd02ad6700a 100644 --- a/cpp/src/parquet/file_deserialize_test.cc +++ b/cpp/src/parquet/file_deserialize_test.cc @@ -69,7 +69,7 @@ static inline void AddDummyStats(int stat_size, H& header, bool fill_all_stats = template static inline void CheckStatistics(const H& expected, const EncodedStatistics& actual) { if (expected.statistics.__isset.max) { - ASSERT_EQ(expected.statistics.max, actual.max()); + ASSERT_EQ(expected.statistics.max, actual.max); } if (expected.statistics.__isset.min) { ASSERT_EQ(expected.statistics.min, actual.min); @@ -513,7 +513,7 @@ TYPED_TEST(PageFilterTest, TestPageFilterCallback) { CheckDataPageHeader(this->data_page_headers_[i], current_page.get())); auto data_page = static_cast(current_page.get()); const EncodedStatistics encoded_statistics = data_page->statistics(); - ASSERT_EQ(read_stats[i].max(), encoded_statistics.max()); + ASSERT_EQ(read_stats[i].max, encoded_statistics.max); ASSERT_EQ(read_stats[i].min, encoded_statistics.min); ASSERT_EQ(read_stats[i].null_count, encoded_statistics.null_count); ASSERT_EQ(read_stats[i].distinct_count, encoded_statistics.distinct_count); diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 54efe2d2fd5..04ff3cca4f3 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -109,20 +109,30 @@ static std::shared_ptr MakeTypedColumnStats( metadata.statistics.__isset.distinct_count ? std::optional(metadata.statistics.distinct_count) : std::nullopt; + std::optional min_val = metadata.statistics.__isset.min + ? std::make_optional(metadata.statistics.min) + : std::nullopt; + std::optional max_val = metadata.statistics.__isset.max + ? std::make_optional(metadata.statistics.max) + : std::nullopt; // If ColumnOrder is defined, return max_value and min_value if (descr->column_order().get_order() == ColumnOrder::TYPE_DEFINED_ORDER) { - return MakeStatistics( - descr, metadata.statistics.min_value, metadata.statistics.max_value, - metadata.num_values - null_count.value_or(0), null_count, distinct_count, - metadata.statistics.__isset.max_value && metadata.statistics.__isset.min_value, - min_exact, max_exact, pool); + std::optional min_value = + metadata.statistics.__isset.min_value + ? std::make_optional(metadata.statistics.min_value) + : std::nullopt; + std::optional max_value = + metadata.statistics.__isset.max_value + ? std::make_optional(metadata.statistics.max_value) + : std::nullopt; + return MakeStatistics(descr, min_value, max_value, + metadata.num_values - null_count.value_or(0), null_count, + distinct_count, min_exact, max_exact, pool); } // Default behavior - return MakeStatistics( - descr, metadata.statistics.min, metadata.statistics.max, - metadata.num_values - null_count.value_or(0), null_count, distinct_count, - metadata.statistics.__isset.max && metadata.statistics.__isset.min, min_exact, - max_exact, pool); + return MakeStatistics(descr, min_val, max_val, + metadata.num_values - null_count.value_or(0), null_count, + distinct_count, min_exact, max_exact, pool); } namespace { @@ -1614,8 +1624,8 @@ bool ApplicationVersion::HasCorrectStatistics(Type::type col_type, (application_ == "parquet-mr" && VersionLt(PARQUET_MR_FIXED_STATS_VERSION()))) { // Only SIGNED are valid unless max and min are the same // (in which case the sort order does not matter) - bool max_equals_min = statistics.min.has_value() && statistics.has_max - ? statistics.min.value() == statistics.max() + bool max_equals_min = statistics.min.has_value() && statistics.max.has_value() + ? statistics.min.value() == statistics.max.value() : false; if (SortOrder::SIGNED != sort_order && !max_equals_min) { return false; diff --git a/cpp/src/parquet/metadata_test.cc b/cpp/src/parquet/metadata_test.cc index ab21cca8606..99958596b72 100644 --- a/cpp/src/parquet/metadata_test.cc +++ b/cpp/src/parquet/metadata_test.cc @@ -156,13 +156,13 @@ TEST(Metadata, TestBuildAccess) { ASSERT_EQ(true, rg1_column2->is_stats_set()); ASSERT_EQ(stats_float.min, std::make_optional(rg1_column2->statistics()->EncodeMin())); - ASSERT_EQ(stats_float.max(), rg1_column2->statistics()->EncodeMax()); + ASSERT_EQ(stats_float.max, rg1_column2->statistics()->EncodeMax()); ASSERT_EQ(stats_int.min, std::make_optional(rg1_column1->statistics()->EncodeMin())); - ASSERT_EQ(stats_int.max(), rg1_column1->statistics()->EncodeMax()); + ASSERT_EQ(stats_int.max, rg1_column1->statistics()->EncodeMax()); ASSERT_EQ(stats_float.min, rg1_column2->encoded_statistics()->min); - ASSERT_EQ(stats_float.max(), rg1_column2->encoded_statistics()->max()); + ASSERT_EQ(stats_float.max, rg1_column2->encoded_statistics()->max); ASSERT_EQ(stats_int.min, rg1_column1->encoded_statistics()->min); - ASSERT_EQ(stats_int.max(), rg1_column1->encoded_statistics()->max()); + ASSERT_EQ(stats_int.max, rg1_column1->encoded_statistics()->max); ASSERT_EQ(0, rg1_column1->statistics()->null_count()); ASSERT_EQ(0, rg1_column2->statistics()->null_count()); ASSERT_EQ(nrows, rg1_column1->statistics()->distinct_count()); @@ -208,13 +208,13 @@ TEST(Metadata, TestBuildAccess) { ASSERT_EQ(true, rg2_column2->is_stats_set()); ASSERT_EQ(stats_float.min, std::make_optional(rg2_column2->statistics()->EncodeMin())); - ASSERT_EQ(stats_float.max(), rg2_column2->statistics()->EncodeMax()); + ASSERT_EQ(stats_float.max, rg2_column2->statistics()->EncodeMax()); ASSERT_EQ(stats_int.min, std::make_optional(rg1_column1->statistics()->EncodeMin())); - ASSERT_EQ(stats_int.max(), rg1_column1->statistics()->EncodeMax()); + ASSERT_EQ(stats_int.max, rg1_column1->statistics()->EncodeMax()); ASSERT_EQ(stats_float.min, rg2_column2->encoded_statistics()->min); - ASSERT_EQ(stats_float.max(), rg2_column2->encoded_statistics()->max()); + ASSERT_EQ(stats_float.max, rg2_column2->encoded_statistics()->max); ASSERT_EQ(stats_int.min, rg1_column1->encoded_statistics()->min); - ASSERT_EQ(stats_int.max(), rg1_column1->encoded_statistics()->max()); + ASSERT_EQ(stats_int.max, rg1_column1->encoded_statistics()->max); ASSERT_EQ(0, rg2_column1->statistics()->null_count()); ASSERT_EQ(0, rg2_column2->statistics()->null_count()); ASSERT_EQ(nrows, rg2_column1->statistics()->distinct_count()); diff --git a/cpp/src/parquet/page_index.cc b/cpp/src/parquet/page_index.cc index c421cd31e63..8eb73d3534b 100644 --- a/cpp/src/parquet/page_index.cc +++ b/cpp/src/parquet/page_index.cc @@ -511,11 +511,11 @@ class ColumnIndexBuilderImpl final : public ColumnIndexBuilder { column_index_.null_pages.emplace_back(true); column_index_.min_values.emplace_back(""); column_index_.max_values.emplace_back(""); - } else if (stats.min.has_value() && stats.has_max) { + } else if (stats.min.has_value() && stats.max.has_value()) { const size_t page_ordinal = column_index_.null_pages.size(); non_null_page_indices_.emplace_back(page_ordinal); column_index_.min_values.emplace_back(stats.min.value()); - column_index_.max_values.emplace_back(stats.max()); + column_index_.max_values.emplace_back(stats.max.value()); column_index_.null_pages.emplace_back(false); } else { /// This is a non-null page but it lacks of meaningful min/max values. diff --git a/cpp/src/parquet/page_index_test.cc b/cpp/src/parquet/page_index_test.cc index 41c8ba5ab45..78f81b190b3 100644 --- a/cpp/src/parquet/page_index_test.cc +++ b/cpp/src/parquet/page_index_test.cc @@ -547,7 +547,7 @@ void TestWriteTypedColumnIndex(schema::NodePtr node, for (size_t i = 0; i < num_pages; ++i) { ASSERT_EQ(page_stats[i].all_null_value, column_index->null_pages()[i]); ASSERT_EQ(page_stats[i].min.value_or(""), column_index->encoded_min_values()[i]); - ASSERT_EQ(page_stats[i].max(), column_index->encoded_max_values()[i]); + ASSERT_EQ(page_stats[i].max.value_or(""), column_index->encoded_max_values()[i]); if (has_null_counts) { ASSERT_EQ(page_stats[i].null_count, column_index->null_counts()[i]); } @@ -814,7 +814,7 @@ class PageIndexBuilderTest : public ::testing::Test { ASSERT_EQ(size_t{1}, column_index->null_pages().size()); ASSERT_EQ(stats.all_null_value, column_index->null_pages()[0]); ASSERT_EQ(stats.min, column_index->encoded_min_values()[0]); - ASSERT_EQ(stats.max(), column_index->encoded_max_values()[0]); + ASSERT_EQ(stats.max, column_index->encoded_max_values()[0]); ASSERT_EQ(stats.null_count.has_value(), column_index->has_null_counts()); if (stats.null_count) { ASSERT_EQ(*stats.null_count, column_index->null_counts()[0]); diff --git a/cpp/src/parquet/printer.cc b/cpp/src/parquet/printer.cc index 4429e3aeb14..5ebcced4eae 100644 --- a/cpp/src/parquet/printer.cc +++ b/cpp/src/parquet/printer.cc @@ -165,7 +165,7 @@ void ParquetFilePrinter::DebugPrint(std::ostream& stream, std::list selecte } stream << " Values: " << column_chunk->num_values(); if (column_chunk->is_stats_set()) { - std::string min = stats->min.value_or(""), max = stats->max(); + std::string min = stats->min.value_or(""), max = stats->max.value_or(""); std::string max_exact = stats->is_max_value_exact.has_value() ? (stats->is_max_value_exact.value() ? "true" : "false") diff --git a/cpp/src/parquet/statistics.cc b/cpp/src/parquet/statistics.cc index 03cd10e2e07..f6ca034f11b 100644 --- a/cpp/src/parquet/statistics.cc +++ b/cpp/src/parquet/statistics.cc @@ -595,9 +595,9 @@ class TypedStatisticsImpl : public TypedStatistics { SetDistinctCount(distinct_count); min_.emplace(); + max_.emplace(); Copy(min, &min_.value(), min_buffer_.get()); - Copy(max, &max_, max_buffer_.get()); - has_min_max_ = true; + Copy(max, &max_.value(), max_buffer_.get()); statistics_.is_min_value_exact = true; statistics_.is_max_value_exact = true; } @@ -605,20 +605,19 @@ class TypedStatisticsImpl : public TypedStatistics { // Create stats from a thrift Statistics object. TypedStatisticsImpl(const ColumnDescriptor* descr, const std::optional& encoded_min, - const std::string& encoded_max, int64_t num_values, + const std::optional& encoded_max, int64_t num_values, std::optional null_count, - std::optional distinct_count, bool has_min_max, - MemoryPool* pool) + std::optional distinct_count, MemoryPool* pool) : TypedStatisticsImpl(descr, encoded_min, encoded_max, num_values, null_count, - distinct_count, has_min_max, + distinct_count, /*is_min_value_exact=*/std::nullopt, /*is_max_value_exact=*/std::nullopt, pool) {} TypedStatisticsImpl(const ColumnDescriptor* descr, const std::optional& encoded_min, - const std::string& encoded_max, int64_t num_values, + const std::optional& encoded_max, int64_t num_values, std::optional null_count, - std::optional distinct_count, bool has_min_max, + std::optional distinct_count, std::optional is_min_value_exact, std::optional is_max_value_exact, MemoryPool* pool) : TypedStatisticsImpl(descr, pool) { @@ -634,24 +633,26 @@ class TypedStatisticsImpl : public TypedStatistics { statistics_.distinct_count = std::nullopt; } - if (has_min_max) { - T min_val, max_val; + if (encoded_min.has_value()) { + T min_val; PlainDecode(encoded_min.value(), &min_val); - PlainDecode(encoded_max, &max_val); min_.emplace(); Copy(min_val, &min_.value(), min_buffer_.get()); - Copy(max_val, &max_, max_buffer_.get()); statistics_.is_min_value_exact = is_min_value_exact; + } + if (encoded_max.has_value()) { + T max_val; + PlainDecode(encoded_max.value(), &max_val); + max_.emplace(); + Copy(max_val, &max_.value(), max_buffer_.get()); statistics_.is_max_value_exact = is_max_value_exact; } - - has_min_max_ = has_min_max; } bool HasDistinctCount() const override { return statistics_.distinct_count.has_value(); }; - bool HasMinMax() const override { return has_min_max_; } + bool HasMinMax() const override { return min_.has_value() && max_.has_value(); } bool HasNullCount() const override { return statistics_.null_count.has_value(); }; void IncrementNullCount(int64_t n) override { @@ -682,11 +683,7 @@ class TypedStatisticsImpl : public TypedStatistics { } const auto& other = checked_cast(raw_other); - - if (has_min_max_ != other.has_min_max_) return false; - if (has_min_max_) { - if (!MinMaxEqual(other)) return false; - } + if (!MinMaxEqual(other)) return false; return null_count() == other.null_count() && distinct_count() == other.distinct_count() && @@ -700,21 +697,39 @@ class TypedStatisticsImpl : public TypedStatistics { void Reset() override { ResetCounts(); ResetHasFlags(); + min_ = std::nullopt; + max_ = std::nullopt; } void SetMinMax(const T& arg_min, const T& arg_max) override { SetMinMaxPair({arg_min, arg_max}); } - void SetMinMax(std::optional arg_min, const T& arg_max) override { - if (arg_min.has_value()) { - SetMinMaxPair({arg_min.value(), arg_max}); - } else { - // If min is not provided, we only update max if we have it. - // But SetMinMaxPair expects a pair. - // For now, if min is not provided, we might need a different logic or - // just ignore if the intention was to merge stats where min might be missing. - // However, HasMinMax() usually implies both are present in Parquet. + void SetMinMax(std::optional arg_min, std::optional arg_max) override { + if (arg_min.has_value() && arg_max.has_value()) { + SetMinMaxPair({arg_min.value(), arg_max.value()}); + } else if (arg_min.has_value()) { + // Only min is provided. + if (!min_.has_value()) { + min_.emplace(); + Copy(arg_min.value(), &min_.value(), min_buffer_.get()); + } else if (comparator_) { + Copy(comparator_->Compare(min_.value(), arg_min.value()) ? min_.value() + : arg_min.value(), + &min_.value(), min_buffer_.get()); + } + statistics_.is_min_value_exact = true; + } else if (arg_max.has_value()) { + // Only max is provided. + if (!max_.has_value()) { + max_.emplace(); + Copy(arg_max.value(), &max_.value(), max_buffer_.get()); + } else if (comparator_) { + Copy(comparator_->Compare(max_.value(), arg_max.value()) ? arg_max.value() + : max_.value(), + &max_.value(), max_buffer_.get()); + } + statistics_.is_max_value_exact = true; } } @@ -764,7 +779,7 @@ class TypedStatisticsImpl : public TypedStatistics { std::optional min() const override { return min_; } - const T& max() const override { return max_; } + std::optional max() const override { return max_; } Type::type physical_type() const override { return descr_->physical_type(); } @@ -772,22 +787,24 @@ class TypedStatisticsImpl : public TypedStatistics { std::string EncodeMin() const override { std::string s; - if (HasMinMax()) this->PlainEncode(min_.value(), &s); + if (min_.has_value()) this->PlainEncode(min_.value(), &s); return s; } std::string EncodeMax() const override { std::string s; - if (HasMinMax()) this->PlainEncode(max_, &s); + if (max_.has_value()) this->PlainEncode(max_.value(), &s); return s; } EncodedStatistics Encode() override { EncodedStatistics s; - if (HasMinMax()) { + if (min_.has_value()) { s.set_min(this->EncodeMin()); - s.set_max(this->EncodeMax()); s.is_min_value_exact = this->is_min_value_exact(); + } + if (max_.has_value()) { + s.set_max(this->EncodeMax()); s.is_max_value_exact = this->is_max_value_exact(); } if (HasNullCount()) { @@ -815,9 +832,8 @@ class TypedStatisticsImpl : public TypedStatistics { private: const ColumnDescriptor* descr_; - bool has_min_max_ = false; std::optional min_; - T max_; + std::optional max_; ::arrow::MemoryPool* pool_; // Number of non-null values. // Please note that num_values_ is reliable when null_count is set. @@ -847,8 +863,6 @@ class TypedStatisticsImpl : public TypedStatistics { } void ResetHasFlags() { - // has_min_max_ will only be set when it meets any valid value. - this->has_min_max_ = false; // distinct_count will only be set once SetDistinctCount() // is called because distinct count calculation is not cheap and // disabled by default. @@ -865,32 +879,35 @@ class TypedStatisticsImpl : public TypedStatistics { auto min = maybe_min_max.value().first; auto max = maybe_min_max.value().second; - - if (!has_min_max_) { - has_min_max_ = true; + if (!HasMinMax()) { min_.emplace(); + max_.emplace(); Copy(min, &min_.value(), min_buffer_.get()); - Copy(max, &max_, max_buffer_.get()); + Copy(max, &max_.value(), max_buffer_.get()); } else { Copy(comparator_->Compare(min_.value(), min) ? min_.value() : min, &min_.value(), min_buffer_.get()); - Copy(comparator_->Compare(max_, max) ? max : max_, &max_, max_buffer_.get()); + Copy(comparator_->Compare(max_.value(), max) ? max : max_.value(), &max_.value(), + max_buffer_.get()); } statistics_.is_min_value_exact = true; statistics_.is_max_value_exact = true; } }; -#define MinMaxSetCheck \ - DCHECK(has_min_max_&& min_.has_value() && other.min_.has_value()) \ - << "Min and Max must be set" template <> inline bool TypedStatisticsImpl::MinMaxEqual( const TypedStatisticsImpl& other) const { uint32_t len = descr_->type_length(); - MinMaxSetCheck; - return std::memcmp(min_.value().ptr, other.min_.value().ptr, len) == 0 && - std::memcmp(max_.ptr, other.max_.ptr, len) == 0; + if (min_.has_value() != other.min_.has_value()) return false; + if (min_.has_value()) { + if (std::memcmp(min_.value().ptr, other.min_.value().ptr, len) != 0) return false; + } + if (max_.has_value() != other.max_.has_value()) return false; + if (max_.has_value()) { + if (std::memcmp(max_.value().ptr, other.max_.value().ptr, len) != 0) return false; + } + return true; } template @@ -1108,33 +1125,33 @@ std::shared_ptr Statistics::Make(const ColumnDescriptor* descr, int64_t num_values, ::arrow::MemoryPool* pool) { DCHECK(encoded_stats != nullptr); - return Make(descr, encoded_stats->min, encoded_stats->max(), num_values, + return Make(descr, encoded_stats->min, encoded_stats->max, num_values, encoded_stats->null_count, encoded_stats->distinct_count, - encoded_stats->min.has_value() && encoded_stats->has_max, encoded_stats->is_min_value_exact, encoded_stats->is_max_value_exact, pool); } std::shared_ptr Statistics::Make( const ColumnDescriptor* descr, const std::optional& encoded_min, - const std::string& encoded_max, int64_t num_values, std::optional null_count, - std::optional distinct_count, bool has_min_max, ::arrow::MemoryPool* pool) { + const std::optional& encoded_max, int64_t num_values, + std::optional null_count, std::optional distinct_count, + ::arrow::MemoryPool* pool) { return Statistics::Make(descr, encoded_min, encoded_max, num_values, null_count, - distinct_count, has_min_max, + distinct_count, /*is_min_value_exact=*/std::nullopt, /*is_max_value_exact=*/std::nullopt, pool); } std::shared_ptr Statistics::Make( const ColumnDescriptor* descr, const std::optional& encoded_min, - const std::string& encoded_max, int64_t num_values, std::optional null_count, - std::optional distinct_count, bool has_min_max, + const std::optional& encoded_max, int64_t num_values, + std::optional null_count, std::optional distinct_count, std::optional is_min_value_exact, std::optional is_max_value_exact, ::arrow::MemoryPool* pool) { #define MAKE_STATS(CAP_TYPE, KLASS) \ case Type::CAP_TYPE: \ return std::make_shared>( \ descr, encoded_min, encoded_max, num_values, null_count, distinct_count, \ - has_min_max, is_min_value_exact, is_max_value_exact, pool) + is_min_value_exact, is_max_value_exact, pool) switch (descr->physical_type()) { MAKE_STATS(BOOLEAN, BooleanType); diff --git a/cpp/src/parquet/statistics.h b/cpp/src/parquet/statistics.h index 6ffa764217b..2b4e6511d5a 100644 --- a/cpp/src/parquet/statistics.h +++ b/cpp/src/parquet/statistics.h @@ -119,13 +119,12 @@ std::shared_ptr> MakeComparator(const ColumnDescriptor* d /// \brief Structure represented encoded statistics to be written to /// and read from Parquet serialized metadata. class PARQUET_EXPORT EncodedStatistics { - std::string max_; bool is_signed_ = false; public: EncodedStatistics() = default; - const std::string& max() const { return max_; } + std::optional max; std::optional min; std::optional is_max_value_exact; @@ -134,8 +133,6 @@ class PARQUET_EXPORT EncodedStatistics { std::optional null_count; std::optional distinct_count; - bool has_max = false; - // When all values in the statistics are null, it is set to true. // Otherwise, at least one value is not null, or we are not sure at all. // Page index requires this information to decide whether a data page @@ -148,9 +145,8 @@ class PARQUET_EXPORT EncodedStatistics { // the true minimum for aggregations and there is no way to mark that a // value has been truncated and is a lower bound and not in the page. void ApplyStatSizeLimits(size_t length) { - if (max_.length() > length) { - has_max = false; - max_.clear(); + if (this->max.has_value() && this->max->length() > length) { + this->max = std::nullopt; is_max_value_exact = std::nullopt; } if (this->min.has_value() && this->min->length() > length) { @@ -161,20 +157,18 @@ class PARQUET_EXPORT EncodedStatistics { // Clear Min Max. void ClearMinMax() { - has_max = false; - max_.clear(); + this->max = std::nullopt; this->min = std::nullopt; } - bool is_set() const { return this->min || has_max || null_count || distinct_count; } + bool is_set() const { return this->min || this->max || null_count || distinct_count; } bool is_signed() const { return is_signed_; } void set_is_signed(bool is_signed) { is_signed_ = is_signed; } EncodedStatistics& set_max(std::string value) { - max_ = std::move(value); - has_max = true; + this->max = std::move(value); return *this; } @@ -215,13 +209,12 @@ class PARQUET_EXPORT Statistics { /// \param[in] num_values total number of values /// \param[in] null_count number of null values (std::nullopt if not set) /// \param[in] distinct_count number of distinct values (std::nullopt if not set) - /// \param[in] has_min_max whether the min/max statistics are set /// \param[in] pool a memory pool to use for any memory allocations, optional static std::shared_ptr Make( const ColumnDescriptor* descr, const std::optional& encoded_min, - const std::string& encoded_max, int64_t num_values, + const std::optional& encoded_max, int64_t num_values, std::optional null_count, std::optional distinct_count, - bool has_min_max, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()); + ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()); /// \brief Create a new statistics instance given a column schema /// definition and preexisting state @@ -231,16 +224,14 @@ class PARQUET_EXPORT Statistics { /// \param[in] num_values total number of values /// \param[in] null_count number of null values (std::nullopt if not set) /// \param[in] distinct_count number of distinct values (std::nullopt if not set) - /// \param[in] has_min_max whether the min/max statistics are set /// \param[in] is_min_value_exact whether the min value is exact /// \param[in] is_max_value_exact whether the max value is exact /// \param[in] pool a memory pool to use for any memory allocations, optional static std::shared_ptr Make( const ColumnDescriptor* descr, const std::optional& encoded_min, - const std::string& encoded_max, int64_t num_values, + const std::optional& encoded_max, int64_t num_values, std::optional null_count, std::optional distinct_count, - bool has_min_max, std::optional is_min_value_exact, - std::optional is_max_value_exact, + std::optional is_min_value_exact, std::optional is_max_value_exact, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()); // Helper function to convert EncodedStatistics to Statistics. @@ -315,7 +306,7 @@ class TypedStatistics : public Statistics { virtual std::optional min() const = 0; /// \brief The current maximum value - virtual const T& max() const = 0; + virtual std::optional max() const = 0; /// \brief Update state with state of another Statistics object virtual void Merge(const TypedStatistics& other) = 0; @@ -353,7 +344,7 @@ class TypedStatistics : public Statistics { virtual void SetMinMax(const T& min, const T& max) = 0; /// \brief Set min and max values to particular values, where min is optional - virtual void SetMinMax(std::optional min, const T& max) = 0; + virtual void SetMinMax(std::optional min, std::optional max) = 0; /// \brief Increments the null count directly /// Use Update to extract the null count from data. Use this if you determine @@ -402,12 +393,11 @@ std::shared_ptr> MakeStatistics(const typename DType::c_t template std::shared_ptr> MakeStatistics( const ColumnDescriptor* descr, const std::optional& encoded_min, - const std::string& encoded_max, int64_t num_values, std::optional null_count, - std::optional distinct_count, bool has_min_max, + const std::optional& encoded_max, int64_t num_values, + std::optional null_count, std::optional distinct_count, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) { return std::static_pointer_cast>(Statistics::Make( descr, encoded_min, encoded_max, num_values, null_count, distinct_count, - has_min_max, /*is_min_value_exact=*/std::nullopt, /*is_max_value_exact=*/std::nullopt, pool)); } @@ -415,12 +405,12 @@ std::shared_ptr> MakeStatistics( template std::shared_ptr> MakeStatistics( const ColumnDescriptor* descr, const std::optional& encoded_min, - const std::string& encoded_max, int64_t num_values, std::optional null_count, - std::optional distinct_count, bool has_min_max, + const std::optional& encoded_max, int64_t num_values, + std::optional null_count, std::optional distinct_count, std::optional is_min_value_exact, std::optional is_max_value_exact, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) { - return std::static_pointer_cast>(Statistics::Make( - descr, encoded_min, encoded_max, num_values, null_count, distinct_count, - has_min_max, is_min_value_exact, is_max_value_exact, pool)); + return std::static_pointer_cast>( + Statistics::Make(descr, encoded_min, encoded_max, num_values, null_count, + distinct_count, is_min_value_exact, is_max_value_exact, pool)); } } // namespace parquet diff --git a/cpp/src/parquet/statistics_test.cc b/cpp/src/parquet/statistics_test.cc index c7f157ded33..48555e20fce 100644 --- a/cpp/src/parquet/statistics_test.cc +++ b/cpp/src/parquet/statistics_test.cc @@ -323,7 +323,6 @@ class TestStatistics : public PrimitiveTypedTest { auto statistics2 = MakeStatistics( this->schema_.Column(0), encoded_min, encoded_max, this->values_.size(), /*null_count=*/0, /*distinct_count=*/0, - /*has_min_max=*/true, /*is_min_value_exact=*/true, /*is_max_value_exact=*/true); auto statistics3 = MakeStatistics(this->schema_.Column(0)); @@ -337,8 +336,7 @@ class TestStatistics : public PrimitiveTypedTest { // Use old API without is_{min/max}_value_exact auto statistics4 = MakeStatistics(this->schema_.Column(0), encoded_min, encoded_max, this->values_.size(), - /*null_count=*/0, /*distinct_count=*/0, - /*has_min_max=*/true); + /*null_count=*/0, /*distinct_count=*/0); ASSERT_EQ(encoded_min, statistics2->EncodeMin()); ASSERT_EQ(encoded_max, statistics2->EncodeMax()); ASSERT_EQ(statistics1->min(), statistics2->min()); @@ -473,9 +471,9 @@ class TestStatistics : public PrimitiveTypedTest { std::shared_ptr enc_stats = column_chunk->encoded_statistics(); EXPECT_EQ(null_count, enc_stats->null_count); EXPECT_TRUE(enc_stats->min.has_value()); - EXPECT_TRUE(enc_stats->has_max); + EXPECT_TRUE(enc_stats->max.has_value()); EXPECT_EQ(std::make_optional(expected_stats->EncodeMin()), enc_stats->min); - EXPECT_EQ(expected_stats->EncodeMax(), enc_stats->max()); + EXPECT_EQ(expected_stats->EncodeMax(), enc_stats->max); EXPECT_EQ(enc_stats->is_min_value_exact, std::make_optional(true)); EXPECT_EQ(enc_stats->is_max_value_exact, std::make_optional(true)); } @@ -566,15 +564,15 @@ void TestStatistics::TestMinMaxEncode() { std::string(reinterpret_cast(statistics1->min().value().ptr), statistics1->min().value().len)); ASSERT_EQ(encoded_max, - std::string(reinterpret_cast(statistics1->max().ptr), - statistics1->max().len)); + std::string(reinterpret_cast(statistics1->max().value().ptr), + statistics1->max().value().len)); - auto statistics2 = MakeStatistics( - this->schema_.Column(0), encoded_min, encoded_max, this->values_.size(), - /*null_count=*/0, - /*distinct_count=*/0, /*has_min_max=*/true, - /*is_min_value_exact=*/true, - /*is_max_value_exact=*/true); + auto statistics2 = MakeStatistics(this->schema_.Column(0), encoded_min, + encoded_max, this->values_.size(), + /*null_count=*/0, + /*distinct_count=*/0, + /*is_min_value_exact=*/true, + /*is_max_value_exact=*/true); ASSERT_EQ(encoded_min, statistics2->EncodeMin()); ASSERT_EQ(encoded_max, statistics2->EncodeMax()); @@ -713,7 +711,7 @@ class TestStatisticsHasFlag : public TestStatistics { auto encoded_stats1 = statistics1->Encode(); EXPECT_FALSE(statistics1->HasMinMax()); EXPECT_FALSE(encoded_stats1.min.has_value()); - EXPECT_FALSE(encoded_stats1.has_max); + EXPECT_FALSE(encoded_stats1.max.has_value()); EXPECT_EQ(encoded_stats1.is_max_value_exact, std::nullopt); EXPECT_EQ(encoded_stats1.is_min_value_exact, std::nullopt); } @@ -725,7 +723,7 @@ class TestStatisticsHasFlag : public TestStatistics { auto encoded_stats2 = statistics2->Encode(); EXPECT_TRUE(statistics2->HasMinMax()); EXPECT_TRUE(encoded_stats2.min.has_value()); - EXPECT_TRUE(encoded_stats2.has_max); + EXPECT_TRUE(encoded_stats2.max.has_value()); EXPECT_EQ(encoded_stats2.is_min_value_exact, std::make_optional(true)); EXPECT_EQ(encoded_stats2.is_max_value_exact, std::make_optional(true)); } @@ -733,7 +731,7 @@ class TestStatisticsHasFlag : public TestStatistics { [](TypedStatistics* merged_statistics) { EXPECT_TRUE(merged_statistics->HasMinMax()); EXPECT_TRUE(merged_statistics->Encode().min.has_value()); - EXPECT_TRUE(merged_statistics->Encode().has_max); + EXPECT_TRUE(merged_statistics->Encode().max.has_value()); EXPECT_EQ(merged_statistics->Encode().is_min_value_exact, std::make_optional(true)); EXPECT_EQ(merged_statistics->Encode().is_max_value_exact, @@ -804,7 +802,7 @@ class TestStatisticsHasFlag : public TestStatistics { EXPECT_FALSE(encoded.null_count.has_value()); EXPECT_FALSE(encoded.distinct_count.has_value()); EXPECT_FALSE(encoded.min.has_value()); - EXPECT_FALSE(encoded.has_max); + EXPECT_FALSE(encoded.max.has_value()); EXPECT_FALSE(encoded.is_min_value_exact.has_value()); EXPECT_FALSE(encoded.is_max_value_exact.has_value()); } @@ -1003,7 +1001,7 @@ class TestStatisticsSortOrder : public ::testing::Test { rg_metadata->ColumnChunk(i); EXPECT_EQ(stats_[i].min, std::make_optional(cc_metadata->statistics()->EncodeMin())); - EXPECT_EQ(stats_[i].max(), cc_metadata->statistics()->EncodeMax()); + EXPECT_EQ(stats_[i].max, cc_metadata->statistics()->EncodeMax()); EXPECT_EQ(stats_[i].is_max_value_exact, std::make_optional(true)); EXPECT_EQ(stats_[i].is_min_value_exact, std::make_optional(true)); } @@ -1424,8 +1422,8 @@ class TestFloatStatistics : public ::testing::Test { ASSERT_TRUE(this->signbit(stats->min().value())); ASSERT_EQ(stats->EncodeMin(), EncodeValue(negative_zero_)); - this->CheckEq(stats->max(), positive_zero_); - ASSERT_FALSE(this->signbit(stats->max())); + this->CheckEq(stats->max().value(), positive_zero_); + ASSERT_FALSE(this->signbit(stats->max().value())); ASSERT_EQ(stats->EncodeMax(), EncodeValue(positive_zero_)); } @@ -1657,7 +1655,7 @@ TEST(TestStatisticsSortOrder, UNKNOWN) { ASSERT_TRUE(enc_stats->null_count.has_value()); ASSERT_FALSE(enc_stats->distinct_count.has_value()); ASSERT_FALSE(enc_stats->min.has_value()); - ASSERT_FALSE(enc_stats->has_max); + ASSERT_FALSE(enc_stats->max.has_value()); ASSERT_EQ(1, enc_stats->null_count); ASSERT_FALSE(enc_stats->is_max_value_exact.has_value()); ASSERT_FALSE(enc_stats->is_min_value_exact.has_value()); @@ -1730,26 +1728,26 @@ TEST(TestEncodedStatistics, TruncatedMinMax) { switch (num_column) { case 2: // Max couldn't truncate the utf-8 string longer than 2 bytes - EXPECT_EQ("🚀Kevin Bacon", encoded_statistics->max()); + EXPECT_EQ("🚀Kevin Bacon", encoded_statistics->max.value()); ASSERT_TRUE(encoded_statistics->is_max_value_exact.value()); ASSERT_FALSE(encoded_statistics->is_min_value_exact.value()); break; case 3: // Max couldn't truncate 0xFFFF binary string - EXPECT_EQ("\xFF\xFF\x1\x2", encoded_statistics->max()); + EXPECT_EQ("\xFF\xFF\x1\x2", encoded_statistics->max.value()); ASSERT_TRUE(encoded_statistics->is_max_value_exact.value()); ASSERT_FALSE(encoded_statistics->is_min_value_exact.value()); break; case 4: case 5: // Min and Max are not truncated, fit on 2 bytes - EXPECT_EQ("Ke", encoded_statistics->max()); + EXPECT_EQ("Ke", encoded_statistics->max.value()); ASSERT_TRUE(encoded_statistics->is_max_value_exact.value()); ASSERT_TRUE(encoded_statistics->is_min_value_exact.value()); break; default: // Max truncated to 2 bytes on columns 0 and 1 - EXPECT_EQ("Kf", encoded_statistics->max()); + EXPECT_EQ("Kf", encoded_statistics->max.value()); ASSERT_FALSE(encoded_statistics->is_max_value_exact.value()); ASSERT_FALSE(encoded_statistics->is_min_value_exact.value()); } @@ -1759,7 +1757,7 @@ TEST(TestEncodedStatistics, TruncatedMinMax) { TEST(TestEncodedStatistics, CopySafe) { EncodedStatistics encoded_statistics; encoded_statistics.set_max("abc"); - ASSERT_TRUE(encoded_statistics.has_max); + ASSERT_TRUE(encoded_statistics.max.has_value()); encoded_statistics.is_max_value_exact = true; ASSERT_TRUE(encoded_statistics.is_max_value_exact.has_value()); @@ -1775,7 +1773,7 @@ TEST(TestEncodedStatistics, CopySafe) { copy_statistics.is_min_value_exact = false; EXPECT_EQ(std::make_optional("abc"), encoded_statistics.min); - EXPECT_EQ("abc", encoded_statistics.max()); + EXPECT_EQ("abc", encoded_statistics.max.value()); EXPECT_EQ(encoded_statistics.is_min_value_exact, std::make_optional(true)); EXPECT_EQ(encoded_statistics.is_max_value_exact, std::make_optional(true)); } @@ -1786,13 +1784,13 @@ TEST(TestEncodedStatistics, ApplyStatSizeLimits) { ASSERT_TRUE(encoded_statistics.min.has_value()); encoded_statistics.set_max("abc"); - ASSERT_TRUE(encoded_statistics.has_max); + ASSERT_TRUE(encoded_statistics.max.has_value()); encoded_statistics.ApplyStatSizeLimits(2); ASSERT_TRUE(encoded_statistics.min.has_value()); ASSERT_EQ(std::make_optional("a"), encoded_statistics.min); - ASSERT_FALSE(encoded_statistics.has_max); + ASSERT_FALSE(encoded_statistics.max.has_value()); NodePtr node = PrimitiveNode::Make("StringColumn", Repetition::REQUIRED, Type::BYTE_ARRAY); diff --git a/cpp/src/parquet/thrift_internal.h b/cpp/src/parquet/thrift_internal.h index 388a6509b36..8beead7bc1f 100644 --- a/cpp/src/parquet/thrift_internal.h +++ b/cpp/src/parquet/thrift_internal.h @@ -480,15 +480,15 @@ static inline format::Statistics ToThrift(const EncodedStatistics& stats) { statistics.__set_min(stats.min.value()); } } - if (stats.has_max) { - statistics.__set_max_value(stats.max()); + if (stats.max.has_value()) { + statistics.__set_max_value(stats.max.value()); if (stats.is_max_value_exact.has_value()) { statistics.__set_is_max_value_exact(stats.is_max_value_exact.value()); } // If the order is SIGNED, then the old max value must be set too. // This for backward compatibility if (stats.is_signed()) { - statistics.__set_max(stats.max()); + statistics.__set_max(stats.max.value()); } } if (stats.null_count) { From 4145d144306d14a1507e09f0edd44366efa30d0f Mon Sep 17 00:00:00 2001 From: Alvaro-Kothe Date: Tue, 24 Feb 2026 00:21:53 -0300 Subject: [PATCH 05/11] refactor --- cpp/src/arrow/dataset/file_parquet_test.cc | 4 ++-- cpp/src/parquet/page_index.cc | 4 ++-- cpp/src/parquet/page_index_test.cc | 2 +- cpp/src/parquet/printer.cc | 2 +- cpp/src/parquet/statistics.cc | 7 ++++--- cpp/src/parquet/statistics.h | 13 ++++++++----- cpp/src/parquet/statistics_test.cc | 3 +-- 7 files changed, 19 insertions(+), 16 deletions(-) diff --git a/cpp/src/arrow/dataset/file_parquet_test.cc b/cpp/src/arrow/dataset/file_parquet_test.cc index adec960f28f..0534dca52ef 100644 --- a/cpp/src/arrow/dataset/file_parquet_test.cc +++ b/cpp/src/arrow/dataset/file_parquet_test.cc @@ -942,7 +942,7 @@ TEST(TestParquetStatistics, NoNullCount) { encoded_stats.set_min(int32_to_parquet_stats(1)); encoded_stats.set_max(int32_to_parquet_stats(100)); encoded_stats.all_null_value = false; - encoded_stats.null_count.reset(); + encoded_stats.null_count = std::nullopt; auto stats = ::parquet::Statistics::Make(&descr, &encoded_stats, /*num_values=*/10); auto stat_expression = @@ -963,7 +963,7 @@ TEST(TestParquetStatistics, NoNullCount) { ASSERT_TRUE(stat_expression.has_value()); EXPECT_EQ(stat_expression->ToString(), "is_null(x, {nan_is_null=false})"); - encoded_stats.null_count.reset(); + encoded_stats.null_count = std::nullopt; encoded_stats.all_null_value = false; stats = ::parquet::Statistics::Make(&descr, &encoded_stats, /*num_values=*/0); stat_expression = ParquetFileFragment::EvaluateStatisticsAsExpression(*field, *stats); diff --git a/cpp/src/parquet/page_index.cc b/cpp/src/parquet/page_index.cc index 8eb73d3534b..39aaf8671c5 100644 --- a/cpp/src/parquet/page_index.cc +++ b/cpp/src/parquet/page_index.cc @@ -524,8 +524,8 @@ class ColumnIndexBuilderImpl final : public ColumnIndexBuilder { return; } - if (column_index_.__isset.null_counts && stats.null_count) { - column_index_.null_counts.emplace_back(*stats.null_count); + if (column_index_.__isset.null_counts && stats.null_count.has_value()) { + column_index_.null_counts.emplace_back(stats.null_count.value()); } else { column_index_.__isset.null_counts = false; column_index_.null_counts.clear(); diff --git a/cpp/src/parquet/page_index_test.cc b/cpp/src/parquet/page_index_test.cc index 78f81b190b3..74e9de287fd 100644 --- a/cpp/src/parquet/page_index_test.cc +++ b/cpp/src/parquet/page_index_test.cc @@ -817,7 +817,7 @@ class PageIndexBuilderTest : public ::testing::Test { ASSERT_EQ(stats.max, column_index->encoded_max_values()[0]); ASSERT_EQ(stats.null_count.has_value(), column_index->has_null_counts()); if (stats.null_count) { - ASSERT_EQ(*stats.null_count, column_index->null_counts()[0]); + ASSERT_EQ(stats.null_count.value(), column_index->null_counts()[0]); } } diff --git a/cpp/src/parquet/printer.cc b/cpp/src/parquet/printer.cc index 5ebcced4eae..8c46373a38a 100644 --- a/cpp/src/parquet/printer.cc +++ b/cpp/src/parquet/printer.cc @@ -165,7 +165,7 @@ void ParquetFilePrinter::DebugPrint(std::ostream& stream, std::list selecte } stream << " Values: " << column_chunk->num_values(); if (column_chunk->is_stats_set()) { - std::string min = stats->min.value_or(""), max = stats->max.value_or(""); + std::string min = stats->Min().value_or(""), max = stats->Max().value_or(""); std::string max_exact = stats->is_max_value_exact.has_value() ? (stats->is_max_value_exact.value() ? "true" : "false") diff --git a/cpp/src/parquet/statistics.cc b/cpp/src/parquet/statistics.cc index f6ca034f11b..42574e54323 100644 --- a/cpp/src/parquet/statistics.cc +++ b/cpp/src/parquet/statistics.cc @@ -737,7 +737,7 @@ class TypedStatisticsImpl : public TypedStatistics { this->num_values_ += other.num_values(); // null_count is valid only if both sides have it. if (this->HasNullCount() && other.HasNullCount()) { - this->statistics_.null_count = *this->statistics_.null_count + other.null_count(); + this->statistics_.null_count = this->statistics_.null_count.value() + other.null_count(); } else { this->statistics_.null_count = std::nullopt; } @@ -745,7 +745,7 @@ class TypedStatisticsImpl : public TypedStatistics { (distinct_count() == 0 || other.distinct_count() == 0)) { // We can merge distinct counts if either side is zero. statistics_.distinct_count = - std::max(*statistics_.distinct_count, other.distinct_count()); + std::max(statistics_.DistinctCount().value(), other.DistinctCount().value()); } else { // Otherwise clear distinct_count as distinct count cannot be merged. this->statistics_.distinct_count = std::nullopt; @@ -879,6 +879,7 @@ class TypedStatisticsImpl : public TypedStatistics { auto min = maybe_min_max.value().first; auto max = maybe_min_max.value().second; + if (!HasMinMax()) { min_.emplace(); max_.emplace(); @@ -1125,7 +1126,7 @@ std::shared_ptr Statistics::Make(const ColumnDescriptor* descr, int64_t num_values, ::arrow::MemoryPool* pool) { DCHECK(encoded_stats != nullptr); - return Make(descr, encoded_stats->min, encoded_stats->max, num_values, + return Make(descr, encoded_stats->Min(), encoded_stats->Max(), num_values, encoded_stats->null_count, encoded_stats->distinct_count, encoded_stats->is_min_value_exact, encoded_stats->is_max_value_exact, pool); } diff --git a/cpp/src/parquet/statistics.h b/cpp/src/parquet/statistics.h index 2b4e6511d5a..a2f5dc4cedd 100644 --- a/cpp/src/parquet/statistics.h +++ b/cpp/src/parquet/statistics.h @@ -161,7 +161,10 @@ class PARQUET_EXPORT EncodedStatistics { this->min = std::nullopt; } - bool is_set() const { return this->min || this->max || null_count || distinct_count; } + bool is_set() const { + return this->min.has_value() || this->max.has_value() || null_count.has_value() || + distinct_count.has_value(); + } bool is_signed() const { return is_signed_; } @@ -207,8 +210,8 @@ class PARQUET_EXPORT Statistics { /// \param[in] encoded_min the encoded minimum value /// \param[in] encoded_max the encoded maximum value /// \param[in] num_values total number of values - /// \param[in] null_count number of null values (std::nullopt if not set) - /// \param[in] distinct_count number of distinct values (std::nullopt if not set) + /// \param[in] null_count number of null values + /// \param[in] distinct_count number of distinct values /// \param[in] pool a memory pool to use for any memory allocations, optional static std::shared_ptr Make( const ColumnDescriptor* descr, const std::optional& encoded_min, @@ -222,8 +225,8 @@ class PARQUET_EXPORT Statistics { /// \param[in] encoded_min the encoded minimum value /// \param[in] encoded_max the encoded maximum value /// \param[in] num_values total number of values - /// \param[in] null_count number of null values (std::nullopt if not set) - /// \param[in] distinct_count number of distinct values (std::nullopt if not set) + /// \param[in] null_count number of null values + /// \param[in] distinct_count number of distinct values /// \param[in] is_min_value_exact whether the min value is exact /// \param[in] is_max_value_exact whether the max value is exact /// \param[in] pool a memory pool to use for any memory allocations, optional diff --git a/cpp/src/parquet/statistics_test.cc b/cpp/src/parquet/statistics_test.cc index 48555e20fce..48751014174 100644 --- a/cpp/src/parquet/statistics_test.cc +++ b/cpp/src/parquet/statistics_test.cc @@ -780,7 +780,6 @@ class TestStatisticsHasFlag : public TestStatistics { } // Merge without null-count should not have null count - // BUG: Failing here VerifyMergedStatistics( *statistics1, *statistics2, [](TypedStatistics* merged_statistics) { EXPECT_FALSE(merged_statistics->HasNullCount()); @@ -792,7 +791,7 @@ class TestStatisticsHasFlag : public TestStatistics { // If statistics doesn't have null count, all_null_value should be false. void TestMissingNullCount() { EncodedStatistics encoded_statistics; - encoded_statistics.null_count.reset(); + encoded_statistics.null_count = std::nullopt; auto statistics = Statistics::Make(this->schema_.Column(0), &encoded_statistics, /*num_values=*/1000); auto typed_stats = std::dynamic_pointer_cast>(statistics); From 97d75448f83c94587193cf77a693d2c3d297780e Mon Sep 17 00:00:00 2001 From: Alvaro-Kothe Date: Tue, 24 Feb 2026 23:20:21 -0300 Subject: [PATCH 06/11] refactor: max min --- .../parquet/arrow/arrow_reader_writer_test.cc | 16 +-- cpp/src/parquet/arrow/reader_internal.cc | 16 +-- cpp/src/parquet/column_writer_test.cc | 2 +- cpp/src/parquet/file_deserialize_test.cc | 8 +- cpp/src/parquet/metadata.cc | 26 ++-- cpp/src/parquet/metadata_test.cc | 34 +++-- cpp/src/parquet/page_index.cc | 6 +- cpp/src/parquet/page_index_test.cc | 8 +- cpp/src/parquet/statistics.cc | 122 ++++++------------ cpp/src/parquet/statistics.h | 48 +++---- cpp/src/parquet/statistics_test.cc | 81 ++++++------ cpp/src/parquet/thrift_internal.h | 16 +-- 12 files changed, 173 insertions(+), 210 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index 27ae5b5ccdd..d1f4e7902b3 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -4735,12 +4735,12 @@ TEST_P(TestArrowWriteDictionary, Statistics) { auto expect_has_min_max = expected_has_min_max_by_page[case_index][row_group_index][page_index]; - EXPECT_EQ(stats.min.has_value(), expect_has_min_max); - EXPECT_EQ(stats.max.has_value(), expect_has_min_max); + EXPECT_EQ(stats.min_.has_value(), expect_has_min_max); + EXPECT_EQ(stats.max_.has_value(), expect_has_min_max); if (expect_has_min_max) { - EXPECT_EQ(stats.min.value(), + EXPECT_EQ(stats.min(), expected_min_by_page[case_index][row_group_index][page_index]); - EXPECT_EQ(stats.max.value(), + EXPECT_EQ(stats.max(), expected_max_by_page[case_index][row_group_index][page_index]); } @@ -4839,8 +4839,8 @@ TEST_P(TestArrowWriteDictionary, StatisticsUnifiedDictionary) { ASSERT_EQ(3, data_page->num_values()); const auto& stats = data_page->statistics(); EXPECT_EQ(1, stats.null_count); - EXPECT_EQ(rg0_min_values[i], stats.min.value()); - EXPECT_EQ(rg0_max_values[i], stats.max.value()); + EXPECT_EQ(rg0_min_values[i], stats.min()); + EXPECT_EQ(rg0_max_values[i], stats.max()); } ASSERT_EQ(rg0_page_reader->NextPage(), nullptr); @@ -4853,8 +4853,8 @@ TEST_P(TestArrowWriteDictionary, StatisticsUnifiedDictionary) { ASSERT_EQ(3, data_page->num_values()); const auto& stats = data_page->statistics(); EXPECT_EQ(1, stats.null_count); - EXPECT_EQ("b", stats.min.value()); - EXPECT_EQ("c", stats.max.value()); + EXPECT_EQ("b", stats.min()); + EXPECT_EQ("c", stats.max()); } ASSERT_EQ(rg1_page_reader->NextPage(), nullptr); } diff --git a/cpp/src/parquet/arrow/reader_internal.cc b/cpp/src/parquet/arrow/reader_internal.cc index 0341223d82d..12f36fe39cf 100644 --- a/cpp/src/parquet/arrow/reader_internal.cc +++ b/cpp/src/parquet/arrow/reader_internal.cc @@ -169,8 +169,8 @@ template Status MakeMinMaxScalar(const StatisticsType& statistics, std::shared_ptr<::arrow::Scalar>* min, std::shared_ptr<::arrow::Scalar>* max) { - *min = ::arrow::MakeScalar(static_cast(statistics.min().value())); - *max = ::arrow::MakeScalar(static_cast(statistics.max().value())); + *min = ::arrow::MakeScalar(static_cast(statistics.min())); + *max = ::arrow::MakeScalar(static_cast(statistics.max())); return Status::OK(); } @@ -179,8 +179,8 @@ Status MakeMinMaxTypedScalar(const StatisticsType& statistics, std::shared_ptr type, std::shared_ptr<::arrow::Scalar>* min, std::shared_ptr<::arrow::Scalar>* max) { - ARROW_ASSIGN_OR_RAISE(*min, ::arrow::MakeScalar(type, statistics.min().value())); - ARROW_ASSIGN_OR_RAISE(*max, ::arrow::MakeScalar(type, statistics.max().value())); + ARROW_ASSIGN_OR_RAISE(*min, ::arrow::MakeScalar(type, statistics.min())); + ARROW_ASSIGN_OR_RAISE(*max, ::arrow::MakeScalar(type, statistics.max())); return Status::OK(); } @@ -227,7 +227,7 @@ static Status FromInt32Statistics(const Int32Statistics& statistics, case LogicalType::Type::NONE: return MakeMinMaxTypedScalar(statistics, type, min, max); case LogicalType::Type::DECIMAL: - return ExtractDecimalMinMaxFromInteger(statistics.min().value(), statistics.max().value(), + return ExtractDecimalMinMaxFromInteger(statistics.min(), statistics.max(), logical_type, min, max); default: break; @@ -252,7 +252,7 @@ static Status FromInt64Statistics(const Int64Statistics& statistics, case LogicalType::Type::NONE: return MakeMinMaxTypedScalar(statistics, type, min, max); case LogicalType::Type::DECIMAL: - return ExtractDecimalMinMaxFromInteger(statistics.min().value(), statistics.max().value(), + return ExtractDecimalMinMaxFromInteger(statistics.min(), statistics.max(), logical_type, min, max); default: break; @@ -389,8 +389,8 @@ void AttachStatistics(::arrow::ArrayData* data, if (statistics->HasMinMax()) { const auto* typed_statistics = checked_cast*>(statistics.get()); - const ArrowCType min = typed_statistics->min().value(); - const ArrowCType max = typed_statistics->max().value(); + const ArrowCType min = typed_statistics->min(); + const ArrowCType max = typed_statistics->max(); if constexpr (std::is_same_v) { array_statistics->min = static_cast(min); array_statistics->max = static_cast(max); diff --git a/cpp/src/parquet/column_writer_test.cc b/cpp/src/parquet/column_writer_test.cc index b392a4d4cef..ce5af028eb9 100644 --- a/cpp/src/parquet/column_writer_test.cc +++ b/cpp/src/parquet/column_writer_test.cc @@ -375,7 +375,7 @@ class TestPrimitiveWriter : public PrimitiveTypedTest { auto metadata_accessor = ColumnChunkMetaData::Make( metadata_->contents(), this->descr_, default_reader_properties(), &app_version); auto encoded_stats = metadata_accessor->statistics()->Encode(); - return {encoded_stats.min.has_value(), encoded_stats.max.has_value()}; + return {encoded_stats.min_.has_value(), encoded_stats.max_.has_value()}; } std::vector metadata_encodings() { diff --git a/cpp/src/parquet/file_deserialize_test.cc b/cpp/src/parquet/file_deserialize_test.cc index fd02ad6700a..7fa5e2f167e 100644 --- a/cpp/src/parquet/file_deserialize_test.cc +++ b/cpp/src/parquet/file_deserialize_test.cc @@ -69,10 +69,10 @@ static inline void AddDummyStats(int stat_size, H& header, bool fill_all_stats = template static inline void CheckStatistics(const H& expected, const EncodedStatistics& actual) { if (expected.statistics.__isset.max) { - ASSERT_EQ(expected.statistics.max, actual.max); + ASSERT_EQ(expected.statistics.max, actual.max()); } if (expected.statistics.__isset.min) { - ASSERT_EQ(expected.statistics.min, actual.min); + ASSERT_EQ(expected.statistics.min, actual.min()); } if (expected.statistics.__isset.null_count) { ASSERT_EQ(expected.statistics.null_count, actual.null_count); @@ -513,8 +513,8 @@ TYPED_TEST(PageFilterTest, TestPageFilterCallback) { CheckDataPageHeader(this->data_page_headers_[i], current_page.get())); auto data_page = static_cast(current_page.get()); const EncodedStatistics encoded_statistics = data_page->statistics(); - ASSERT_EQ(read_stats[i].max, encoded_statistics.max); - ASSERT_EQ(read_stats[i].min, encoded_statistics.min); + ASSERT_EQ(read_stats[i].max(), encoded_statistics.max()); + ASSERT_EQ(read_stats[i].min(), encoded_statistics.min()); ASSERT_EQ(read_stats[i].null_count, encoded_statistics.null_count); ASSERT_EQ(read_stats[i].distinct_count, encoded_statistics.distinct_count); ASSERT_EQ(read_num_values[i], this->data_page_headers_[i].num_values); diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 04ff3cca4f3..3053aef889d 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -109,21 +109,23 @@ static std::shared_ptr MakeTypedColumnStats( metadata.statistics.__isset.distinct_count ? std::optional(metadata.statistics.distinct_count) : std::nullopt; - std::optional min_val = metadata.statistics.__isset.min - ? std::make_optional(metadata.statistics.min) - : std::nullopt; - std::optional max_val = metadata.statistics.__isset.max - ? std::make_optional(metadata.statistics.max) - : std::nullopt; + std::optional min_val = + metadata.statistics.__isset.min + ? std::optional(metadata.statistics.min) + : std::nullopt; + std::optional max_val = + metadata.statistics.__isset.max + ? std::optional(metadata.statistics.max) + : std::nullopt; // If ColumnOrder is defined, return max_value and min_value if (descr->column_order().get_order() == ColumnOrder::TYPE_DEFINED_ORDER) { - std::optional min_value = + std::optional min_value = metadata.statistics.__isset.min_value - ? std::make_optional(metadata.statistics.min_value) + ? std::optional(metadata.statistics.min_value) : std::nullopt; - std::optional max_value = + std::optional max_value = metadata.statistics.__isset.max_value - ? std::make_optional(metadata.statistics.max_value) + ? std::optional(metadata.statistics.max_value) : std::nullopt; return MakeStatistics(descr, min_value, max_value, metadata.num_values - null_count.value_or(0), null_count, @@ -1624,8 +1626,8 @@ bool ApplicationVersion::HasCorrectStatistics(Type::type col_type, (application_ == "parquet-mr" && VersionLt(PARQUET_MR_FIXED_STATS_VERSION()))) { // Only SIGNED are valid unless max and min are the same // (in which case the sort order does not matter) - bool max_equals_min = statistics.min.has_value() && statistics.max.has_value() - ? statistics.min.value() == statistics.max.value() + bool max_equals_min = statistics.min_.has_value() && statistics.max_.has_value() + ? statistics.min() == statistics.max() : false; if (SortOrder::SIGNED != sort_order && !max_equals_min) { return false; diff --git a/cpp/src/parquet/metadata_test.cc b/cpp/src/parquet/metadata_test.cc index 99958596b72..572f053179c 100644 --- a/cpp/src/parquet/metadata_test.cc +++ b/cpp/src/parquet/metadata_test.cc @@ -154,15 +154,14 @@ TEST(Metadata, TestBuildAccess) { auto rg1_column2 = rg1_accessor->ColumnChunk(1); ASSERT_EQ(true, rg1_column1->is_stats_set()); ASSERT_EQ(true, rg1_column2->is_stats_set()); - ASSERT_EQ(stats_float.min, - std::make_optional(rg1_column2->statistics()->EncodeMin())); - ASSERT_EQ(stats_float.max, rg1_column2->statistics()->EncodeMax()); - ASSERT_EQ(stats_int.min, std::make_optional(rg1_column1->statistics()->EncodeMin())); - ASSERT_EQ(stats_int.max, rg1_column1->statistics()->EncodeMax()); - ASSERT_EQ(stats_float.min, rg1_column2->encoded_statistics()->min); - ASSERT_EQ(stats_float.max, rg1_column2->encoded_statistics()->max); - ASSERT_EQ(stats_int.min, rg1_column1->encoded_statistics()->min); - ASSERT_EQ(stats_int.max, rg1_column1->encoded_statistics()->max); + ASSERT_EQ(stats_float.min(), rg1_column2->statistics()->EncodeMin()); + ASSERT_EQ(stats_float.max(), rg1_column2->statistics()->EncodeMax()); + ASSERT_EQ(stats_int.min(), rg1_column1->statistics()->EncodeMin()); + ASSERT_EQ(stats_int.max(), rg1_column1->statistics()->EncodeMax()); + ASSERT_EQ(stats_float.min(), rg1_column2->encoded_statistics()->min()); + ASSERT_EQ(stats_float.max(), rg1_column2->encoded_statistics()->max()); + ASSERT_EQ(stats_int.min(), rg1_column1->encoded_statistics()->min()); + ASSERT_EQ(stats_int.max(), rg1_column1->encoded_statistics()->max()); ASSERT_EQ(0, rg1_column1->statistics()->null_count()); ASSERT_EQ(0, rg1_column2->statistics()->null_count()); ASSERT_EQ(nrows, rg1_column1->statistics()->distinct_count()); @@ -206,15 +205,14 @@ TEST(Metadata, TestBuildAccess) { auto rg2_column2 = rg2_accessor->ColumnChunk(1); ASSERT_EQ(true, rg2_column1->is_stats_set()); ASSERT_EQ(true, rg2_column2->is_stats_set()); - ASSERT_EQ(stats_float.min, - std::make_optional(rg2_column2->statistics()->EncodeMin())); - ASSERT_EQ(stats_float.max, rg2_column2->statistics()->EncodeMax()); - ASSERT_EQ(stats_int.min, std::make_optional(rg1_column1->statistics()->EncodeMin())); - ASSERT_EQ(stats_int.max, rg1_column1->statistics()->EncodeMax()); - ASSERT_EQ(stats_float.min, rg2_column2->encoded_statistics()->min); - ASSERT_EQ(stats_float.max, rg2_column2->encoded_statistics()->max); - ASSERT_EQ(stats_int.min, rg1_column1->encoded_statistics()->min); - ASSERT_EQ(stats_int.max, rg1_column1->encoded_statistics()->max); + ASSERT_EQ(stats_float.min(), rg2_column2->statistics()->EncodeMin()); + ASSERT_EQ(stats_float.max(), rg2_column2->statistics()->EncodeMax()); + ASSERT_EQ(stats_int.min(), rg1_column1->statistics()->EncodeMin()); + ASSERT_EQ(stats_int.max(), rg1_column1->statistics()->EncodeMax()); + ASSERT_EQ(stats_float.min(), rg2_column2->encoded_statistics()->min()); + ASSERT_EQ(stats_float.max(), rg2_column2->encoded_statistics()->max()); + ASSERT_EQ(stats_int.min(), rg1_column1->encoded_statistics()->min()); + ASSERT_EQ(stats_int.max(), rg1_column1->encoded_statistics()->max()); ASSERT_EQ(0, rg2_column1->statistics()->null_count()); ASSERT_EQ(0, rg2_column2->statistics()->null_count()); ASSERT_EQ(nrows, rg2_column1->statistics()->distinct_count()); diff --git a/cpp/src/parquet/page_index.cc b/cpp/src/parquet/page_index.cc index 39aaf8671c5..0f90c45358d 100644 --- a/cpp/src/parquet/page_index.cc +++ b/cpp/src/parquet/page_index.cc @@ -511,11 +511,11 @@ class ColumnIndexBuilderImpl final : public ColumnIndexBuilder { column_index_.null_pages.emplace_back(true); column_index_.min_values.emplace_back(""); column_index_.max_values.emplace_back(""); - } else if (stats.min.has_value() && stats.max.has_value()) { + } else if (stats.min_.has_value() && stats.max_.has_value()) { const size_t page_ordinal = column_index_.null_pages.size(); non_null_page_indices_.emplace_back(page_ordinal); - column_index_.min_values.emplace_back(stats.min.value()); - column_index_.max_values.emplace_back(stats.max.value()); + column_index_.min_values.emplace_back(stats.min()); + column_index_.max_values.emplace_back(stats.max()); column_index_.null_pages.emplace_back(false); } else { /// This is a non-null page but it lacks of meaningful min/max values. diff --git a/cpp/src/parquet/page_index_test.cc b/cpp/src/parquet/page_index_test.cc index 74e9de287fd..b92bd7cccc5 100644 --- a/cpp/src/parquet/page_index_test.cc +++ b/cpp/src/parquet/page_index_test.cc @@ -546,8 +546,8 @@ void TestWriteTypedColumnIndex(schema::NodePtr node, for (size_t i = 0; i < num_pages; ++i) { ASSERT_EQ(page_stats[i].all_null_value, column_index->null_pages()[i]); - ASSERT_EQ(page_stats[i].min.value_or(""), column_index->encoded_min_values()[i]); - ASSERT_EQ(page_stats[i].max.value_or(""), column_index->encoded_max_values()[i]); + ASSERT_EQ(page_stats[i].min_.value_or(""), column_index->encoded_min_values()[i]); + ASSERT_EQ(page_stats[i].max_.value_or(""), column_index->encoded_max_values()[i]); if (has_null_counts) { ASSERT_EQ(page_stats[i].null_count, column_index->null_counts()[i]); } @@ -813,8 +813,8 @@ class PageIndexBuilderTest : public ::testing::Test { ASSERT_NE(nullptr, column_index); ASSERT_EQ(size_t{1}, column_index->null_pages().size()); ASSERT_EQ(stats.all_null_value, column_index->null_pages()[0]); - ASSERT_EQ(stats.min, column_index->encoded_min_values()[0]); - ASSERT_EQ(stats.max, column_index->encoded_max_values()[0]); + ASSERT_EQ(stats.min(), column_index->encoded_min_values()[0]); + ASSERT_EQ(stats.max(), column_index->encoded_max_values()[0]); ASSERT_EQ(stats.null_count.has_value(), column_index->has_null_counts()); if (stats.null_count) { ASSERT_EQ(stats.null_count.value(), column_index->null_counts()[0]); diff --git a/cpp/src/parquet/statistics.cc b/cpp/src/parquet/statistics.cc index 42574e54323..34cc529e0f6 100644 --- a/cpp/src/parquet/statistics.cc +++ b/cpp/src/parquet/statistics.cc @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -604,8 +605,8 @@ class TypedStatisticsImpl : public TypedStatistics { // Create stats from a thrift Statistics object. TypedStatisticsImpl(const ColumnDescriptor* descr, - const std::optional& encoded_min, - const std::optional& encoded_max, int64_t num_values, + std::optional encoded_min, + std::optional encoded_max, int64_t num_values, std::optional null_count, std::optional distinct_count, MemoryPool* pool) : TypedStatisticsImpl(descr, encoded_min, encoded_max, num_values, null_count, @@ -614,37 +615,35 @@ class TypedStatisticsImpl : public TypedStatistics { /*is_max_value_exact=*/std::nullopt, pool) {} TypedStatisticsImpl(const ColumnDescriptor* descr, - const std::optional& encoded_min, - const std::optional& encoded_max, int64_t num_values, + std::optional encoded_min, + std::optional encoded_max, int64_t num_values, std::optional null_count, std::optional distinct_count, std::optional is_min_value_exact, std::optional is_max_value_exact, MemoryPool* pool) : TypedStatisticsImpl(descr, pool) { TypedStatisticsImpl::IncrementNumValues(num_values); - if (null_count) { - TypedStatisticsImpl::IncrementNullCount(*null_count); + if (null_count.has_value()) { + TypedStatisticsImpl::IncrementNullCount(null_count.value()); } else { statistics_.null_count = std::nullopt; } - if (distinct_count) { - SetDistinctCount(*distinct_count); + if (distinct_count.has_value()) { + SetDistinctCount(distinct_count.value()); } else { statistics_.distinct_count = std::nullopt; } - if (encoded_min.has_value()) { - T min_val; - PlainDecode(encoded_min.value(), &min_val); + if (encoded_min.has_value() && encoded_max.has_value()) { + T min, max; + PlainDecode(encoded_min.value(), &min); + PlainDecode(encoded_max.value(), &max); min_.emplace(); - Copy(min_val, &min_.value(), min_buffer_.get()); - statistics_.is_min_value_exact = is_min_value_exact; - } - if (encoded_max.has_value()) { - T max_val; - PlainDecode(encoded_max.value(), &max_val); max_.emplace(); - Copy(max_val, &max_.value(), max_buffer_.get()); + // Copy the decoded result to avoid dangling pointer problem + Copy(min, &min_.value(), min_buffer_.get()); + Copy(max, &max_.value(), max_buffer_.get()); + statistics_.is_min_value_exact = is_min_value_exact; statistics_.is_max_value_exact = is_max_value_exact; } } @@ -683,7 +682,8 @@ class TypedStatisticsImpl : public TypedStatistics { } const auto& other = checked_cast(raw_other); - if (!MinMaxEqual(other)) return false; + if (this->HasMinMax() != other.HasMinMax()) return false; + if (this->HasMinMax() && !MinMaxEqual(other)) return false; return null_count() == other.null_count() && distinct_count() == other.distinct_count() && @@ -697,47 +697,17 @@ class TypedStatisticsImpl : public TypedStatistics { void Reset() override { ResetCounts(); ResetHasFlags(); - min_ = std::nullopt; - max_ = std::nullopt; } void SetMinMax(const T& arg_min, const T& arg_max) override { SetMinMaxPair({arg_min, arg_max}); } - void SetMinMax(std::optional arg_min, std::optional arg_max) override { - if (arg_min.has_value() && arg_max.has_value()) { - SetMinMaxPair({arg_min.value(), arg_max.value()}); - } else if (arg_min.has_value()) { - // Only min is provided. - if (!min_.has_value()) { - min_.emplace(); - Copy(arg_min.value(), &min_.value(), min_buffer_.get()); - } else if (comparator_) { - Copy(comparator_->Compare(min_.value(), arg_min.value()) ? min_.value() - : arg_min.value(), - &min_.value(), min_buffer_.get()); - } - statistics_.is_min_value_exact = true; - } else if (arg_max.has_value()) { - // Only max is provided. - if (!max_.has_value()) { - max_.emplace(); - Copy(arg_max.value(), &max_.value(), max_buffer_.get()); - } else if (comparator_) { - Copy(comparator_->Compare(max_.value(), arg_max.value()) ? arg_max.value() - : max_.value(), - &max_.value(), max_buffer_.get()); - } - statistics_.is_max_value_exact = true; - } - } - void Merge(const TypedStatistics& other) override { this->num_values_ += other.num_values(); // null_count is valid only if both sides have it. if (this->HasNullCount() && other.HasNullCount()) { - this->statistics_.null_count = this->statistics_.null_count.value() + other.null_count(); + this->statistics_.null_count.value() += other.null_count(); } else { this->statistics_.null_count = std::nullopt; } @@ -777,9 +747,9 @@ class TypedStatisticsImpl : public TypedStatistics { SetMinMaxPair(comparator_->GetMinMax(values)); } - std::optional min() const override { return min_; } + const T& min() const override { return min_.value(); } - std::optional max() const override { return max_; } + const T& max() const override { return max_.value(); } Type::type physical_type() const override { return descr_->physical_type(); } @@ -787,24 +757,22 @@ class TypedStatisticsImpl : public TypedStatistics { std::string EncodeMin() const override { std::string s; - if (min_.has_value()) this->PlainEncode(min_.value(), &s); + if (HasMinMax()) this->PlainEncode(min_.value(), &s); return s; } std::string EncodeMax() const override { std::string s; - if (max_.has_value()) this->PlainEncode(max_.value(), &s); + if (HasMinMax()) this->PlainEncode(max_.value(), &s); return s; } EncodedStatistics Encode() override { EncodedStatistics s; - if (min_.has_value()) { + if (HasMinMax()) { s.set_min(this->EncodeMin()); - s.is_min_value_exact = this->is_min_value_exact(); - } - if (max_.has_value()) { s.set_max(this->EncodeMax()); + s.is_min_value_exact = this->is_min_value_exact(); s.is_max_value_exact = this->is_max_value_exact(); } if (HasNullCount()) { @@ -847,7 +815,7 @@ class TypedStatisticsImpl : public TypedStatistics { LogicalType::Type::type logical_type_ = LogicalType::Type::NONE; void PlainEncode(const T& src, std::string* dst) const; - void PlainDecode(const std::string& src, T* dst) const; + void PlainDecode(std::string_view src, T* dst) const; void Copy(const T& src, T* dst, ResizableBuffer*) { *dst = src; } @@ -863,6 +831,9 @@ class TypedStatisticsImpl : public TypedStatistics { } void ResetHasFlags() { + // min_ and max_ will only be set when it meets any valid value. + this->min_ = std::nullopt; + this->max_ = std::nullopt; // distinct_count will only be set once SetDistinctCount() // is called because distinct count calculation is not cheap and // disabled by default. @@ -975,9 +946,9 @@ void TypedStatisticsImpl::PlainEncode(const T& src, std::string* dst) con } template -void TypedStatisticsImpl::PlainDecode(const std::string& src, T* dst) const { +void TypedStatisticsImpl::PlainDecode(std::string_view src, T* dst) const { auto decoder = MakeTypedDecoder(Encoding::PLAIN, descr_); - decoder->SetData(1, reinterpret_cast(src.c_str()), + decoder->SetData(1, reinterpret_cast(src.data()), static_cast(src.size())); int decoded_values = decoder->Decode(dst, 1); if (decoded_values != 1) { @@ -992,18 +963,9 @@ void TypedStatisticsImpl::PlainEncode(const T& src, } template <> -void TypedStatisticsImpl::PlainDecode(const std::string& src, T* dst) const { - if (src.size() != static_cast(descr_->type_length())) { - throw ParquetException("Size of encoded FLBA does not match descriptor"); - } - dst->ptr = reinterpret_cast(src.c_str()); -} - -template <> -void TypedStatisticsImpl::PlainDecode(const std::string& src, - T* dst) const { +void TypedStatisticsImpl::PlainDecode(std::string_view src, T* dst) const { dst->len = static_cast(src.size()); - dst->ptr = reinterpret_cast(src.c_str()); + dst->ptr = reinterpret_cast(src.data()); } std::shared_ptr DoMakeComparator(Type::type physical_type, @@ -1131,11 +1093,13 @@ std::shared_ptr Statistics::Make(const ColumnDescriptor* descr, encoded_stats->is_min_value_exact, encoded_stats->is_max_value_exact, pool); } -std::shared_ptr Statistics::Make( - const ColumnDescriptor* descr, const std::optional& encoded_min, - const std::optional& encoded_max, int64_t num_values, - std::optional null_count, std::optional distinct_count, - ::arrow::MemoryPool* pool) { +std::shared_ptr Statistics::Make(const ColumnDescriptor* descr, + std::optional encoded_min, + std::optional encoded_max, + int64_t num_values, + std::optional null_count, + std::optional distinct_count, + ::arrow::MemoryPool* pool) { return Statistics::Make(descr, encoded_min, encoded_max, num_values, null_count, distinct_count, /*is_min_value_exact=*/std::nullopt, @@ -1143,8 +1107,8 @@ std::shared_ptr Statistics::Make( } std::shared_ptr Statistics::Make( - const ColumnDescriptor* descr, const std::optional& encoded_min, - const std::optional& encoded_max, int64_t num_values, + const ColumnDescriptor* descr, std::optional encoded_min, + std::optional encoded_max, int64_t num_values, std::optional null_count, std::optional distinct_count, std::optional is_min_value_exact, std::optional is_max_value_exact, ::arrow::MemoryPool* pool) { diff --git a/cpp/src/parquet/statistics.h b/cpp/src/parquet/statistics.h index a2f5dc4cedd..15a8ad7bb25 100644 --- a/cpp/src/parquet/statistics.h +++ b/cpp/src/parquet/statistics.h @@ -22,6 +22,7 @@ #include #include #include +#include #include #include "parquet/platform.h" @@ -124,8 +125,10 @@ class PARQUET_EXPORT EncodedStatistics { public: EncodedStatistics() = default; - std::optional max; - std::optional min; + std::optional max_, min_; + + const std::string& max() const { return max_.value(); } + const std::string& min() const { return min_.value(); } std::optional is_max_value_exact; std::optional is_min_value_exact; @@ -145,24 +148,24 @@ class PARQUET_EXPORT EncodedStatistics { // the true minimum for aggregations and there is no way to mark that a // value has been truncated and is a lower bound and not in the page. void ApplyStatSizeLimits(size_t length) { - if (this->max.has_value() && this->max->length() > length) { - this->max = std::nullopt; + if (max_.has_value() && max_->length() > length) { + max_ = std::nullopt; is_max_value_exact = std::nullopt; } - if (this->min.has_value() && this->min->length() > length) { - this->min = std::nullopt; + if (min_.has_value() && min_->length() > length) { + min_ = std::nullopt; is_min_value_exact = std::nullopt; } } // Clear Min Max. void ClearMinMax() { - this->max = std::nullopt; - this->min = std::nullopt; + max_ = std::nullopt; + min_ = std::nullopt; } bool is_set() const { - return this->min.has_value() || this->max.has_value() || null_count.has_value() || + return min_.has_value() || max_.has_value() || null_count.has_value() || distinct_count.has_value(); } @@ -171,12 +174,12 @@ class PARQUET_EXPORT EncodedStatistics { void set_is_signed(bool is_signed) { is_signed_ = is_signed; } EncodedStatistics& set_max(std::string value) { - this->max = std::move(value); + max_ = std::move(value); return *this; } EncodedStatistics& set_min(std::string value) { - this->min = std::move(value); + min_ = std::move(value); return *this; } @@ -214,8 +217,8 @@ class PARQUET_EXPORT Statistics { /// \param[in] distinct_count number of distinct values /// \param[in] pool a memory pool to use for any memory allocations, optional static std::shared_ptr Make( - const ColumnDescriptor* descr, const std::optional& encoded_min, - const std::optional& encoded_max, int64_t num_values, + const ColumnDescriptor* descr, std::optional encoded_min, + std::optional encoded_max, int64_t num_values, std::optional null_count, std::optional distinct_count, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()); @@ -231,8 +234,8 @@ class PARQUET_EXPORT Statistics { /// \param[in] is_max_value_exact whether the max value is exact /// \param[in] pool a memory pool to use for any memory allocations, optional static std::shared_ptr Make( - const ColumnDescriptor* descr, const std::optional& encoded_min, - const std::optional& encoded_max, int64_t num_values, + const ColumnDescriptor* descr, std::optional encoded_min, + std::optional encoded_max, int64_t num_values, std::optional null_count, std::optional distinct_count, std::optional is_min_value_exact, std::optional is_max_value_exact, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()); @@ -306,10 +309,10 @@ class TypedStatistics : public Statistics { using T = typename DType::c_type; /// \brief The current minimum value - virtual std::optional min() const = 0; + virtual const T& min() const = 0; /// \brief The current maximum value - virtual std::optional max() const = 0; + virtual const T& max() const = 0; /// \brief Update state with state of another Statistics object virtual void Merge(const TypedStatistics& other) = 0; @@ -346,9 +349,6 @@ class TypedStatistics : public Statistics { /// \brief Set min and max values to particular values virtual void SetMinMax(const T& min, const T& max) = 0; - /// \brief Set min and max values to particular values, where min is optional - virtual void SetMinMax(std::optional min, std::optional max) = 0; - /// \brief Increments the null count directly /// Use Update to extract the null count from data. Use this if you determine /// the null count through some other means (e.g. dictionary arrays where the @@ -395,8 +395,8 @@ std::shared_ptr> MakeStatistics(const typename DType::c_t /// \brief Typed version of Statistics::Make template std::shared_ptr> MakeStatistics( - const ColumnDescriptor* descr, const std::optional& encoded_min, - const std::optional& encoded_max, int64_t num_values, + const ColumnDescriptor* descr, std::optional encoded_min, + std::optional encoded_max, int64_t num_values, std::optional null_count, std::optional distinct_count, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) { return std::static_pointer_cast>(Statistics::Make( @@ -407,8 +407,8 @@ std::shared_ptr> MakeStatistics( /// \brief Typed version of Statistics::Make template std::shared_ptr> MakeStatistics( - const ColumnDescriptor* descr, const std::optional& encoded_min, - const std::optional& encoded_max, int64_t num_values, + const ColumnDescriptor* descr, std::optional encoded_min, + std::optional encoded_max, int64_t num_values, std::optional null_count, std::optional distinct_count, std::optional is_min_value_exact, std::optional is_max_value_exact, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) { diff --git a/cpp/src/parquet/statistics_test.cc b/cpp/src/parquet/statistics_test.cc index 48751014174..09db338c6d5 100644 --- a/cpp/src/parquet/statistics_test.cc +++ b/cpp/src/parquet/statistics_test.cc @@ -470,10 +470,10 @@ class TestStatistics : public PrimitiveTypedTest { std::shared_ptr enc_stats = column_chunk->encoded_statistics(); EXPECT_EQ(null_count, enc_stats->null_count); - EXPECT_TRUE(enc_stats->min.has_value()); - EXPECT_TRUE(enc_stats->max.has_value()); - EXPECT_EQ(std::make_optional(expected_stats->EncodeMin()), enc_stats->min); - EXPECT_EQ(expected_stats->EncodeMax(), enc_stats->max); + EXPECT_TRUE(enc_stats->min_.has_value()); + EXPECT_TRUE(enc_stats->max_.has_value()); + EXPECT_EQ(expected_stats->EncodeMin(), enc_stats->min()); + EXPECT_EQ(expected_stats->EncodeMax(), enc_stats->max()); EXPECT_EQ(enc_stats->is_min_value_exact, std::make_optional(true)); EXPECT_EQ(enc_stats->is_max_value_exact, std::make_optional(true)); } @@ -561,11 +561,11 @@ void TestStatistics::TestMinMaxEncode() { // encoded is same as unencoded ASSERT_EQ(encoded_min, - std::string(reinterpret_cast(statistics1->min().value().ptr), - statistics1->min().value().len)); + std::string(reinterpret_cast(statistics1->min().ptr), + statistics1->min().len)); ASSERT_EQ(encoded_max, - std::string(reinterpret_cast(statistics1->max().value().ptr), - statistics1->max().value().len)); + std::string(reinterpret_cast(statistics1->max().ptr), + statistics1->max().len)); auto statistics2 = MakeStatistics(this->schema_.Column(0), encoded_min, encoded_max, this->values_.size(), @@ -710,8 +710,8 @@ class TestStatisticsHasFlag : public TestStatistics { /*null_count=*/this->values_.size()); auto encoded_stats1 = statistics1->Encode(); EXPECT_FALSE(statistics1->HasMinMax()); - EXPECT_FALSE(encoded_stats1.min.has_value()); - EXPECT_FALSE(encoded_stats1.max.has_value()); + EXPECT_FALSE(encoded_stats1.min_.has_value()); + EXPECT_FALSE(encoded_stats1.max_.has_value()); EXPECT_EQ(encoded_stats1.is_max_value_exact, std::nullopt); EXPECT_EQ(encoded_stats1.is_min_value_exact, std::nullopt); } @@ -722,16 +722,16 @@ class TestStatisticsHasFlag : public TestStatistics { statistics2->Update(this->values_ptr_, this->values_.size(), 0); auto encoded_stats2 = statistics2->Encode(); EXPECT_TRUE(statistics2->HasMinMax()); - EXPECT_TRUE(encoded_stats2.min.has_value()); - EXPECT_TRUE(encoded_stats2.max.has_value()); + EXPECT_TRUE(encoded_stats2.min_.has_value()); + EXPECT_TRUE(encoded_stats2.max_.has_value()); EXPECT_EQ(encoded_stats2.is_min_value_exact, std::make_optional(true)); EXPECT_EQ(encoded_stats2.is_max_value_exact, std::make_optional(true)); } VerifyMergedStatistics(*statistics1, *statistics2, [](TypedStatistics* merged_statistics) { EXPECT_TRUE(merged_statistics->HasMinMax()); - EXPECT_TRUE(merged_statistics->Encode().min.has_value()); - EXPECT_TRUE(merged_statistics->Encode().max.has_value()); + EXPECT_TRUE(merged_statistics->Encode().min_.has_value()); + EXPECT_TRUE(merged_statistics->Encode().max_.has_value()); EXPECT_EQ(merged_statistics->Encode().is_min_value_exact, std::make_optional(true)); EXPECT_EQ(merged_statistics->Encode().is_max_value_exact, @@ -771,7 +771,7 @@ class TestStatisticsHasFlag : public TestStatistics { std::shared_ptr> statistics2; { EncodedStatistics encoded_statistics2; - encoded_statistics2.null_count.reset(); + encoded_statistics2.null_count = std::nullopt; statistics2 = std::dynamic_pointer_cast>( Statistics::Make(this->schema_.Column(0), &encoded_statistics2, /*num_values=*/1000)); @@ -800,8 +800,8 @@ class TestStatisticsHasFlag : public TestStatistics { EXPECT_FALSE(encoded.all_null_value); EXPECT_FALSE(encoded.null_count.has_value()); EXPECT_FALSE(encoded.distinct_count.has_value()); - EXPECT_FALSE(encoded.min.has_value()); - EXPECT_FALSE(encoded.max.has_value()); + EXPECT_FALSE(encoded.min_.has_value()); + EXPECT_FALSE(encoded.max_.has_value()); EXPECT_FALSE(encoded.is_min_value_exact.has_value()); EXPECT_FALSE(encoded.is_max_value_exact.has_value()); } @@ -998,9 +998,8 @@ class TestStatisticsSortOrder : public ::testing::Test { ARROW_SCOPED_TRACE("Statistics for field #", i); std::shared_ptr cc_metadata = rg_metadata->ColumnChunk(i); - EXPECT_EQ(stats_[i].min, - std::make_optional(cc_metadata->statistics()->EncodeMin())); - EXPECT_EQ(stats_[i].max, cc_metadata->statistics()->EncodeMax()); + EXPECT_EQ(stats_[i].min(), cc_metadata->statistics()->EncodeMin()); + EXPECT_EQ(stats_[i].max(), cc_metadata->statistics()->EncodeMax()); EXPECT_EQ(stats_[i].is_max_value_exact, std::make_optional(true)); EXPECT_EQ(stats_[i].is_min_value_exact, std::make_optional(true)); } @@ -1240,7 +1239,7 @@ void TestByteArrayStatisticsFromArrow() { auto stats = MakeStatistics(&descr); ASSERT_NO_FATAL_FAILURE(stats->Update(*values)); - ASSERT_EQ(ByteArray(typed_values.GetView(2)), stats->min().value()); + ASSERT_EQ(ByteArray(typed_values.GetView(2)), stats->min()); ASSERT_EQ(ByteArray(typed_values.GetView(9)), stats->max()); ASSERT_EQ(2, stats->null_count()); } @@ -1417,12 +1416,12 @@ class TestFloatStatistics : public ::testing::Test { stats->Update(values.data(), values.size(), /*null_count=*/0); ASSERT_TRUE(stats->HasMinMax()); - this->CheckEq(stats->min().value(), positive_zero_); - ASSERT_TRUE(this->signbit(stats->min().value())); + this->CheckEq(stats->min(), positive_zero_); + ASSERT_TRUE(this->signbit(stats->min())); ASSERT_EQ(stats->EncodeMin(), EncodeValue(negative_zero_)); - this->CheckEq(stats->max().value(), positive_zero_); - ASSERT_FALSE(this->signbit(stats->max().value())); + this->CheckEq(stats->max(), positive_zero_); + ASSERT_FALSE(this->signbit(stats->max())); ASSERT_EQ(stats->EncodeMax(), EncodeValue(positive_zero_)); } @@ -1653,8 +1652,8 @@ TEST(TestStatisticsSortOrder, UNKNOWN) { std::shared_ptr enc_stats = column_chunk->encoded_statistics(); ASSERT_TRUE(enc_stats->null_count.has_value()); ASSERT_FALSE(enc_stats->distinct_count.has_value()); - ASSERT_FALSE(enc_stats->min.has_value()); - ASSERT_FALSE(enc_stats->max.has_value()); + ASSERT_FALSE(enc_stats->min_.has_value()); + ASSERT_FALSE(enc_stats->max_.has_value()); ASSERT_EQ(1, enc_stats->null_count); ASSERT_FALSE(enc_stats->is_max_value_exact.has_value()); ASSERT_FALSE(enc_stats->is_min_value_exact.has_value()); @@ -1721,32 +1720,32 @@ TEST(TestEncodedStatistics, TruncatedMinMax) { column_chunk->encoded_statistics(); ASSERT_TRUE(encoded_statistics != NULL); ASSERT_EQ(0, encoded_statistics->null_count); - EXPECT_EQ(std::make_optional("Al"), encoded_statistics->min); + EXPECT_EQ("Al", encoded_statistics->min()); ASSERT_TRUE(encoded_statistics->is_max_value_exact.has_value()); ASSERT_TRUE(encoded_statistics->is_min_value_exact.has_value()); switch (num_column) { case 2: // Max couldn't truncate the utf-8 string longer than 2 bytes - EXPECT_EQ("🚀Kevin Bacon", encoded_statistics->max.value()); + EXPECT_EQ("🚀Kevin Bacon", encoded_statistics->max()); ASSERT_TRUE(encoded_statistics->is_max_value_exact.value()); ASSERT_FALSE(encoded_statistics->is_min_value_exact.value()); break; case 3: // Max couldn't truncate 0xFFFF binary string - EXPECT_EQ("\xFF\xFF\x1\x2", encoded_statistics->max.value()); + EXPECT_EQ("\xFF\xFF\x1\x2", encoded_statistics->max()); ASSERT_TRUE(encoded_statistics->is_max_value_exact.value()); ASSERT_FALSE(encoded_statistics->is_min_value_exact.value()); break; case 4: case 5: // Min and Max are not truncated, fit on 2 bytes - EXPECT_EQ("Ke", encoded_statistics->max.value()); + EXPECT_EQ("Ke", encoded_statistics->max()); ASSERT_TRUE(encoded_statistics->is_max_value_exact.value()); ASSERT_TRUE(encoded_statistics->is_min_value_exact.value()); break; default: // Max truncated to 2 bytes on columns 0 and 1 - EXPECT_EQ("Kf", encoded_statistics->max.value()); + EXPECT_EQ("Kf", encoded_statistics->max()); ASSERT_FALSE(encoded_statistics->is_max_value_exact.value()); ASSERT_FALSE(encoded_statistics->is_min_value_exact.value()); } @@ -1756,12 +1755,12 @@ TEST(TestEncodedStatistics, TruncatedMinMax) { TEST(TestEncodedStatistics, CopySafe) { EncodedStatistics encoded_statistics; encoded_statistics.set_max("abc"); - ASSERT_TRUE(encoded_statistics.max.has_value()); + ASSERT_TRUE(encoded_statistics.max_.has_value()); encoded_statistics.is_max_value_exact = true; ASSERT_TRUE(encoded_statistics.is_max_value_exact.has_value()); encoded_statistics.set_min("abc"); - ASSERT_TRUE(encoded_statistics.min.has_value()); + ASSERT_TRUE(encoded_statistics.min_.has_value()); encoded_statistics.is_min_value_exact = true; ASSERT_TRUE(encoded_statistics.is_min_value_exact.has_value()); @@ -1771,8 +1770,8 @@ TEST(TestEncodedStatistics, CopySafe) { copy_statistics.is_max_value_exact = false; copy_statistics.is_min_value_exact = false; - EXPECT_EQ(std::make_optional("abc"), encoded_statistics.min); - EXPECT_EQ("abc", encoded_statistics.max.value()); + EXPECT_EQ("abc", encoded_statistics.min()); + EXPECT_EQ("abc", encoded_statistics.max()); EXPECT_EQ(encoded_statistics.is_min_value_exact, std::make_optional(true)); EXPECT_EQ(encoded_statistics.is_max_value_exact, std::make_optional(true)); } @@ -1780,16 +1779,16 @@ TEST(TestEncodedStatistics, CopySafe) { TEST(TestEncodedStatistics, ApplyStatSizeLimits) { EncodedStatistics encoded_statistics; encoded_statistics.set_min("a"); - ASSERT_TRUE(encoded_statistics.min.has_value()); + ASSERT_TRUE(encoded_statistics.min_.has_value()); encoded_statistics.set_max("abc"); - ASSERT_TRUE(encoded_statistics.max.has_value()); + ASSERT_TRUE(encoded_statistics.max_.has_value()); encoded_statistics.ApplyStatSizeLimits(2); - ASSERT_TRUE(encoded_statistics.min.has_value()); - ASSERT_EQ(std::make_optional("a"), encoded_statistics.min); - ASSERT_FALSE(encoded_statistics.max.has_value()); + ASSERT_TRUE(encoded_statistics.min_.has_value()); + ASSERT_EQ("a", encoded_statistics.min()); + ASSERT_FALSE(encoded_statistics.max_.has_value()); NodePtr node = PrimitiveNode::Make("StringColumn", Repetition::REQUIRED, Type::BYTE_ARRAY); diff --git a/cpp/src/parquet/thrift_internal.h b/cpp/src/parquet/thrift_internal.h index 8beead7bc1f..06281e8a2c0 100644 --- a/cpp/src/parquet/thrift_internal.h +++ b/cpp/src/parquet/thrift_internal.h @@ -469,33 +469,33 @@ static inline format::GeospatialStatistics ToThrift( static inline format::Statistics ToThrift(const EncodedStatistics& stats) { format::Statistics statistics; - if (stats.min.has_value()) { - statistics.__set_min_value(stats.min.value()); + if (stats.min_.has_value()) { + statistics.__set_min_value(stats.min()); if (stats.is_min_value_exact.has_value()) { statistics.__set_is_min_value_exact(stats.is_min_value_exact.value()); } // If the order is SIGNED, then the old min value must be set too. // This for backward compatibility if (stats.is_signed()) { - statistics.__set_min(stats.min.value()); + statistics.__set_min(stats.min()); } } - if (stats.max.has_value()) { - statistics.__set_max_value(stats.max.value()); + if (stats.max_.has_value()) { + statistics.__set_max_value(stats.max()); if (stats.is_max_value_exact.has_value()) { statistics.__set_is_max_value_exact(stats.is_max_value_exact.value()); } // If the order is SIGNED, then the old max value must be set too. // This for backward compatibility if (stats.is_signed()) { - statistics.__set_max(stats.max.value()); + statistics.__set_max(stats.max()); } } if (stats.null_count) { - statistics.__set_null_count(*stats.null_count); + statistics.__set_null_count(stats.null_count.value()); } if (stats.distinct_count) { - statistics.__set_distinct_count(*stats.distinct_count); + statistics.__set_distinct_count(stats.distinct_count.value()); } return statistics; From 26dcb88fe376c99a4e08f771cf16c1835969e722 Mon Sep 17 00:00:00 2001 From: Alvaro-Kothe Date: Wed, 25 Feb 2026 21:34:57 -0300 Subject: [PATCH 07/11] fix: deprecate min()/max() --- cpp/src/arrow/dataset/file_parquet.cc | 2 +- .../parquet/arrow/arrow_statistics_test.cc | 2 +- cpp/src/parquet/arrow/reader_internal.cc | 24 +++--- cpp/src/parquet/file_deserialize_test.cc | 8 +- cpp/src/parquet/metadata.cc | 2 +- cpp/src/parquet/metadata_test.cc | 48 +++++------ cpp/src/parquet/page_index.cc | 8 +- cpp/src/parquet/page_index_test.cc | 4 +- cpp/src/parquet/printer.cc | 4 +- cpp/src/parquet/statistics.cc | 28 ++++-- cpp/src/parquet/statistics.h | 42 ++++++++- cpp/src/parquet/statistics_test.cc | 86 +++++++++---------- cpp/src/parquet/thrift_internal.h | 16 ++-- 13 files changed, 160 insertions(+), 114 deletions(-) diff --git a/cpp/src/arrow/dataset/file_parquet.cc b/cpp/src/arrow/dataset/file_parquet.cc index 6e0b1ce5b96..727202b6cd3 100644 --- a/cpp/src/arrow/dataset/file_parquet.cc +++ b/cpp/src/arrow/dataset/file_parquet.cc @@ -370,7 +370,7 @@ std::optional ParquetFileFragment::EvaluateStatisticsAsExpr const parquet::Statistics& statistics) { auto field_expr = compute::field_ref(field_ref); - bool may_have_null = !statistics.HasNullCount() || statistics.null_count() > 0; + bool may_have_null = !statistics.HasNullCount() || statistics.NullCount() > 0; // Optimize for corner case where all values are nulls if (statistics.num_values() == 0) { // If there are no non-null values, column `field_ref` in the fragment diff --git a/cpp/src/parquet/arrow/arrow_statistics_test.cc b/cpp/src/parquet/arrow/arrow_statistics_test.cc index 27a76fd72be..6f7990af0d0 100644 --- a/cpp/src/parquet/arrow/arrow_statistics_test.cc +++ b/cpp/src/parquet/arrow/arrow_statistics_test.cc @@ -89,7 +89,7 @@ TEST_P(ParameterizedStatisticsTest, NoNullCountWrittenForRepeatedFields) { auto parquet_reader = ParquetFileReader::Open(std::move(buffer_reader)); std::shared_ptr metadata = parquet_reader->metadata(); std::shared_ptr stats = metadata->RowGroup(0)->ColumnChunk(0)->statistics(); - EXPECT_EQ(stats->null_count(), GetParam().expected_null_count); + EXPECT_EQ(stats->NullCount(), GetParam().expected_null_count); EXPECT_EQ(stats->num_values(), GetParam().expected_value_count); ASSERT_TRUE(stats->HasMinMax()); EXPECT_EQ(stats->EncodeMin(), GetParam().expected_min); diff --git a/cpp/src/parquet/arrow/reader_internal.cc b/cpp/src/parquet/arrow/reader_internal.cc index 12f36fe39cf..1a9806f1aa4 100644 --- a/cpp/src/parquet/arrow/reader_internal.cc +++ b/cpp/src/parquet/arrow/reader_internal.cc @@ -169,8 +169,8 @@ template Status MakeMinMaxScalar(const StatisticsType& statistics, std::shared_ptr<::arrow::Scalar>* min, std::shared_ptr<::arrow::Scalar>* max) { - *min = ::arrow::MakeScalar(static_cast(statistics.min())); - *max = ::arrow::MakeScalar(static_cast(statistics.max())); + *min = ::arrow::MakeScalar(static_cast(statistics.Min().value())); + *max = ::arrow::MakeScalar(static_cast(statistics.Max().value())); return Status::OK(); } @@ -179,8 +179,8 @@ Status MakeMinMaxTypedScalar(const StatisticsType& statistics, std::shared_ptr type, std::shared_ptr<::arrow::Scalar>* min, std::shared_ptr<::arrow::Scalar>* max) { - ARROW_ASSIGN_OR_RAISE(*min, ::arrow::MakeScalar(type, statistics.min())); - ARROW_ASSIGN_OR_RAISE(*max, ::arrow::MakeScalar(type, statistics.max())); + ARROW_ASSIGN_OR_RAISE(*min, ::arrow::MakeScalar(type, statistics.Min().value())); + ARROW_ASSIGN_OR_RAISE(*max, ::arrow::MakeScalar(type, statistics.Max().value())); return Status::OK(); } @@ -227,8 +227,9 @@ static Status FromInt32Statistics(const Int32Statistics& statistics, case LogicalType::Type::NONE: return MakeMinMaxTypedScalar(statistics, type, min, max); case LogicalType::Type::DECIMAL: - return ExtractDecimalMinMaxFromInteger(statistics.min(), statistics.max(), - logical_type, min, max); + return ExtractDecimalMinMaxFromInteger(statistics.Min().value(), + statistics.Max().value(), logical_type, min, + max); default: break; } @@ -252,8 +253,9 @@ static Status FromInt64Statistics(const Int64Statistics& statistics, case LogicalType::Type::NONE: return MakeMinMaxTypedScalar(statistics, type, min, max); case LogicalType::Type::DECIMAL: - return ExtractDecimalMinMaxFromInteger(statistics.min(), statistics.max(), - logical_type, min, max); + return ExtractDecimalMinMaxFromInteger(statistics.Min().value(), + statistics.Max().value(), logical_type, min, + max); default: break; } @@ -384,13 +386,13 @@ void AttachStatistics(::arrow::ArrayData* data, } if (statistics) { if (statistics->HasDistinctCount()) { - array_statistics->distinct_count = statistics->distinct_count(); + array_statistics->distinct_count = statistics->DistinctCount().value(); } if (statistics->HasMinMax()) { const auto* typed_statistics = checked_cast*>(statistics.get()); - const ArrowCType min = typed_statistics->min(); - const ArrowCType max = typed_statistics->max(); + const ArrowCType min = typed_statistics->Min().value(); + const ArrowCType max = typed_statistics->Max().value(); if constexpr (std::is_same_v) { array_statistics->min = static_cast(min); array_statistics->max = static_cast(max); diff --git a/cpp/src/parquet/file_deserialize_test.cc b/cpp/src/parquet/file_deserialize_test.cc index 7fa5e2f167e..f584fffe2bb 100644 --- a/cpp/src/parquet/file_deserialize_test.cc +++ b/cpp/src/parquet/file_deserialize_test.cc @@ -69,10 +69,10 @@ static inline void AddDummyStats(int stat_size, H& header, bool fill_all_stats = template static inline void CheckStatistics(const H& expected, const EncodedStatistics& actual) { if (expected.statistics.__isset.max) { - ASSERT_EQ(expected.statistics.max, actual.max()); + ASSERT_EQ(expected.statistics.max, actual.Max()); } if (expected.statistics.__isset.min) { - ASSERT_EQ(expected.statistics.min, actual.min()); + ASSERT_EQ(expected.statistics.min, actual.Min()); } if (expected.statistics.__isset.null_count) { ASSERT_EQ(expected.statistics.null_count, actual.null_count); @@ -513,8 +513,8 @@ TYPED_TEST(PageFilterTest, TestPageFilterCallback) { CheckDataPageHeader(this->data_page_headers_[i], current_page.get())); auto data_page = static_cast(current_page.get()); const EncodedStatistics encoded_statistics = data_page->statistics(); - ASSERT_EQ(read_stats[i].max(), encoded_statistics.max()); - ASSERT_EQ(read_stats[i].min(), encoded_statistics.min()); + ASSERT_EQ(read_stats[i].Max(), encoded_statistics.Max()); + ASSERT_EQ(read_stats[i].Min(), encoded_statistics.Min()); ASSERT_EQ(read_stats[i].null_count, encoded_statistics.null_count); ASSERT_EQ(read_stats[i].distinct_count, encoded_statistics.distinct_count); ASSERT_EQ(read_num_values[i], this->data_page_headers_[i].num_values); diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 3053aef889d..45ccb02b8fe 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -1626,7 +1626,7 @@ bool ApplicationVersion::HasCorrectStatistics(Type::type col_type, (application_ == "parquet-mr" && VersionLt(PARQUET_MR_FIXED_STATS_VERSION()))) { // Only SIGNED are valid unless max and min are the same // (in which case the sort order does not matter) - bool max_equals_min = statistics.min_.has_value() && statistics.max_.has_value() + bool max_equals_min = statistics.HasMin() && statistics.HasMax() ? statistics.min() == statistics.max() : false; if (SortOrder::SIGNED != sort_order && !max_equals_min) { diff --git a/cpp/src/parquet/metadata_test.cc b/cpp/src/parquet/metadata_test.cc index 572f053179c..842594cb2c7 100644 --- a/cpp/src/parquet/metadata_test.cc +++ b/cpp/src/parquet/metadata_test.cc @@ -154,18 +154,18 @@ TEST(Metadata, TestBuildAccess) { auto rg1_column2 = rg1_accessor->ColumnChunk(1); ASSERT_EQ(true, rg1_column1->is_stats_set()); ASSERT_EQ(true, rg1_column2->is_stats_set()); - ASSERT_EQ(stats_float.min(), rg1_column2->statistics()->EncodeMin()); - ASSERT_EQ(stats_float.max(), rg1_column2->statistics()->EncodeMax()); - ASSERT_EQ(stats_int.min(), rg1_column1->statistics()->EncodeMin()); - ASSERT_EQ(stats_int.max(), rg1_column1->statistics()->EncodeMax()); - ASSERT_EQ(stats_float.min(), rg1_column2->encoded_statistics()->min()); - ASSERT_EQ(stats_float.max(), rg1_column2->encoded_statistics()->max()); - ASSERT_EQ(stats_int.min(), rg1_column1->encoded_statistics()->min()); - ASSERT_EQ(stats_int.max(), rg1_column1->encoded_statistics()->max()); - ASSERT_EQ(0, rg1_column1->statistics()->null_count()); - ASSERT_EQ(0, rg1_column2->statistics()->null_count()); - ASSERT_EQ(nrows, rg1_column1->statistics()->distinct_count()); - ASSERT_EQ(nrows, rg1_column2->statistics()->distinct_count()); + ASSERT_EQ(stats_float.Min(), rg1_column2->statistics()->EncodeMin()); + ASSERT_EQ(stats_float.Max(), rg1_column2->statistics()->EncodeMax()); + ASSERT_EQ(stats_int.Min(), rg1_column1->statistics()->EncodeMin()); + ASSERT_EQ(stats_int.Max(), rg1_column1->statistics()->EncodeMax()); + ASSERT_EQ(stats_float.Min(), rg1_column2->encoded_statistics()->Min()); + ASSERT_EQ(stats_float.Max(), rg1_column2->encoded_statistics()->Max()); + ASSERT_EQ(stats_int.Min(), rg1_column1->encoded_statistics()->Min()); + ASSERT_EQ(stats_int.Max(), rg1_column1->encoded_statistics()->Max()); + ASSERT_EQ(0, rg1_column1->statistics()->NullCount()); + ASSERT_EQ(0, rg1_column2->statistics()->NullCount()); + ASSERT_EQ(nrows, rg1_column1->statistics()->DistinctCount()); + ASSERT_EQ(nrows, rg1_column2->statistics()->DistinctCount()); ASSERT_EQ(DEFAULT_COMPRESSION_TYPE, rg1_column1->compression()); ASSERT_EQ(DEFAULT_COMPRESSION_TYPE, rg1_column2->compression()); ASSERT_EQ(nrows / 2, rg1_column1->num_values()); @@ -205,18 +205,18 @@ TEST(Metadata, TestBuildAccess) { auto rg2_column2 = rg2_accessor->ColumnChunk(1); ASSERT_EQ(true, rg2_column1->is_stats_set()); ASSERT_EQ(true, rg2_column2->is_stats_set()); - ASSERT_EQ(stats_float.min(), rg2_column2->statistics()->EncodeMin()); - ASSERT_EQ(stats_float.max(), rg2_column2->statistics()->EncodeMax()); - ASSERT_EQ(stats_int.min(), rg1_column1->statistics()->EncodeMin()); - ASSERT_EQ(stats_int.max(), rg1_column1->statistics()->EncodeMax()); - ASSERT_EQ(stats_float.min(), rg2_column2->encoded_statistics()->min()); - ASSERT_EQ(stats_float.max(), rg2_column2->encoded_statistics()->max()); - ASSERT_EQ(stats_int.min(), rg1_column1->encoded_statistics()->min()); - ASSERT_EQ(stats_int.max(), rg1_column1->encoded_statistics()->max()); - ASSERT_EQ(0, rg2_column1->statistics()->null_count()); - ASSERT_EQ(0, rg2_column2->statistics()->null_count()); - ASSERT_EQ(nrows, rg2_column1->statistics()->distinct_count()); - ASSERT_EQ(nrows, rg2_column2->statistics()->distinct_count()); + ASSERT_EQ(stats_float.Min(), rg2_column2->statistics()->EncodeMin()); + ASSERT_EQ(stats_float.Max(), rg2_column2->statistics()->EncodeMax()); + ASSERT_EQ(stats_int.Min(), rg1_column1->statistics()->EncodeMin()); + ASSERT_EQ(stats_int.Max(), rg1_column1->statistics()->EncodeMax()); + ASSERT_EQ(stats_float.Min(), rg2_column2->encoded_statistics()->Min()); + ASSERT_EQ(stats_float.Max(), rg2_column2->encoded_statistics()->Max()); + ASSERT_EQ(stats_int.Min(), rg1_column1->encoded_statistics()->Min()); + ASSERT_EQ(stats_int.Max(), rg1_column1->encoded_statistics()->Max()); + ASSERT_EQ(0, rg2_column1->statistics()->NullCount()); + ASSERT_EQ(0, rg2_column2->statistics()->NullCount()); + ASSERT_EQ(nrows, rg2_column1->statistics()->DistinctCount()); + ASSERT_EQ(nrows, rg2_column2->statistics()->DistinctCount()); ASSERT_EQ(nrows / 2, rg2_column1->num_values()); ASSERT_EQ(nrows / 2, rg2_column2->num_values()); ASSERT_EQ(DEFAULT_COMPRESSION_TYPE, rg2_column1->compression()); diff --git a/cpp/src/parquet/page_index.cc b/cpp/src/parquet/page_index.cc index 0f90c45358d..b850a923a4c 100644 --- a/cpp/src/parquet/page_index.cc +++ b/cpp/src/parquet/page_index.cc @@ -511,11 +511,11 @@ class ColumnIndexBuilderImpl final : public ColumnIndexBuilder { column_index_.null_pages.emplace_back(true); column_index_.min_values.emplace_back(""); column_index_.max_values.emplace_back(""); - } else if (stats.min_.has_value() && stats.max_.has_value()) { + } else if (stats.HasMin() && stats.HasMax()) { const size_t page_ordinal = column_index_.null_pages.size(); non_null_page_indices_.emplace_back(page_ordinal); - column_index_.min_values.emplace_back(stats.min()); - column_index_.max_values.emplace_back(stats.max()); + column_index_.min_values.emplace_back(stats.Min().value()); + column_index_.max_values.emplace_back(stats.Max().value()); column_index_.null_pages.emplace_back(false); } else { /// This is a non-null page but it lacks of meaningful min/max values. @@ -524,7 +524,7 @@ class ColumnIndexBuilderImpl final : public ColumnIndexBuilder { return; } - if (column_index_.__isset.null_counts && stats.null_count.has_value()) { + if (column_index_.__isset.null_counts && stats.HasNullCount()) { column_index_.null_counts.emplace_back(stats.null_count.value()); } else { column_index_.__isset.null_counts = false; diff --git a/cpp/src/parquet/page_index_test.cc b/cpp/src/parquet/page_index_test.cc index b92bd7cccc5..6446547baff 100644 --- a/cpp/src/parquet/page_index_test.cc +++ b/cpp/src/parquet/page_index_test.cc @@ -813,8 +813,8 @@ class PageIndexBuilderTest : public ::testing::Test { ASSERT_NE(nullptr, column_index); ASSERT_EQ(size_t{1}, column_index->null_pages().size()); ASSERT_EQ(stats.all_null_value, column_index->null_pages()[0]); - ASSERT_EQ(stats.min(), column_index->encoded_min_values()[0]); - ASSERT_EQ(stats.max(), column_index->encoded_max_values()[0]); + ASSERT_EQ(stats.Min(), column_index->encoded_min_values()[0]); + ASSERT_EQ(stats.Max(), column_index->encoded_max_values()[0]); ASSERT_EQ(stats.null_count.has_value(), column_index->has_null_counts()); if (stats.null_count) { ASSERT_EQ(stats.null_count.value(), column_index->null_counts()[0]); diff --git a/cpp/src/parquet/printer.cc b/cpp/src/parquet/printer.cc index 8c46373a38a..0d07196f825 100644 --- a/cpp/src/parquet/printer.cc +++ b/cpp/src/parquet/printer.cc @@ -335,11 +335,11 @@ void ParquetFilePrinter::JSONPrint(std::ostream& stream, std::list selected if (column_chunk->is_stats_set()) { stream << R"("True", "Stats": {)"; if (stats->HasNullCount()) { - stream << R"("NumNulls": ")" << stats->null_count() << "\""; + stream << R"("NumNulls": ")" << stats->NullCount().value() << "\""; } if (stats->HasDistinctCount()) { stream << ", " - << R"("DistinctValues": ")" << stats->distinct_count() << "\""; + << R"("DistinctValues": ")" << stats->DistinctCount().value() << "\""; } if (stats->HasMinMax()) { std::string min = stats->EncodeMin(), max = stats->EncodeMax(); diff --git a/cpp/src/parquet/statistics.cc b/cpp/src/parquet/statistics.cc index 34cc529e0f6..6be427fa4bd 100644 --- a/cpp/src/parquet/statistics.cc +++ b/cpp/src/parquet/statistics.cc @@ -651,11 +651,18 @@ class TypedStatisticsImpl : public TypedStatistics { bool HasDistinctCount() const override { return statistics_.distinct_count.has_value(); }; - bool HasMinMax() const override { return min_.has_value() && max_.has_value(); } + bool HasMin() const override { return min_.has_value(); } + bool HasMax() const override { return max_.has_value(); } + bool HasMinMax() const override { return HasMin() && HasMax(); } bool HasNullCount() const override { return statistics_.null_count.has_value(); }; + std::optional NullCount() const override { return statistics_.null_count; } + std::optional DistinctCount() const override { + return statistics_.distinct_count; + } + void IncrementNullCount(int64_t n) override { - statistics_.null_count = statistics_.null_count.value_or(0) + n; + statistics_.null_count = NullCount().value_or(0) + n; } void IncrementNumValues(int64_t n) override { num_values_ += n; } @@ -685,8 +692,7 @@ class TypedStatisticsImpl : public TypedStatistics { if (this->HasMinMax() != other.HasMinMax()) return false; if (this->HasMinMax() && !MinMaxEqual(other)) return false; - return null_count() == other.null_count() && - distinct_count() == other.distinct_count() && + return NullCount() == other.NullCount() && DistinctCount() == other.DistinctCount() && num_values() == other.num_values() && is_min_value_exact() == other.is_min_value_exact() && is_max_value_exact() == other.is_max_value_exact(); @@ -707,12 +713,12 @@ class TypedStatisticsImpl : public TypedStatistics { this->num_values_ += other.num_values(); // null_count is valid only if both sides have it. if (this->HasNullCount() && other.HasNullCount()) { - this->statistics_.null_count.value() += other.null_count(); + this->statistics_.null_count = NullCount().value() + other.NullCount().value(); } else { this->statistics_.null_count = std::nullopt; } if (this->HasDistinctCount() && other.HasDistinctCount() && - (distinct_count() == 0 || other.distinct_count() == 0)) { + (DistinctCount().value() == 0 || other.DistinctCount().value() == 0)) { // We can merge distinct counts if either side is zero. statistics_.distinct_count = std::max(statistics_.DistinctCount().value(), other.DistinctCount().value()); @@ -724,7 +730,7 @@ class TypedStatisticsImpl : public TypedStatistics { // min/max which may happen when other is an empty stats or all // its values are null and/or NaN. if (other.HasMinMax()) { - SetMinMax(other.min(), other.max()); + SetMinMax(other.Min().value(), other.Max().value()); } } @@ -749,8 +755,12 @@ class TypedStatisticsImpl : public TypedStatistics { const T& min() const override { return min_.value(); } + std::optional Min() const override { return min_; } + const T& max() const override { return max_.value(); } + std::optional Max() const override { return max_; } + Type::type physical_type() const override { return descr_->physical_type(); } const ColumnDescriptor* descr() const override { return descr_; } @@ -776,12 +786,12 @@ class TypedStatisticsImpl : public TypedStatistics { s.is_max_value_exact = this->is_max_value_exact(); } if (HasNullCount()) { - s.set_null_count(this->null_count()); + s.set_null_count(this->NullCount().value()); // num_values_ is reliable and it means number of non-null values. s.all_null_value = num_values_ == 0; } if (HasDistinctCount()) { - s.set_distinct_count(this->distinct_count()); + s.set_distinct_count(this->DistinctCount().value()); } return s; } diff --git a/cpp/src/parquet/statistics.h b/cpp/src/parquet/statistics.h index 15a8ad7bb25..3b3aeeeb803 100644 --- a/cpp/src/parquet/statistics.h +++ b/cpp/src/parquet/statistics.h @@ -127,15 +127,28 @@ class PARQUET_EXPORT EncodedStatistics { std::optional max_, min_; + PARQUET_DEPRECATED("Deprecated in 24.0.0. Use Max() instead") const std::string& max() const { return max_.value(); } + PARQUET_DEPRECATED("Deprecated in 24.0.0. Use Min() instead") const std::string& min() const { return min_.value(); } + std::optional Max() const { return max_; } + std::optional Min() const { return min_; } + std::optional DistinctCount() const { return distinct_count; } + std::optional NullCount() const { return null_count; } + + bool HasMax() const { return max_.has_value(); } + bool HasMin() const { return min_.has_value(); } + std::optional is_max_value_exact; std::optional is_min_value_exact; std::optional null_count; std::optional distinct_count; + bool HasNullCount() const { return null_count.has_value(); } + bool HasDistinctCount() const { return distinct_count.has_value(); } + // When all values in the statistics are null, it is set to true. // Otherwise, at least one value is not null, or we are not sure at all. // Page index requires this information to decide whether a data page @@ -148,11 +161,11 @@ class PARQUET_EXPORT EncodedStatistics { // the true minimum for aggregations and there is no way to mark that a // value has been truncated and is a lower bound and not in the page. void ApplyStatSizeLimits(size_t length) { - if (max_.has_value() && max_->length() > length) { + if (HasMax() && max_->length() > length) { max_ = std::nullopt; is_max_value_exact = std::nullopt; } - if (min_.has_value() && min_->length() > length) { + if (HasMin() && min_->length() > length) { min_ = std::nullopt; is_min_value_exact = std::nullopt; } @@ -165,8 +178,7 @@ class PARQUET_EXPORT EncodedStatistics { } bool is_set() const { - return min_.has_value() || max_.has_value() || null_count.has_value() || - distinct_count.has_value(); + return HasMin() || HasMax() || HasNullCount() || HasDistinctCount(); } bool is_signed() const { return is_signed_; } @@ -252,17 +264,31 @@ class PARQUET_EXPORT Statistics { virtual bool HasNullCount() const = 0; /// \brief The number of null values, may not be set + PARQUET_DEPRECATED("Deprecated in 24.0.0. Use NullCount() instead") virtual int64_t null_count() const = 0; + /// \brief The number of null values, may not be set + virtual std::optional NullCount() const = 0; + /// \brief Return true if the count of distinct values is set virtual bool HasDistinctCount() const = 0; /// \brief The number of distinct values, may not be set + PARQUET_DEPRECATED("Deprecated in 24.0.0. Use DistinctCount() instead") virtual int64_t distinct_count() const = 0; + /// \brief The number of distinct values, may not be set + virtual std::optional DistinctCount() const = 0; + /// \brief The number of non-null values in the column virtual int64_t num_values() const = 0; + /// \brief Return true if the minimum value statistic is set. + virtual bool HasMin() const = 0; + + /// \brief Return true if the maximum value statistic is set. + virtual bool HasMax() const = 0; + /// \brief Return true if both min and max statistics are set. Obtain /// with TypedStatistics::min and max virtual bool HasMinMax() const = 0; @@ -309,11 +335,19 @@ class TypedStatistics : public Statistics { using T = typename DType::c_type; /// \brief The current minimum value + PARQUET_DEPRECATED("Use Min() instead") virtual const T& min() const = 0; + /// \brief The current minimum value + virtual std::optional Min() const = 0; + /// \brief The current maximum value + PARQUET_DEPRECATED("Deprecated in 24.0.0. Use Max() instead") virtual const T& max() const = 0; + /// \brief The current maximum value + virtual std::optional Max() const = 0; + /// \brief Update state with state of another Statistics object virtual void Merge(const TypedStatistics& other) = 0; diff --git a/cpp/src/parquet/statistics_test.cc b/cpp/src/parquet/statistics_test.cc index 09db338c6d5..8a40d1316d7 100644 --- a/cpp/src/parquet/statistics_test.cc +++ b/cpp/src/parquet/statistics_test.cc @@ -339,20 +339,20 @@ class TestStatistics : public PrimitiveTypedTest { /*null_count=*/0, /*distinct_count=*/0); ASSERT_EQ(encoded_min, statistics2->EncodeMin()); ASSERT_EQ(encoded_max, statistics2->EncodeMax()); - ASSERT_EQ(statistics1->min(), statistics2->min()); - ASSERT_EQ(statistics1->max(), statistics2->max()); + ASSERT_EQ(statistics1->Min(), statistics2->Min()); + ASSERT_EQ(statistics1->Max(), statistics2->Max()); ASSERT_EQ(statistics1->is_min_value_exact(), std::make_optional(true)); ASSERT_EQ(statistics1->is_max_value_exact(), std::make_optional(true)); ASSERT_EQ(statistics2->is_min_value_exact(), std::make_optional(true)); ASSERT_EQ(statistics2->is_max_value_exact(), std::make_optional(true)); ASSERT_EQ(encoded_min_spaced, statistics2->EncodeMin()); ASSERT_EQ(encoded_max_spaced, statistics2->EncodeMax()); - ASSERT_EQ(statistics3->min(), statistics2->min()); - ASSERT_EQ(statistics3->max(), statistics2->max()); + ASSERT_EQ(statistics3->Min(), statistics2->Min()); + ASSERT_EQ(statistics3->Max(), statistics2->Max()); ASSERT_EQ(statistics3->is_min_value_exact(), std::make_optional(true)); ASSERT_EQ(statistics3->is_max_value_exact(), std::make_optional(true)); - ASSERT_EQ(statistics4->min(), statistics2->min()); - ASSERT_EQ(statistics4->max(), statistics2->max()); + ASSERT_EQ(statistics4->Min(), statistics2->Min()); + ASSERT_EQ(statistics4->Max(), statistics2->Max()); ASSERT_EQ(statistics4->is_min_value_exact(), std::nullopt); ASSERT_EQ(statistics4->is_max_value_exact(), std::nullopt); } @@ -368,9 +368,9 @@ class TestStatistics : public PrimitiveTypedTest { ASSERT_TRUE(statistics->HasNullCount()); ASSERT_FALSE(statistics->HasMinMax()); ASSERT_FALSE(statistics->HasDistinctCount()); - ASSERT_EQ(0, statistics->null_count()); + ASSERT_EQ(std::make_optional(0), statistics->NullCount()); ASSERT_EQ(0, statistics->num_values()); - ASSERT_EQ(0, statistics->distinct_count()); + ASSERT_EQ(std::nullopt, statistics->DistinctCount()); ASSERT_EQ("", statistics->EncodeMin()); ASSERT_EQ("", statistics->EncodeMax()); } @@ -393,10 +393,10 @@ class TestStatistics : public PrimitiveTypedTest { total->Merge(*statistics1); total->Merge(*statistics2); - ASSERT_EQ(num_null[0] + num_null[1], total->null_count()); + ASSERT_EQ(num_null[0] + num_null[1], total->NullCount()); ASSERT_EQ(this->values_.size() * 2 - num_null[0] - num_null[1], total->num_values()); - ASSERT_EQ(total->min(), std::min(statistics1->min(), statistics2->min())); - ASSERT_EQ(total->max(), std::max(statistics1->max(), statistics2->max())); + ASSERT_EQ(total->Min(), std::min(statistics1->Min(), statistics2->Min())); + ASSERT_EQ(total->Max(), std::max(statistics1->Max(), statistics2->Max())); } void TestEquals() { @@ -462,7 +462,7 @@ class TestStatistics : public PrimitiveTypedTest { if (!column_chunk->is_stats_set()) return; std::shared_ptr stats = column_chunk->statistics(); // check values after serialization + deserialization - EXPECT_EQ(null_count, stats->null_count()); + EXPECT_EQ(null_count, stats->NullCount()); EXPECT_EQ(num_values - null_count, stats->num_values()); EXPECT_TRUE(expected_stats->HasMinMax()); EXPECT_EQ(expected_stats->EncodeMin(), stats->EncodeMin()); @@ -472,8 +472,8 @@ class TestStatistics : public PrimitiveTypedTest { EXPECT_EQ(null_count, enc_stats->null_count); EXPECT_TRUE(enc_stats->min_.has_value()); EXPECT_TRUE(enc_stats->max_.has_value()); - EXPECT_EQ(expected_stats->EncodeMin(), enc_stats->min()); - EXPECT_EQ(expected_stats->EncodeMax(), enc_stats->max()); + EXPECT_EQ(expected_stats->EncodeMin(), enc_stats->Min()); + EXPECT_EQ(expected_stats->EncodeMax(), enc_stats->Max()); EXPECT_EQ(enc_stats->is_min_value_exact, std::make_optional(true)); EXPECT_EQ(enc_stats->is_max_value_exact, std::make_optional(true)); } @@ -561,11 +561,11 @@ void TestStatistics::TestMinMaxEncode() { // encoded is same as unencoded ASSERT_EQ(encoded_min, - std::string(reinterpret_cast(statistics1->min().ptr), - statistics1->min().len)); + std::string(reinterpret_cast(statistics1->Min()->ptr), + statistics1->Min()->len)); ASSERT_EQ(encoded_max, - std::string(reinterpret_cast(statistics1->max().ptr), - statistics1->max().len)); + std::string(reinterpret_cast(statistics1->Max()->ptr), + statistics1->Max()->len)); auto statistics2 = MakeStatistics(this->schema_.Column(0), encoded_min, encoded_max, this->values_.size(), @@ -576,8 +576,8 @@ void TestStatistics::TestMinMaxEncode() { ASSERT_EQ(encoded_min, statistics2->EncodeMin()); ASSERT_EQ(encoded_max, statistics2->EncodeMax()); - ASSERT_EQ(statistics1->min(), statistics2->min()); - ASSERT_EQ(statistics1->max(), statistics2->max()); + ASSERT_EQ(statistics1->Min(), statistics2->Min()); + ASSERT_EQ(statistics1->Max(), statistics2->Max()); } using Types = ::testing::Types { } EncodedStatistics final_statistics = statistics->Encode(); EXPECT_EQ(statistics->HasDistinctCount(), - final_statistics.distinct_count.has_value()); + final_statistics.HasDistinctCount()); if (statistics->HasDistinctCount()) { - EXPECT_EQ(statistics->distinct_count(), final_statistics.distinct_count); - return statistics->distinct_count(); + EXPECT_EQ(statistics->DistinctCount(), final_statistics.distinct_count); + return statistics->DistinctCount(); } return std::nullopt; } @@ -754,14 +754,14 @@ class TestStatisticsHasFlag : public TestStatistics { /*null_count=*/0); auto encoded_stats1 = statistics1->Encode(); EXPECT_TRUE(statistics1->HasNullCount()); - EXPECT_EQ(0, statistics1->null_count()); + EXPECT_EQ(0, statistics1->NullCount()); EXPECT_TRUE(statistics1->Encode().null_count.has_value()); } // Merge with null-count should also have null count VerifyMergedStatistics(*statistics1, *statistics1, [](TypedStatistics* merged_statistics) { EXPECT_TRUE(merged_statistics->HasNullCount()); - EXPECT_EQ(0, merged_statistics->null_count()); + EXPECT_EQ(0, merged_statistics->NullCount()); auto encoded = merged_statistics->Encode(); EXPECT_TRUE(encoded.null_count.has_value()); EXPECT_EQ(0, encoded.null_count); @@ -998,8 +998,8 @@ class TestStatisticsSortOrder : public ::testing::Test { ARROW_SCOPED_TRACE("Statistics for field #", i); std::shared_ptr cc_metadata = rg_metadata->ColumnChunk(i); - EXPECT_EQ(stats_[i].min(), cc_metadata->statistics()->EncodeMin()); - EXPECT_EQ(stats_[i].max(), cc_metadata->statistics()->EncodeMax()); + EXPECT_EQ(stats_[i].Min(), cc_metadata->statistics()->EncodeMin()); + EXPECT_EQ(stats_[i].Max(), cc_metadata->statistics()->EncodeMax()); EXPECT_EQ(stats_[i].is_max_value_exact, std::make_optional(true)); EXPECT_EQ(stats_[i].is_min_value_exact, std::make_optional(true)); } @@ -1239,9 +1239,9 @@ void TestByteArrayStatisticsFromArrow() { auto stats = MakeStatistics(&descr); ASSERT_NO_FATAL_FAILURE(stats->Update(*values)); - ASSERT_EQ(ByteArray(typed_values.GetView(2)), stats->min()); - ASSERT_EQ(ByteArray(typed_values.GetView(9)), stats->max()); - ASSERT_EQ(2, stats->null_count()); + ASSERT_EQ(ByteArray(typed_values.GetView(2)), stats->Min()); + ASSERT_EQ(ByteArray(typed_values.GetView(9)), stats->Max()); + ASSERT_EQ(2, stats->NullCount()); } TEST(TestByteArrayStatisticsFromArrow, StringType) { @@ -1416,12 +1416,12 @@ class TestFloatStatistics : public ::testing::Test { stats->Update(values.data(), values.size(), /*null_count=*/0); ASSERT_TRUE(stats->HasMinMax()); - this->CheckEq(stats->min(), positive_zero_); - ASSERT_TRUE(this->signbit(stats->min())); + this->CheckEq(stats->Min().value(), positive_zero_); + ASSERT_TRUE(this->signbit(stats->Min().value())); ASSERT_EQ(stats->EncodeMin(), EncodeValue(negative_zero_)); - this->CheckEq(stats->max(), positive_zero_); - ASSERT_FALSE(this->signbit(stats->max())); + this->CheckEq(stats->Max().value(), positive_zero_); + ASSERT_FALSE(this->signbit(stats->Max().value())); ASSERT_EQ(stats->EncodeMax(), EncodeValue(positive_zero_)); } @@ -1681,7 +1681,7 @@ TEST(TestStatisticsSortOrderMinMax, Unsigned) { std::shared_ptr stats = column_chunk->statistics(); ASSERT_TRUE(stats != NULL); - ASSERT_EQ(0, stats->null_count()); + ASSERT_EQ(0, stats->NullCount()); ASSERT_EQ(12, stats->num_values()); ASSERT_EQ(0x00, stats->EncodeMin()[0]); ASSERT_EQ(0x0b, stats->EncodeMax()[0]); @@ -1720,32 +1720,32 @@ TEST(TestEncodedStatistics, TruncatedMinMax) { column_chunk->encoded_statistics(); ASSERT_TRUE(encoded_statistics != NULL); ASSERT_EQ(0, encoded_statistics->null_count); - EXPECT_EQ("Al", encoded_statistics->min()); + EXPECT_EQ("Al", encoded_statistics->Min()); ASSERT_TRUE(encoded_statistics->is_max_value_exact.has_value()); ASSERT_TRUE(encoded_statistics->is_min_value_exact.has_value()); switch (num_column) { case 2: // Max couldn't truncate the utf-8 string longer than 2 bytes - EXPECT_EQ("🚀Kevin Bacon", encoded_statistics->max()); + EXPECT_EQ("🚀Kevin Bacon", encoded_statistics->Max()); ASSERT_TRUE(encoded_statistics->is_max_value_exact.value()); ASSERT_FALSE(encoded_statistics->is_min_value_exact.value()); break; case 3: // Max couldn't truncate 0xFFFF binary string - EXPECT_EQ("\xFF\xFF\x1\x2", encoded_statistics->max()); + EXPECT_EQ("\xFF\xFF\x1\x2", encoded_statistics->Max()); ASSERT_TRUE(encoded_statistics->is_max_value_exact.value()); ASSERT_FALSE(encoded_statistics->is_min_value_exact.value()); break; case 4: case 5: // Min and Max are not truncated, fit on 2 bytes - EXPECT_EQ("Ke", encoded_statistics->max()); + EXPECT_EQ("Ke", encoded_statistics->Max()); ASSERT_TRUE(encoded_statistics->is_max_value_exact.value()); ASSERT_TRUE(encoded_statistics->is_min_value_exact.value()); break; default: // Max truncated to 2 bytes on columns 0 and 1 - EXPECT_EQ("Kf", encoded_statistics->max()); + EXPECT_EQ("Kf", encoded_statistics->Max()); ASSERT_FALSE(encoded_statistics->is_max_value_exact.value()); ASSERT_FALSE(encoded_statistics->is_min_value_exact.value()); } @@ -1770,8 +1770,8 @@ TEST(TestEncodedStatistics, CopySafe) { copy_statistics.is_max_value_exact = false; copy_statistics.is_min_value_exact = false; - EXPECT_EQ("abc", encoded_statistics.min()); - EXPECT_EQ("abc", encoded_statistics.max()); + EXPECT_EQ("abc", encoded_statistics.Min()); + EXPECT_EQ("abc", encoded_statistics.Max()); EXPECT_EQ(encoded_statistics.is_min_value_exact, std::make_optional(true)); EXPECT_EQ(encoded_statistics.is_max_value_exact, std::make_optional(true)); } @@ -1787,7 +1787,7 @@ TEST(TestEncodedStatistics, ApplyStatSizeLimits) { encoded_statistics.ApplyStatSizeLimits(2); ASSERT_TRUE(encoded_statistics.min_.has_value()); - ASSERT_EQ("a", encoded_statistics.min()); + ASSERT_EQ("a", encoded_statistics.Min()); ASSERT_FALSE(encoded_statistics.max_.has_value()); NodePtr node = diff --git a/cpp/src/parquet/thrift_internal.h b/cpp/src/parquet/thrift_internal.h index 06281e8a2c0..4d5b184dc16 100644 --- a/cpp/src/parquet/thrift_internal.h +++ b/cpp/src/parquet/thrift_internal.h @@ -469,32 +469,32 @@ static inline format::GeospatialStatistics ToThrift( static inline format::Statistics ToThrift(const EncodedStatistics& stats) { format::Statistics statistics; - if (stats.min_.has_value()) { - statistics.__set_min_value(stats.min()); + if (stats.HasMin()) { + statistics.__set_min_value(stats.Min().value()); if (stats.is_min_value_exact.has_value()) { statistics.__set_is_min_value_exact(stats.is_min_value_exact.value()); } // If the order is SIGNED, then the old min value must be set too. // This for backward compatibility if (stats.is_signed()) { - statistics.__set_min(stats.min()); + statistics.__set_min(stats.Min().value()); } } - if (stats.max_.has_value()) { - statistics.__set_max_value(stats.max()); + if (stats.HasMax()) { + statistics.__set_max_value(stats.Max().value()); if (stats.is_max_value_exact.has_value()) { statistics.__set_is_max_value_exact(stats.is_max_value_exact.value()); } // If the order is SIGNED, then the old max value must be set too. // This for backward compatibility if (stats.is_signed()) { - statistics.__set_max(stats.max()); + statistics.__set_max(stats.Max().value()); } } - if (stats.null_count) { + if (stats.HasNullCount()) { statistics.__set_null_count(stats.null_count.value()); } - if (stats.distinct_count) { + if (stats.HasDistinctCount()) { statistics.__set_distinct_count(stats.distinct_count.value()); } From ef63fccd8df46d95ea36ed7f61451f031f54b827 Mon Sep 17 00:00:00 2001 From: Alvaro-Kothe Date: Wed, 25 Feb 2026 23:32:41 -0300 Subject: [PATCH 08/11] fix: return old methods --- cpp/src/arrow/dataset/file_parquet.cc | 2 +- cpp/src/parquet/statistics.cc | 37 +++++++++++ cpp/src/parquet/statistics.h | 93 ++++++++++++++++++++++++--- cpp/src/parquet/statistics_test.cc | 2 +- 4 files changed, 123 insertions(+), 11 deletions(-) diff --git a/cpp/src/arrow/dataset/file_parquet.cc b/cpp/src/arrow/dataset/file_parquet.cc index 727202b6cd3..974e2ae1258 100644 --- a/cpp/src/arrow/dataset/file_parquet.cc +++ b/cpp/src/arrow/dataset/file_parquet.cc @@ -370,7 +370,7 @@ std::optional ParquetFileFragment::EvaluateStatisticsAsExpr const parquet::Statistics& statistics) { auto field_expr = compute::field_ref(field_ref); - bool may_have_null = !statistics.HasNullCount() || statistics.NullCount() > 0; + bool may_have_null = !statistics.HasNullCount() || statistics.NullCount().value() > 0; // Optimize for corner case where all values are nulls if (statistics.num_values() == 0) { // If there are no non-null values, column `field_ref` in the fragment diff --git a/cpp/src/parquet/statistics.cc b/cpp/src/parquet/statistics.cc index 6be427fa4bd..0183bff65ee 100644 --- a/cpp/src/parquet/statistics.cc +++ b/cpp/src/parquet/statistics.cc @@ -1103,6 +1103,19 @@ std::shared_ptr Statistics::Make(const ColumnDescriptor* descr, encoded_stats->is_min_value_exact, encoded_stats->is_max_value_exact, pool); } +std::shared_ptr Statistics::Make(const ColumnDescriptor* descr, + const std::string& encoded_min, + const std::string& encoded_max, + int64_t num_values, int64_t null_count, + int64_t distinct_count, bool has_min_max, + bool has_null_count, bool has_distinct_count, + ::arrow::MemoryPool* pool) { + return Statistics::Make(descr, encoded_min, encoded_max, num_values, null_count, + distinct_count, has_min_max, has_null_count, has_distinct_count, + /*is_min_value_exact=*/std::nullopt, + /*is_max_value_exact=*/std::nullopt, pool); +} + std::shared_ptr Statistics::Make(const ColumnDescriptor* descr, std::optional encoded_min, std::optional encoded_max, @@ -1116,6 +1129,30 @@ std::shared_ptr Statistics::Make(const ColumnDescriptor* descr, /*is_max_value_exact=*/std::nullopt, pool); } +std::shared_ptr Statistics::Make( + const ColumnDescriptor* descr, const std::string& encoded_min, + const std::string& encoded_max, int64_t num_values, int64_t null_count, + int64_t distinct_count, bool has_min_max, bool has_null_count, + bool has_distinct_count, std::optional is_min_value_exact, + std::optional is_max_value_exact, ::arrow::MemoryPool* pool) { + std::optional min = std::nullopt; + std::optional max = std::nullopt; + if (has_min_max) { + min = encoded_min; + max = encoded_max; + } + std::optional null_cnt = std::nullopt; + if (has_null_count) { + null_cnt = null_count; + } + std::optional distinct_cnt = std::nullopt; + if (has_distinct_count) { + distinct_cnt = distinct_count; + } + return Statistics::Make(descr, min, max, num_values, null_cnt, distinct_cnt, + is_min_value_exact, is_max_value_exact, pool); +} + std::shared_ptr Statistics::Make( const ColumnDescriptor* descr, std::optional encoded_min, std::optional encoded_max, int64_t num_values, diff --git a/cpp/src/parquet/statistics.h b/cpp/src/parquet/statistics.h index 3b3aeeeb803..b962a4f15d0 100644 --- a/cpp/src/parquet/statistics.h +++ b/cpp/src/parquet/statistics.h @@ -126,10 +126,12 @@ class PARQUET_EXPORT EncodedStatistics { EncodedStatistics() = default; std::optional max_, min_; + std::optional null_count; + std::optional distinct_count; - PARQUET_DEPRECATED("Deprecated in 24.0.0. Use Max() instead") + PARQUET_DEPRECATED("Deprecated in 24.0.0. Use Max() instead.") const std::string& max() const { return max_.value(); } - PARQUET_DEPRECATED("Deprecated in 24.0.0. Use Min() instead") + PARQUET_DEPRECATED("Deprecated in 24.0.0. Use Min() instead.") const std::string& min() const { return min_.value(); } std::optional Max() const { return max_; } @@ -143,9 +145,6 @@ class PARQUET_EXPORT EncodedStatistics { std::optional is_max_value_exact; std::optional is_min_value_exact; - std::optional null_count; - std::optional distinct_count; - bool HasNullCount() const { return null_count.has_value(); } bool HasDistinctCount() const { return distinct_count.has_value(); } @@ -234,6 +233,27 @@ class PARQUET_EXPORT Statistics { std::optional null_count, std::optional distinct_count, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()); + /// \brief Create a new statistics instance given a column schema + /// definition and preexisting state + /// \param[in] descr the column schema + /// \param[in] encoded_min the encoded minimum value + /// \param[in] encoded_max the encoded maximum value + /// \param[in] num_values total number of values + /// \param[in] null_count number of null values + /// \param[in] distinct_count number of distinct values + /// \param[in] has_min_max whether the min/max statistics are set + /// \param[in] has_null_count whether the null_count statistics are set + /// \param[in] has_distinct_count whether the distinct_count statistics are set + /// \param[in] pool a memory pool to use for any memory allocations, optional + /// \deprecated Deprecated in 24.0.0. Use std::optional version instead. + PARQUET_DEPRECATED("Deprecated in 24.0.0. Use std::optional version instead.") + static std::shared_ptr Make( + const ColumnDescriptor* descr, const std::string& encoded_min, + const std::string& encoded_max, int64_t num_values, int64_t null_count, + int64_t distinct_count, bool has_min_max, bool has_null_count, + bool has_distinct_count, + ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()); + /// \brief Create a new statistics instance given a column schema /// definition and preexisting state /// \param[in] descr the column schema @@ -252,6 +272,30 @@ class PARQUET_EXPORT Statistics { std::optional is_min_value_exact, std::optional is_max_value_exact, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()); + /// \brief Create a new statistics instance given a column schema + /// definition and preexisting state + /// \param[in] descr the column schema + /// \param[in] encoded_min the encoded minimum value + /// \param[in] encoded_max the encoded maximum value + /// \param[in] num_values total number of values + /// \param[in] null_count number of null values + /// \param[in] distinct_count number of distinct values + /// \param[in] has_min_max whether the min/max statistics are set + /// \param[in] has_null_count whether the null_count statistics are set + /// \param[in] has_distinct_count whether the distinct_count statistics are set + /// \param[in] is_min_value_exact whether the min value is exact + /// \param[in] is_max_value_exact whether the max value is exact + /// \param[in] pool a memory pool to use for any memory allocations, optional + /// \deprecated Deprecated in 24.0.0. Use std::optional version instead. + PARQUET_DEPRECATED("Deprecated in 24.0.0. Use std::optional version instead.") + static std::shared_ptr Make( + const ColumnDescriptor* descr, const std::string& encoded_min, + const std::string& encoded_max, int64_t num_values, int64_t null_count, + int64_t distinct_count, bool has_min_max, bool has_null_count, + bool has_distinct_count, std::optional is_min_value_exact, + std::optional is_max_value_exact, + ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()); + // Helper function to convert EncodedStatistics to Statistics. // EncodedStatistics does not contain number of non-null values, and it can be // passed using the num_values parameter. @@ -264,7 +308,7 @@ class PARQUET_EXPORT Statistics { virtual bool HasNullCount() const = 0; /// \brief The number of null values, may not be set - PARQUET_DEPRECATED("Deprecated in 24.0.0. Use NullCount() instead") + PARQUET_DEPRECATED("Deprecated in 24.0.0. Use NullCount() instead.") virtual int64_t null_count() const = 0; /// \brief The number of null values, may not be set @@ -274,7 +318,7 @@ class PARQUET_EXPORT Statistics { virtual bool HasDistinctCount() const = 0; /// \brief The number of distinct values, may not be set - PARQUET_DEPRECATED("Deprecated in 24.0.0. Use DistinctCount() instead") + PARQUET_DEPRECATED("Deprecated in 24.0.0. Use DistinctCount() instead.") virtual int64_t distinct_count() const = 0; /// \brief The number of distinct values, may not be set @@ -335,14 +379,14 @@ class TypedStatistics : public Statistics { using T = typename DType::c_type; /// \brief The current minimum value - PARQUET_DEPRECATED("Use Min() instead") + PARQUET_DEPRECATED("Deprecated in 24.0.0. Use Min() instead.") virtual const T& min() const = 0; /// \brief The current minimum value virtual std::optional Min() const = 0; /// \brief The current maximum value - PARQUET_DEPRECATED("Deprecated in 24.0.0. Use Max() instead") + PARQUET_DEPRECATED("Deprecated in 24.0.0. Use Max() instead.") virtual const T& max() const = 0; /// \brief The current maximum value @@ -438,6 +482,20 @@ std::shared_ptr> MakeStatistics( /*is_min_value_exact=*/std::nullopt, /*is_max_value_exact=*/std::nullopt, pool)); } +/// \brief Typed version of Statistics::Make +template +PARQUET_DEPRECATED("Deprecated in 24.0.0. Use std::optional version instead.") +std::shared_ptr> MakeStatistics( + const ColumnDescriptor* descr, const std::string& encoded_min, + const std::string& encoded_max, int64_t num_values, int64_t null_count, + int64_t distinct_count, bool has_min_max, bool has_null_count, + bool has_distinct_count, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) { + return std::static_pointer_cast>(Statistics::Make( + descr, encoded_min, encoded_max, num_values, null_count, distinct_count, + has_min_max, has_null_count, has_distinct_count, + /*is_min_value_exact=*/std::nullopt, /*is_max_value_exact=*/std::nullopt, pool)); +} + /// \brief Typed version of Statistics::Make template std::shared_ptr> MakeStatistics( @@ -450,4 +508,21 @@ std::shared_ptr> MakeStatistics( Statistics::Make(descr, encoded_min, encoded_max, num_values, null_count, distinct_count, is_min_value_exact, is_max_value_exact, pool)); } + +/// \brief Typed version of Statistics::Make +template +PARQUET_DEPRECATED("Deprecated in 24.0.0. Use std::optional version instead.") +std::shared_ptr> MakeStatistics( + const ColumnDescriptor* descr, const std::string& encoded_min, + const std::string& encoded_max, int64_t num_values, int64_t null_count, + int64_t distinct_count, bool has_min_max, bool has_null_count, + bool has_distinct_count, std::optional is_min_value_exact, + std::optional is_max_value_exact, + ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) { + return std::static_pointer_cast>( + Statistics::Make(descr, encoded_min, encoded_max, num_values, null_count, + distinct_count, has_min_max, has_null_count, has_distinct_count, + is_min_value_exact, is_max_value_exact, pool)); +} + } // namespace parquet diff --git a/cpp/src/parquet/statistics_test.cc b/cpp/src/parquet/statistics_test.cc index 8a40d1316d7..4e1c11c0353 100644 --- a/cpp/src/parquet/statistics_test.cc +++ b/cpp/src/parquet/statistics_test.cc @@ -755,7 +755,7 @@ class TestStatisticsHasFlag : public TestStatistics { auto encoded_stats1 = statistics1->Encode(); EXPECT_TRUE(statistics1->HasNullCount()); EXPECT_EQ(0, statistics1->NullCount()); - EXPECT_TRUE(statistics1->Encode().null_count.has_value()); + EXPECT_TRUE(statistics1->Encode().HasNullCount()); } // Merge with null-count should also have null count VerifyMergedStatistics(*statistics1, *statistics1, From 3719f01a2ee630d852e3e8aafb0a23bc99913890 Mon Sep 17 00:00:00 2001 From: Alvaro-Kothe Date: Thu, 26 Feb 2026 00:38:37 -0300 Subject: [PATCH 09/11] refactor --- cpp/src/parquet/arrow/reader_internal.cc | 10 +++--- cpp/src/parquet/column_writer_test.cc | 2 +- cpp/src/parquet/page_index_test.cc | 8 ++--- cpp/src/parquet/statistics_test.cc | 40 ++++++++++++------------ 4 files changed, 29 insertions(+), 31 deletions(-) diff --git a/cpp/src/parquet/arrow/reader_internal.cc b/cpp/src/parquet/arrow/reader_internal.cc index 1a9806f1aa4..152f35f8e29 100644 --- a/cpp/src/parquet/arrow/reader_internal.cc +++ b/cpp/src/parquet/arrow/reader_internal.cc @@ -227,9 +227,8 @@ static Status FromInt32Statistics(const Int32Statistics& statistics, case LogicalType::Type::NONE: return MakeMinMaxTypedScalar(statistics, type, min, max); case LogicalType::Type::DECIMAL: - return ExtractDecimalMinMaxFromInteger(statistics.Min().value(), - statistics.Max().value(), logical_type, min, - max); + return ExtractDecimalMinMaxFromInteger( + statistics.Min().value(), statistics.Max().value(), logical_type, min, max); default: break; } @@ -253,9 +252,8 @@ static Status FromInt64Statistics(const Int64Statistics& statistics, case LogicalType::Type::NONE: return MakeMinMaxTypedScalar(statistics, type, min, max); case LogicalType::Type::DECIMAL: - return ExtractDecimalMinMaxFromInteger(statistics.Min().value(), - statistics.Max().value(), logical_type, min, - max); + return ExtractDecimalMinMaxFromInteger( + statistics.Min().value(), statistics.Max().value(), logical_type, min, max); default: break; } diff --git a/cpp/src/parquet/column_writer_test.cc b/cpp/src/parquet/column_writer_test.cc index ce5af028eb9..3f11b9a1dbd 100644 --- a/cpp/src/parquet/column_writer_test.cc +++ b/cpp/src/parquet/column_writer_test.cc @@ -375,7 +375,7 @@ class TestPrimitiveWriter : public PrimitiveTypedTest { auto metadata_accessor = ColumnChunkMetaData::Make( metadata_->contents(), this->descr_, default_reader_properties(), &app_version); auto encoded_stats = metadata_accessor->statistics()->Encode(); - return {encoded_stats.min_.has_value(), encoded_stats.max_.has_value()}; + return {encoded_stats.HasMin(), encoded_stats.HasMax()}; } std::vector metadata_encodings() { diff --git a/cpp/src/parquet/page_index_test.cc b/cpp/src/parquet/page_index_test.cc index 6446547baff..b21265fbf0d 100644 --- a/cpp/src/parquet/page_index_test.cc +++ b/cpp/src/parquet/page_index_test.cc @@ -546,8 +546,8 @@ void TestWriteTypedColumnIndex(schema::NodePtr node, for (size_t i = 0; i < num_pages; ++i) { ASSERT_EQ(page_stats[i].all_null_value, column_index->null_pages()[i]); - ASSERT_EQ(page_stats[i].min_.value_or(""), column_index->encoded_min_values()[i]); - ASSERT_EQ(page_stats[i].max_.value_or(""), column_index->encoded_max_values()[i]); + ASSERT_EQ(page_stats[i].Min().value_or(""), column_index->encoded_min_values()[i]); + ASSERT_EQ(page_stats[i].Max().value_or(""), column_index->encoded_max_values()[i]); if (has_null_counts) { ASSERT_EQ(page_stats[i].null_count, column_index->null_counts()[i]); } @@ -815,8 +815,8 @@ class PageIndexBuilderTest : public ::testing::Test { ASSERT_EQ(stats.all_null_value, column_index->null_pages()[0]); ASSERT_EQ(stats.Min(), column_index->encoded_min_values()[0]); ASSERT_EQ(stats.Max(), column_index->encoded_max_values()[0]); - ASSERT_EQ(stats.null_count.has_value(), column_index->has_null_counts()); - if (stats.null_count) { + ASSERT_EQ(stats.HasNullCount(), column_index->has_null_counts()); + if (stats.HasNullCount()) { ASSERT_EQ(stats.null_count.value(), column_index->null_counts()[0]); } } diff --git a/cpp/src/parquet/statistics_test.cc b/cpp/src/parquet/statistics_test.cc index 4e1c11c0353..c8c33959ef4 100644 --- a/cpp/src/parquet/statistics_test.cc +++ b/cpp/src/parquet/statistics_test.cc @@ -722,16 +722,16 @@ class TestStatisticsHasFlag : public TestStatistics { statistics2->Update(this->values_ptr_, this->values_.size(), 0); auto encoded_stats2 = statistics2->Encode(); EXPECT_TRUE(statistics2->HasMinMax()); - EXPECT_TRUE(encoded_stats2.min_.has_value()); - EXPECT_TRUE(encoded_stats2.max_.has_value()); + EXPECT_TRUE(encoded_stats2.HasMin()); + EXPECT_TRUE(encoded_stats2.HasMax()); EXPECT_EQ(encoded_stats2.is_min_value_exact, std::make_optional(true)); EXPECT_EQ(encoded_stats2.is_max_value_exact, std::make_optional(true)); } VerifyMergedStatistics(*statistics1, *statistics2, [](TypedStatistics* merged_statistics) { EXPECT_TRUE(merged_statistics->HasMinMax()); - EXPECT_TRUE(merged_statistics->Encode().min_.has_value()); - EXPECT_TRUE(merged_statistics->Encode().max_.has_value()); + EXPECT_TRUE(merged_statistics->Encode().HasMin()); + EXPECT_TRUE(merged_statistics->Encode().HasMax()); EXPECT_EQ(merged_statistics->Encode().is_min_value_exact, std::make_optional(true)); EXPECT_EQ(merged_statistics->Encode().is_max_value_exact, @@ -763,7 +763,7 @@ class TestStatisticsHasFlag : public TestStatistics { EXPECT_TRUE(merged_statistics->HasNullCount()); EXPECT_EQ(0, merged_statistics->NullCount()); auto encoded = merged_statistics->Encode(); - EXPECT_TRUE(encoded.null_count.has_value()); + EXPECT_TRUE(encoded.HasNullCount()); EXPECT_EQ(0, encoded.null_count); }); @@ -775,7 +775,7 @@ class TestStatisticsHasFlag : public TestStatistics { statistics2 = std::dynamic_pointer_cast>( Statistics::Make(this->schema_.Column(0), &encoded_statistics2, /*num_values=*/1000)); - EXPECT_FALSE(statistics2->Encode().null_count.has_value()); + EXPECT_FALSE(statistics2->Encode().HasNullCount()); EXPECT_FALSE(statistics2->HasNullCount()); } @@ -798,10 +798,10 @@ class TestStatisticsHasFlag : public TestStatistics { EXPECT_FALSE(typed_stats->HasNullCount()); auto encoded = typed_stats->Encode(); EXPECT_FALSE(encoded.all_null_value); - EXPECT_FALSE(encoded.null_count.has_value()); - EXPECT_FALSE(encoded.distinct_count.has_value()); - EXPECT_FALSE(encoded.min_.has_value()); - EXPECT_FALSE(encoded.max_.has_value()); + EXPECT_FALSE(encoded.HasNullCount()); + EXPECT_FALSE(encoded.HasDistinctCount()); + EXPECT_FALSE(encoded.HasMin()); + EXPECT_FALSE(encoded.HasMax()); EXPECT_FALSE(encoded.is_min_value_exact.has_value()); EXPECT_FALSE(encoded.is_max_value_exact.has_value()); } @@ -1650,10 +1650,10 @@ TEST(TestStatisticsSortOrder, UNKNOWN) { ASSERT_TRUE(column_chunk->is_stats_set()); std::shared_ptr enc_stats = column_chunk->encoded_statistics(); - ASSERT_TRUE(enc_stats->null_count.has_value()); - ASSERT_FALSE(enc_stats->distinct_count.has_value()); - ASSERT_FALSE(enc_stats->min_.has_value()); - ASSERT_FALSE(enc_stats->max_.has_value()); + ASSERT_TRUE(enc_stats->HasNullCount()); + ASSERT_FALSE(enc_stats->HasDistinctCount()); + ASSERT_FALSE(enc_stats->HasMin()); + ASSERT_FALSE(enc_stats->HasMax()); ASSERT_EQ(1, enc_stats->null_count); ASSERT_FALSE(enc_stats->is_max_value_exact.has_value()); ASSERT_FALSE(enc_stats->is_min_value_exact.has_value()); @@ -1755,12 +1755,12 @@ TEST(TestEncodedStatistics, TruncatedMinMax) { TEST(TestEncodedStatistics, CopySafe) { EncodedStatistics encoded_statistics; encoded_statistics.set_max("abc"); - ASSERT_TRUE(encoded_statistics.max_.has_value()); + ASSERT_TRUE(encoded_statistics.HasMax()); encoded_statistics.is_max_value_exact = true; ASSERT_TRUE(encoded_statistics.is_max_value_exact.has_value()); encoded_statistics.set_min("abc"); - ASSERT_TRUE(encoded_statistics.min_.has_value()); + ASSERT_TRUE(encoded_statistics.HasMin()); encoded_statistics.is_min_value_exact = true; ASSERT_TRUE(encoded_statistics.is_min_value_exact.has_value()); @@ -1779,16 +1779,16 @@ TEST(TestEncodedStatistics, CopySafe) { TEST(TestEncodedStatistics, ApplyStatSizeLimits) { EncodedStatistics encoded_statistics; encoded_statistics.set_min("a"); - ASSERT_TRUE(encoded_statistics.min_.has_value()); + ASSERT_TRUE(encoded_statistics.HasMin()); encoded_statistics.set_max("abc"); - ASSERT_TRUE(encoded_statistics.max_.has_value()); + ASSERT_TRUE(encoded_statistics.HasMax()); encoded_statistics.ApplyStatSizeLimits(2); - ASSERT_TRUE(encoded_statistics.min_.has_value()); + ASSERT_TRUE(encoded_statistics.HasMin()); ASSERT_EQ("a", encoded_statistics.Min()); - ASSERT_FALSE(encoded_statistics.max_.has_value()); + ASSERT_FALSE(encoded_statistics.HasMax()); NodePtr node = PrimitiveNode::Make("StringColumn", Repetition::REQUIRED, Type::BYTE_ARRAY); From 967939b2d1c64cf3ce8f1d2cab56bd395435dc5a Mon Sep 17 00:00:00 2001 From: Alvaro-Kothe Date: Thu, 26 Feb 2026 09:15:33 -0300 Subject: [PATCH 10/11] reduce changes --- .../parquet/arrow/arrow_reader_writer_test.cc | 4 +-- cpp/src/parquet/column_writer.cc | 2 +- cpp/src/parquet/statistics.cc | 11 +++---- cpp/src/parquet/statistics_test.cc | 33 +++++++++---------- 4 files changed, 23 insertions(+), 27 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index d1f4e7902b3..a4f0e7d344e 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -4735,8 +4735,8 @@ TEST_P(TestArrowWriteDictionary, Statistics) { auto expect_has_min_max = expected_has_min_max_by_page[case_index][row_group_index][page_index]; - EXPECT_EQ(stats.min_.has_value(), expect_has_min_max); - EXPECT_EQ(stats.max_.has_value(), expect_has_min_max); + EXPECT_EQ(stats.HasMin(), expect_has_min_max); + EXPECT_EQ(stats.HasMax(), expect_has_min_max); if (expect_has_min_max) { EXPECT_EQ(stats.min(), expected_min_by_page[case_index][row_group_index][page_index]); diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 61aca8c8e58..4301c256155 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -1084,7 +1084,7 @@ void ColumnWriterImpl::BuildDataPageV2(int64_t definition_levels_rle_size, // page_stats.null_count is not set when page_statistics_ is nullptr. It is only used // here for safety check. - DCHECK(!page_stats.null_count.has_value() || page_stats.null_count == null_count); + DCHECK(!page_stats.HasNullCount() || page_stats.null_count == null_count); // Write the page to OutputStream eagerly if there is no dictionary or // if dictionary encoding has fallen back to PLAIN diff --git a/cpp/src/parquet/statistics.cc b/cpp/src/parquet/statistics.cc index 0183bff65ee..1e360a0af2c 100644 --- a/cpp/src/parquet/statistics.cc +++ b/cpp/src/parquet/statistics.cc @@ -645,6 +645,9 @@ class TypedStatisticsImpl : public TypedStatistics { Copy(max, &max_.value(), max_buffer_.get()); statistics_.is_min_value_exact = is_min_value_exact; statistics_.is_max_value_exact = is_max_value_exact; + } else { + min_ = std::nullopt; + max_ = std::nullopt; } } @@ -882,13 +885,9 @@ inline bool TypedStatisticsImpl::MinMaxEqual( const TypedStatisticsImpl& other) const { uint32_t len = descr_->type_length(); if (min_.has_value() != other.min_.has_value()) return false; - if (min_.has_value()) { - if (std::memcmp(min_.value().ptr, other.min_.value().ptr, len) != 0) return false; - } + if (min_.has_value() && std::memcmp(min_->ptr, other.min_->ptr, len) != 0) return false; if (max_.has_value() != other.max_.has_value()) return false; - if (max_.has_value()) { - if (std::memcmp(max_.value().ptr, other.max_.value().ptr, len) != 0) return false; - } + if (max_.has_value() && std::memcmp(max_->ptr, other.max_->ptr, len) != 0) return false; return true; } diff --git a/cpp/src/parquet/statistics_test.cc b/cpp/src/parquet/statistics_test.cc index c8c33959ef4..86fc2c8e975 100644 --- a/cpp/src/parquet/statistics_test.cc +++ b/cpp/src/parquet/statistics_test.cc @@ -368,7 +368,7 @@ class TestStatistics : public PrimitiveTypedTest { ASSERT_TRUE(statistics->HasNullCount()); ASSERT_FALSE(statistics->HasMinMax()); ASSERT_FALSE(statistics->HasDistinctCount()); - ASSERT_EQ(std::make_optional(0), statistics->NullCount()); + ASSERT_EQ(0, statistics->NullCount()); ASSERT_EQ(0, statistics->num_values()); ASSERT_EQ(std::nullopt, statistics->DistinctCount()); ASSERT_EQ("", statistics->EncodeMin()); @@ -470,8 +470,8 @@ class TestStatistics : public PrimitiveTypedTest { std::shared_ptr enc_stats = column_chunk->encoded_statistics(); EXPECT_EQ(null_count, enc_stats->null_count); - EXPECT_TRUE(enc_stats->min_.has_value()); - EXPECT_TRUE(enc_stats->max_.has_value()); + EXPECT_TRUE(enc_stats->HasMin()); + EXPECT_TRUE(enc_stats->HasMax()); EXPECT_EQ(expected_stats->EncodeMin(), enc_stats->Min()); EXPECT_EQ(expected_stats->EncodeMax(), enc_stats->Max()); EXPECT_EQ(enc_stats->is_min_value_exact, std::make_optional(true)); @@ -567,12 +567,10 @@ void TestStatistics::TestMinMaxEncode() { std::string(reinterpret_cast(statistics1->Max()->ptr), statistics1->Max()->len)); - auto statistics2 = MakeStatistics(this->schema_.Column(0), encoded_min, - encoded_max, this->values_.size(), - /*null_count=*/0, - /*distinct_count=*/0, - /*is_min_value_exact=*/true, - /*is_max_value_exact=*/true); + auto statistics2 = MakeStatistics( + this->schema_.Column(0), encoded_min, encoded_max, this->values_.size(), + /*null_count=*/0, /*distinct_count=*/0, + /*is_min_value_exact=*/true, /*is_max_value_exact=*/true); ASSERT_EQ(encoded_min, statistics2->EncodeMin()); ASSERT_EQ(encoded_max, statistics2->EncodeMax()); @@ -655,8 +653,7 @@ class TestStatisticsHasFlag : public TestStatistics { statistics->Merge(*next_statistics); } EncodedStatistics final_statistics = statistics->Encode(); - EXPECT_EQ(statistics->HasDistinctCount(), - final_statistics.HasDistinctCount()); + EXPECT_EQ(statistics->HasDistinctCount(), final_statistics.HasDistinctCount()); if (statistics->HasDistinctCount()) { EXPECT_EQ(statistics->DistinctCount(), final_statistics.distinct_count); return statistics->DistinctCount(); @@ -710,8 +707,8 @@ class TestStatisticsHasFlag : public TestStatistics { /*null_count=*/this->values_.size()); auto encoded_stats1 = statistics1->Encode(); EXPECT_FALSE(statistics1->HasMinMax()); - EXPECT_FALSE(encoded_stats1.min_.has_value()); - EXPECT_FALSE(encoded_stats1.max_.has_value()); + EXPECT_FALSE(encoded_stats1.HasMin()); + EXPECT_FALSE(encoded_stats1.HasMax()); EXPECT_EQ(encoded_stats1.is_max_value_exact, std::nullopt); EXPECT_EQ(encoded_stats1.is_min_value_exact, std::nullopt); } @@ -780,11 +777,11 @@ class TestStatisticsHasFlag : public TestStatistics { } // Merge without null-count should not have null count - VerifyMergedStatistics( - *statistics1, *statistics2, [](TypedStatistics* merged_statistics) { - EXPECT_FALSE(merged_statistics->HasNullCount()); - EXPECT_FALSE(merged_statistics->Encode().null_count.has_value()); - }); + VerifyMergedStatistics(*statistics1, *statistics2, + [](TypedStatistics* merged_statistics) { + EXPECT_FALSE(merged_statistics->HasNullCount()); + EXPECT_FALSE(merged_statistics->Encode().HasNullCount()); + }); } // statistics.all_null_value is used to build the page index. From e39f26c8f3024c8b0e1f5eb5b6552b37e0b29176 Mon Sep 17 00:00:00 2001 From: Alvaro-Kothe Date: Thu, 26 Feb 2026 21:57:47 -0300 Subject: [PATCH 11/11] refactor: reduce changes --- cpp/src/parquet/printer.cc | 4 ++-- cpp/src/parquet/statistics.cc | 18 +++++++++--------- cpp/src/parquet/statistics.h | 17 +++++++++-------- 3 files changed, 20 insertions(+), 19 deletions(-) diff --git a/cpp/src/parquet/printer.cc b/cpp/src/parquet/printer.cc index 0d07196f825..feae3bf58eb 100644 --- a/cpp/src/parquet/printer.cc +++ b/cpp/src/parquet/printer.cc @@ -174,8 +174,8 @@ void ParquetFilePrinter::DebugPrint(std::ostream& stream, std::list selecte stats->is_min_value_exact.has_value() ? (stats->is_min_value_exact.value() ? "true" : "false") : "unknown"; - stream << ", Null Values: " << stats->null_count.value_or(0) - << ", Distinct Values: " << stats->distinct_count.value_or(0) << std::endl + stream << ", Null Values: " << stats->NullCount().value_or(0) + << ", Distinct Values: " << stats->DistinctCount().value_or(0) << std::endl << " Max (exact: " << max_exact << "): " << FormatStatValue(descr->physical_type(), max, descr->logical_type()) << ", Min (exact: " << min_exact << "): " diff --git a/cpp/src/parquet/statistics.cc b/cpp/src/parquet/statistics.cc index 1e360a0af2c..c035bd92ec9 100644 --- a/cpp/src/parquet/statistics.cc +++ b/cpp/src/parquet/statistics.cc @@ -659,11 +659,6 @@ class TypedStatisticsImpl : public TypedStatistics { bool HasMinMax() const override { return HasMin() && HasMax(); } bool HasNullCount() const override { return statistics_.null_count.has_value(); }; - std::optional NullCount() const override { return statistics_.null_count; } - std::optional DistinctCount() const override { - return statistics_.distinct_count; - } - void IncrementNullCount(int64_t n) override { statistics_.null_count = NullCount().value_or(0) + n; } @@ -804,6 +799,11 @@ class TypedStatisticsImpl : public TypedStatistics { return statistics_.distinct_count.value_or(0); } int64_t num_values() const override { return num_values_; } + + std::optional NullCount() const override { return statistics_.null_count; } + std::optional DistinctCount() const override { + return statistics_.distinct_count; + } std::optional is_min_value_exact() const override { return statistics_.is_min_value_exact; } @@ -884,10 +884,10 @@ template <> inline bool TypedStatisticsImpl::MinMaxEqual( const TypedStatisticsImpl& other) const { uint32_t len = descr_->type_length(); - if (min_.has_value() != other.min_.has_value()) return false; - if (min_.has_value() && std::memcmp(min_->ptr, other.min_->ptr, len) != 0) return false; - if (max_.has_value() != other.max_.has_value()) return false; - if (max_.has_value() && std::memcmp(max_->ptr, other.max_->ptr, len) != 0) return false; + if (this->HasMin() != other.HasMin()) return false; + if (this->HasMin() && std::memcmp(Min()->ptr, other.Min()->ptr, len) != 0) return false; + if (this->HasMax() != other.HasMax()) return false; + if (this->HasMax() && std::memcmp(Max()->ptr, other.Max()->ptr, len) != 0) return false; return true; } diff --git a/cpp/src/parquet/statistics.h b/cpp/src/parquet/statistics.h index b962a4f15d0..5b7fb2bad43 100644 --- a/cpp/src/parquet/statistics.h +++ b/cpp/src/parquet/statistics.h @@ -120,14 +120,12 @@ std::shared_ptr> MakeComparator(const ColumnDescriptor* d /// \brief Structure represented encoded statistics to be written to /// and read from Parquet serialized metadata. class PARQUET_EXPORT EncodedStatistics { + std::optional max_, min_; bool is_signed_ = false; public: EncodedStatistics() = default; - std::optional max_, min_; - std::optional null_count; - std::optional distinct_count; PARQUET_DEPRECATED("Deprecated in 24.0.0. Use Max() instead.") const std::string& max() const { return max_.value(); } @@ -136,18 +134,21 @@ class PARQUET_EXPORT EncodedStatistics { std::optional Max() const { return max_; } std::optional Min() const { return min_; } - std::optional DistinctCount() const { return distinct_count; } - std::optional NullCount() const { return null_count; } - - bool HasMax() const { return max_.has_value(); } - bool HasMin() const { return min_.has_value(); } std::optional is_max_value_exact; std::optional is_min_value_exact; + std::optional null_count; + std::optional distinct_count; + + bool HasMax() const { return max_.has_value(); } + bool HasMin() const { return min_.has_value(); } bool HasNullCount() const { return null_count.has_value(); } bool HasDistinctCount() const { return distinct_count.has_value(); } + std::optional DistinctCount() const { return distinct_count; } + std::optional NullCount() const { return null_count; } + // When all values in the statistics are null, it is set to true. // Otherwise, at least one value is not null, or we are not sure at all. // Page index requires this information to decide whether a data page