From c7c7feed57128d0090ead07974dcb75e4fac00d0 Mon Sep 17 00:00:00 2001 From: Eduard Tudenhoefner Date: Tue, 19 May 2026 15:00:28 +0200 Subject: [PATCH 1/3] Core: Align content stats fields with latest Spec changes --- .../org/apache/iceberg/BaseContentStats.java | 42 +++-- .../org/apache/iceberg/BaseFieldStats.java | 122 ++++++--------- .../org/apache/iceberg/FieldStatistic.java | 90 ++++++----- .../java/org/apache/iceberg/FieldStats.java | 30 ++-- .../org/apache/iceberg/TestContentStats.java | 35 ++--- .../org/apache/iceberg/TestFieldStats.java | 57 +++---- .../org/apache/iceberg/TestStatsUtil.java | 147 +++++++++++++----- .../apache/iceberg/TestTrackedFileStruct.java | 20 +-- 8 files changed, 294 insertions(+), 249 deletions(-) diff --git a/core/src/main/java/org/apache/iceberg/BaseContentStats.java b/core/src/main/java/org/apache/iceberg/BaseContentStats.java index 45900b03e299..0a83f7eaa099 100644 --- a/core/src/main/java/org/apache/iceberg/BaseContentStats.java +++ b/core/src/main/java/org/apache/iceberg/BaseContentStats.java @@ -124,25 +124,6 @@ public void set(int pos, T value) { FieldStats stat = fieldStats.get(pos); BaseFieldStats.Builder builder = BaseFieldStats.buildFrom(stat); Type type = stat.type(); - if (null != record.getField(FieldStatistic.VALUE_COUNT.fieldName())) { - builder.valueCount((Long) record.getField(FieldStatistic.VALUE_COUNT.fieldName())); - } - - if (null != record.getField(FieldStatistic.NAN_VALUE_COUNT.fieldName())) { - builder.nanValueCount((Long) record.getField(FieldStatistic.NAN_VALUE_COUNT.fieldName())); - } - - if (null != record.getField(FieldStatistic.NULL_VALUE_COUNT.fieldName())) { - builder.nullValueCount((Long) record.getField(FieldStatistic.NULL_VALUE_COUNT.fieldName())); - } - - if (null != record.getField(FieldStatistic.AVG_VALUE_SIZE.fieldName())) { - builder.avgValueSize((Integer) record.getField(FieldStatistic.AVG_VALUE_SIZE.fieldName())); - } - - if (null != record.getField(FieldStatistic.MAX_VALUE_SIZE.fieldName())) { - builder.maxValueSize((Integer) record.getField(FieldStatistic.MAX_VALUE_SIZE.fieldName())); - } Object lowerBound = record.getField(FieldStatistic.LOWER_BOUND.fieldName()); if (null != type && null != lowerBound) { @@ -164,9 +145,26 @@ public void set(int pos, T value) { builder.upperBound(type.typeId().javaClass().cast(upperBound)); } - if (null != record.getField(FieldStatistic.EXACT_BOUNDS.fieldName())) { - Boolean exactBounds = (Boolean) record.getField(FieldStatistic.EXACT_BOUNDS.fieldName()); - builder.hasExactBounds(null != exactBounds && exactBounds); + if (null != record.getField(FieldStatistic.TIGHT_BOUNDS.fieldName())) { + Boolean tightBounds = (Boolean) record.getField(FieldStatistic.TIGHT_BOUNDS.fieldName()); + builder.tightBounds(null != tightBounds && tightBounds); + } + + if (null != record.getField(FieldStatistic.VALUE_COUNT.fieldName())) { + builder.valueCount((Long) record.getField(FieldStatistic.VALUE_COUNT.fieldName())); + } + + if (null != record.getField(FieldStatistic.NULL_VALUE_COUNT.fieldName())) { + builder.nullValueCount((Long) record.getField(FieldStatistic.NULL_VALUE_COUNT.fieldName())); + } + + if (null != record.getField(FieldStatistic.NAN_VALUE_COUNT.fieldName())) { + builder.nanValueCount((Long) record.getField(FieldStatistic.NAN_VALUE_COUNT.fieldName())); + } + + if (null != record.getField(FieldStatistic.AVG_VALUE_SIZE_IN_BYTES.fieldName())) { + builder.avgValueSizeInBytes( + (Integer) record.getField(FieldStatistic.AVG_VALUE_SIZE_IN_BYTES.fieldName())); } BaseFieldStats newStat = builder.build(); diff --git a/core/src/main/java/org/apache/iceberg/BaseFieldStats.java b/core/src/main/java/org/apache/iceberg/BaseFieldStats.java index 11da570b8faa..b2945f8c30ef 100644 --- a/core/src/main/java/org/apache/iceberg/BaseFieldStats.java +++ b/core/src/main/java/org/apache/iceberg/BaseFieldStats.java @@ -32,38 +32,35 @@ class BaseFieldStats extends SupportsIndexProjection implements FieldStats private static final int[] IDENTITY_MAPPING = identityMapping(); private final int fieldId; private final Type type; + private final T lowerBound; + private final T upperBound; + private final boolean tightBounds; private final Long valueCount; private final Long nullValueCount; private final Long nanValueCount; - private final Integer avgValueSize; - private final Integer maxValueSize; - private final T lowerBound; - private final T upperBound; - private final boolean hasExactBounds; + private final Integer avgValueSizeInBytes; private BaseFieldStats( int fieldId, int[] fromProjectionPos, Type type, + T lowerBound, + T upperBound, + boolean tightBounds, Long valueCount, Long nullValueCount, Long nanValueCount, - Integer avgValueSize, - Integer maxValueSize, - T lowerBound, - T upperBound, - boolean hasExactBounds) { + Integer avgValueSizeInBytes) { super(fromProjectionPos != null ? fromProjectionPos : IDENTITY_MAPPING); this.fieldId = fieldId; this.type = type; + this.lowerBound = lowerBound; + this.upperBound = upperBound; + this.tightBounds = tightBounds; this.valueCount = valueCount; this.nullValueCount = nullValueCount; this.nanValueCount = nanValueCount; - this.avgValueSize = avgValueSize; - this.maxValueSize = maxValueSize; - this.lowerBound = lowerBound; - this.upperBound = upperBound; - this.hasExactBounds = hasExactBounds; + this.avgValueSizeInBytes = avgValueSizeInBytes; } private static int[] identityMapping() { @@ -77,7 +74,7 @@ private static int[] identityMapping() { } /** - * Computes a position mapping from the column-specific stats struct to the full 8-field struct. + * Computes a position mapping from the column-specific stats struct to the full stats struct. * Each entry maps a projected position to its base position (0-based) using the field ID offsets * from the column's base stats field ID. */ @@ -122,13 +119,8 @@ public Long nanValueCount() { } @Override - public Integer avgValueSize() { - return avgValueSize; - } - - @Override - public Integer maxValueSize() { - return maxValueSize; + public Integer avgValueSizeInBytes() { + return avgValueSizeInBytes; } @SuppressWarnings("unchecked") @@ -173,21 +165,20 @@ public T upperBound() { } @Override - public boolean hasExactBounds() { - return hasExactBounds; + public boolean tightBounds() { + return tightBounds; } @Override protected X internalGet(int pos, Class javaClass) { return switch (FieldStatistic.fromPosition(pos)) { + case LOWER_BOUND -> javaClass.cast(lowerBound()); + case UPPER_BOUND -> javaClass.cast(upperBound()); + case TIGHT_BOUNDS -> javaClass.cast(tightBounds); case VALUE_COUNT -> javaClass.cast(valueCount); case NULL_VALUE_COUNT -> javaClass.cast(nullValueCount); case NAN_VALUE_COUNT -> javaClass.cast(nanValueCount); - case AVG_VALUE_SIZE -> javaClass.cast(avgValueSize); - case MAX_VALUE_SIZE -> javaClass.cast(maxValueSize); - case LOWER_BOUND -> javaClass.cast(lowerBound()); - case UPPER_BOUND -> javaClass.cast(upperBound()); - case EXACT_BOUNDS -> javaClass.cast(hasExactBounds); + case AVG_VALUE_SIZE_IN_BYTES -> javaClass.cast(avgValueSizeInBytes); default -> throw new UnsupportedOperationException("Unknown field ordinal: " + pos); }; } @@ -202,14 +193,13 @@ public String toString() { return MoreObjects.toStringHelper(this) .add("fieldId", fieldId) .add("type", type) + .add("lowerBound", lowerBound) + .add("upperBound", upperBound) + .add("tightBounds", tightBounds) .add("valueCount", valueCount) .add("nullValueCount", nullValueCount) .add("nanValueCount", nanValueCount) - .add("avgValueSize", avgValueSize) - .add("maxValueSize", maxValueSize) - .add("lowerBound", lowerBound) - .add("upperBound", upperBound) - .add("hasExactBounds", hasExactBounds) + .add("avgValueSizeInBytes", avgValueSizeInBytes) .toString(); } @@ -221,15 +211,14 @@ public boolean equals(Object o) { BaseFieldStats that = (BaseFieldStats) o; return fieldId == that.fieldId + && tightBounds == that.tightBounds && Objects.equals(type, that.type) + && Objects.deepEquals(lowerBound, that.lowerBound) + && Objects.deepEquals(upperBound, that.upperBound) && Objects.equals(valueCount, that.valueCount) && Objects.equals(nullValueCount, that.nullValueCount) && Objects.equals(nanValueCount, that.nanValueCount) - && Objects.equals(avgValueSize, that.avgValueSize) - && Objects.equals(maxValueSize, that.maxValueSize) - && Objects.deepEquals(lowerBound, that.lowerBound) - && Objects.deepEquals(upperBound, that.upperBound) - && hasExactBounds == that.hasExactBounds; + && Objects.equals(avgValueSizeInBytes, that.avgValueSizeInBytes); } @Override @@ -237,14 +226,13 @@ public int hashCode() { return Objects.hash( fieldId, type, + lowerBound, + upperBound, + tightBounds, valueCount, nullValueCount, nanValueCount, - avgValueSize, - maxValueSize, - lowerBound, - upperBound, - hasExactBounds); + avgValueSizeInBytes); } public static Builder builder() { @@ -256,28 +244,26 @@ public static Builder buildFrom(FieldStats value) { return BaseFieldStats.builder() .type(value.type()) .fieldId(value.fieldId()) + .lowerBound(value.lowerBound()) + .upperBound(value.upperBound()) + .tightBounds(value.tightBounds()) .valueCount(value.valueCount()) .nullValueCount(value.nullValueCount()) .nanValueCount(value.nanValueCount()) - .avgValueSize(value.avgValueSize()) - .maxValueSize(value.maxValueSize()) - .lowerBound(value.lowerBound()) - .upperBound(value.upperBound()) - .hasExactBounds(value.hasExactBounds()); + .avgValueSizeInBytes(value.avgValueSizeInBytes()); } public static class Builder { private int fieldId; private int[] fromProjectionPos; private Type type; + private T lowerBound; + private T upperBound; + private boolean tightBounds; private Long valueCount; private Long nullValueCount; private Long nanValueCount; - private Integer avgValueSize; - private Integer maxValueSize; - private T lowerBound; - private T upperBound; - private boolean hasExactBounds; + private Integer avgValueSizeInBytes; private Builder() {} @@ -306,13 +292,8 @@ public Builder nanValueCount(Long newNanValueCount) { return this; } - public Builder avgValueSize(Integer newAvgValueSize) { - this.avgValueSize = newAvgValueSize; - return this; - } - - public Builder maxValueSize(Integer newMaxValueSize) { - this.maxValueSize = newMaxValueSize; + public Builder avgValueSizeInBytes(Integer newAvgValueSizeInBytes) { + this.avgValueSizeInBytes = newAvgValueSizeInBytes; return this; } @@ -331,13 +312,13 @@ public Builder fieldId(int newFieldId) { return this; } - public Builder hasExactBounds(boolean newHasExactBounds) { - this.hasExactBounds = newHasExactBounds; + public Builder tightBounds(boolean newTightBounds) { + this.tightBounds = newTightBounds; return this; } - public Builder hasExactBounds() { - this.hasExactBounds = true; + public Builder tightBounds() { + this.tightBounds = true; return this; } @@ -370,14 +351,13 @@ public BaseFieldStats build() { fieldId, fromProjectionPos, type, + lowerBound, + upperBound, + tightBounds, valueCount, nullValueCount, nanValueCount, - avgValueSize, - maxValueSize, - lowerBound, - upperBound, - hasExactBounds); + avgValueSizeInBytes); } } } diff --git a/core/src/main/java/org/apache/iceberg/FieldStatistic.java b/core/src/main/java/org/apache/iceberg/FieldStatistic.java index 85712384254c..e088f386efe0 100644 --- a/core/src/main/java/org/apache/iceberg/FieldStatistic.java +++ b/core/src/main/java/org/apache/iceberg/FieldStatistic.java @@ -26,14 +26,13 @@ import org.apache.iceberg.types.Types; enum FieldStatistic { - VALUE_COUNT(1, "value_count"), - NULL_VALUE_COUNT(2, "null_value_count"), - NAN_VALUE_COUNT(3, "nan_value_count"), - AVG_VALUE_SIZE(4, "avg_value_size_in_bytes"), - MAX_VALUE_SIZE(5, "max_value_size_in_bytes"), - LOWER_BOUND(6, "lower_bound"), - UPPER_BOUND(7, "upper_bound"), - EXACT_BOUNDS(8, "exact_bounds"); + LOWER_BOUND(1, "lower_bound"), + UPPER_BOUND(2, "upper_bound"), + TIGHT_BOUNDS(3, "tight_bounds"), + VALUE_COUNT(4, "value_count"), + NULL_VALUE_COUNT(5, "null_value_count"), + NAN_VALUE_COUNT(6, "nan_value_count"), + AVG_VALUE_SIZE_IN_BYTES(7, "avg_value_size_in_bytes"); private final int offset; private final String fieldName; @@ -78,28 +77,52 @@ public String fieldName() { */ public static FieldStatistic fromPosition(int position) { return switch (position) { - case 0 -> VALUE_COUNT; - case 1 -> NULL_VALUE_COUNT; - case 2 -> NAN_VALUE_COUNT; - case 3 -> AVG_VALUE_SIZE; - case 4 -> MAX_VALUE_SIZE; - case 5 -> LOWER_BOUND; - case 6 -> UPPER_BOUND; - case 7 -> EXACT_BOUNDS; + case 0 -> LOWER_BOUND; + case 1 -> UPPER_BOUND; + case 2 -> TIGHT_BOUNDS; + case 3 -> VALUE_COUNT; + case 4 -> NULL_VALUE_COUNT; + case 5 -> NAN_VALUE_COUNT; + case 6 -> AVG_VALUE_SIZE_IN_BYTES; default -> throw new IllegalArgumentException("Invalid statistic position: " + position); }; } public static Types.StructType fieldStatsFor(Types.NestedField field, int baseFieldId) { - List fields = Lists.newArrayListWithCapacity(8); + List fields = Lists.newArrayListWithCapacity(7); Type type = field.type(); + Type.TypeID typeId = type.typeId(); + boolean isGeo = typeId == Type.TypeID.GEOMETRY || typeId == Type.TypeID.GEOGRAPHY; + boolean isVariant = type.isVariantType(); + + fields.add( + optional( + baseFieldId + LOWER_BOUND.offset(), + LOWER_BOUND.fieldName(), + type, + "Lower bound stored as the field's type")); + fields.add( + optional( + baseFieldId + UPPER_BOUND.offset(), + UPPER_BOUND.fieldName(), + type, + "Upper bound stored as the field's type")); + + if (!isGeo && !isVariant) { + fields.add( + optional( + baseFieldId + TIGHT_BOUNDS.offset(), + TIGHT_BOUNDS.fieldName(), + Types.BooleanType.get(), + "When true, lower_bound and upper_bound must be equal to the min and max values")); + } fields.add( optional( baseFieldId + VALUE_COUNT.offset(), VALUE_COUNT.fieldName(), Types.LongType.get(), - "Total value count, including null and NaN")); + "Number of values in the column (including null and NaN values)")); if (field.isOptional()) { fields.add( @@ -107,44 +130,27 @@ public static Types.StructType fieldStatsFor(Types.NestedField field, int baseFi baseFieldId + NULL_VALUE_COUNT.offset(), NULL_VALUE_COUNT.fieldName(), Types.LongType.get(), - "Total null value count")); + "Number of null values in the column")); } - if (type.typeId() == Type.TypeID.FLOAT || type.typeId() == Type.TypeID.DOUBLE) { + if (typeId == Type.TypeID.FLOAT || typeId == Type.TypeID.DOUBLE) { fields.add( optional( baseFieldId + NAN_VALUE_COUNT.offset(), NAN_VALUE_COUNT.fieldName(), Types.LongType.get(), - "Total NaN value count")); + "Number of NaN values in the column")); } - if (type.typeId() == Type.TypeID.STRING || type.typeId() == Type.TypeID.BINARY) { + if (typeId == Type.TypeID.STRING || typeId == Type.TypeID.BINARY || isVariant) { fields.add( optional( - baseFieldId + AVG_VALUE_SIZE.offset(), - AVG_VALUE_SIZE.fieldName(), + baseFieldId + AVG_VALUE_SIZE_IN_BYTES.offset(), + AVG_VALUE_SIZE_IN_BYTES.fieldName(), Types.IntegerType.get(), - "Avg value size in bytes of variable-length types (String, Binary)")); - fields.add( - optional( - baseFieldId + MAX_VALUE_SIZE.offset(), - MAX_VALUE_SIZE.fieldName(), - Types.IntegerType.get(), - "Max value size in bytes of variable-length types (String, Binary)")); + "Avg value size (uncompressed) in bytes to estimate memory consumption")); } - fields.add( - optional(baseFieldId + LOWER_BOUND.offset(), LOWER_BOUND.fieldName(), type, "Lower bound")); - fields.add( - optional(baseFieldId + UPPER_BOUND.offset(), UPPER_BOUND.fieldName(), type, "Upper bound")); - fields.add( - optional( - baseFieldId + EXACT_BOUNDS.offset(), - EXACT_BOUNDS.fieldName(), - Types.BooleanType.get(), - "Whether the upper/lower bound is exact or not")); - return Types.StructType.of(fields); } } diff --git a/core/src/main/java/org/apache/iceberg/FieldStats.java b/core/src/main/java/org/apache/iceberg/FieldStats.java index e42d774c7cee..885617db3b54 100644 --- a/core/src/main/java/org/apache/iceberg/FieldStats.java +++ b/core/src/main/java/org/apache/iceberg/FieldStats.java @@ -27,6 +27,18 @@ interface FieldStats extends StructLike { /** The field type of the statistic */ Type type(); + /** The lower bound */ + T lowerBound(); + + /** The upper bound */ + T upperBound(); + + /** + * Whether {@link #lowerBound()} and {@link #upperBound()} are equal to the min and max values for + * the column. + */ + boolean tightBounds(); + /** The total value count, including null and NaN */ Long valueCount(); @@ -36,18 +48,8 @@ interface FieldStats extends StructLike { /** The total NaN value count */ Long nanValueCount(); - /** The avg value size of variable-length types (String, Binary) */ - Integer avgValueSize(); - - /** The max value size of variable-length types (String, Binary) */ - Integer maxValueSize(); - - /** The lower bound */ - T lowerBound(); - - /** The upper bound */ - T upperBound(); - - /** Whether the upper/lower bound is exact or not. */ - boolean hasExactBounds(); + /** + * The avg value size (uncompressed) in bytes for variable-length types (string, binary, variant) + */ + Integer avgValueSizeInBytes(); } diff --git a/core/src/test/java/org/apache/iceberg/TestContentStats.java b/core/src/test/java/org/apache/iceberg/TestContentStats.java index 0f06276d454b..74237a27dfa8 100644 --- a/core/src/test/java/org/apache/iceberg/TestContentStats.java +++ b/core/src/test/java/org/apache/iceberg/TestContentStats.java @@ -18,12 +18,11 @@ */ package org.apache.iceberg; -import static org.apache.iceberg.FieldStatistic.AVG_VALUE_SIZE; -import static org.apache.iceberg.FieldStatistic.EXACT_BOUNDS; +import static org.apache.iceberg.FieldStatistic.AVG_VALUE_SIZE_IN_BYTES; import static org.apache.iceberg.FieldStatistic.LOWER_BOUND; -import static org.apache.iceberg.FieldStatistic.MAX_VALUE_SIZE; import static org.apache.iceberg.FieldStatistic.NAN_VALUE_COUNT; import static org.apache.iceberg.FieldStatistic.NULL_VALUE_COUNT; +import static org.apache.iceberg.FieldStatistic.TIGHT_BOUNDS; import static org.apache.iceberg.FieldStatistic.UPPER_BOUND; import static org.apache.iceberg.FieldStatistic.VALUE_COUNT; import static org.apache.iceberg.types.Types.NestedField.optional; @@ -244,7 +243,7 @@ public void setByPositionOptionalString() { Schema tableSchema = new Schema(optional(1, "s", Types.StringType.get())); Types.StructType rootStatsStruct = StatsUtil.contentStatsFor(tableSchema).type().asStructType(); Types.StructType statsStructForFieldId = rootStatsStruct.fields().get(0).type().asStructType(); - assertThat(statsStructForFieldId.fields()).hasSize(7); + assertThat(statsStructForFieldId.fields()).hasSize(6); GenericRecord record = GenericRecord.create(statsStructForFieldId); BaseFieldStats fieldStats = @@ -253,20 +252,18 @@ public void setByPositionOptionalString() { .fieldId(1) .valueCount(10L) .nullValueCount(2L) - .avgValueSize(3) - .maxValueSize(10) + .avgValueSizeInBytes(3) .lowerBound("aa") .upperBound("zzz") - .hasExactBounds() + .tightBounds() .build(); - record.setField(VALUE_COUNT.fieldName(), fieldStats.valueCount()); - record.setField(NULL_VALUE_COUNT.fieldName(), fieldStats.nullValueCount()); - record.setField(AVG_VALUE_SIZE.fieldName(), fieldStats.avgValueSize()); - record.setField(MAX_VALUE_SIZE.fieldName(), fieldStats.maxValueSize()); record.setField(LOWER_BOUND.fieldName(), fieldStats.lowerBound()); record.setField(UPPER_BOUND.fieldName(), fieldStats.upperBound()); - record.setField(EXACT_BOUNDS.fieldName(), fieldStats.hasExactBounds()); + record.setField(TIGHT_BOUNDS.fieldName(), fieldStats.tightBounds()); + record.setField(VALUE_COUNT.fieldName(), fieldStats.valueCount()); + record.setField(NULL_VALUE_COUNT.fieldName(), fieldStats.nullValueCount()); + record.setField(AVG_VALUE_SIZE_IN_BYTES.fieldName(), fieldStats.avgValueSizeInBytes()); BaseContentStats stats = new BaseContentStats(rootStatsStruct); stats.set(0, record); @@ -290,15 +287,15 @@ public void setByPositionOptionalDouble() { .nanValueCount(3L) .lowerBound(5.0) .upperBound(20.0) - .hasExactBounds() + .tightBounds() .build(); + record.setField(LOWER_BOUND.fieldName(), fieldStats.lowerBound()); + record.setField(UPPER_BOUND.fieldName(), fieldStats.upperBound()); + record.setField(TIGHT_BOUNDS.fieldName(), fieldStats.tightBounds()); record.setField(VALUE_COUNT.fieldName(), fieldStats.valueCount()); record.setField(NULL_VALUE_COUNT.fieldName(), fieldStats.nullValueCount()); record.setField(NAN_VALUE_COUNT.fieldName(), fieldStats.nanValueCount()); - record.setField(LOWER_BOUND.fieldName(), fieldStats.lowerBound()); - record.setField(UPPER_BOUND.fieldName(), fieldStats.upperBound()); - record.setField(EXACT_BOUNDS.fieldName(), fieldStats.hasExactBounds()); BaseContentStats stats = new BaseContentStats(rootStatsStruct); stats.set(0, record); @@ -320,13 +317,13 @@ public void setByPositionRequiredInteger() { .valueCount(10L) .lowerBound(5) .upperBound(20) - .hasExactBounds() + .tightBounds() .build(); - record.setField(VALUE_COUNT.fieldName(), fieldStats.valueCount()); record.setField(LOWER_BOUND.fieldName(), fieldStats.lowerBound()); record.setField(UPPER_BOUND.fieldName(), fieldStats.upperBound()); - record.setField(EXACT_BOUNDS.fieldName(), fieldStats.hasExactBounds()); + record.setField(TIGHT_BOUNDS.fieldName(), fieldStats.tightBounds()); + record.setField(VALUE_COUNT.fieldName(), fieldStats.valueCount()); // this is typically called by Avro reflection code BaseContentStats stats = new BaseContentStats(rootStatsStruct); diff --git a/core/src/test/java/org/apache/iceberg/TestFieldStats.java b/core/src/test/java/org/apache/iceberg/TestFieldStats.java index c703a3044fc0..35be3ed36bcf 100644 --- a/core/src/test/java/org/apache/iceberg/TestFieldStats.java +++ b/core/src/test/java/org/apache/iceberg/TestFieldStats.java @@ -18,12 +18,11 @@ */ package org.apache.iceberg; -import static org.apache.iceberg.FieldStatistic.AVG_VALUE_SIZE; -import static org.apache.iceberg.FieldStatistic.EXACT_BOUNDS; +import static org.apache.iceberg.FieldStatistic.AVG_VALUE_SIZE_IN_BYTES; import static org.apache.iceberg.FieldStatistic.LOWER_BOUND; -import static org.apache.iceberg.FieldStatistic.MAX_VALUE_SIZE; import static org.apache.iceberg.FieldStatistic.NAN_VALUE_COUNT; import static org.apache.iceberg.FieldStatistic.NULL_VALUE_COUNT; +import static org.apache.iceberg.FieldStatistic.TIGHT_BOUNDS; import static org.apache.iceberg.FieldStatistic.UPPER_BOUND; import static org.apache.iceberg.FieldStatistic.VALUE_COUNT; import static org.assertj.core.api.Assertions.assertThat; @@ -47,13 +46,13 @@ public void empty() { BaseFieldStats empty = BaseFieldStats.builder().build(); assertThat(empty.fieldId()).isEqualTo(0); assertThat(empty.type()).isNull(); + assertThat(empty.lowerBound()).isNull(); + assertThat(empty.upperBound()).isNull(); + assertThat(empty.tightBounds()).isFalse(); assertThat(empty.valueCount()).isNull(); assertThat(empty.nullValueCount()).isNull(); assertThat(empty.nanValueCount()).isNull(); - assertThat(empty.avgValueSize()).isNull(); - assertThat(empty.maxValueSize()).isNull(); - assertThat(empty.lowerBound()).isNull(); - assertThat(empty.upperBound()).isNull(); + assertThat(empty.avgValueSizeInBytes()).isNull(); } @Test @@ -65,11 +64,10 @@ public void validIndividualValues() { .valueCount(10L) .nullValueCount(2L) .nanValueCount(3L) - .avgValueSize(30) - .maxValueSize(70) + .avgValueSizeInBytes(30) .lowerBound(5) .upperBound(20) - .hasExactBounds() + .tightBounds() .build(); assertThat(fieldStats.type()).isEqualTo(Types.IntegerType.get()); @@ -77,11 +75,10 @@ public void validIndividualValues() { assertThat(fieldStats.valueCount()).isEqualTo(10L); assertThat(fieldStats.nullValueCount()).isEqualTo(2L); assertThat(fieldStats.nanValueCount()).isEqualTo(3L); - assertThat(fieldStats.avgValueSize()).isEqualTo(30); - assertThat(fieldStats.maxValueSize()).isEqualTo(70); + assertThat(fieldStats.avgValueSizeInBytes()).isEqualTo(30); assertThat(fieldStats.lowerBound()).isEqualTo(5); assertThat(fieldStats.upperBound()).isEqualTo(20); - assertThat(fieldStats.hasExactBounds()).isTrue(); + assertThat(fieldStats.tightBounds()).isTrue(); } @Test @@ -94,26 +91,24 @@ public void buildFromExistingStats() { .valueCount(10L) .nullValueCount(2L) .nanValueCount(3L) - .avgValueSize(30) - .maxValueSize(70) + .avgValueSizeInBytes(30) .lowerBound(5) .upperBound(20) .build()) .lowerBound(2) .upperBound(50) - .maxValueSize(90) - .hasExactBounds() + .avgValueSizeInBytes(90) + .tightBounds() .build(); assertThat(fieldStats.type()).isEqualTo(Types.IntegerType.get()); assertThat(fieldStats.fieldId()).isEqualTo(23); assertThat(fieldStats.valueCount()).isEqualTo(10L); assertThat(fieldStats.nullValueCount()).isEqualTo(2L); assertThat(fieldStats.nanValueCount()).isEqualTo(3L); - assertThat(fieldStats.avgValueSize()).isEqualTo(30); - assertThat(fieldStats.maxValueSize()).isEqualTo(90); + assertThat(fieldStats.avgValueSizeInBytes()).isEqualTo(90); assertThat(fieldStats.lowerBound()).isEqualTo(2); assertThat(fieldStats.upperBound()).isEqualTo(50); - assertThat(fieldStats.hasExactBounds()).isTrue(); + assertThat(fieldStats.tightBounds()).isTrue(); } @Test @@ -124,8 +119,7 @@ public void validFieldStats() { assertThat(BaseFieldStats.builder().nullValueCount(3L).build()).isNotNull(); assertThat(BaseFieldStats.builder().nanValueCount(3L).build()).isNotNull(); assertThat(BaseFieldStats.builder().type(Types.IntegerType.get()).build()).isNotNull(); - assertThat(BaseFieldStats.builder().avgValueSize(3).build()).isNotNull(); - assertThat(BaseFieldStats.builder().maxValueSize(3).build()).isNotNull(); + assertThat(BaseFieldStats.builder().avgValueSizeInBytes(3).build()).isNotNull(); assertThat(BaseFieldStats.builder().type(Types.LongType.get()).lowerBound(3L).build()) .isNotNull(); @@ -193,29 +187,28 @@ public void retrievalByPosition() { .valueCount(10L) .nullValueCount(2L) .nanValueCount(3L) - .avgValueSize(30) - .maxValueSize(70) + .avgValueSizeInBytes(30) .lowerBound(5) .upperBound(20) - .hasExactBounds() + .tightBounds() .build(); + assertThat(fieldStats.get(LOWER_BOUND.position(), Integer.class)).isEqualTo(5); + assertThat(fieldStats.get(UPPER_BOUND.position(), Integer.class)).isEqualTo(20); + assertThat(fieldStats.get(TIGHT_BOUNDS.position(), Boolean.class)).isEqualTo(true); assertThat(fieldStats.get(VALUE_COUNT.position(), Long.class)).isEqualTo(10L); assertThat(fieldStats.get(NULL_VALUE_COUNT.position(), Long.class)).isEqualTo(2L); assertThat(fieldStats.get(NAN_VALUE_COUNT.position(), Long.class)).isEqualTo(3L); - assertThat(fieldStats.get(AVG_VALUE_SIZE.position(), Integer.class)).isEqualTo(30); - assertThat(fieldStats.get(MAX_VALUE_SIZE.position(), Integer.class)).isEqualTo(70); - assertThat(fieldStats.get(LOWER_BOUND.position(), Integer.class)).isEqualTo(5); - assertThat(fieldStats.get(UPPER_BOUND.position(), Integer.class)).isEqualTo(20); - assertThat(fieldStats.get(EXACT_BOUNDS.position(), Boolean.class)).isEqualTo(true); + assertThat(fieldStats.get(AVG_VALUE_SIZE_IN_BYTES.position(), Integer.class)).isEqualTo(30); assertThatThrownBy(() -> assertThat(fieldStats.get(10, Long.class))) .isInstanceOf(ArrayIndexOutOfBoundsException.class) - .hasMessage("Index 10 out of bounds for length 8"); + .hasMessage("Index 10 out of bounds for length 7"); assertThatThrownBy(() -> assertThat(fieldStats.get(VALUE_COUNT.position(), Double.class))) .isInstanceOf(ClassCastException.class) .hasMessage("Cannot cast java.lang.Long to java.lang.Double"); - assertThatThrownBy(() -> assertThat(fieldStats.get(AVG_VALUE_SIZE.position(), Long.class))) + assertThatThrownBy( + () -> assertThat(fieldStats.get(AVG_VALUE_SIZE_IN_BYTES.position(), Long.class))) .isInstanceOf(ClassCastException.class) .hasMessage("Cannot cast java.lang.Integer to java.lang.Long"); } diff --git a/core/src/test/java/org/apache/iceberg/TestStatsUtil.java b/core/src/test/java/org/apache/iceberg/TestStatsUtil.java index 54db9e5d2095..7cc0b190779c 100644 --- a/core/src/test/java/org/apache/iceberg/TestStatsUtil.java +++ b/core/src/test/java/org/apache/iceberg/TestStatsUtil.java @@ -18,12 +18,11 @@ */ package org.apache.iceberg; -import static org.apache.iceberg.FieldStatistic.AVG_VALUE_SIZE; -import static org.apache.iceberg.FieldStatistic.EXACT_BOUNDS; +import static org.apache.iceberg.FieldStatistic.AVG_VALUE_SIZE_IN_BYTES; import static org.apache.iceberg.FieldStatistic.LOWER_BOUND; -import static org.apache.iceberg.FieldStatistic.MAX_VALUE_SIZE; import static org.apache.iceberg.FieldStatistic.NAN_VALUE_COUNT; import static org.apache.iceberg.FieldStatistic.NULL_VALUE_COUNT; +import static org.apache.iceberg.FieldStatistic.TIGHT_BOUNDS; import static org.apache.iceberg.FieldStatistic.UPPER_BOUND; import static org.apache.iceberg.FieldStatistic.VALUE_COUNT; import static org.apache.iceberg.types.Types.NestedField.optional; @@ -223,27 +222,25 @@ public void conditionalFieldInclusionForInteger() { fieldStatsNames( FieldStatistic.fieldStatsFor(required(1, "x", Types.IntegerType.get()), 10000))) .containsExactly( - VALUE_COUNT.fieldName(), LOWER_BOUND.fieldName(), UPPER_BOUND.fieldName(), - EXACT_BOUNDS.fieldName()) + TIGHT_BOUNDS.fieldName(), + VALUE_COUNT.fieldName()) .doesNotContain( NULL_VALUE_COUNT.fieldName(), NAN_VALUE_COUNT.fieldName(), - AVG_VALUE_SIZE.fieldName(), - MAX_VALUE_SIZE.fieldName()); + AVG_VALUE_SIZE_IN_BYTES.fieldName()); assertThat( fieldStatsNames( FieldStatistic.fieldStatsFor(optional(1, "x", Types.IntegerType.get()), 10000))) .containsExactly( - VALUE_COUNT.fieldName(), - NULL_VALUE_COUNT.fieldName(), LOWER_BOUND.fieldName(), UPPER_BOUND.fieldName(), - EXACT_BOUNDS.fieldName()) - .doesNotContain( - NAN_VALUE_COUNT.fieldName(), AVG_VALUE_SIZE.fieldName(), MAX_VALUE_SIZE.fieldName()); + TIGHT_BOUNDS.fieldName(), + VALUE_COUNT.fieldName(), + NULL_VALUE_COUNT.fieldName()) + .doesNotContain(NAN_VALUE_COUNT.fieldName(), AVG_VALUE_SIZE_IN_BYTES.fieldName()); } @Test @@ -252,24 +249,23 @@ public void conditionalFieldInclusionForFloatAndDouble() { fieldStatsNames( FieldStatistic.fieldStatsFor(required(1, "x", Types.FloatType.get()), 10000))) .containsExactly( - VALUE_COUNT.fieldName(), - NAN_VALUE_COUNT.fieldName(), LOWER_BOUND.fieldName(), UPPER_BOUND.fieldName(), - EXACT_BOUNDS.fieldName()) - .doesNotContain( - NULL_VALUE_COUNT.fieldName(), AVG_VALUE_SIZE.fieldName(), MAX_VALUE_SIZE.fieldName()); + TIGHT_BOUNDS.fieldName(), + VALUE_COUNT.fieldName(), + NAN_VALUE_COUNT.fieldName()) + .doesNotContain(NULL_VALUE_COUNT.fieldName(), AVG_VALUE_SIZE_IN_BYTES.fieldName()); assertThat( fieldStatsNames( FieldStatistic.fieldStatsFor(optional(1, "x", Types.DoubleType.get()), 10000))) .containsExactly( - VALUE_COUNT.fieldName(), - NULL_VALUE_COUNT.fieldName(), - NAN_VALUE_COUNT.fieldName(), LOWER_BOUND.fieldName(), UPPER_BOUND.fieldName(), - EXACT_BOUNDS.fieldName()); + TIGHT_BOUNDS.fieldName(), + VALUE_COUNT.fieldName(), + NULL_VALUE_COUNT.fieldName(), + NAN_VALUE_COUNT.fieldName()); } @Test @@ -278,25 +274,23 @@ public void conditionalFieldInclusionForString() { fieldStatsNames( FieldStatistic.fieldStatsFor(required(1, "x", Types.StringType.get()), 10000))) .containsExactly( - VALUE_COUNT.fieldName(), - AVG_VALUE_SIZE.fieldName(), - MAX_VALUE_SIZE.fieldName(), LOWER_BOUND.fieldName(), UPPER_BOUND.fieldName(), - EXACT_BOUNDS.fieldName()) + TIGHT_BOUNDS.fieldName(), + VALUE_COUNT.fieldName(), + AVG_VALUE_SIZE_IN_BYTES.fieldName()) .doesNotContain(NULL_VALUE_COUNT.fieldName(), NAN_VALUE_COUNT.fieldName()); assertThat( fieldStatsNames( FieldStatistic.fieldStatsFor(optional(1, "x", Types.StringType.get()), 10000))) .containsExactly( - VALUE_COUNT.fieldName(), - NULL_VALUE_COUNT.fieldName(), - AVG_VALUE_SIZE.fieldName(), - MAX_VALUE_SIZE.fieldName(), LOWER_BOUND.fieldName(), UPPER_BOUND.fieldName(), - EXACT_BOUNDS.fieldName()); + TIGHT_BOUNDS.fieldName(), + VALUE_COUNT.fieldName(), + NULL_VALUE_COUNT.fieldName(), + AVG_VALUE_SIZE_IN_BYTES.fieldName()); } @Test @@ -305,28 +299,103 @@ public void conditionalFieldInclusionForBinary() { fieldStatsNames( FieldStatistic.fieldStatsFor(optional(1, "x", Types.BinaryType.get()), 10000))) .containsExactly( - VALUE_COUNT.fieldName(), - NULL_VALUE_COUNT.fieldName(), - AVG_VALUE_SIZE.fieldName(), - MAX_VALUE_SIZE.fieldName(), LOWER_BOUND.fieldName(), UPPER_BOUND.fieldName(), - EXACT_BOUNDS.fieldName()) + TIGHT_BOUNDS.fieldName(), + VALUE_COUNT.fieldName(), + NULL_VALUE_COUNT.fieldName(), + AVG_VALUE_SIZE_IN_BYTES.fieldName()) .doesNotContain(NAN_VALUE_COUNT.fieldName()); assertThat( fieldStatsNames( FieldStatistic.fieldStatsFor(required(1, "x", Types.BinaryType.get()), 10000))) .containsExactly( - VALUE_COUNT.fieldName(), - AVG_VALUE_SIZE.fieldName(), - MAX_VALUE_SIZE.fieldName(), LOWER_BOUND.fieldName(), UPPER_BOUND.fieldName(), - EXACT_BOUNDS.fieldName()) + TIGHT_BOUNDS.fieldName(), + VALUE_COUNT.fieldName(), + AVG_VALUE_SIZE_IN_BYTES.fieldName()) .doesNotContain(NULL_VALUE_COUNT.fieldName(), NAN_VALUE_COUNT.fieldName()); } + @Test + public void conditionalFieldInclusionForGeometry() { + assertThat( + fieldStatsNames( + FieldStatistic.fieldStatsFor(required(1, "x", Types.GeometryType.crs84()), 10000))) + .containsExactly(LOWER_BOUND.fieldName(), UPPER_BOUND.fieldName(), VALUE_COUNT.fieldName()) + .doesNotContain( + TIGHT_BOUNDS.fieldName(), + NULL_VALUE_COUNT.fieldName(), + NAN_VALUE_COUNT.fieldName(), + AVG_VALUE_SIZE_IN_BYTES.fieldName()); + + assertThat( + fieldStatsNames( + FieldStatistic.fieldStatsFor(optional(1, "x", Types.GeometryType.crs84()), 10000))) + .containsExactly( + LOWER_BOUND.fieldName(), + UPPER_BOUND.fieldName(), + VALUE_COUNT.fieldName(), + NULL_VALUE_COUNT.fieldName()) + .doesNotContain( + TIGHT_BOUNDS.fieldName(), + NAN_VALUE_COUNT.fieldName(), + AVG_VALUE_SIZE_IN_BYTES.fieldName()); + } + + @Test + public void conditionalFieldInclusionForGeography() { + assertThat( + fieldStatsNames( + FieldStatistic.fieldStatsFor(required(1, "x", Types.GeographyType.crs84()), 10000))) + .containsExactly(LOWER_BOUND.fieldName(), UPPER_BOUND.fieldName(), VALUE_COUNT.fieldName()) + .doesNotContain( + TIGHT_BOUNDS.fieldName(), + NULL_VALUE_COUNT.fieldName(), + NAN_VALUE_COUNT.fieldName(), + AVG_VALUE_SIZE_IN_BYTES.fieldName()); + + assertThat( + fieldStatsNames( + FieldStatistic.fieldStatsFor(optional(1, "x", Types.GeographyType.crs84()), 10000))) + .containsExactly( + LOWER_BOUND.fieldName(), + UPPER_BOUND.fieldName(), + VALUE_COUNT.fieldName(), + NULL_VALUE_COUNT.fieldName()) + .doesNotContain( + TIGHT_BOUNDS.fieldName(), + NAN_VALUE_COUNT.fieldName(), + AVG_VALUE_SIZE_IN_BYTES.fieldName()); + } + + @Test + public void conditionalFieldInclusionForVariant() { + assertThat( + fieldStatsNames( + FieldStatistic.fieldStatsFor(required(1, "x", Types.VariantType.get()), 10000))) + .containsExactly( + LOWER_BOUND.fieldName(), + UPPER_BOUND.fieldName(), + VALUE_COUNT.fieldName(), + AVG_VALUE_SIZE_IN_BYTES.fieldName()) + .doesNotContain( + TIGHT_BOUNDS.fieldName(), NULL_VALUE_COUNT.fieldName(), NAN_VALUE_COUNT.fieldName()); + + assertThat( + fieldStatsNames( + FieldStatistic.fieldStatsFor(optional(1, "x", Types.VariantType.get()), 10000))) + .containsExactly( + LOWER_BOUND.fieldName(), + UPPER_BOUND.fieldName(), + VALUE_COUNT.fieldName(), + NULL_VALUE_COUNT.fieldName(), + AVG_VALUE_SIZE_IN_BYTES.fieldName()) + .doesNotContain(TIGHT_BOUNDS.fieldName(), NAN_VALUE_COUNT.fieldName()); + } + private List fieldStatsNames(Types.StructType structType) { return structType.fields().stream().map(Types.NestedField::name).toList(); } diff --git a/core/src/test/java/org/apache/iceberg/TestTrackedFileStruct.java b/core/src/test/java/org/apache/iceberg/TestTrackedFileStruct.java index 3abb36aa51ff..d91ab83bc14f 100644 --- a/core/src/test/java/org/apache/iceberg/TestTrackedFileStruct.java +++ b/core/src/test/java/org/apache/iceberg/TestTrackedFileStruct.java @@ -377,20 +377,20 @@ static TrackedFileStruct createTrackedFileWithStats() { 10000, "1", Types.StructType.of( - Types.NestedField.optional(10001, "value_count", Types.LongType.get()), - Types.NestedField.optional(10002, "null_value_count", Types.LongType.get()), - Types.NestedField.optional(10003, "nan_value_count", Types.LongType.get()), - Types.NestedField.optional(10006, "lower_bound", Types.IntegerType.get()), - Types.NestedField.optional(10007, "upper_bound", Types.IntegerType.get()))), + Types.NestedField.optional(10001, "lower_bound", Types.IntegerType.get()), + Types.NestedField.optional(10002, "upper_bound", Types.IntegerType.get()), + Types.NestedField.optional(10004, "value_count", Types.LongType.get()), + Types.NestedField.optional(10005, "null_value_count", Types.LongType.get()), + Types.NestedField.optional(10006, "nan_value_count", Types.LongType.get()))), Types.NestedField.optional( 20000, "2", Types.StructType.of( - Types.NestedField.optional(20001, "value_count", Types.LongType.get()), - Types.NestedField.optional(20002, "null_value_count", Types.LongType.get()), - Types.NestedField.optional(20003, "nan_value_count", Types.LongType.get()), - Types.NestedField.optional(20006, "lower_bound", Types.FloatType.get()), - Types.NestedField.optional(20007, "upper_bound", Types.FloatType.get())))); + Types.NestedField.optional(20001, "lower_bound", Types.FloatType.get()), + Types.NestedField.optional(20002, "upper_bound", Types.FloatType.get()), + Types.NestedField.optional(20004, "value_count", Types.LongType.get()), + Types.NestedField.optional(20005, "null_value_count", Types.LongType.get()), + Types.NestedField.optional(20006, "nan_value_count", Types.LongType.get())))); List> fieldStatsList = ImmutableList.of( From f0c18f6441408781dfe6b01cf33cb182d308b417 Mon Sep 17 00:00:00 2001 From: Eduard Tudenhoefner Date: Tue, 19 May 2026 15:54:31 +0200 Subject: [PATCH 2/3] Core: Handle Geo & Variant types in content stats --- .../org/apache/iceberg/FieldStatistic.java | 82 +++++++-- .../java/org/apache/iceberg/FieldStats.java | 3 +- .../java/org/apache/iceberg/StatsUtil.java | 4 +- .../org/apache/iceberg/TestStatsUtil.java | 160 ++++++++++++++---- 4 files changed, 206 insertions(+), 43 deletions(-) diff --git a/core/src/main/java/org/apache/iceberg/FieldStatistic.java b/core/src/main/java/org/apache/iceberg/FieldStatistic.java index e088f386efe0..0522ec1f0033 100644 --- a/core/src/main/java/org/apache/iceberg/FieldStatistic.java +++ b/core/src/main/java/org/apache/iceberg/FieldStatistic.java @@ -19,6 +19,7 @@ package org.apache.iceberg; import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; import java.util.List; import org.apache.iceberg.relocated.com.google.common.collect.Lists; @@ -34,6 +35,17 @@ enum FieldStatistic { NAN_VALUE_COUNT(6, "nan_value_count"), AVG_VALUE_SIZE_IN_BYTES(7, "avg_value_size_in_bytes"); + // Offsets used within geo_lower struct (relative to the parent stats struct base ID). + private static final int GEO_LOWER_X_OFFSET = 10; + private static final int GEO_LOWER_Y_OFFSET = 11; + private static final int GEO_LOWER_Z_OFFSET = 12; + private static final int GEO_LOWER_M_OFFSET = 13; + // Offsets used within geo_upper struct (relative to the parent stats struct base ID). + private static final int GEO_UPPER_X_OFFSET = 14; + private static final int GEO_UPPER_Y_OFFSET = 15; + private static final int GEO_UPPER_Z_OFFSET = 16; + private static final int GEO_UPPER_M_OFFSET = 17; + private final int offset; private final String fieldName; @@ -88,27 +100,41 @@ public static FieldStatistic fromPosition(int position) { }; } + @SuppressWarnings("checkstyle:CyclomaticComplexity") public static Types.StructType fieldStatsFor(Types.NestedField field, int baseFieldId) { List fields = Lists.newArrayListWithCapacity(7); Type type = field.type(); - Type.TypeID typeId = type.typeId(); - boolean isGeo = typeId == Type.TypeID.GEOMETRY || typeId == Type.TypeID.GEOGRAPHY; + boolean isGeo = type.typeId() == Type.TypeID.GEOMETRY || type.typeId() == Type.TypeID.GEOGRAPHY; boolean isVariant = type.isVariantType(); + // For geo types, lower/upper bounds are XYZM points stored in geo_lower / geo_upper structs. + // For variant types, bounds are unshredded variant values (same type as the field). + // For all other primitive types, bounds use the field's type. + Type lowerBoundType = isGeo ? geoLowerBoundStruct(baseFieldId) : type; + Type upperBoundType = isGeo ? geoUpperBoundStruct(baseFieldId) : type; + String lowerBoundDoc = + isGeo + ? "Lower bound XYZM point of the bounding box for the geo column" + : "Lower bound stored as the field's type"; + String upperBoundDoc = + isGeo + ? "Upper bound XYZM point of the bounding box for the geo column" + : "Upper bound stored as the field's type"; + fields.add( optional( baseFieldId + LOWER_BOUND.offset(), LOWER_BOUND.fieldName(), - type, - "Lower bound stored as the field's type")); + lowerBoundType, + lowerBoundDoc)); fields.add( optional( baseFieldId + UPPER_BOUND.offset(), UPPER_BOUND.fieldName(), - type, - "Upper bound stored as the field's type")); + upperBoundType, + upperBoundDoc)); - if (!isGeo && !isVariant) { + if (!isGeo && !type.isVariantType()) { fields.add( optional( baseFieldId + TIGHT_BOUNDS.offset(), @@ -133,7 +159,7 @@ public static Types.StructType fieldStatsFor(Types.NestedField field, int baseFi "Number of null values in the column")); } - if (typeId == Type.TypeID.FLOAT || typeId == Type.TypeID.DOUBLE) { + if (type.typeId() == Type.TypeID.FLOAT || type.typeId() == Type.TypeID.DOUBLE) { fields.add( optional( baseFieldId + NAN_VALUE_COUNT.offset(), @@ -142,15 +168,51 @@ public static Types.StructType fieldStatsFor(Types.NestedField field, int baseFi "Number of NaN values in the column")); } - if (typeId == Type.TypeID.STRING || typeId == Type.TypeID.BINARY || isVariant) { + if (type.typeId() == Type.TypeID.STRING || type.typeId() == Type.TypeID.BINARY || isVariant) { fields.add( optional( baseFieldId + AVG_VALUE_SIZE_IN_BYTES.offset(), AVG_VALUE_SIZE_IN_BYTES.fieldName(), Types.IntegerType.get(), - "Avg value size (uncompressed) in bytes to estimate memory consumption")); + "Avg value size in memory (uncompressed) in bytes to estimate memory consumption")); } return Types.StructType.of(fields); } + + private static Types.StructType geoLowerBoundStruct(int baseFieldId) { + return Types.StructType.of( + required( + baseFieldId + GEO_LOWER_X_OFFSET, + "x", + Types.DoubleType.get(), + "Bounding box westernmost/xmin; [-180..180]"), + required( + baseFieldId + GEO_LOWER_Y_OFFSET, + "y", + Types.DoubleType.get(), + "Bounding box southernmost/ymin; [-90..90]"), + optional( + baseFieldId + GEO_LOWER_Z_OFFSET, "z", Types.DoubleType.get(), "Bounding box zmin"), + optional( + baseFieldId + GEO_LOWER_M_OFFSET, "m", Types.DoubleType.get(), "Bounding box mmin")); + } + + private static Types.StructType geoUpperBoundStruct(int baseFieldId) { + return Types.StructType.of( + required( + baseFieldId + GEO_UPPER_X_OFFSET, + "x", + Types.DoubleType.get(), + "Bounding box easternmost/xmax; [-180..180]"), + required( + baseFieldId + GEO_UPPER_Y_OFFSET, + "y", + Types.DoubleType.get(), + "Bounding box northernmost/ymax; [-90..90]"), + optional( + baseFieldId + GEO_UPPER_Z_OFFSET, "z", Types.DoubleType.get(), "Bounding box zmax"), + optional( + baseFieldId + GEO_UPPER_M_OFFSET, "m", Types.DoubleType.get(), "Bounding box mmax")); + } } diff --git a/core/src/main/java/org/apache/iceberg/FieldStats.java b/core/src/main/java/org/apache/iceberg/FieldStats.java index 885617db3b54..ffad2ecf6f0c 100644 --- a/core/src/main/java/org/apache/iceberg/FieldStats.java +++ b/core/src/main/java/org/apache/iceberg/FieldStats.java @@ -49,7 +49,8 @@ interface FieldStats extends StructLike { Long nanValueCount(); /** - * The avg value size (uncompressed) in bytes for variable-length types (string, binary, variant) + * The avg value size in memory (uncompressed) in bytes for variable-length types (string, binary, + * variant) */ Integer avgValueSizeInBytes(); } diff --git a/core/src/main/java/org/apache/iceberg/StatsUtil.java b/core/src/main/java/org/apache/iceberg/StatsUtil.java index 39fef3d372d3..3eb7a7e6454e 100644 --- a/core/src/main/java/org/apache/iceberg/StatsUtil.java +++ b/core/src/main/java/org/apache/iceberg/StatsUtil.java @@ -171,14 +171,14 @@ public Types.NestedField struct(Types.StructType struct, List @Override public Types.NestedField field(Types.NestedField field, Types.NestedField fieldResult) { - if (field.type().isNestedType() || field.type().isVariantType()) { + if (field.type().isNestedType()) { return null; } int fieldId = StatsUtil.statsFieldIdForField(field.fieldId()); if (fieldId >= 0) { Types.StructType structType = FieldStatistic.fieldStatsFor(field, fieldId); - return optional(fieldId, Integer.toString(field.fieldId()), structType); + return optional(fieldId, field.name(), structType); } else { skippedFieldIds.add(field.fieldId()); } diff --git a/core/src/test/java/org/apache/iceberg/TestStatsUtil.java b/core/src/test/java/org/apache/iceberg/TestStatsUtil.java index 7cc0b190779c..72bad3528a80 100644 --- a/core/src/test/java/org/apache/iceberg/TestStatsUtil.java +++ b/core/src/test/java/org/apache/iceberg/TestStatsUtil.java @@ -158,13 +158,20 @@ public void contentStatsForSimpleSchema() { 146, "content_stats", Types.StructType.of( - optional(10000, "0", FieldStatistic.fieldStatsFor(intField, 10000)), - optional(10400, "2", FieldStatistic.fieldStatsFor(floatField, 10400)), - optional(10800, "4", FieldStatistic.fieldStatsFor(stringField, 10800)), - optional(11200, "6", FieldStatistic.fieldStatsFor(booleanField, 11200)), + optional(10000, intField.name(), FieldStatistic.fieldStatsFor(intField, 10000)), + optional( + 10400, floatField.name(), FieldStatistic.fieldStatsFor(floatField, 10400)), + optional( + 10800, + stringField.name(), + FieldStatistic.fieldStatsFor(stringField, 10800)), + optional( + 11200, + booleanField.name(), + FieldStatistic.fieldStatsFor(booleanField, 11200)), optional( 200010000, - "1000000", + uuidField.name(), FieldStatistic.fieldStatsFor(uuidField, 200010000))))); Schema statsSchema = new Schema(StatsUtil.contentStatsFor(schema)); assertThat(statsSchema.asStruct()).isEqualTo(expectedStatsSchema.asStruct()); @@ -177,6 +184,7 @@ public void contentStatsForComplexSchema() { Types.NestedField structString = optional(8, "string", Types.StringType.get()); Types.NestedField mapKey = required(22, "key", Types.IntegerType.get()); Types.NestedField mapValue = optional(24, "value", Types.StringType.get()); + Types.NestedField variantField = required(30, "variant", Types.VariantType.get()); Types.NestedField uuidField = required(100_000, "u", Types.UUIDType.get()); Schema schema = new Schema( @@ -192,7 +200,7 @@ public void contentStatsForComplexSchema() { 20, "b", Types.MapType.ofOptional(22, 24, Types.IntegerType.get(), Types.StringType.get())), - required(30, "variant", Types.VariantType.get()), + variantField, uuidField); Schema expectedStatsSchema = new Schema( @@ -205,13 +213,26 @@ public void contentStatsForComplexSchema() { "0", FieldStatistic.fieldStatsFor( required(0, "i", Types.IntegerType.get()), 10000)), - optional(10600, "3", FieldStatistic.fieldStatsFor(listElement, 10600)), - optional(11400, "7", FieldStatistic.fieldStatsFor(structInt, 11400)), - optional(11600, "8", FieldStatistic.fieldStatsFor(structString, 11600)), - optional(14400, "22", FieldStatistic.fieldStatsFor(mapKey, 14400)), - optional(14800, "24", FieldStatistic.fieldStatsFor(mapValue, 14800)), optional( - 20010000, "100000", FieldStatistic.fieldStatsFor(uuidField, 20010000))))); + 10600, + listElement.name(), + FieldStatistic.fieldStatsFor(listElement, 10600)), + optional( + 11400, structInt.name(), FieldStatistic.fieldStatsFor(structInt, 11400)), + optional( + 11600, + structString.name(), + FieldStatistic.fieldStatsFor(structString, 11600)), + optional(14400, mapKey.name(), FieldStatistic.fieldStatsFor(mapKey, 14400)), + optional(14800, mapValue.name(), FieldStatistic.fieldStatsFor(mapValue, 14800)), + optional( + 16000, + variantField.name(), + FieldStatistic.fieldStatsFor(variantField, 16000)), + optional( + 20010000, + uuidField.name(), + FieldStatistic.fieldStatsFor(uuidField, 20010000))))); Schema statsSchema = new Schema(StatsUtil.contentStatsFor(schema)); assertThat(statsSchema.asStruct()).isEqualTo(expectedStatsSchema.asStruct()); } @@ -321,19 +342,20 @@ public void conditionalFieldInclusionForBinary() { @Test public void conditionalFieldInclusionForGeometry() { - assertThat( - fieldStatsNames( - FieldStatistic.fieldStatsFor(required(1, "x", Types.GeometryType.crs84()), 10000))) + Types.StructType requiredStats = + FieldStatistic.fieldStatsFor(required(1, "x", Types.GeometryType.crs84()), 10000); + assertThat(fieldStatsNames(requiredStats)) .containsExactly(LOWER_BOUND.fieldName(), UPPER_BOUND.fieldName(), VALUE_COUNT.fieldName()) .doesNotContain( TIGHT_BOUNDS.fieldName(), NULL_VALUE_COUNT.fieldName(), NAN_VALUE_COUNT.fieldName(), AVG_VALUE_SIZE_IN_BYTES.fieldName()); + assertGeoBoundStructs(requiredStats, 10000); - assertThat( - fieldStatsNames( - FieldStatistic.fieldStatsFor(optional(1, "x", Types.GeometryType.crs84()), 10000))) + Types.StructType optionalStats = + FieldStatistic.fieldStatsFor(optional(1, "x", Types.GeometryType.crs84()), 10000); + assertThat(fieldStatsNames(optionalStats)) .containsExactly( LOWER_BOUND.fieldName(), UPPER_BOUND.fieldName(), @@ -343,23 +365,25 @@ public void conditionalFieldInclusionForGeometry() { TIGHT_BOUNDS.fieldName(), NAN_VALUE_COUNT.fieldName(), AVG_VALUE_SIZE_IN_BYTES.fieldName()); + assertGeoBoundStructs(optionalStats, 10000); } @Test public void conditionalFieldInclusionForGeography() { - assertThat( - fieldStatsNames( - FieldStatistic.fieldStatsFor(required(1, "x", Types.GeographyType.crs84()), 10000))) + Types.StructType requiredStats = + FieldStatistic.fieldStatsFor(required(1, "x", Types.GeographyType.crs84()), 10000); + assertThat(fieldStatsNames(requiredStats)) .containsExactly(LOWER_BOUND.fieldName(), UPPER_BOUND.fieldName(), VALUE_COUNT.fieldName()) .doesNotContain( TIGHT_BOUNDS.fieldName(), NULL_VALUE_COUNT.fieldName(), NAN_VALUE_COUNT.fieldName(), AVG_VALUE_SIZE_IN_BYTES.fieldName()); + assertGeoBoundStructs(requiredStats, 10000); - assertThat( - fieldStatsNames( - FieldStatistic.fieldStatsFor(optional(1, "x", Types.GeographyType.crs84()), 10000))) + Types.StructType optionalStats = + FieldStatistic.fieldStatsFor(optional(1, "x", Types.GeographyType.crs84()), 10000); + assertThat(fieldStatsNames(optionalStats)) .containsExactly( LOWER_BOUND.fieldName(), UPPER_BOUND.fieldName(), @@ -369,13 +393,14 @@ public void conditionalFieldInclusionForGeography() { TIGHT_BOUNDS.fieldName(), NAN_VALUE_COUNT.fieldName(), AVG_VALUE_SIZE_IN_BYTES.fieldName()); + assertGeoBoundStructs(optionalStats, 10000); } @Test public void conditionalFieldInclusionForVariant() { - assertThat( - fieldStatsNames( - FieldStatistic.fieldStatsFor(required(1, "x", Types.VariantType.get()), 10000))) + Types.StructType requiredStats = + FieldStatistic.fieldStatsFor(required(1, "x", Types.VariantType.get()), 10000); + assertThat(fieldStatsNames(requiredStats)) .containsExactly( LOWER_BOUND.fieldName(), UPPER_BOUND.fieldName(), @@ -383,10 +408,11 @@ public void conditionalFieldInclusionForVariant() { AVG_VALUE_SIZE_IN_BYTES.fieldName()) .doesNotContain( TIGHT_BOUNDS.fieldName(), NULL_VALUE_COUNT.fieldName(), NAN_VALUE_COUNT.fieldName()); + assertVariantBoundTypes(requiredStats); - assertThat( - fieldStatsNames( - FieldStatistic.fieldStatsFor(optional(1, "x", Types.VariantType.get()), 10000))) + Types.StructType optionalStats = + FieldStatistic.fieldStatsFor(optional(1, "x", Types.VariantType.get()), 10000); + assertThat(fieldStatsNames(optionalStats)) .containsExactly( LOWER_BOUND.fieldName(), UPPER_BOUND.fieldName(), @@ -394,6 +420,80 @@ public void conditionalFieldInclusionForVariant() { NULL_VALUE_COUNT.fieldName(), AVG_VALUE_SIZE_IN_BYTES.fieldName()) .doesNotContain(TIGHT_BOUNDS.fieldName(), NAN_VALUE_COUNT.fieldName()); + assertVariantBoundTypes(optionalStats); + } + + private void assertGeoBoundStructs(Types.StructType stats, int baseFieldId) { + Types.NestedField lowerBound = stats.field(LOWER_BOUND.fieldName()); + assertThat(lowerBound.type().isStructType()).isTrue(); + Types.StructType geoLower = lowerBound.type().asStructType(); + assertThat(geoLower.fields()).hasSize(4); + assertThat(geoLower.field("x")) + .satisfies( + f -> { + assertThat(f.fieldId()).isEqualTo(baseFieldId + 10); + assertThat(f.type()).isEqualTo(Types.DoubleType.get()); + assertThat(f.isRequired()).isTrue(); + }); + assertThat(geoLower.field("y")) + .satisfies( + f -> { + assertThat(f.fieldId()).isEqualTo(baseFieldId + 11); + assertThat(f.type()).isEqualTo(Types.DoubleType.get()); + assertThat(f.isRequired()).isTrue(); + }); + assertThat(geoLower.field("z")) + .satisfies( + f -> { + assertThat(f.fieldId()).isEqualTo(baseFieldId + 12); + assertThat(f.type()).isEqualTo(Types.DoubleType.get()); + assertThat(f.isOptional()).isTrue(); + }); + assertThat(geoLower.field("m")) + .satisfies( + f -> { + assertThat(f.fieldId()).isEqualTo(baseFieldId + 13); + assertThat(f.type()).isEqualTo(Types.DoubleType.get()); + assertThat(f.isOptional()).isTrue(); + }); + + Types.NestedField upperBound = stats.field(UPPER_BOUND.fieldName()); + assertThat(upperBound.type().isStructType()).isTrue(); + Types.StructType geoUpper = upperBound.type().asStructType(); + assertThat(geoUpper.fields()).hasSize(4); + assertThat(geoUpper.field("x")) + .satisfies( + f -> { + assertThat(f.fieldId()).isEqualTo(baseFieldId + 14); + assertThat(f.type()).isEqualTo(Types.DoubleType.get()); + assertThat(f.isRequired()).isTrue(); + }); + assertThat(geoUpper.field("y")) + .satisfies( + f -> { + assertThat(f.fieldId()).isEqualTo(baseFieldId + 15); + assertThat(f.type()).isEqualTo(Types.DoubleType.get()); + assertThat(f.isRequired()).isTrue(); + }); + assertThat(geoUpper.field("z")) + .satisfies( + f -> { + assertThat(f.fieldId()).isEqualTo(baseFieldId + 16); + assertThat(f.type()).isEqualTo(Types.DoubleType.get()); + assertThat(f.isOptional()).isTrue(); + }); + assertThat(geoUpper.field("m")) + .satisfies( + f -> { + assertThat(f.fieldId()).isEqualTo(baseFieldId + 17); + assertThat(f.type()).isEqualTo(Types.DoubleType.get()); + assertThat(f.isOptional()).isTrue(); + }); + } + + private void assertVariantBoundTypes(Types.StructType stats) { + assertThat(stats.field(LOWER_BOUND.fieldName()).type()).isEqualTo(Types.VariantType.get()); + assertThat(stats.field(UPPER_BOUND.fieldName()).type()).isEqualTo(Types.VariantType.get()); } private List fieldStatsNames(Types.StructType structType) { From 51be71e69be836c8c73a6de403fd9321bc283a3d Mon Sep 17 00:00:00 2001 From: Eduard Tudenhoefner Date: Wed, 20 May 2026 14:51:42 +0200 Subject: [PATCH 3/3] some additional tests for geo/variant --- .../org/apache/iceberg/FieldStatistic.java | 2 +- .../java/org/apache/iceberg/FieldStats.java | 8 +- .../org/apache/iceberg/TestContentStats.java | 122 ++++++++++++++++++ .../org/apache/iceberg/TestStatsUtil.java | 2 +- 4 files changed, 131 insertions(+), 3 deletions(-) diff --git a/core/src/main/java/org/apache/iceberg/FieldStatistic.java b/core/src/main/java/org/apache/iceberg/FieldStatistic.java index 0522ec1f0033..7061517b0559 100644 --- a/core/src/main/java/org/apache/iceberg/FieldStatistic.java +++ b/core/src/main/java/org/apache/iceberg/FieldStatistic.java @@ -57,7 +57,7 @@ enum FieldStatistic { /** * The offset from the field ID of the base stats structure * - * @return The offset from the field ID of the base strats structure + * @return The offset from the field ID of the base stats structure */ public int offset() { return offset; diff --git a/core/src/main/java/org/apache/iceberg/FieldStats.java b/core/src/main/java/org/apache/iceberg/FieldStats.java index ffad2ecf6f0c..ce74dd95b9a2 100644 --- a/core/src/main/java/org/apache/iceberg/FieldStats.java +++ b/core/src/main/java/org/apache/iceberg/FieldStats.java @@ -24,7 +24,13 @@ interface FieldStats extends StructLike { /** The field ID of the statistic */ int fieldId(); - /** The field type of the statistic */ + /** + * The field type of the statistic. + * + *

For geo types (geometry/geography), this returns the bounding box struct type (geo_lower / + * geo_upper) rather than the column's geometry or geography type, because the type is inferred + * from the lower/upper bound schema fields. + */ Type type(); /** The lower bound */ diff --git a/core/src/test/java/org/apache/iceberg/TestContentStats.java b/core/src/test/java/org/apache/iceberg/TestContentStats.java index 74237a27dfa8..ffdfee2b0550 100644 --- a/core/src/test/java/org/apache/iceberg/TestContentStats.java +++ b/core/src/test/java/org/apache/iceberg/TestContentStats.java @@ -33,6 +33,10 @@ import org.apache.iceberg.data.GenericRecord; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; import org.apache.iceberg.types.Types; +import org.apache.iceberg.variants.ShreddedObject; +import org.apache.iceberg.variants.Variant; +import org.apache.iceberg.variants.VariantMetadata; +import org.apache.iceberg.variants.Variants; import org.junit.jupiter.api.Test; public class TestContentStats { @@ -331,6 +335,124 @@ public void setByPositionRequiredInteger() { assertThat(stats.fieldStats()).containsExactly(fieldStats); } + @Test + public void setByPositionOptionalGeometry() { + Schema tableSchema = new Schema(optional(1, "g", Types.GeometryType.crs84())); + Types.StructType rootStatsStruct = StatsUtil.contentStatsFor(tableSchema).type().asStructType(); + Types.StructType statsStructForFieldId = rootStatsStruct.fields().get(0).type().asStructType(); + // lower_bound, upper_bound, value_count, null_value_count + assertThat(statsStructForFieldId.fields()).hasSize(4); + + GenericRecord lower = + GenericRecord.create( + statsStructForFieldId.field(LOWER_BOUND.fieldName()).type().asStructType()); + lower.setField("x", -122.4); + lower.setField("y", 37.7); + lower.setField("z", null); + lower.setField("m", null); + + GenericRecord upper = + GenericRecord.create( + statsStructForFieldId.field(UPPER_BOUND.fieldName()).type().asStructType()); + upper.setField("x", -122.0); + upper.setField("y", 38.0); + upper.setField("z", null); + upper.setField("m", null); + + GenericRecord record = GenericRecord.create(statsStructForFieldId); + record.setField(LOWER_BOUND.fieldName(), lower); + record.setField(UPPER_BOUND.fieldName(), upper); + record.setField(VALUE_COUNT.fieldName(), 100L); + record.setField(NULL_VALUE_COUNT.fieldName(), 5L); + + ContentStats stats = new BaseContentStats(rootStatsStruct); + stats.set(0, record); + + FieldStats result = stats.fieldStats().get(0); + assertThat(result.valueCount()).isEqualTo(100L); + assertThat(result.nullValueCount()).isEqualTo(5L); + assertThat(result.tightBounds()).isFalse(); + assertThat(result.lowerBound()).isEqualTo(lower); + assertThat(result.upperBound()).isEqualTo(upper); + } + + @Test + public void setByPositionOptionalGeography() { + Schema tableSchema = new Schema(optional(1, "g", Types.GeographyType.crs84())); + Types.StructType rootStatsStruct = StatsUtil.contentStatsFor(tableSchema).type().asStructType(); + Types.StructType statsStructForFieldId = rootStatsStruct.fields().get(0).type().asStructType(); + // lower_bound, upper_bound, value_count, null_value_count + assertThat(statsStructForFieldId.fields()).hasSize(4); + + GenericRecord lower = + GenericRecord.create( + statsStructForFieldId.field(LOWER_BOUND.fieldName()).type().asStructType()); + lower.setField("x", 10.0); + lower.setField("y", 20.0); + lower.setField("z", 0.0); + lower.setField("m", null); + + GenericRecord upper = + GenericRecord.create( + statsStructForFieldId.field(UPPER_BOUND.fieldName()).type().asStructType()); + upper.setField("x", 30.0); + upper.setField("y", 40.0); + upper.setField("z", 100.0); + upper.setField("m", null); + + GenericRecord record = GenericRecord.create(statsStructForFieldId); + record.setField(LOWER_BOUND.fieldName(), lower); + record.setField(UPPER_BOUND.fieldName(), upper); + record.setField(VALUE_COUNT.fieldName(), 200L); + record.setField(NULL_VALUE_COUNT.fieldName(), 10L); + + ContentStats stats = new BaseContentStats(rootStatsStruct); + stats.set(0, record); + + FieldStats result = stats.fieldStats().get(0); + assertThat(result.valueCount()).isEqualTo(200L); + assertThat(result.nullValueCount()).isEqualTo(10L); + assertThat(result.tightBounds()).isFalse(); + assertThat(result.lowerBound()).isEqualTo(lower); + assertThat(result.upperBound()).isEqualTo(upper); + } + + @Test + public void setByPositionRequiredVariant() { + Schema tableSchema = new Schema(required(1, "v", Types.VariantType.get())); + Types.StructType rootStatsStruct = StatsUtil.contentStatsFor(tableSchema).type().asStructType(); + Types.StructType statsStructForFieldId = rootStatsStruct.fields().get(0).type().asStructType(); + // lower_bound, upper_bound, value_count, avg_value_size_in_bytes + assertThat(statsStructForFieldId.fields()).hasSize(4); + + VariantMetadata metadata = Variants.metadata("$['name']", "$['score']"); + ShreddedObject lower = Variants.object(metadata); + lower.put("$['name']", Variants.of("alice")); + lower.put("$['score']", Variants.of(1)); + Variant lowerVariant = Variant.of(metadata, lower); + + ShreddedObject upper = Variants.object(metadata); + upper.put("$['name']", Variants.of("zara")); + upper.put("$['score']", Variants.of(100)); + Variant upperVariant = Variant.of(metadata, upper); + + GenericRecord record = GenericRecord.create(statsStructForFieldId); + record.setField(LOWER_BOUND.fieldName(), lowerVariant); + record.setField(UPPER_BOUND.fieldName(), upperVariant); + record.setField(VALUE_COUNT.fieldName(), 50L); + record.setField(AVG_VALUE_SIZE_IN_BYTES.fieldName(), 128); + + ContentStats stats = new BaseContentStats(rootStatsStruct); + stats.set(0, record); + + FieldStats result = stats.fieldStats().get(0); + assertThat(result.valueCount()).isEqualTo(50L); + assertThat(result.avgValueSizeInBytes()).isEqualTo(128); + assertThat(result.tightBounds()).isFalse(); + assertThat(result.lowerBound()).isEqualTo(lowerVariant); + assertThat(result.upperBound()).isEqualTo(upperVariant); + } + @Test public void setByPositionWithInvalidLowerAndUpperBound() { Schema tableSchema = new Schema(required(1, "id", Types.IntegerType.get())); diff --git a/core/src/test/java/org/apache/iceberg/TestStatsUtil.java b/core/src/test/java/org/apache/iceberg/TestStatsUtil.java index 72bad3528a80..f59ee8e42f75 100644 --- a/core/src/test/java/org/apache/iceberg/TestStatsUtil.java +++ b/core/src/test/java/org/apache/iceberg/TestStatsUtil.java @@ -210,7 +210,7 @@ public void contentStatsForComplexSchema() { Types.StructType.of( optional( 10000, - "0", + "i", FieldStatistic.fieldStatsFor( required(0, "i", Types.IntegerType.get()), 10000)), optional(