apache
diff --git a/‎iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/udf/GenericUDFIcebergBucket.java‎
Lines changed: 34 additions & 1 deletion b/‎iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/udf/GenericUDFIcebergBucket.java‎
Lines changed: 34 additions & 1 deletion
diff --git a/‎iceberg/iceberg-handler/src/test/java/org/apache/iceberg/mr/hive/udf/TestGenericUDFIcebergBucketStatEstimator.java‎
Lines changed: 73 additions & 0 deletions b/‎iceberg/iceberg-handler/src/test/java/org/apache/iceberg/mr/hive/udf/TestGenericUDFIcebergBucketStatEstimator.java‎
Lines changed: 73 additions & 0 deletions
diff --git a/‎iceberg/iceberg-handler/src/test/queries/positive/dynamic_partition_writes.q‎
Lines changed: 4 additions & 4 deletions b/‎iceberg/iceberg-handler/src/test/queries/positive/dynamic_partition_writes.q‎
Lines changed: 4 additions & 4 deletions
@@ -19,11 +19,16 @@
 package org.apache.iceberg.mr.hive.udf;
 
 import java.nio.ByteBuffer;
+import java.util.List;
+import java.util.Optional;
 import java.util.function.Function;
 import org.apache.hadoop.hive.ql.exec.Description;
 import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
 import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
 import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.plan.ColStatistics;
+import org.apache.hadoop.hive.ql.stats.estimator.StatEstimator;
+import org.apache.hadoop.hive.ql.stats.estimator.StatEstimatorProvider;
 import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
 import org.apache.hadoop.hive.serde2.io.DateWritableV2;
 import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;
@@ -52,7 +57,7 @@
     value = "_FUNC_(value, bucketCount) - " +
         "Returns the bucket value calculated by Iceberg bucket transform function ",
     extended = "Example:\n  > SELECT _FUNC_('A bucket full of ice!', 5);\n  4")
-public class GenericUDFIcebergBucket extends GenericUDF {
+public class GenericUDFIcebergBucket extends GenericUDF implements StatEstimatorProvider {
   private final IntWritable result = new IntWritable();
   private int numBuckets = -1;
   private transient PrimitiveObjectInspector argumentOI;
@@ -209,4 +214,32 @@ public Object evaluate(DeferredObject[] arguments) throws HiveException {
   public String getDisplayString(String[] children) {
     return getStandardDisplayString("iceberg_bucket", children);
   }
+
+  @Override
+  public StatEstimator getStatEstimator() {
+    return new BucketStatEstimator();
+  }
+
+  private static class BucketStatEstimator implements StatEstimator {
+    @Override
+    public Optional<ColStatistics> estimate(List<ColStatistics> argStats) {
+      if (argStats.size() != 2) {
+        return Optional.empty();
+      }
+      ColStatistics inputStats = argStats.get(0);
+      ColStatistics bucketCountStats = argStats.get(1);
+      ColStatistics.Range bucketRange = bucketCountStats.getRange();
+      if (bucketRange == null || bucketRange.minValue == null) {
+        return Optional.empty();
+      }
+      long numBuckets = bucketRange.minValue.longValue();
+      if (numBuckets <= 0) {
+        return Optional.empty();
+      }
+      ColStatistics result = inputStats.clone();
+      result.setCountDistint(Math.min(inputStats.getCountDistint(), numBuckets));
+      result.setRange(0, numBuckets - 1);
+      return Optional.of(result);
+    }
+  }
 }
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.iceberg.mr.hive.udf;
+
+import java.util.Arrays;
+import java.util.Optional;
+import org.apache.hadoop.hive.ql.plan.ColStatistics;
+import org.apache.hadoop.hive.ql.stats.estimator.StatEstimator;
+import org.junit.Assert;
+import org.junit.Test;
+
+/**
+ * Tests for the BucketStatEstimator in GenericUDFIcebergBucket.
+ * Verifies that the StatEstimator correctly narrows NDV based on bucket count.
+ */
+public class TestGenericUDFIcebergBucketStatEstimator {
+
+  @Test
+  public void testNdvNarrowedByBucketCount() {
+    // source NDV (100) > numBuckets (8) -> output NDV should be 8
+    Optional<ColStatistics> result = estimateBucket(100, 8);
+    Assert.assertTrue(result.isPresent());
+    Assert.assertEquals(8, result.get().getCountDistint());
+  }
+
+  @Test
+  public void testNdvBelowBucketCount() {
+    // source NDV (3) < numBuckets (8) -> output NDV should be 3
+    Optional<ColStatistics> result = estimateBucket(3, 8);
+    Assert.assertTrue(result.isPresent());
+    Assert.assertEquals(3, result.get().getCountDistint());
+  }
+
+  @Test
+  public void testNdvEqualsBucketCount() {
+    // source NDV (8) == numBuckets (8) -> output NDV should be 8
+    Optional<ColStatistics> result = estimateBucket(8, 8);
+    Assert.assertTrue(result.isPresent());
+    Assert.assertEquals(8, result.get().getCountDistint());
+  }
+
+  @Test
+  public void testZeroBucketsReturnsEmpty() {
+    Optional<ColStatistics> result = estimateBucket(100, 0);
+    Assert.assertFalse(result.isPresent());
+  }
+
+  private Optional<ColStatistics> estimateBucket(long sourceNdv, long numBuckets) {
+    ColStatistics sourceStats = new ColStatistics("col", "int");
+    sourceStats.setCountDistint(sourceNdv);
+    ColStatistics numBucketsStats = new ColStatistics("numBuckets", "int");
+    numBucketsStats.setRange(numBuckets, numBuckets);
+
+    StatEstimator estimator = new GenericUDFIcebergBucket().getStatEstimator();
+    return estimator.estimate(Arrays.asList(sourceStats, numBucketsStats));
+  }
+}
@@ -31,8 +31,8 @@ insert overwrite table tbl_target_identity select a, b from tbl_src;
 select * from tbl_target_identity order by a, ccy;
 
 --bucketed case - should invoke GenericUDFIcebergBucket to calculate buckets before sorting
-create external table tbl_target_bucket (a int, ccy string) partitioned by spec (bucket (2, ccy)) stored by iceberg stored as orc;
--- threshold = 0 (default, cost-based): NDV of b (~5) < MAX_WRITERS -> no sort (FanoutWriter)
+create external table tbl_target_bucket (a int, ccy string) partitioned by spec (bucket (3, ccy)) stored by iceberg stored as orc;
+-- threshold = 0 (default, cost-based): bucket NDV = min(NDV(b), 3) = 3 < MAX_WRITERS -> no sort (FanoutWriter)
 explain insert into table tbl_target_bucket select a, b from tbl_src;
 insert into table tbl_target_bucket select a, b from tbl_src;
 select * from tbl_target_bucket order by a, ccy;
@@ -165,12 +165,12 @@ set hive.optimize.sort.dynamic.partition.threshold=1;
 explain insert into tbl_target_identity select a, b from tbl_src;
 explain insert into tbl_target_bucket select a, b from tbl_src;
 
--- threshold = 2: NDV of b (~5) > 2 -> sort (ClusteredWriter)
+-- threshold = 2: bucket NDV = min(NDV(b), 3) = 3 > 2 -> sort (ClusteredWriter)
 set hive.optimize.sort.dynamic.partition.threshold=2;
 explain insert into tbl_target_identity select a, b from tbl_src;
 explain insert into tbl_target_bucket select a, b from tbl_src;
 
--- threshold = 100: NDV of b (~5) <= 100 -> no sort (FanoutWriter)
+-- threshold = 100: bucket NDV = min(NDV(b), 3) = 3 <= 100 -> no sort (FanoutWriter)
 set hive.optimize.sort.dynamic.partition.threshold=100;
 explain insert into tbl_target_identity select a, b from tbl_src;
 explain insert into tbl_target_bucket select a, b from tbl_src;