HIVE-29516: Fix NPE in StatsUtils.updateStats when column statistics unavailable

shubhluck · shubhluck · commit 732913fb0023 · 2026-04-04T11:22:39.000-04:00
Check column stats availability before passing useColStats=true in
TezCompiler.removeSemijoinOptimizationByBenefit() to avoid NPE when
column statistics are not present.

Made-with: Cursor
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java
@@ -1977,8 +1977,9 @@ private void removeSemijoinOptimizationByBenefit(OptimizeTezProcContext procCtx)
           LOG.debug("Old stats for {}: {}", roi.filterOperator, roi.filterStats);
           LOG.debug("Number of rows reduction: {}/{}", newNumRows, roi.filterStats.getNumRows());
         }
+        boolean useColStats = roi.filterStats.getColumnStats() != null;
         StatsUtils.updateStats(roi.filterStats, newNumRows,
-            true, roi.filterOperator, roi.colNames);
+            useColStats, roi.filterOperator, roi.colNames);
         if (LOG.isDebugEnabled()) {
           LOG.debug("New stats for {}: {}", roi.filterOperator, roi.filterStats);
         }
diff --git a/ql/src/test/queries/clientpositive/semijoin_stats_missing_colstats.q b/ql/src/test/queries/clientpositive/semijoin_stats_missing_colstats.q
@@ -0,0 +1,19 @@
+-- HIVE-29516: Verify that query compilation succeeds when column statistics
+-- are missing during semijoin optimization in removeSemijoinOptimizationByBenefit.
+
+set hive.tez.dynamic.partition.pruning=true;
+set hive.tez.dynamic.semijoin.reduction=true;
+set hive.tez.bigtable.minsize.semijoin.reduction=1;
+set hive.tez.min.bloom.filter.entries=1;
+set hive.tez.bloom.filter.factor=1.0f;
+set hive.auto.convert.join=false;
+
+create table t1_nocolstats (id int, val string);
+create table t2_nocolstats (id int, val string);
+
+alter table t1_nocolstats update statistics set('numRows'='100000000', 'rawDataSize'='2000000000');
+alter table t2_nocolstats update statistics set('numRows'='1000', 'rawDataSize'='20000');
+
+explain
+select t1.id, t1.val, t2.val
+from t1_nocolstats t1 join t2_nocolstats t2 on t1.id = t2.id;
diff --git a/ql/src/test/results/clientpositive/llap/semijoin_stats_missing_colstats.q.out b/ql/src/test/results/clientpositive/llap/semijoin_stats_missing_colstats.q.out
@@ -0,0 +1,160 @@
+PREHOOK: query: create table t1_nocolstats (id int, val string)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@t1_nocolstats
+POSTHOOK: query: create table t1_nocolstats (id int, val string)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@t1_nocolstats
+PREHOOK: query: create table t2_nocolstats (id int, val string)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@t2_nocolstats
+POSTHOOK: query: create table t2_nocolstats (id int, val string)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@t2_nocolstats
+PREHOOK: query: alter table t1_nocolstats update statistics set('numRows'='100000000', 'rawDataSize'='2000000000')
+PREHOOK: type: ALTERTABLE_UPDATETABLESTATS
+PREHOOK: Input: default@t1_nocolstats
+PREHOOK: Output: default@t1_nocolstats
+POSTHOOK: query: alter table t1_nocolstats update statistics set('numRows'='100000000', 'rawDataSize'='2000000000')
+POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS
+POSTHOOK: Input: default@t1_nocolstats
+POSTHOOK: Output: default@t1_nocolstats
+PREHOOK: query: alter table t2_nocolstats update statistics set('numRows'='1000', 'rawDataSize'='20000')
+PREHOOK: type: ALTERTABLE_UPDATETABLESTATS
+PREHOOK: Input: default@t2_nocolstats
+PREHOOK: Output: default@t2_nocolstats
+POSTHOOK: query: alter table t2_nocolstats update statistics set('numRows'='1000', 'rawDataSize'='20000')
+POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS
+POSTHOOK: Input: default@t2_nocolstats
+POSTHOOK: Output: default@t2_nocolstats
+PREHOOK: query: explain
+select t1.id, t1.val, t2.val
+from t1_nocolstats t1 join t2_nocolstats t2 on t1.id = t2.id
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1_nocolstats
+PREHOOK: Input: default@t2_nocolstats
+#### A masked pattern was here ####
+POSTHOOK: query: explain
+select t1.id, t1.val, t2.val
+from t1_nocolstats t1 join t2_nocolstats t2 on t1.id = t2.id
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1_nocolstats
+POSTHOOK: Input: default@t2_nocolstats
+#### A masked pattern was here ####
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Tez
+#### A masked pattern was here ####
+      Edges:
+        Map 1 <- Reducer 4 (BROADCAST_EDGE)
+        Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 3 (SIMPLE_EDGE)
+        Reducer 4 <- Map 3 (CUSTOM_SIMPLE_EDGE)
+#### A masked pattern was here ####
+      Vertices:
+        Map 1
+            Map Operator Tree:
+                TableScan
+                  alias: t1
+                  filterExpr: (id is not null and id BETWEEN DynamicValue(RS_7_t2_id_min) AND DynamicValue(RS_7_t2_id_max) and in_bloom_filter(id, DynamicValue(RS_7_t2_id_bloom_filter))) (type: boolean)
+                  Statistics: Num rows: 100000000 Data size: 17860000188 Basic stats: COMPLETE Column stats: NONE
+                  Filter Operator
+                    predicate: (id is not null and id BETWEEN DynamicValue(RS_7_t2_id_min) AND DynamicValue(RS_7_t2_id_max) and in_bloom_filter(id, DynamicValue(RS_7_t2_id_bloom_filter))) (type: boolean)
+                    Statistics: Num rows: 95000000 Data size: 16967000178 Basic stats: COMPLETE Column stats: NONE
+                    Select Operator
+                      expressions: id (type: int), val (type: string)
+                      outputColumnNames: _col0, _col1
+                      Statistics: Num rows: 95000000 Data size: 16967000178 Basic stats: COMPLETE Column stats: NONE
+                      Reduce Output Operator
+                        key expressions: _col0 (type: int)
+                        null sort order: z
+                        sort order: +
+                        Map-reduce partition columns: _col0 (type: int)
+                        Statistics: Num rows: 95000000 Data size: 16967000178 Basic stats: COMPLETE Column stats: NONE
+                        value expressions: _col1 (type: string)
+            Execution mode: vectorized, llap
+            LLAP IO: all inputs
+        Map 3
+            Map Operator Tree:
+                TableScan
+                  alias: t2
+                  filterExpr: id is not null (type: boolean)
+                  Statistics: Num rows: 1000 Data size: 178788 Basic stats: COMPLETE Column stats: NONE
+                  Filter Operator
+                    predicate: id is not null (type: boolean)
+                    Statistics: Num rows: 950 Data size: 169848 Basic stats: COMPLETE Column stats: NONE
+                    Select Operator
+                      expressions: id (type: int), val (type: string)
+                      outputColumnNames: _col0, _col1
+                      Statistics: Num rows: 950 Data size: 169848 Basic stats: COMPLETE Column stats: NONE
+                      Reduce Output Operator
+                        key expressions: _col0 (type: int)
+                        null sort order: z
+                        sort order: +
+                        Map-reduce partition columns: _col0 (type: int)
+                        Statistics: Num rows: 950 Data size: 169848 Basic stats: COMPLETE Column stats: NONE
+                        value expressions: _col1 (type: string)
+                      Select Operator
+                        expressions: _col0 (type: int)
+                        outputColumnNames: _col0
+                        Statistics: Num rows: 950 Data size: 169848 Basic stats: COMPLETE Column stats: NONE
+                        Group By Operator
+                          aggregations: min(_col0), max(_col0), bloom_filter(_col0, expectedEntries=950)
+                          minReductionHashAggr: 0.99
+                          mode: hash
+                          outputColumnNames: _col0, _col1, _col2
+                          Statistics: Num rows: 1 Data size: 340 Basic stats: COMPLETE Column stats: NONE
+                          Reduce Output Operator
+                            null sort order:
+                            sort order:
+                            Statistics: Num rows: 1 Data size: 340 Basic stats: COMPLETE Column stats: NONE
+                            value expressions: _col0 (type: int), _col1 (type: int), _col2 (type: binary)
+            Execution mode: vectorized, llap
+            LLAP IO: all inputs
+        Reducer 2
+            Execution mode: llap
+            Reduce Operator Tree:
+              Merge Join Operator
+                condition map:
+                     Inner Join 0 to 1
+                keys:
+                  0 _col0 (type: int)
+                  1 _col0 (type: int)
+                outputColumnNames: _col0, _col1, _col3
+                Statistics: Num rows: 104500002 Data size: 18663700600 Basic stats: COMPLETE Column stats: NONE
+                Select Operator
+                  expressions: _col0 (type: int), _col1 (type: string), _col3 (type: string)
+                  outputColumnNames: _col0, _col1, _col2
+                  Statistics: Num rows: 104500002 Data size: 18663700600 Basic stats: COMPLETE Column stats: NONE
+                  File Output Operator
+                    compressed: false
+                    Statistics: Num rows: 104500002 Data size: 18663700600 Basic stats: COMPLETE Column stats: NONE
+                    table:
+                        input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+                        output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                        serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+        Reducer 4
+            Execution mode: vectorized, llap
+            Reduce Operator Tree:
+              Group By Operator
+                aggregations: min(VALUE._col0), max(VALUE._col1), bloom_filter(VALUE._col2, 1, expectedEntries=950)
+                mode: final
+                outputColumnNames: _col0, _col1, _col2
+                Statistics: Num rows: 1 Data size: 340 Basic stats: COMPLETE Column stats: NONE
+                Reduce Output Operator
+                  null sort order:
+                  sort order:
+                  Statistics: Num rows: 1 Data size: 340 Basic stats: COMPLETE Column stats: NONE
+                  value expressions: _col0 (type: int), _col1 (type: int), _col2 (type: binary)
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink

Original file line number	Diff line number	Diff line change
`@@ -1977,8 +1977,9 @@ private void removeSemijoinOptimizationByBenefit(OptimizeTezProcContext procCtx)`
`1977`	`1977`	`LOG.debug("Old stats for {}: {}", roi.filterOperator, roi.filterStats);`
`1978`	`1978`	`LOG.debug("Number of rows reduction: {}/{}", newNumRows, roi.filterStats.getNumRows());`
`1979`	`1979`	`}`
	`1980`	`+ boolean useColStats = roi.filterStats.getColumnStats() != null;`
`1980`	`1981`	`StatsUtils.updateStats(roi.filterStats, newNumRows,`
`1981`		`- true, roi.filterOperator, roi.colNames);`
	`1982`	`+ useColStats, roi.filterOperator, roi.colNames);`
`1982`	`1983`	`if (LOG.isDebugEnabled()) {`
`1983`	`1984`	`LOG.debug("New stats for {}: {}", roi.filterOperator, roi.filterStats);`
`1984`	`1985`	`}`