Skip to content

Commit 732913f

Browse files
committed
HIVE-29516: Fix NPE in StatsUtils.updateStats when column statistics unavailable
Check column stats availability before passing useColStats=true in TezCompiler.removeSemijoinOptimizationByBenefit() to avoid NPE when column statistics are not present. Made-with: Cursor
1 parent d55885e commit 732913f

3 files changed

Lines changed: 181 additions & 1 deletion

File tree

ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1977,8 +1977,9 @@ private void removeSemijoinOptimizationByBenefit(OptimizeTezProcContext procCtx)
19771977
LOG.debug("Old stats for {}: {}", roi.filterOperator, roi.filterStats);
19781978
LOG.debug("Number of rows reduction: {}/{}", newNumRows, roi.filterStats.getNumRows());
19791979
}
1980+
boolean useColStats = roi.filterStats.getColumnStats() != null;
19801981
StatsUtils.updateStats(roi.filterStats, newNumRows,
1981-
true, roi.filterOperator, roi.colNames);
1982+
useColStats, roi.filterOperator, roi.colNames);
19821983
if (LOG.isDebugEnabled()) {
19831984
LOG.debug("New stats for {}: {}", roi.filterOperator, roi.filterStats);
19841985
}
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
-- HIVE-29516: Verify that query compilation succeeds when column statistics
2+
-- are missing during semijoin optimization in removeSemijoinOptimizationByBenefit.
3+
4+
set hive.tez.dynamic.partition.pruning=true;
5+
set hive.tez.dynamic.semijoin.reduction=true;
6+
set hive.tez.bigtable.minsize.semijoin.reduction=1;
7+
set hive.tez.min.bloom.filter.entries=1;
8+
set hive.tez.bloom.filter.factor=1.0f;
9+
set hive.auto.convert.join=false;
10+
11+
create table t1_nocolstats (id int, val string);
12+
create table t2_nocolstats (id int, val string);
13+
14+
alter table t1_nocolstats update statistics set('numRows'='100000000', 'rawDataSize'='2000000000');
15+
alter table t2_nocolstats update statistics set('numRows'='1000', 'rawDataSize'='20000');
16+
17+
explain
18+
select t1.id, t1.val, t2.val
19+
from t1_nocolstats t1 join t2_nocolstats t2 on t1.id = t2.id;
Lines changed: 160 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,160 @@
1+
PREHOOK: query: create table t1_nocolstats (id int, val string)
2+
PREHOOK: type: CREATETABLE
3+
PREHOOK: Output: database:default
4+
PREHOOK: Output: default@t1_nocolstats
5+
POSTHOOK: query: create table t1_nocolstats (id int, val string)
6+
POSTHOOK: type: CREATETABLE
7+
POSTHOOK: Output: database:default
8+
POSTHOOK: Output: default@t1_nocolstats
9+
PREHOOK: query: create table t2_nocolstats (id int, val string)
10+
PREHOOK: type: CREATETABLE
11+
PREHOOK: Output: database:default
12+
PREHOOK: Output: default@t2_nocolstats
13+
POSTHOOK: query: create table t2_nocolstats (id int, val string)
14+
POSTHOOK: type: CREATETABLE
15+
POSTHOOK: Output: database:default
16+
POSTHOOK: Output: default@t2_nocolstats
17+
PREHOOK: query: alter table t1_nocolstats update statistics set('numRows'='100000000', 'rawDataSize'='2000000000')
18+
PREHOOK: type: ALTERTABLE_UPDATETABLESTATS
19+
PREHOOK: Input: default@t1_nocolstats
20+
PREHOOK: Output: default@t1_nocolstats
21+
POSTHOOK: query: alter table t1_nocolstats update statistics set('numRows'='100000000', 'rawDataSize'='2000000000')
22+
POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS
23+
POSTHOOK: Input: default@t1_nocolstats
24+
POSTHOOK: Output: default@t1_nocolstats
25+
PREHOOK: query: alter table t2_nocolstats update statistics set('numRows'='1000', 'rawDataSize'='20000')
26+
PREHOOK: type: ALTERTABLE_UPDATETABLESTATS
27+
PREHOOK: Input: default@t2_nocolstats
28+
PREHOOK: Output: default@t2_nocolstats
29+
POSTHOOK: query: alter table t2_nocolstats update statistics set('numRows'='1000', 'rawDataSize'='20000')
30+
POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS
31+
POSTHOOK: Input: default@t2_nocolstats
32+
POSTHOOK: Output: default@t2_nocolstats
33+
PREHOOK: query: explain
34+
select t1.id, t1.val, t2.val
35+
from t1_nocolstats t1 join t2_nocolstats t2 on t1.id = t2.id
36+
PREHOOK: type: QUERY
37+
PREHOOK: Input: default@t1_nocolstats
38+
PREHOOK: Input: default@t2_nocolstats
39+
#### A masked pattern was here ####
40+
POSTHOOK: query: explain
41+
select t1.id, t1.val, t2.val
42+
from t1_nocolstats t1 join t2_nocolstats t2 on t1.id = t2.id
43+
POSTHOOK: type: QUERY
44+
POSTHOOK: Input: default@t1_nocolstats
45+
POSTHOOK: Input: default@t2_nocolstats
46+
#### A masked pattern was here ####
47+
STAGE DEPENDENCIES:
48+
Stage-1 is a root stage
49+
Stage-0 depends on stages: Stage-1
50+
51+
STAGE PLANS:
52+
Stage: Stage-1
53+
Tez
54+
#### A masked pattern was here ####
55+
Edges:
56+
Map 1 <- Reducer 4 (BROADCAST_EDGE)
57+
Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 3 (SIMPLE_EDGE)
58+
Reducer 4 <- Map 3 (CUSTOM_SIMPLE_EDGE)
59+
#### A masked pattern was here ####
60+
Vertices:
61+
Map 1
62+
Map Operator Tree:
63+
TableScan
64+
alias: t1
65+
filterExpr: (id is not null and id BETWEEN DynamicValue(RS_7_t2_id_min) AND DynamicValue(RS_7_t2_id_max) and in_bloom_filter(id, DynamicValue(RS_7_t2_id_bloom_filter))) (type: boolean)
66+
Statistics: Num rows: 100000000 Data size: 17860000188 Basic stats: COMPLETE Column stats: NONE
67+
Filter Operator
68+
predicate: (id is not null and id BETWEEN DynamicValue(RS_7_t2_id_min) AND DynamicValue(RS_7_t2_id_max) and in_bloom_filter(id, DynamicValue(RS_7_t2_id_bloom_filter))) (type: boolean)
69+
Statistics: Num rows: 95000000 Data size: 16967000178 Basic stats: COMPLETE Column stats: NONE
70+
Select Operator
71+
expressions: id (type: int), val (type: string)
72+
outputColumnNames: _col0, _col1
73+
Statistics: Num rows: 95000000 Data size: 16967000178 Basic stats: COMPLETE Column stats: NONE
74+
Reduce Output Operator
75+
key expressions: _col0 (type: int)
76+
null sort order: z
77+
sort order: +
78+
Map-reduce partition columns: _col0 (type: int)
79+
Statistics: Num rows: 95000000 Data size: 16967000178 Basic stats: COMPLETE Column stats: NONE
80+
value expressions: _col1 (type: string)
81+
Execution mode: vectorized, llap
82+
LLAP IO: all inputs
83+
Map 3
84+
Map Operator Tree:
85+
TableScan
86+
alias: t2
87+
filterExpr: id is not null (type: boolean)
88+
Statistics: Num rows: 1000 Data size: 178788 Basic stats: COMPLETE Column stats: NONE
89+
Filter Operator
90+
predicate: id is not null (type: boolean)
91+
Statistics: Num rows: 950 Data size: 169848 Basic stats: COMPLETE Column stats: NONE
92+
Select Operator
93+
expressions: id (type: int), val (type: string)
94+
outputColumnNames: _col0, _col1
95+
Statistics: Num rows: 950 Data size: 169848 Basic stats: COMPLETE Column stats: NONE
96+
Reduce Output Operator
97+
key expressions: _col0 (type: int)
98+
null sort order: z
99+
sort order: +
100+
Map-reduce partition columns: _col0 (type: int)
101+
Statistics: Num rows: 950 Data size: 169848 Basic stats: COMPLETE Column stats: NONE
102+
value expressions: _col1 (type: string)
103+
Select Operator
104+
expressions: _col0 (type: int)
105+
outputColumnNames: _col0
106+
Statistics: Num rows: 950 Data size: 169848 Basic stats: COMPLETE Column stats: NONE
107+
Group By Operator
108+
aggregations: min(_col0), max(_col0), bloom_filter(_col0, expectedEntries=950)
109+
minReductionHashAggr: 0.99
110+
mode: hash
111+
outputColumnNames: _col0, _col1, _col2
112+
Statistics: Num rows: 1 Data size: 340 Basic stats: COMPLETE Column stats: NONE
113+
Reduce Output Operator
114+
null sort order:
115+
sort order:
116+
Statistics: Num rows: 1 Data size: 340 Basic stats: COMPLETE Column stats: NONE
117+
value expressions: _col0 (type: int), _col1 (type: int), _col2 (type: binary)
118+
Execution mode: vectorized, llap
119+
LLAP IO: all inputs
120+
Reducer 2
121+
Execution mode: llap
122+
Reduce Operator Tree:
123+
Merge Join Operator
124+
condition map:
125+
Inner Join 0 to 1
126+
keys:
127+
0 _col0 (type: int)
128+
1 _col0 (type: int)
129+
outputColumnNames: _col0, _col1, _col3
130+
Statistics: Num rows: 104500002 Data size: 18663700600 Basic stats: COMPLETE Column stats: NONE
131+
Select Operator
132+
expressions: _col0 (type: int), _col1 (type: string), _col3 (type: string)
133+
outputColumnNames: _col0, _col1, _col2
134+
Statistics: Num rows: 104500002 Data size: 18663700600 Basic stats: COMPLETE Column stats: NONE
135+
File Output Operator
136+
compressed: false
137+
Statistics: Num rows: 104500002 Data size: 18663700600 Basic stats: COMPLETE Column stats: NONE
138+
table:
139+
input format: org.apache.hadoop.mapred.SequenceFileInputFormat
140+
output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
141+
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
142+
Reducer 4
143+
Execution mode: vectorized, llap
144+
Reduce Operator Tree:
145+
Group By Operator
146+
aggregations: min(VALUE._col0), max(VALUE._col1), bloom_filter(VALUE._col2, 1, expectedEntries=950)
147+
mode: final
148+
outputColumnNames: _col0, _col1, _col2
149+
Statistics: Num rows: 1 Data size: 340 Basic stats: COMPLETE Column stats: NONE
150+
Reduce Output Operator
151+
null sort order:
152+
sort order:
153+
Statistics: Num rows: 1 Data size: 340 Basic stats: COMPLETE Column stats: NONE
154+
value expressions: _col0 (type: int), _col1 (type: int), _col2 (type: binary)
155+
156+
Stage: Stage-0
157+
Fetch Operator
158+
limit: -1
159+
Processor Tree:
160+
ListSink

0 commit comments

Comments
 (0)