Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions ql/src/java/org/apache/hadoop/hive/ql/plan/ColStatistics.java
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ public class ColStatistics {
private boolean isPrimaryKey;
private boolean isEstimated;
private boolean isFilteredColumn;
private boolean isConst;
private byte[] bitVectors;
private byte[] histogram;

Expand Down Expand Up @@ -155,6 +156,8 @@ public String toString() {

sb.append(" isEstimated: ");
sb.append(isEstimated);
sb.append(" isConst: ");
sb.append(isConst);
return sb.toString();
}

Expand All @@ -171,6 +174,7 @@ public ColStatistics clone() {
clone.setPrimaryKey(isPrimaryKey);
clone.setIsEstimated(isEstimated);
clone.setIsFilteredColumn(isFilteredColumn);
clone.setConst(isConst);
if (range != null ) {
clone.setRange(range.clone());
}
Expand Down Expand Up @@ -232,4 +236,12 @@ public boolean isFilteredColumn() {
return isFilteredColumn;
}

public boolean isConst() {
return isConst;
}

public void setConst(boolean isConst) {
this.isConst = isConst;
}

}
5 changes: 4 additions & 1 deletion ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -823,9 +823,11 @@ public static ColStatistics getColStatistics(ColumnStatisticsObj cso, String col
if (numTrues == 0 && numFalses == 0) {
// All NULL column - no non-null distinct values
cs.setCountDistint(0);
cs.setConst(true);
} else if (numTrues == 0 || numFalses == 0) {
// One value type confirmed absent (=0), other is present (>0) or unknown (<0)
cs.setCountDistint(1);
cs.setConst(csd.getBooleanStats().getNumNulls() == 0 && (numTrues > 0 || numFalses > 0));
} else {
// Both != 0: either both present (>0), both unknown (<0), or one present + one unknown
cs.setCountDistint(2);
Expand Down Expand Up @@ -1646,6 +1648,7 @@ private static ColStatistics buildColStatForConstant(HiveConf conf, long numRows
colStats.setAvgColLen(avgColSize);
colStats.setCountDistint(countDistincts);
colStats.setNumNulls(numNulls);
colStats.setConst(true);

Optional<Number> value = getConstValue(encd);
value.ifPresent(number -> colStats.setRange(number, number));
Expand Down Expand Up @@ -2112,7 +2115,7 @@ private static List<Long> extractNDVGroupingColumns(List<ColStatistics> colStats
for (ColStatistics cs : colStats) {
if (cs != null) {
long ndv = cs.getCountDistint();
if (cs.getNumNulls() > 0) {
if (cs.getNumNulls() > 0 && (ndv > 0 || cs.isConst())) {
ndv = StatsUtils.safeAdd(ndv, 1);
}
ndvValues.add(ndv);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ public void add(ColStatistics stat) {
if (stat.isFilteredColumn()) {
result.setFilterColumn();
}
result.setConst(result.isConst() && stat.isConst());
}
public Optional<ColStatistics> getResult() {
return Optional.of(result);
Expand Down
60 changes: 60 additions & 0 deletions ql/src/test/org/apache/hadoop/hive/ql/stats/TestStatsUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -367,6 +367,46 @@ void testGetColStatisticsBooleanAllNull() {

assertNotNull(cs);
assertEquals(0, cs.getCountDistint(), "Boolean NDV should be 0 for all-NULL column");
assertEquals(true, cs.isConst(), "all-NULL boolean column must be marked isConst");
}

@ParameterizedTest(name = "isConst=true for verified single-value boolean (numTrues={0}, numFalses={1}, numNulls=0)")
@org.junit.jupiter.params.provider.CsvSource({"100, 0", "0, 100"})
void testGetColStatisticsBooleanIsConstForVerifiedSingleValue(long numTrues, long numFalses) {
ColumnStatisticsObj cso = new ColumnStatisticsObj();
cso.setColName("bool_col");
cso.setColType(serdeConstants.BOOLEAN_TYPE_NAME);
BooleanColumnStatsData boolStats = new BooleanColumnStatsData();
boolStats.setNumTrues(numTrues);
boolStats.setNumFalses(numFalses);
boolStats.setNumNulls(0);
ColumnStatisticsData data = new ColumnStatisticsData();
data.setBooleanStats(boolStats);
cso.setStatsData(data);

ColStatistics cs = StatsUtils.getColStatistics(cso, "bool_col");

assertEquals(true, cs.isConst());
}

@ParameterizedTest(name = "isConst stays false for boolean (numTrues={0}, numFalses={1})")
@org.junit.jupiter.params.provider.CsvSource({
"100, 0", "0, 100", "-1, 100", "100, -1", "50, 50"})
void testGetColStatisticsBooleanIsConstNotSetForNonAllNullCases(long numTrues, long numFalses) {
ColumnStatisticsObj cso = new ColumnStatisticsObj();
cso.setColName("bool_col");
cso.setColType(serdeConstants.BOOLEAN_TYPE_NAME);
BooleanColumnStatsData boolStats = new BooleanColumnStatsData();
boolStats.setNumTrues(numTrues);
boolStats.setNumFalses(numFalses);
boolStats.setNumNulls(10);
ColumnStatisticsData data = new ColumnStatisticsData();
data.setBooleanStats(boolStats);
cso.setStatsData(data);

ColStatistics cs = StatsUtils.getColStatistics(cso, "bool_col");

assertEquals(false, cs.isConst());
}

@Test
Expand Down Expand Up @@ -565,4 +605,24 @@ void testGetColStatisticsTimestampType() {
assertEquals(1700000000L, range.maxValue.longValue(), "maxValue mismatch for TIMESTAMP");
}

@Test
void testColStatisticsIsConstDefaultsFalse() {
ColStatistics cs = new ColStatistics("c", "int");
assertEquals(false, cs.isConst(), "isConst should default to false");
}

@Test
void testColStatisticsIsConstSetterAndClone() {
ColStatistics cs = new ColStatistics("c", "int");
cs.setConst(true);
assertEquals(true, cs.isConst(), "setConst(true) should be observable via isConst()");

ColStatistics clone = cs.clone();
assertEquals(true, clone.isConst(), "clone() must propagate isConst");

cs.setConst(false);
assertEquals(false, cs.isConst(), "setConst(false) should clear the flag");
assertEquals(true, clone.isConst(), "Clone should be independent of source after mutation");
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@

import org.apache.hadoop.hive.ql.plan.ColStatistics;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.CsvSource;

class TestPessimisticStatCombiner {

Expand Down Expand Up @@ -155,6 +157,21 @@ void testCombineBothUnknownNumTruesAndNumFalses() {
assertEquals(-1, combined.getNumFalses(), "Both unknown should result in unknown (-1)");
}

@ParameterizedTest(name = "combine isConst({0}, {1}) = {2}")
@CsvSource({"true, true, true", "true, false, false", "false, true, false", "false, false, false"})
void testCombineIsConstAndSemantics(boolean stat1Const, boolean stat2Const, boolean expected) {
ColStatistics stat1 = createStat("col1", "int", 0, 100, 4.0);
stat1.setConst(stat1Const);
ColStatistics stat2 = createStat("col2", "int", 0, 100, 4.0);
stat2.setConst(stat2Const);

PessimisticStatCombiner combiner = new PessimisticStatCombiner();
combiner.add(stat1);
combiner.add(stat2);

assertEquals(expected, combiner.getResult().get().isConst());
}

private ColStatistics createStat(String name, String type, long ndv, long numNulls, double avgColLen) {
ColStatistics stat = new ColStatistics(name, type);
stat.setCountDistint(ndv);
Expand Down
30 changes: 30 additions & 0 deletions ql/src/test/queries/clientpositive/groupby_unknown_ndv.q
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
-- HIVE-29556: GROUP BY of a column with unknown NDV (NDV=0 sentinel + numNulls>0)
-- must not collapse to "1 row" estimate. Both probes feed a join sized so that
-- master's bogus 1-row estimate triggers Map Join while the heuristic fallback
-- forces Merge Join.

SET hive.auto.convert.join=true;

CREATE TABLE big (k BIGINT);
CREATE TABLE small (id BIGINT, name STRING);

ALTER TABLE big UPDATE STATISTICS SET('numRows'='100000000');
ALTER TABLE big UPDATE STATISTICS for column k SET ('numDVs'='0','numNulls'='100000000');

ALTER TABLE small UPDATE STATISTICS SET('numRows'='1000000');
ALTER TABLE small UPDATE STATISTICS for column id SET ('numDVs'='1000000','numNulls'='0');
ALTER TABLE small UPDATE STATISTICS for column name SET ('numDVs'='1000000','numNulls'='0','avgColLen'='100','maxColLen'='100');

-- U1: direct column with unknown NDV.
EXPLAIN
SELECT s.name, g.cnt
FROM (SELECT k, COUNT(*) AS cnt FROM big GROUP BY k) g
JOIN small s ON g.cnt = s.id;

-- U2: PessimisticStatCombiner output for CASE WHEN with one NULL branch.
EXPLAIN
SELECT s.name, g.cnt
FROM (SELECT x, COUNT(*) AS cnt
FROM (SELECT CASE WHEN k > 0 THEN k ELSE cast(NULL AS BIGINT) END AS x FROM big) t
GROUP BY x) g
JOIN small s ON g.cnt = s.id;
Loading
Loading