From 9e055130022a822fe15decfaa82f9495623f4cf4 Mon Sep 17 00:00:00 2001 From: Jiawei Zhao Date: Thu, 18 Jun 2026 22:27:39 +0800 Subject: [PATCH 1/2] refactor: name build-row and matchable-map presence checks in hash join `state_after_build_ready` distinguishes two facts about the collected build side: whether it physically has rows, and whether its hash map has any matchable entries. These differ under `NullEquality::NullEqualsNothing`, where build rows with a NULL join key are omitted from the map -- the distinction that was the source of the bug fixed in #22893. Expose it as named methods on `JoinLeftData` (`has_build_rows` / `has_matchable_build_rows`) instead of repeating raw `batch().num_rows()` / `map().is_empty()` checks at the call site, so the invariant is visible at the API surface and future uses are harder to get wrong. No behavior change. Closes #23008 Signed-off-by: Jiawei Zhao --- .../physical-plan/src/joins/hash_join/exec.rs | 19 +++++++++++++++++++ .../src/joins/hash_join/stream.rs | 4 ++-- 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/datafusion/physical-plan/src/joins/hash_join/exec.rs b/datafusion/physical-plan/src/joins/hash_join/exec.rs index bb9ebcd4e619..f47e247db668 100644 --- a/datafusion/physical-plan/src/joins/hash_join/exec.rs +++ b/datafusion/physical-plan/src/joins/hash_join/exec.rs @@ -229,6 +229,25 @@ impl JoinLeftData { &self.batch } + /// Returns `true` if the build side physically contains rows. + /// + /// This is distinct from [`Self::has_matchable_build_rows`]: a build side + /// can hold rows while its hash map is empty (see that method). + pub(super) fn has_build_rows(&self) -> bool { + self.batch().num_rows() > 0 + } + + /// Returns `true` if the build-side hash map has any matchable entries. + /// + /// Under [`NullEquality::NullEqualsNothing`] build rows whose join key is + /// NULL are omitted from the map, so this can be `false` even when + /// [`Self::has_build_rows`] is `true`. + /// + /// [`NullEquality::NullEqualsNothing`]: datafusion_common::NullEquality::NullEqualsNothing + pub(super) fn has_matchable_build_rows(&self) -> bool { + !self.map().is_empty() + } + /// returns a reference to the build side expressions values pub(super) fn values(&self) -> &[ArrayRef] { &self.values diff --git a/datafusion/physical-plan/src/joins/hash_join/stream.rs b/datafusion/physical-plan/src/joins/hash_join/stream.rs index ed605301ad4a..82f9508d872d 100644 --- a/datafusion/physical-plan/src/joins/hash_join/stream.rs +++ b/datafusion/physical-plan/src/joins/hash_join/stream.rs @@ -523,12 +523,12 @@ impl HashJoinStream { join_type: JoinType, left_data: &JoinLeftData, ) -> HashJoinStreamState { - let build_empty = left_data.batch().num_rows() == 0; + let build_empty = !left_data.has_build_rows(); // The map can be empty even when the build side has rows: under // `NullEqualsNothing`, build rows with a NULL join key are omitted. For // join types whose every output row requires a build match, that still // guarantees an empty result, so we can skip scanning the probe side. - let map_empty = left_data.map().is_empty(); + let map_empty = !left_data.has_matchable_build_rows(); if (build_empty && join_type.empty_build_side_produces_empty_result()) || (map_empty && join_type.empty_map_produces_empty_result()) From b39f8f1f9ace5899c5ca1814660f0717ee04aa62 Mon Sep 17 00:00:00 2001 From: Jiawei Zhao Date: Sat, 20 Jun 2026 08:04:04 +0800 Subject: [PATCH 2/2] refactor: simplify presence-check doc and reuse the helper Drop the explicit intra-doc link reference on `has_matchable_build_rows`: the inline `[`NullEquality::NullEqualsNothing`]` link resolves on its own since the type is in scope, so spelling out the path again is redundant. Also use `has_matchable_build_rows()` for the empty-build-side check in `HashJoinStream` instead of a raw `map().is_empty()`, keeping the presence checks consistent. No behavior change. Signed-off-by: Jiawei Zhao --- datafusion/physical-plan/src/joins/hash_join/exec.rs | 2 -- datafusion/physical-plan/src/joins/hash_join/stream.rs | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/datafusion/physical-plan/src/joins/hash_join/exec.rs b/datafusion/physical-plan/src/joins/hash_join/exec.rs index f47e247db668..b1d387ea7455 100644 --- a/datafusion/physical-plan/src/joins/hash_join/exec.rs +++ b/datafusion/physical-plan/src/joins/hash_join/exec.rs @@ -242,8 +242,6 @@ impl JoinLeftData { /// Under [`NullEquality::NullEqualsNothing`] build rows whose join key is /// NULL are omitted from the map, so this can be `false` even when /// [`Self::has_build_rows`] is `true`. - /// - /// [`NullEquality::NullEqualsNothing`]: datafusion_common::NullEquality::NullEqualsNothing pub(super) fn has_matchable_build_rows(&self) -> bool { !self.map().is_empty() } diff --git a/datafusion/physical-plan/src/joins/hash_join/stream.rs b/datafusion/physical-plan/src/joins/hash_join/stream.rs index 82f9508d872d..2aa6e69dff80 100644 --- a/datafusion/physical-plan/src/joins/hash_join/stream.rs +++ b/datafusion/physical-plan/src/joins/hash_join/stream.rs @@ -779,7 +779,7 @@ impl HashJoinStream { } } - let is_empty = build_side.left_data.map().is_empty(); + let is_empty = !build_side.left_data.has_matchable_build_rows(); if is_empty { let result = build_batch_empty_build_side(