From 168dc6d05e689624bcb435b0470291981c814918 Mon Sep 17 00:00:00 2001
From: wilbyang <yagn.wiby@gmail.com>
Date: Fri, 6 Mar 2026 09:15:26 +0100
Subject: [PATCH] feat: emit warning diagnostic when comparing with NULL using
 = or <>
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Closes #14434

Comparing a column against NULL with `=` or `<>` always produces NULL
(never TRUE or FALSE), which is almost certainly a mistake. The user
almost certainly meant `IS NULL` or `IS NOT NULL` instead.

This commit:

- Adds `warnings: Arc<Mutex<Vec<Diagnostic>>>` to `PlannerContext` so
  non-fatal diagnostics can be collected during planning without halting
  execution. The `Arc` ensures all clones (e.g. subquery contexts) share
  the same backing store, so warnings from nested queries surface to the
  top-level caller.
- Detects `expr = NULL` / `NULL = expr` / `expr <> NULL` in the binary
  expression stack machine and pushes a `Diagnostic::new_warning` with
  a help message suggesting `IS NULL` / `IS NOT NULL`.
- Adds `PlannerContext::add_warning` and `PlannerContext::take_warnings`
  as the public API for producing and consuming warnings.
- Adds 5 tests in `datafusion/sql/tests/cases/diagnostic.rs` covering
  the new warning, the negative case (`IS NULL` → no warning), and
  multiple warnings in one query.
---
 datafusion/sql/src/expr/mod.rs           |  42 ++++++++-
 datafusion/sql/src/planner.rs            |  35 +++++++-
 datafusion/sql/tests/cases/diagnostic.rs | 109 ++++++++++++++++++++++-
 3 files changed, 181 insertions(+), 5 deletions(-)
diff --git a/datafusion/sql/src/expr/mod.rs b/datafusion/sql/src/expr/mod.rs
index 7902eed1e6922..eb8ab9fcab8d5 100644
--- a/datafusion/sql/src/expr/mod.rs
+++ b/datafusion/sql/src/expr/mod.rs
@@ -27,8 +27,8 @@ use sqlparser::ast::{
 };
 
 use datafusion_common::{
-    DFSchema, Result, ScalarValue, internal_datafusion_err, internal_err, not_impl_err,
-    plan_err,
+    DFSchema, Diagnostic, Result, ScalarValue, internal_datafusion_err, internal_err,
+    not_impl_err, plan_err,
 };
 
 use datafusion_expr::expr::ScalarFunction;
@@ -108,6 +108,44 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
                 StackEntry::Operator(op) => {
                     let right = eval_stack.pop().unwrap();
                     let left = eval_stack.pop().unwrap();
+                    // Warn when `expr = NULL` or `NULL = expr` is used.
+                    // Such comparisons always evaluate to NULL (never TRUE or
+                    // FALSE), so the user almost certainly meant `IS NULL` /
+                    // `IS NOT NULL` instead.
+                    if matches!(op, BinaryOperator::Eq | BinaryOperator::NotEq) {
+                        let right_is_null =
+                            matches!(right, Expr::Literal(ScalarValue::Null, _));
+                        let left_is_null =
+                            matches!(left, Expr::Literal(ScalarValue::Null, _));
+                        if right_is_null || left_is_null {
+                            // Point the span at the non-null operand (the
+                            // column or expression being compared), because
+                            // the NULL literal itself carries no span.
+                            let span = if right_is_null {
+                                left.spans().and_then(|s| s.first())
+                            } else {
+                                right.spans().and_then(|s| s.first())
+                            };
+                            let (op_str, suggestion) =
+                                if matches!(op, BinaryOperator::Eq) {
+                                    ("=", "IS NULL")
+                                } else {
+                                    ("<>", "IS NOT NULL")
+                                };
+                            let mut warning = Diagnostic::new_warning(
+                                format!(
+                                    "'{op_str} NULL' will always be NULL \
+                                     (null comparisons never return true or false)"
+                                ),
+                                span,
+                            );
+                            warning.add_help(
+                                format!("use '{suggestion}' instead"),
+                                None,
+                            );
+                            planner_context.add_warning(warning);
+                        }
+                    }
                     let expr = self.build_logical_expr(op, left, right, schema)?;
                     eval_stack.push(expr);
                 }
diff --git a/datafusion/sql/src/planner.rs b/datafusion/sql/src/planner.rs
index 307f28e8ff9ad..9d4add198d65c 100644
--- a/datafusion/sql/src/planner.rs
+++ b/datafusion/sql/src/planner.rs
@@ -18,7 +18,7 @@
 //! [`SqlToRel`]: SQL Query Planner (produces [`LogicalPlan`] from SQL AST)
 use std::collections::HashMap;
 use std::str::FromStr;
-use std::sync::Arc;
+use std::sync::{Arc, Mutex};
 use std::vec;
 
 use crate::utils::make_decimal_type;
@@ -253,6 +253,11 @@ impl IdentNormalizer {
 /// This helps resolve scoping issues of CTEs.
 /// By using cloning, a subquery can inherit CTEs from the outer query
 /// and can also define its own private CTEs without affecting the outer query.
+///
+/// Warnings (non-fatal [`Diagnostic`]s) are shared across all clones via
+/// [`Arc`], so warnings collected in subquery contexts are visible from the
+/// top-level context. Use [`PlannerContext::take_warnings`] after planning to
+/// retrieve them.
 #[derive(Debug, Clone)]
 pub struct PlannerContext {
     /// Data types for numbered parameters ($1, $2, etc), if supplied
@@ -270,6 +275,9 @@ pub struct PlannerContext {
     outer_from_schema: Option<DFSchemaRef>,
     /// The query schema defined by the table
     create_table_schema: Option<DFSchemaRef>,
+    /// Non-fatal warnings collected during planning. Shared across clones (e.g.
+    /// for subqueries) so that all warnings surface to the top-level caller.
+    warnings: Arc<Mutex<Vec<Diagnostic>>>,
 }
 
 impl Default for PlannerContext {
@@ -287,9 +295,34 @@ impl PlannerContext {
             outer_queries_schemas_stack: vec![],
             outer_from_schema: None,
             create_table_schema: None,
+            warnings: Arc::new(Mutex::new(vec![])),
         }
     }
 
+    /// Add a non-fatal warning [`Diagnostic`] collected during planning.
+    ///
+    /// Warnings do not halt query execution. Call [`Self::take_warnings`] after
+    /// planning to retrieve them.
+    pub fn add_warning(&self, warning: Diagnostic) {
+        self.warnings
+            .lock()
+            .expect("warnings lock poisoned")
+            .push(warning);
+    }
+
+    /// Drain and return all non-fatal [`Diagnostic`] warnings collected during
+    /// planning. This is shared across cloned contexts (e.g. subqueries), so
+    /// calling this on the top-level context returns warnings from the entire
+    /// query.
+    pub fn take_warnings(&self) -> Vec<Diagnostic> {
+        std::mem::take(
+            &mut self
+                .warnings
+                .lock()
+                .expect("warnings lock poisoned"),
+        )
+    }
+
     /// Update the PlannerContext with provided prepare_param_data_types
     pub fn with_prepare_param_data_types(
         mut self,
diff --git a/datafusion/sql/tests/cases/diagnostic.rs b/datafusion/sql/tests/cases/diagnostic.rs
index 7a729739469d3..22561e09e324a 100644
--- a/datafusion/sql/tests/cases/diagnostic.rs
+++ b/datafusion/sql/tests/cases/diagnostic.rs
@@ -21,8 +21,8 @@ use std::{collections::HashMap, sync::Arc};
 
 use datafusion_common::{Diagnostic, Location, Result, Span};
 use datafusion_sql::{
-    parser::{DFParser, DFParserBuilder},
-    planner::{ParserOptions, SqlToRel},
+    parser::{DFParser, DFParserBuilder, Statement as DFStatement},
+    planner::{ParserOptions, PlannerContext, SqlToRel},
 };
 use regex::Regex;
 
@@ -51,6 +51,33 @@ fn do_query(sql: &'static str) -> Diagnostic {
     }
 }
 
+/// Plan a query that is expected to **succeed** and return any non-fatal
+/// [`Diagnostic`] warnings that were collected during planning.
+fn do_query_warnings(sql: &'static str) -> Vec<Diagnostic> {
+    let statement = DFParserBuilder::new(sql)
+        .build()
+        .expect("unable to create parser")
+        .parse_statement()
+        .expect("unable to parse query");
+    let options = ParserOptions {
+        collect_spans: true,
+        ..ParserOptions::default()
+    };
+    let state = MockSessionState::default();
+    let context = MockContextProvider { state };
+    let sql_to_rel = SqlToRel::new_with_options(&context, options);
+    let mut planner_context = PlannerContext::new();
+    match statement {
+        DFStatement::Statement(s) => {
+            sql_to_rel
+                .sql_statement_to_plan_with_context(*s, &mut planner_context)
+                .expect("expected planning to succeed");
+        }
+        _ => panic!("expected a SQL statement"),
+    }
+    planner_context.take_warnings()
+}
+
 /// Given a query that contains tag delimited spans, returns a mapping from the
 /// span name to the [`Span`]. Tags are comments of the form `/*tag*/`. In case
 /// you want the same location to open two spans, or close open and open
@@ -390,3 +417,81 @@ fn test_syntax_error() -> Result<()> {
         },
     }
 }
+
+// ── = NULL warning tests ──────────────────────────────────────────────────────
+
+#[test]
+fn test_eq_null_warning_in_where() -> Result<()> {
+    let query = "SELECT * FROM person WHERE /*col*/first_name/*col*/ = NULL";
+    let spans = get_spans(query);
+    let warnings = do_query_warnings(query);
+    assert_eq!(warnings.len(), 1);
+    let w = &warnings[0];
+    assert_snapshot!(
+        w.message,
+        @"'= NULL' will always be NULL (null comparisons never return true or false)"
+    );
+    assert_eq!(w.span, Some(spans["col"]));
+    assert_snapshot!(w.helps[0].message, @"use 'IS NULL' instead");
+    Ok(())
+}
+
+#[test]
+fn test_null_eq_warning_in_where() -> Result<()> {
+    // NULL on the left side
+    let query = "SELECT * FROM person WHERE NULL = /*col*/first_name/*col*/";
+    let spans = get_spans(query);
+    let warnings = do_query_warnings(query);
+    assert_eq!(warnings.len(), 1);
+    let w = &warnings[0];
+    assert_snapshot!(
+        w.message,
+        @"'= NULL' will always be NULL (null comparisons never return true or false)"
+    );
+    assert_eq!(w.span, Some(spans["col"]));
+    assert_snapshot!(w.helps[0].message, @"use 'IS NULL' instead");
+    Ok(())
+}
+
+#[test]
+fn test_neq_null_warning_in_where() -> Result<()> {
+    // <> NULL also warrants a warning
+    let query = "SELECT * FROM person WHERE /*col*/first_name/*col*/ <> NULL";
+    let spans = get_spans(query);
+    let warnings = do_query_warnings(query);
+    assert_eq!(warnings.len(), 1);
+    let w = &warnings[0];
+    assert_snapshot!(
+        w.message,
+        @"'<> NULL' will always be NULL (null comparisons never return true or false)"
+    );
+    assert_eq!(w.span, Some(spans["col"]));
+    assert_snapshot!(w.helps[0].message, @"use 'IS NOT NULL' instead");
+    Ok(())
+}
+
+#[test]
+fn test_is_null_no_warning() -> Result<()> {
+    // IS NULL is the correct form – no warning expected
+    let query = "SELECT * FROM person WHERE first_name IS NULL";
+    let warnings = do_query_warnings(query);
+    assert!(
+        warnings.is_empty(),
+        "expected no warnings for IS NULL, got: {warnings:?}"
+    );
+    Ok(())
+}
+
+#[test]
+fn test_multiple_eq_null_warnings() -> Result<()> {
+    // Multiple = NULL comparisons in the same query each produce a warning
+    let query =
+        "SELECT * FROM person WHERE first_name = NULL AND last_name = NULL";
+    let warnings = do_query_warnings(query);
+    assert_eq!(warnings.len(), 2);
+    let expected =
+        "'= NULL' will always be NULL (null comparisons never return true or false)";
+    assert_eq!(warnings[0].message, expected);
+    assert_eq!(warnings[1].message, expected);
+    Ok(())
+}