Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 6 additions & 7 deletions datafusion/optimizer/src/simplify_expressions/regex.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
// under the License.

use datafusion_common::tree_node::Transformed;
use datafusion_common::{DataFusionError, Result};
use datafusion_common::{DataFusionError, Result, ScalarValue};
use datafusion_expr::{BinaryExpr, Expr, Like, Operator, lit};
use regex_syntax::hir::{Capture, Hir, HirKind, Literal, Look};

Expand All @@ -39,7 +39,7 @@ const ANY_CHAR_REGEX_PATTERN: &str = ".*";
/// - partial anchored regex patterns (e.g. `^foo`) to `LIKE 'foo%'`
/// - combinations (alternatives) of the above, will be concatenated with `OR` or `AND`
/// - `EQ .*` to NotNull
/// - `NE .*` means IS EMPTY
/// - `NE .*` to col IS NOT NULL AND Boolean(NULL) (false for any string, or NULL if col is NULL)
///
/// Dev note: unit tests of this function are in `expr_simplifier.rs`, case `test_simplify_regex`.
pub fn simplify_regex_expr(
Expand Down Expand Up @@ -68,12 +68,11 @@ pub fn simplify_regex_expr(
// Handle the special case for ".*" pattern
if pattern == ANY_CHAR_REGEX_PATTERN {
let new_expr = if mode.not {
// not empty
let empty_lit = Box::new(string_scalar.to_expr(""));
let null_bool = lit(ScalarValue::Boolean(None));
Expr::BinaryExpr(BinaryExpr {
left,
op: Operator::Eq,
right: empty_lit,
left: Box::new(left.is_not_null()),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this case should be is_null rather than is_not_null

I think this because when left is null, The expression null !~ '.*' should also evaluate to null

However, as written (col IS NOT NULL AND NULL) will evaluate

col IS NOT NULL AND NULL
-->  FALSE AND NULL
-->  FALSE 

If the transformation is col IS NULL AND NULL 🤯 then:

col IS NULL AND NULL
-->  true AND NULL
-->  NULL 

As expected

op: Operator::And,
right: Box::new(null_bool),
})
} else {
// not null
Expand Down
51 changes: 43 additions & 8 deletions datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,7 @@ mod tests {
use arrow::datatypes::{DataType, Field, Schema};
use chrono::{DateTime, Utc};

use datafusion_common::ScalarValue;
use datafusion_expr::logical_plan::builder::table_scan_with_filters;
use datafusion_expr::logical_plan::table_scan;
use datafusion_expr::*;
Expand Down Expand Up @@ -883,17 +884,17 @@ mod tests {
"
)?;

// Test `!= ".*"` transforms to checking if the column is empty
// Test `!~ ".*"` transforms to CASE WHEN col IS NOT NULL THEN FALSE ELSE NULL END
let plan = LogicalPlanBuilder::from(table_scan.clone())
.filter(binary_expr(col("a"), Operator::RegexNotMatch, lit(".*")))?
.build()?;

assert_optimized_plan_equal!(
plan,
@ r#"
Filter: test.a = Utf8("")
@ r"
Filter: test.a IS NOT NULL AND Boolean(NULL)
TableScan: test
"#
"
)?;

// Test case-insensitive versions
Expand All @@ -911,17 +912,51 @@ mod tests {
"
)?;

// Test `!~ ".*"` (case-insensitive) transforms to checking if the column is empty
// Test NULL `!~ ".*"` transforms to Boolean(NULL)
let plan = LogicalPlanBuilder::from(table_scan.clone())
.filter(binary_expr(
lit(ScalarValue::Utf8(None)),
Operator::RegexNotMatch,
lit(".*"),
))?
.build()?;

assert_optimized_plan_equal!(
plan,
@ r"
Filter: Boolean(NULL)
TableScan: test
"
)?;

// Test `!~* ".*"` (case-insensitive) transforms to false
let plan = LogicalPlanBuilder::from(table_scan.clone())
.filter(binary_expr(col("a"), Operator::RegexNotIMatch, lit(".*")))?
.build()?;

assert_optimized_plan_equal!(
plan,
@ r#"
Filter: test.a = Utf8("")
@ r"
Filter: test.a IS NOT NULL AND Boolean(NULL)
TableScan: test
"#
"
)?;

// Test NULL `!~* ".*"` transforms to Boolean(NULL)
let plan = LogicalPlanBuilder::from(table_scan.clone())
.filter(binary_expr(
lit(ScalarValue::Utf8(None)),
Operator::RegexNotIMatch,
lit(".*"),
))?
.build()?;

assert_optimized_plan_equal!(
plan,
@ r"
Filter: Boolean(NULL)
TableScan: test
"
)
}

Expand Down
4 changes: 2 additions & 2 deletions datafusion/sqllogictest/test_files/simplify_expr.slt
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,10 @@ query TT
explain select b from t where b !~ '.*'
----
logical_plan
01)Filter: t.b = Utf8View("")
01)Filter: t.b IS NOT NULL AND Boolean(NULL)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should add a test that actually executes col !~ .* for a column that has 'foo', '', and null

Specifically like this (the ::text is needed for postgres):

-- Reproducer for regex simplification of `col !~ '.*'`
-- Input values: 'foo', '', NULL

WITH t(col) AS (
    VALUES
      ('foo'::text),
      (''::text),
      (NULL::text)
  )
  SELECT
    col,
    col !~ '.*' AS not_match_dot_star
  FROM t;

Should result in

+------+-----------------+
| a    | baseline_result |
+------+-----------------+
| foo  | false           |
|      | false           |
| NULL | NULL            |
+------+-----------------+

Here is what current datafusion does

andrewlamb@Andrews-MacBook-Pro-3:~/Software/datafusion2$ datafusion-cli
DataFusion CLI v52.1.0
> WITH t(col) AS (
    VALUES
      ('foo'::text),
      (''::text),
      (NULL::text)
  )
  SELECT
    col,
    col !~ '.*' AS not_match_dot_star
  FROM t;
+------+--------------------+
| col  | not_match_dot_star |
+------+--------------------+
| foo  | false              |
|      | true               |
| NULL | NULL               |
+------+--------------------+
3 row(s) fetched.
Elapsed 0.038 seconds.

Here is what postgres says

postgres=#   WITH t(col) AS (
    VALUES
      ('foo'::text),
      (''::text),
      (NULL::text)
  )
  SELECT
    col,
    col !~ '.*' AS not_match_dot_star
  FROM t;
 col | not_match_dot_star
-----+--------------------
 foo | f
     | f
     |
(3 rows)

I think this branch will not get the right value for col IS NULL

02)--TableScan: t projection=[b]
physical_plan
01)FilterExec: b@0 =
01)FilterExec: b@0 IS NOT NULL AND NULL
02)--DataSourceExec: partitions=1, partition_sizes=[1]

query T
Expand Down
Loading