-
Notifications
You must be signed in to change notification settings - Fork 13.9k
[FLINK-39064][Table SQL / API] Add built-in REGEXP_SPLIT function to split string by regular expression pattern #27577
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1362,6 +1362,18 @@ def regexp_substr(self, regex) -> 'Expression': | |
| """ | ||
| return _binary_op("regexpSubstr")(self, regex) | ||
|
|
||
| def regexp_split(self, regex) -> 'Expression': | ||
| """ | ||
| Splits the string by the regular expression regex and returns an array of strings. | ||
| null if any of the arguments are null or regex is invalid. | ||
|
|
||
| E.g., regexp_split('Hello123World456', '[0-9]+') returns ['Hello', 'World', '']. | ||
|
|
||
| :param regex: A STRING expression with a matching pattern. | ||
| :return: An ARRAY<STRING> of split substrings. | ||
| """ | ||
|
Comment on lines
+1366
to
+1374
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ideally this should mirror the JavaDocs in BaseExpressions |
||
| return _binary_op("regexpSplit")(self, regex) | ||
|
|
||
| @property | ||
| def from_base64(self) -> 'Expression[str]': | ||
| """ | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -445,6 +445,20 @@ ANY, and(logical(LogicalTypeRoot.BOOLEAN), LITERAL) | |
| .runtimeClass("org.apache.flink.table.runtime.functions.scalar.SplitFunction") | ||
| .build(); | ||
|
|
||
| public static final BuiltInFunctionDefinition REGEXP_SPLIT = | ||
| BuiltInFunctionDefinition.newBuilder() | ||
| .name("REGEXP_SPLIT") | ||
| .sqlName("REGEXP_SPLIT") | ||
| .kind(SCALAR) | ||
| .inputTypeStrategy( | ||
| sequence( | ||
| logical(LogicalTypeFamily.CHARACTER_STRING), | ||
| logical(LogicalTypeFamily.CHARACTER_STRING))) | ||
|
Comment on lines
+453
to
+456
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
| .outputTypeStrategy(nullableIfArgs(explicit(DataTypes.ARRAY(STRING())))) | ||
| .runtimeClass( | ||
| "org.apache.flink.table.runtime.functions.scalar.RegexpSplitFunction") | ||
| .build(); | ||
|
|
||
| public static final BuiltInFunctionDefinition URL_DECODE = | ||
| BuiltInFunctionDefinition.newBuilder() | ||
| .name("URL_DECODE") | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -37,7 +37,8 @@ Stream<TestSetSpec> getTestSetSpecs() { | |
| regexpExtractTestCases(), | ||
| regexpExtractAllTestCases(), | ||
| regexpInstrTestCases(), | ||
| regexpSubstrTestCases()) | ||
| regexpSubstrTestCases(), | ||
| regexpSplitTestCases()) | ||
| .flatMap(s -> s); | ||
| } | ||
|
|
||
|
|
@@ -387,4 +388,94 @@ private Stream<TestSetSpec> regexpSubstrTestCases() { | |
| "Invalid input arguments. Expected signatures are:\n" | ||
| + "REGEXP_SUBSTR(str <CHARACTER_STRING>, regex <CHARACTER_STRING>)")); | ||
| } | ||
|
|
||
| private Stream<TestSetSpec> regexpSplitTestCases() { | ||
| return Stream.of( | ||
| TestSetSpec.forFunction(BuiltInFunctionDefinitions.REGEXP_SPLIT) | ||
| .onFieldsWithData( | ||
| "Hello123World456", | ||
| null, | ||
| "a,b;c|d", | ||
| "one two three", | ||
| 123, | ||
| "12345", | ||
| ",123,,,123,") | ||
| .andDataTypes( | ||
| DataTypes.STRING().notNull(), | ||
| DataTypes.STRING(), | ||
| DataTypes.STRING().notNull(), | ||
| DataTypes.STRING().notNull(), | ||
| DataTypes.INT().notNull(), | ||
| DataTypes.STRING().notNull(), | ||
| DataTypes.STRING()) | ||
| // Basic regex split | ||
| .testResult( | ||
| $("f0").regexpSplit("[0-9]+"), | ||
| "REGEXP_SPLIT(f0, '[0-9]+')", | ||
| new String[] {"Hello", "World", ""}, | ||
| DataTypes.ARRAY(DataTypes.STRING()).notNull()) | ||
| // null input test | ||
| .testResult( | ||
| $("f0").regexpSplit(null), | ||
| "REGEXP_SPLIT(f0, NULL)", | ||
| null, | ||
| DataTypes.ARRAY(DataTypes.STRING())) | ||
| // Empty regex - split by character | ||
| .testResult( | ||
| $("f5").regexpSplit(""), | ||
| "REGEXP_SPLIT(f5, '')", | ||
| new String[] {"1", "2", "3", "4", "5"}, | ||
| DataTypes.ARRAY(DataTypes.STRING()).notNull()) | ||
| // null string input | ||
| .testResult( | ||
| $("f1").regexpSplit("[0-9]+"), | ||
| "REGEXP_SPLIT(f1, '[0-9]+')", | ||
| null, | ||
| DataTypes.ARRAY(DataTypes.STRING())) | ||
| // null string and null pattern | ||
| .testResult( | ||
| $("f1").regexpSplit(null), | ||
| "REGEXP_SPLIT(f1, null)", | ||
| null, | ||
| DataTypes.ARRAY(DataTypes.STRING())) | ||
| // Multi-character delimiter regex | ||
| .testResult( | ||
| $("f2").regexpSplit("[,;|]"), | ||
| "REGEXP_SPLIT(f2, '[,;|]')", | ||
| new String[] {"a", "b", "c", "d"}, | ||
| DataTypes.ARRAY(DataTypes.STRING()).notNull()) | ||
| // Whitespace regex | ||
| .testResult( | ||
| $("f3").regexpSplit("\\s+"), | ||
| "REGEXP_SPLIT(f3, '\\s+')", | ||
| new String[] {"one", "two", "three"}, | ||
| DataTypes.ARRAY(DataTypes.STRING()).notNull()) | ||
| // No match - return original string | ||
| .testResult( | ||
| $("f5").regexpSplit("[a-z]+"), | ||
| "REGEXP_SPLIT(f5, '[a-z]+')", | ||
| new String[] {"12345"}, | ||
| DataTypes.ARRAY(DataTypes.STRING()).notNull()) | ||
| // Invalid regex - return null | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Pleas add literal/non-literal invalid input tests |
||
| .testResult( | ||
| $("f6").regexpSplit("("), | ||
| "REGEXP_SPLIT(f6, '(')", | ||
| null, | ||
| DataTypes.ARRAY(DataTypes.STRING())) | ||
| // Validation error for non-string type input | ||
| .testTableApiValidationError( | ||
| $("f4").regexpSplit("[0-9]+"), | ||
| "Invalid input arguments. Expected signatures are:\n" | ||
| + "REGEXP_SPLIT(<CHARACTER_STRING>, <CHARACTER_STRING>)") | ||
| .testSqlValidationError( | ||
| "REGEXP_SPLIT(f4, '[0-9]+')", | ||
| "Invalid input arguments. Expected signatures are:\n" | ||
| + "REGEXP_SPLIT(<CHARACTER_STRING>, <CHARACTER_STRING>)") | ||
| .testSqlValidationError( | ||
| "REGEXP_SPLIT()", | ||
| "No match found for function signature REGEXP_SPLIT()") | ||
| .testSqlValidationError( | ||
| "REGEXP_SPLIT(f1, '1', '2')", | ||
| "No match found for function signature REGEXP_SPLIT(<CHARACTER>, <CHARACTER>, <CHARACTER>)")); | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change | ||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -491,6 +491,24 @@ public static Matcher getRegexpMatcher(@Nullable StringData str, @Nullable Strin | |||||||||||
| } | ||||||||||||
| } | ||||||||||||
|
|
||||||||||||
| /** | ||||||||||||
| * Returns a compiled Pattern object for the given regular expression string, using a shared | ||||||||||||
| * cache for performance optimization. | ||||||||||||
| * | ||||||||||||
| * @param regex the regular expression pattern string | ||||||||||||
| * @return the compiled Pattern, or null if regex is null or invalid | ||||||||||||
| */ | ||||||||||||
| public static @Nullable Pattern getRegexpPattern(@Nullable String regex) { | ||||||||||||
| if (regex == null) { | ||||||||||||
| return null; | ||||||||||||
| } | ||||||||||||
|
Comment on lines
+501
to
+504
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We can also make this non-null
Suggested change
|
||||||||||||
| try { | ||||||||||||
| return REGEXP_PATTERN_CACHE.get(regex); | ||||||||||||
| } catch (PatternSyntaxException e) { | ||||||||||||
| return null; | ||||||||||||
| } | ||||||||||||
| } | ||||||||||||
|
|
||||||||||||
| /** | ||||||||||||
| * Parse string as key-value string and return the value matches key name. example: | ||||||||||||
| * keyvalue('k1=v1;k2=v2', ';', '=', 'k2') = 'v2' keyvalue('k1:v1,k2:v2', ',', ':', 'k3') = NULL | ||||||||||||
|
|
||||||||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,84 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one | ||
| * or more contributor license agreements. See the NOTICE file | ||
| * distributed with this work for additional information | ||
| * regarding copyright ownership. The ASF licenses this file | ||
| * to you under the Apache License, Version 2.0 (the | ||
| * "License"); you may not use this file except in compliance | ||
| * with the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package org.apache.flink.table.runtime.functions.scalar; | ||
|
|
||
| import org.apache.flink.annotation.Internal; | ||
| import org.apache.flink.table.data.ArrayData; | ||
| import org.apache.flink.table.data.GenericArrayData; | ||
| import org.apache.flink.table.data.StringData; | ||
| import org.apache.flink.table.functions.BuiltInFunctionDefinitions; | ||
| import org.apache.flink.table.functions.SpecializedFunction; | ||
|
|
||
| import javax.annotation.Nullable; | ||
|
|
||
| import java.util.regex.Pattern; | ||
|
|
||
| import static org.apache.flink.table.runtime.functions.SqlFunctionUtils.getRegexpPattern; | ||
|
|
||
| /** | ||
| * Implementation of {@link BuiltInFunctionDefinitions#REGEXP_SPLIT}. | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. also see https://issues.apache.org/jira/browse/FLINK-6810 for general instructions on what else you need to add in order to contribute builtin functions, for example which docs to add, what other considerations to make |
||
| * | ||
| * <p>Splits a string by a regular expression pattern and returns an array of substrings. | ||
| * | ||
| * <p>Examples: | ||
| * | ||
| * <pre>{@code | ||
| * REGEXP_SPLIT('Hello123World456', '[0-9]+') = ['Hello', 'World', ''] | ||
| * REGEXP_SPLIT('a,b;c', '[,;]') = ['a', 'b', 'c'] | ||
| * REGEXP_SPLIT('one two three', '\\s+') = ['one', 'two', 'three'] | ||
| * }</pre> | ||
| */ | ||
| @Internal | ||
| public class RegexpSplitFunction extends BuiltInScalarFunction { | ||
|
|
||
| public RegexpSplitFunction(SpecializedFunction.SpecializedContext context) { | ||
| super(BuiltInFunctionDefinitions.REGEXP_SPLIT, context); | ||
| } | ||
|
|
||
| public @Nullable ArrayData eval(@Nullable StringData str, @Nullable StringData regex) { | ||
| if (str == null || regex == null) { | ||
| return null; | ||
| } | ||
|
|
||
| String regexStr = regex.toString(); | ||
| if (regexStr.isEmpty()) { | ||
| // If regex is empty, split by each character | ||
| String strValue = str.toString(); | ||
| StringData[] result = new StringData[strValue.length()]; | ||
| for (int i = 0; i < strValue.length(); i++) { | ||
| result[i] = StringData.fromString(String.valueOf(strValue.charAt(i))); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please have a look at this PR: #28264 So you split the SMP correctly. Maybe we can even extract this logic into a util. |
||
| } | ||
| return new GenericArrayData(result); | ||
| } | ||
|
|
||
| Pattern pattern = getRegexpPattern(regexStr); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nice thanks for using the
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for the review! The reason I added getRegexpPattern() instead of using getRegexpMatcher() is that REGEXP_SPLIT needs to call Pattern.split(str, -1), and the split() method is on the Pattern class, not the Matcher class. The existing getRegexpMatcher() returns a Matcher object which is designed for matching operations like find(), group(), etc. - this works perfectly for other REGEXP_* functions like REGEXP_SUBSTR, REGEXP_COUNT, REGEXP_INSTR that need to iterate through matches. However, REGEXP_SPLIT doesn't need to iterate through matches - it needs to split the input string by the pattern, which requires direct access to the Pattern object. Please let me know which approach you'd prefer:
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. thanks for the thorough explanation! I agree that keeping it is as a reusable utility could be helpful so im good with that approach |
||
| if (pattern == null) { | ||
| // Return null for invalid regex pattern (consistent with other REGEXP_* functions) | ||
| return null; | ||
| } | ||
|
|
||
| // Use -1 as limit to keep all trailing empty strings | ||
| String[] splitResult = pattern.split(str.toString(), -1); | ||
| StringData[] result = new StringData[splitResult.length]; | ||
| for (int i = 0; i < splitResult.length; i++) { | ||
| result[i] = StringData.fromString(splitResult[i]); | ||
| } | ||
| return new GenericArrayData(result); | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you please fix the docs? It seems the merge conflicts are not resolved correctly