Skip to content

Commit 3ad74c0

Browse files
authored
[feature](variant) schema template auto cast (#60362)
Problem Summary: This PR implements Variant Schema Template Auto Cast end-to-end. It applies schema-template-based casts during analysis so behavior is consistent across all clauses (`SELECT/WHERE/ORDER/GROUP/HAVING/JOIN/window`), supports chained paths (a.b/ a['b']) with correct path resolution, and makes alias-based `ORDER BY/GROUP BY/JOIN` ON work by restoring original expressions via alias mapping. A single global switch enable_variant_schema_auto_cast controls the feature. Regression tests are expanded to cover leaf vs non-leaf paths, alias/subquery scenarios, and ordering/aggregation/join behavior. doc: apache/doris-website#3339
1 parent afb1da4 commit 3ad74c0

16 files changed

Lines changed: 2072 additions & 18 deletions

File tree

be/src/vec/common/variant_util.cpp

Lines changed: 159 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@
1919

2020
#include <assert.h>
2121
#include <fmt/format.h>
22-
#include <fnmatch.h>
2322
#include <gen_cpp/FrontendService.h>
2423
#include <gen_cpp/FrontendService_types.h>
2524
#include <gen_cpp/HeartbeatService_types.h>
@@ -38,6 +37,7 @@
3837
#include <cstddef>
3938
#include <cstdint>
4039
#include <cstring>
40+
#include <list>
4141
#include <memory>
4242
#include <mutex>
4343
#include <optional>
@@ -63,6 +63,7 @@
6363
#include "olap/tablet.h"
6464
#include "olap/tablet_fwd.h"
6565
#include "olap/tablet_schema.h"
66+
#include "re2/re2.h"
6667
#include "runtime/client_cache.h"
6768
#include "runtime/define_primitive_type.h"
6869
#include "runtime/exec_env.h"
@@ -102,6 +103,162 @@
102103
namespace doris::vectorized::variant_util {
103104
#include "common/compile_check_begin.h"
104105

106+
inline void append_escaped_regex_char(std::string* regex_output, char ch) {
107+
switch (ch) {
108+
case '.':
109+
case '^':
110+
case '$':
111+
case '+':
112+
case '*':
113+
case '?':
114+
case '(':
115+
case ')':
116+
case '|':
117+
case '{':
118+
case '}':
119+
case '[':
120+
case ']':
121+
case '\\':
122+
regex_output->push_back('\\');
123+
regex_output->push_back(ch);
124+
break;
125+
default:
126+
regex_output->push_back(ch);
127+
break;
128+
}
129+
}
130+
131+
// Small LRU to cap compiled glob patterns
132+
constexpr size_t kGlobRegexCacheCapacity = 256;
133+
134+
struct GlobRegexCacheEntry {
135+
std::shared_ptr<RE2> re2;
136+
std::list<std::string>::iterator lru_it;
137+
};
138+
139+
std::mutex g_glob_regex_cache_mutex;
140+
std::list<std::string> g_glob_regex_cache_lru;
141+
std::unordered_map<std::string, GlobRegexCacheEntry> g_glob_regex_cache;
142+
143+
std::shared_ptr<RE2> get_or_build_re2(const std::string& glob_pattern) {
144+
{
145+
std::lock_guard<std::mutex> lock(g_glob_regex_cache_mutex);
146+
auto it = g_glob_regex_cache.find(glob_pattern);
147+
if (it != g_glob_regex_cache.end()) {
148+
g_glob_regex_cache_lru.splice(g_glob_regex_cache_lru.begin(), g_glob_regex_cache_lru,
149+
it->second.lru_it);
150+
return it->second.re2;
151+
}
152+
}
153+
std::string regex_pattern;
154+
Status st = glob_to_regex(glob_pattern, &regex_pattern);
155+
if (!st.ok()) {
156+
return nullptr;
157+
}
158+
auto compiled = std::make_shared<RE2>(regex_pattern);
159+
if (!compiled->ok()) {
160+
return nullptr;
161+
}
162+
{
163+
std::lock_guard<std::mutex> lock(g_glob_regex_cache_mutex);
164+
auto it = g_glob_regex_cache.find(glob_pattern);
165+
if (it != g_glob_regex_cache.end()) {
166+
g_glob_regex_cache_lru.splice(g_glob_regex_cache_lru.begin(), g_glob_regex_cache_lru,
167+
it->second.lru_it);
168+
return it->second.re2;
169+
}
170+
g_glob_regex_cache_lru.push_front(glob_pattern);
171+
g_glob_regex_cache.emplace(glob_pattern,
172+
GlobRegexCacheEntry {compiled, g_glob_regex_cache_lru.begin()});
173+
if (g_glob_regex_cache.size() > kGlobRegexCacheCapacity) {
174+
const std::string& evict_key = g_glob_regex_cache_lru.back();
175+
g_glob_regex_cache.erase(evict_key);
176+
g_glob_regex_cache_lru.pop_back();
177+
}
178+
}
179+
return compiled;
180+
}
181+
182+
// Convert a restricted glob pattern into a regex.
183+
// Supported: '*', '?', '[...]', '\\' escape. Others are treated as literals.
184+
Status glob_to_regex(const std::string& glob_pattern, std::string* regex_pattern) {
185+
regex_pattern->clear();
186+
regex_pattern->append("^");
187+
bool is_escaped = false;
188+
size_t pattern_length = glob_pattern.size();
189+
for (size_t index = 0; index < pattern_length; ++index) {
190+
char current_char = glob_pattern[index];
191+
if (is_escaped) {
192+
append_escaped_regex_char(regex_pattern, current_char);
193+
is_escaped = false;
194+
continue;
195+
}
196+
if (current_char == '\\') {
197+
is_escaped = true;
198+
continue;
199+
}
200+
if (current_char == '*') {
201+
regex_pattern->append(".*");
202+
continue;
203+
}
204+
if (current_char == '?') {
205+
regex_pattern->append(".");
206+
continue;
207+
}
208+
if (current_char == '[') {
209+
size_t class_index = index + 1;
210+
bool class_closed = false;
211+
bool is_class_escaped = false;
212+
std::string class_buffer;
213+
if (class_index < pattern_length &&
214+
(glob_pattern[class_index] == '!' || glob_pattern[class_index] == '^')) {
215+
class_buffer.push_back('^');
216+
++class_index;
217+
}
218+
for (; class_index < pattern_length; ++class_index) {
219+
char class_char = glob_pattern[class_index];
220+
if (is_class_escaped) {
221+
class_buffer.push_back(class_char);
222+
is_class_escaped = false;
223+
continue;
224+
}
225+
if (class_char == '\\') {
226+
is_class_escaped = true;
227+
continue;
228+
}
229+
if (class_char == ']') {
230+
class_closed = true;
231+
break;
232+
}
233+
class_buffer.push_back(class_char);
234+
}
235+
if (!class_closed) {
236+
return Status::InvalidArgument("Unclosed character class in glob pattern: {}",
237+
glob_pattern);
238+
}
239+
regex_pattern->append("[");
240+
regex_pattern->append(class_buffer);
241+
regex_pattern->append("]");
242+
index = class_index;
243+
continue;
244+
}
245+
append_escaped_regex_char(regex_pattern, current_char);
246+
}
247+
if (is_escaped) {
248+
append_escaped_regex_char(regex_pattern, '\\');
249+
}
250+
regex_pattern->append("$");
251+
return Status::OK();
252+
}
253+
254+
bool glob_match_re2(const std::string& glob_pattern, const std::string& candidate_path) {
255+
auto compiled = get_or_build_re2(glob_pattern);
256+
if (compiled == nullptr) {
257+
return false;
258+
}
259+
return RE2::FullMatch(candidate_path, *compiled);
260+
}
261+
105262
size_t get_number_of_dimensions(const IDataType& type) {
106263
if (const auto* type_array = typeid_cast<const DataTypeArray*>(&type)) {
107264
return type_array->get_number_of_dimensions();
@@ -1307,8 +1464,7 @@ bool generate_sub_column_info(const TabletSchema& schema, int32_t col_unique_id,
13071464
break;
13081465
}
13091466
case PatternTypePB::MATCH_NAME_GLOB: {
1310-
int result = fnmatch(pattern, path.c_str(), FNM_PATHNAME);
1311-
if (result == 0) {
1467+
if (glob_match_re2(pattern, path)) {
13121468
generate_result_column(*sub_column, &sub_column_info->column);
13131469
generate_index(sub_column->name());
13141470
return true;
@@ -1788,8 +1944,6 @@ std::unordered_map<std::string_view, ColumnVariant::Subcolumn> materialize_docs_
17881944
return subcolumns;
17891945
}
17901946

1791-
namespace {
1792-
17931947
Status _parse_and_materialize_variant_columns(Block& block,
17941948
const std::vector<uint32_t>& variant_pos,
17951949
const std::vector<ParseConfig>& configs) {
@@ -1864,8 +2018,6 @@ Status _parse_and_materialize_variant_columns(Block& block,
18642018
return Status::OK();
18652019
}
18662020

1867-
} // namespace
1868-
18692021
Status parse_and_materialize_variant_columns(Block& block, const std::vector<uint32_t>& variant_pos,
18702022
const std::vector<ParseConfig>& configs) {
18712023
RETURN_IF_CATCH_EXCEPTION(

be/src/vec/common/variant_util.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,13 @@ using JsonParser = JSONDataParser<SimdJSONParser>;
6464
const std::string SPARSE_COLUMN_PATH = "__DORIS_VARIANT_SPARSE__";
6565
const std::string DOC_VALUE_COLUMN_PATH = "__DORIS_VARIANT_DOC_VALUE__";
6666
namespace doris::vectorized::variant_util {
67+
68+
// Convert a restricted glob pattern into a regex (for tests/internal use).
69+
Status glob_to_regex(const std::string& glob_pattern, std::string* regex_pattern);
70+
71+
// Match a glob pattern against a path using RE2.
72+
bool glob_match_re2(const std::string& glob_pattern, const std::string& candidate_path);
73+
6774
using PathToNoneNullValues = std::unordered_map<std::string, int64_t>;
6875
using PathToDataTypes = std::unordered_map<PathInData, std::vector<DataTypePtr>, PathInData::Hash>;
6976

be/test/olap/rowset/segment_v2/variant_util_test.cpp

Lines changed: 133 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -209,4 +209,136 @@ TEST(VariantUtilTest, ParseVariantColumns_DocModeRejectOnlySubcolumnsConfig) {
209209
EXPECT_TRUE(st.ok()) << st.to_string();
210210
}
211211

212-
} // namespace doris::vectorized::variant_util
212+
TEST(VariantUtilTest, GlobToRegex) {
213+
struct Case {
214+
std::string glob;
215+
std::string expected_regex;
216+
};
217+
const std::vector<Case> cases = {
218+
{"*", "^.*$"},
219+
{"?", "^.$"},
220+
{"a?b", "^a.b$"},
221+
{"a*b", "^a.*b$"},
222+
{"a**b", "^a.*.*b$"},
223+
{"a??b", "^a..b$"},
224+
{"?*", "^..*$"},
225+
{"*?", "^.*.$"},
226+
{"a.b", "^a\\.b$"},
227+
{"a+b", "^a\\+b$"},
228+
{"a{b}", "^a\\{b\\}$"},
229+
{R"(a\*b)", R"(^a\*b$)"},
230+
{"a\\?b", "^a\\?b$"},
231+
{"a\\[b", "^a\\[b$"},
232+
{"abc\\", "^abc\\\\$"},
233+
{"a|b", "^a\\|b$"},
234+
{"a(b)c", "^a\\(b\\)c$"},
235+
{"a^b", "^a\\^b$"},
236+
{"a$b", "^a\\$b$"},
237+
{"int_[0-9]", "^int_[0-9]$"},
238+
{"int_[!0-9]", "^int_[^0-9]$"},
239+
{"int_[^0-9]", "^int_[^0-9]$"},
240+
{"a[\\-]b", "^a[-]b$"},
241+
{"a[b-d]e", "^a[b-d]e$"},
242+
{"a[\\]]b", "^a[]]b$"},
243+
{"a[\\!]b", "^a[!]b$"},
244+
{"", "^$"},
245+
{"a[[]b", "^a[[]b$"},
246+
{"a[]b", "^a[]b$"},
247+
{"[]", "^[]$"},
248+
{"[!]", "^[^]$"},
249+
{"[^]", "^[^]$"},
250+
{"\\", "^\\\\$"},
251+
{"\\*", "^\\*$"},
252+
{"a\\*b", "^a\\*b$"},
253+
{"a[!\\]]b", "^a[^]]b$"},
254+
};
255+
256+
for (const auto& test_case : cases) {
257+
std::string regex;
258+
Status st = glob_to_regex(test_case.glob, &regex);
259+
EXPECT_TRUE(st.ok()) << st.to_string() << " pattern=" << test_case.glob;
260+
EXPECT_EQ(regex, test_case.expected_regex) << "pattern=" << test_case.glob;
261+
}
262+
263+
std::string regex;
264+
Status st = glob_to_regex("int_[0-9", &regex);
265+
EXPECT_FALSE(st.ok());
266+
267+
st = glob_to_regex("a[\\]b", &regex);
268+
EXPECT_FALSE(st.ok());
269+
}
270+
271+
TEST(VariantUtilTest, GlobMatchRe2) {
272+
struct Case {
273+
std::string glob;
274+
std::string candidate;
275+
bool expected;
276+
};
277+
const std::vector<Case> cases = {
278+
{"*", "", true},
279+
{"*", "abc", true},
280+
{"?", "a", true},
281+
{"?", "", false},
282+
{"a?b", "acb", true},
283+
{"a?b", "ab", false},
284+
{"a*b", "ab", true},
285+
{"a*b", "axxxb", true},
286+
{"a**b", "ab", true},
287+
{"a**b", "axxxb", true},
288+
{"?*", "", false},
289+
{"?*", "a", true},
290+
{"*?", "", false},
291+
{"*?", "a", true},
292+
{"a*b", "a/b", true},
293+
{"a.b", "a.b", true},
294+
{"a.b", "acb", false},
295+
{"a+b", "a+b", true},
296+
{"a{b}", "a{b}", true},
297+
{"a|b", "a|b", true},
298+
{"a|b", "ab", false},
299+
{"a(b)c", "a(b)c", true},
300+
{"a(b)c", "abc", false},
301+
{"a^b", "a^b", true},
302+
{"a^b", "ab", false},
303+
{"a$b", "a$b", true},
304+
{"a$b", "ab", false},
305+
{"a[b-d]e", "ace", true},
306+
{"a[b-d]e", "aee", false},
307+
{"a[\\]]b", "a]b", true},
308+
{"a[\\]]b", "a[b", false},
309+
{"a[\\!]b", "a!b", true},
310+
{"a[\\!]b", "a]b", false},
311+
{"[]", "a", false},
312+
{"[!]", "]", false},
313+
{"\\", "\\", true},
314+
{"\\*", "\\abc", false},
315+
{"a[!\\]]b", "aXb", true},
316+
{"a[!\\]]b", "a]b", false},
317+
{"a[]b", "aXb", false},
318+
{"a[[]b", "a[b", true},
319+
{R"(a\*b)", "a*b", true},
320+
{R"(a\?b)", "a?b", true},
321+
{R"(a\[b)", "a[b", true},
322+
{R"(abc\)", R"(abc\)", true},
323+
{"int_[0-9]", "int_1", true},
324+
{"int_[0-9]", "int_a", false},
325+
{"int_[!0-9]", "int_a", true},
326+
{"int_[!0-9]", "int_1", false},
327+
{"int_[^0-9]", "int_b", true},
328+
{"int_[^0-9]", "int_2", false},
329+
{R"(a[\-]b)", "a-b", true},
330+
{"", "", true},
331+
{"", "a", false},
332+
};
333+
334+
for (const auto& test_case : cases) {
335+
bool matched = glob_match_re2(test_case.glob, test_case.candidate);
336+
EXPECT_EQ(matched, test_case.expected)
337+
<< "pattern=" << test_case.glob << " candidate=" << test_case.candidate;
338+
}
339+
340+
EXPECT_FALSE(glob_match_re2("int_[0-9", "int_1"));
341+
EXPECT_FALSE(glob_match_re2("a[\\]b", "a]b"));
342+
}
343+
344+
} // namespace doris::vectorized::variant_util

fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
import org.apache.doris.common.ErrorCode;
4242
import org.apache.doris.common.ErrorReport;
4343
import org.apache.doris.common.FeConstants;
44+
import org.apache.doris.common.GlobRegexUtil;
4445
import org.apache.doris.common.Pair;
4546
import org.apache.doris.common.UserException;
4647
import org.apache.doris.common.io.DeepCopy;
@@ -3750,12 +3751,11 @@ public Index getInvertedIndex(Column column, List<String> subPath, String analyz
37503751
String childName = child.getName();
37513752
if (child.getFieldPatternType() == TPatternType.MATCH_NAME_GLOB) {
37523753
try {
3753-
java.nio.file.PathMatcher matcher = java.nio.file.FileSystems.getDefault()
3754-
.getPathMatcher("glob:" + childName);
3755-
if (matcher.matches(java.nio.file.Paths.get(subPathString))) {
3754+
com.google.re2j.Pattern compiled = GlobRegexUtil.getOrCompilePattern(childName);
3755+
if (compiled.matcher(subPathString).matches()) {
37563756
fieldPattern = childName;
37573757
}
3758-
} catch (Exception e) {
3758+
} catch (com.google.re2j.PatternSyntaxException | IllegalArgumentException e) {
37593759
continue;
37603760
}
37613761
} else if (child.getFieldPatternType() == TPatternType.MATCH_NAME) {

0 commit comments

Comments
 (0)