|
11 | 11 |
|
12 | 12 | namespace libsqlglot { |
13 | 13 |
|
| 14 | +/// Tokenizer configuration for dialect-specific behavior |
| 15 | +struct TokenizerConfig { |
| 16 | + bool hash_comments_enabled = true; // MySQL uses #, SQL Server doesn't (uses # for temp tables) |
| 17 | + bool bracket_quoted_identifiers = true; // SQL Server uses [identifiers], Snowflake uses [] for array indexing |
| 18 | + |
| 19 | + /// Factory methods for common dialects |
| 20 | + static constexpr TokenizerConfig mysql() noexcept { |
| 21 | + return TokenizerConfig{.hash_comments_enabled = true, .bracket_quoted_identifiers = false}; |
| 22 | + } |
| 23 | + |
| 24 | + static constexpr TokenizerConfig sqlserver() noexcept { |
| 25 | + return TokenizerConfig{.hash_comments_enabled = false, .bracket_quoted_identifiers = true}; |
| 26 | + } |
| 27 | + |
| 28 | + static constexpr TokenizerConfig postgresql() noexcept { |
| 29 | + return TokenizerConfig{.hash_comments_enabled = true, .bracket_quoted_identifiers = false}; |
| 30 | + } |
| 31 | + |
| 32 | + static constexpr TokenizerConfig snowflake() noexcept { |
| 33 | + return TokenizerConfig{.hash_comments_enabled = true, .bracket_quoted_identifiers = false}; |
| 34 | + } |
| 35 | + |
| 36 | + static constexpr TokenizerConfig default_config() noexcept { |
| 37 | + return TokenizerConfig{.hash_comments_enabled = true, .bracket_quoted_identifiers = false}; |
| 38 | + } |
| 39 | +}; |
| 40 | + |
14 | 41 | /// Tokenizer - converts SQL source text into tokens |
15 | 42 | /// Fast scalar implementation with branchless optimizations and perfect hash keyword lookup |
16 | 43 | /// Thread-safe (stateless), uses LocalStringPool for interning |
17 | 44 | class Tokenizer { |
18 | 45 | public: |
19 | | - explicit Tokenizer(std::string_view source, LocalStringPool* pool = nullptr) |
| 46 | + explicit Tokenizer(std::string_view source, LocalStringPool* pool = nullptr, TokenizerConfig config = TokenizerConfig::default_config()) |
20 | 47 | : source_(source) |
21 | 48 | , pos_(0) |
22 | 49 | , line_(1) |
23 | 50 | , col_(1) |
24 | 51 | , pool_(pool) |
25 | 52 | , default_pool_() |
| 53 | + , config_(config) |
26 | 54 | { |
27 | 55 | if (!pool_) { |
28 | 56 | pool_ = &default_pool_; |
@@ -54,7 +82,10 @@ class Tokenizer { |
54 | 82 | char c = peek(); |
55 | 83 |
|
56 | 84 | // Identifiers and keywords (including quoted identifiers) |
57 | | - if (is_identifier_start(c) || c == '"' || c == '`' || c == '[') { |
| 85 | + // In SQL Server mode (hash comments disabled), # is an identifier prefix for temp tables |
| 86 | + // Bracket quotes [] are only for SQL Server, not Snowflake (which uses [] for array indexing) |
| 87 | + bool bracket_quote = config_.bracket_quoted_identifiers && c == '['; |
| 88 | + if (is_identifier_start(c) || c == '"' || c == '`' || bracket_quote || (!config_.hash_comments_enabled && c == '#')) { |
58 | 89 | return tokenize_identifier(); |
59 | 90 | } |
60 | 91 |
|
@@ -93,11 +124,27 @@ class Tokenizer { |
93 | 124 | } |
94 | 125 | } |
95 | 126 |
|
96 | | - // Parameters: @name (T-SQL), :name (Oracle), $1 (Postgres), ? |
97 | | - if (c == '@' || c == ':' || c == '$' || c == '?') { |
| 127 | + // JSON operators that start with # must be checked BEFORE parameters |
| 128 | + // because @ can be @> operator or @parameter, and # can be #> operator or # comment |
| 129 | + if (c == '@' && peek(1) == '>') { |
| 130 | + return tokenize_operator(); |
| 131 | + } |
| 132 | + if (c == '#' && (peek(1) == '>' || peek(1) == '>')) { |
| 133 | + return tokenize_operator(); |
| 134 | + } |
| 135 | + |
| 136 | + // Parameters: @name (T-SQL), $1 (Postgres), ? |
| 137 | + // Note: : is NOT handled here - see below |
| 138 | + if (c == '@' || c == '$' || c == '?') { |
98 | 139 | return tokenize_parameter(); |
99 | 140 | } |
100 | 141 |
|
| 142 | + // Colon - could be :name parameter, := assignment, :: cast, or Snowflake :field |
| 143 | + // Let tokenize_operator handle it to distinguish these cases |
| 144 | + if (c == ':') { |
| 145 | + return tokenize_operator(); |
| 146 | + } |
| 147 | + |
101 | 148 | // Operators and delimiters |
102 | 149 | return tokenize_operator(); |
103 | 150 | } |
@@ -149,8 +196,12 @@ class Tokenizer { |
149 | 196 | continue; |
150 | 197 | } |
151 | 198 |
|
152 | | - // Line comment: -- or # |
153 | | - if ((c == '-' && peek(1) == '-') || c == '#') { |
| 199 | + // Line comment: -- or # (dialect-dependent) |
| 200 | + // SQL Server doesn't use # for comments (it's for temp tables), but MySQL does |
| 201 | + // But # followed by > is #> or #>> operator, not a comment |
| 202 | + bool is_line_comment = (c == '-' && peek(1) == '-') || |
| 203 | + (config_.hash_comments_enabled && c == '#' && peek(1) != '>'); |
| 204 | + if (is_line_comment) { |
154 | 205 | while (!is_eof() && peek() != '\n') { |
155 | 206 | advance(); |
156 | 207 | } |
@@ -213,15 +264,24 @@ class Tokenizer { |
213 | 264 | return make_token(TokenType::IDENTIFIER, start_pos, pos_, start_line, start_col, interned); |
214 | 265 | } |
215 | 266 |
|
216 | | - // Regular identifier |
| 267 | + // Handle # prefix for SQL Server temp tables (#temp, ##global) |
| 268 | + // When hash comments are disabled, # is an identifier prefix |
| 269 | + if (peek() == '#') { |
| 270 | + advance(); // Consume first # |
| 271 | + if (peek() == '#') { |
| 272 | + advance(); // Consume second # for global temp tables |
| 273 | + } |
| 274 | + } |
| 275 | + |
| 276 | + // Regular identifier (including after # prefix) |
217 | 277 | while (!is_eof() && is_identifier_continue(peek())) { |
218 | 278 | advance(); |
219 | 279 | } |
220 | 280 |
|
221 | 281 | std::string_view text = source_.substr(start_pos, pos_ - start_pos); |
222 | 282 | const char* interned = pool_->intern(text); |
223 | 283 |
|
224 | | - // Check if it's a keyword |
| 284 | + // Check if it's a keyword (but # prefixed identifiers are never keywords) |
225 | 285 | TokenType type = keyword_type(text); |
226 | 286 |
|
227 | 287 | return make_token(type, start_pos, pos_, start_line, start_col, interned); |
@@ -386,10 +446,13 @@ class Tokenizer { |
386 | 446 |
|
387 | 447 | char prefix = advance(); // @ or : or $ or ? |
388 | 448 |
|
389 | | - // For standalone ? parameter, return immediately |
| 449 | + // For standalone ? it could be either: |
| 450 | + // 1. A parameter: WHERE x = ? |
| 451 | + // 2. JSON exists operator: WHERE data ? 'key' |
| 452 | + // We treat it as QUESTION token (which can be used for both) |
390 | 453 | if (prefix == '?') { |
391 | 454 | std::string_view text = source_.substr(start_pos, pos_ - start_pos); |
392 | | - return make_token(TokenType::PARAMETER, start_pos, pos_, start_line, start_col, pool_->intern(text)); |
| 455 | + return make_token(TokenType::QUESTION, start_pos, pos_, start_line, start_col, pool_->intern(text)); |
393 | 456 | } |
394 | 457 |
|
395 | 458 | // For :=, don't treat as parameter (it's assignment operator) |
@@ -443,16 +506,23 @@ class Tokenizer { |
443 | 506 |
|
444 | 507 | // Three-character operators |
445 | 508 | if (c == '<' && next == '=' && peek(1) == '>') { |
446 | | - advance(); advance(); // <= |
| 509 | + advance(); advance(); // <=> |
447 | 510 | return make_token(TokenType::NULL_SAFE_EQ, start_pos, pos_, start_line, start_col); |
448 | 511 | } |
| 512 | + if (c == '#' && next == '>' && peek(1) == '>') { |
| 513 | + advance(); advance(); // #>> |
| 514 | + return make_token(TokenType::HASH_LONG_ARROW, start_pos, pos_, start_line, start_col); |
| 515 | + } |
449 | 516 |
|
450 | 517 | // Two-character operators |
451 | 518 | if (c == '|' && next == '|') { advance(); return make_token(TokenType::CONCAT, start_pos, pos_, start_line, start_col); } |
452 | 519 | if (c == '<' && next == '>') { advance(); return make_token(TokenType::NEQ, start_pos, pos_, start_line, start_col); } |
453 | 520 | if (c == '<' && next == '=') { advance(); return make_token(TokenType::LTE, start_pos, pos_, start_line, start_col); } |
| 521 | + if (c == '<' && next == '@') { advance(); return make_token(TokenType::LT_AT, start_pos, pos_, start_line, start_col); } |
454 | 522 | if (c == '>' && next == '=') { advance(); return make_token(TokenType::GTE, start_pos, pos_, start_line, start_col); } |
455 | 523 | if (c == '!' && next == '=') { advance(); return make_token(TokenType::NEQ, start_pos, pos_, start_line, start_col); } |
| 524 | + if (c == '@' && next == '>') { advance(); return make_token(TokenType::AT_GT, start_pos, pos_, start_line, start_col); } |
| 525 | + if (c == '#' && next == '>') { advance(); return make_token(TokenType::HASH_ARROW, start_pos, pos_, start_line, start_col); } |
456 | 526 | if (c == ':' && next == '=') { advance(); return make_token(TokenType::COLON_EQUALS, start_pos, pos_, start_line, start_col); } |
457 | 527 | if (c == ':' && next == ':') { advance(); return make_token(TokenType::DOUBLE_COLON, start_pos, pos_, start_line, start_col); } |
458 | 528 | if (c == '.' && next == '.') { advance(); return make_token(TokenType::DOUBLE_DOT, start_pos, pos_, start_line, start_col); } |
@@ -502,6 +572,7 @@ class Tokenizer { |
502 | 572 | uint16_t col_; |
503 | 573 | LocalStringPool* pool_; |
504 | 574 | LocalStringPool default_pool_; |
| 575 | + TokenizerConfig config_; |
505 | 576 | }; |
506 | 577 |
|
507 | 578 | } // namespace libsqlglot |
0 commit comments