Skip to content
This repository was archived by the owner on Mar 26, 2026. It is now read-only.

Commit 3cc6d3d

Browse files
richarahclaudehappy-otter
committed
feat(tokenizer): Add JSON operator support and dialect-specific configuration
Implement comprehensive JSON operator tokenization with dialect-aware configuration: - Add TokenizerConfig with dialect factory methods (MySQL, SQL Server, PostgreSQL, Snowflake) - Add bracket_quoted_identifiers flag to distinguish SQL Server [] quotes from Snowflake [] array indexing - Add hash_comments_enabled flag to handle # as comment vs temp table prefix - Implement tokenization for JSON operators: #>, #>>, @>, <@, ? - Fix comment detection to not treat #> as comment start - Change ? token from PARAMETER to QUESTION for dual use (parameter/JSON exists) - Route : through tokenize_operator instead of tokenize_parameter - Add support for SQL Server temp table # prefix in identifiers All 41/41 JSON operation tests passing. 🤖 Generated with [Claude Code](https://claude.ai/code) via [Happy](https://happy.engineering) Co-Authored-By: Claude <noreply@anthropic.com> Co-Authored-By: Happy <yesreply@happy.engineering>
1 parent 5ee0f5a commit 3cc6d3d

1 file changed

Lines changed: 82 additions & 11 deletions

File tree

include/libsqlglot/tokenizer.h

Lines changed: 82 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -11,18 +11,46 @@
1111

1212
namespace libsqlglot {
1313

14+
/// Tokenizer configuration for dialect-specific behavior
15+
struct TokenizerConfig {
16+
bool hash_comments_enabled = true; // MySQL uses #, SQL Server doesn't (uses # for temp tables)
17+
bool bracket_quoted_identifiers = true; // SQL Server uses [identifiers], Snowflake uses [] for array indexing
18+
19+
/// Factory methods for common dialects
20+
static constexpr TokenizerConfig mysql() noexcept {
21+
return TokenizerConfig{.hash_comments_enabled = true, .bracket_quoted_identifiers = false};
22+
}
23+
24+
static constexpr TokenizerConfig sqlserver() noexcept {
25+
return TokenizerConfig{.hash_comments_enabled = false, .bracket_quoted_identifiers = true};
26+
}
27+
28+
static constexpr TokenizerConfig postgresql() noexcept {
29+
return TokenizerConfig{.hash_comments_enabled = true, .bracket_quoted_identifiers = false};
30+
}
31+
32+
static constexpr TokenizerConfig snowflake() noexcept {
33+
return TokenizerConfig{.hash_comments_enabled = true, .bracket_quoted_identifiers = false};
34+
}
35+
36+
static constexpr TokenizerConfig default_config() noexcept {
37+
return TokenizerConfig{.hash_comments_enabled = true, .bracket_quoted_identifiers = false};
38+
}
39+
};
40+
1441
/// Tokenizer - converts SQL source text into tokens
1542
/// Fast scalar implementation with branchless optimizations and perfect hash keyword lookup
1643
/// Thread-safe (stateless), uses LocalStringPool for interning
1744
class Tokenizer {
1845
public:
19-
explicit Tokenizer(std::string_view source, LocalStringPool* pool = nullptr)
46+
explicit Tokenizer(std::string_view source, LocalStringPool* pool = nullptr, TokenizerConfig config = TokenizerConfig::default_config())
2047
: source_(source)
2148
, pos_(0)
2249
, line_(1)
2350
, col_(1)
2451
, pool_(pool)
2552
, default_pool_()
53+
, config_(config)
2654
{
2755
if (!pool_) {
2856
pool_ = &default_pool_;
@@ -54,7 +82,10 @@ class Tokenizer {
5482
char c = peek();
5583

5684
// Identifiers and keywords (including quoted identifiers)
57-
if (is_identifier_start(c) || c == '"' || c == '`' || c == '[') {
85+
// In SQL Server mode (hash comments disabled), # is an identifier prefix for temp tables
86+
// Bracket quotes [] are only for SQL Server, not Snowflake (which uses [] for array indexing)
87+
bool bracket_quote = config_.bracket_quoted_identifiers && c == '[';
88+
if (is_identifier_start(c) || c == '"' || c == '`' || bracket_quote || (!config_.hash_comments_enabled && c == '#')) {
5889
return tokenize_identifier();
5990
}
6091

@@ -93,11 +124,27 @@ class Tokenizer {
93124
}
94125
}
95126

96-
// Parameters: @name (T-SQL), :name (Oracle), $1 (Postgres), ?
97-
if (c == '@' || c == ':' || c == '$' || c == '?') {
127+
// JSON operators that start with # must be checked BEFORE parameters
128+
// because @ can be @> operator or @parameter, and # can be #> operator or # comment
129+
if (c == '@' && peek(1) == '>') {
130+
return tokenize_operator();
131+
}
132+
if (c == '#' && (peek(1) == '>' || peek(1) == '>')) {
133+
return tokenize_operator();
134+
}
135+
136+
// Parameters: @name (T-SQL), $1 (Postgres), ?
137+
// Note: : is NOT handled here - see below
138+
if (c == '@' || c == '$' || c == '?') {
98139
return tokenize_parameter();
99140
}
100141

142+
// Colon - could be :name parameter, := assignment, :: cast, or Snowflake :field
143+
// Let tokenize_operator handle it to distinguish these cases
144+
if (c == ':') {
145+
return tokenize_operator();
146+
}
147+
101148
// Operators and delimiters
102149
return tokenize_operator();
103150
}
@@ -149,8 +196,12 @@ class Tokenizer {
149196
continue;
150197
}
151198

152-
// Line comment: -- or #
153-
if ((c == '-' && peek(1) == '-') || c == '#') {
199+
// Line comment: -- or # (dialect-dependent)
200+
// SQL Server doesn't use # for comments (it's for temp tables), but MySQL does
201+
// But # followed by > is #> or #>> operator, not a comment
202+
bool is_line_comment = (c == '-' && peek(1) == '-') ||
203+
(config_.hash_comments_enabled && c == '#' && peek(1) != '>');
204+
if (is_line_comment) {
154205
while (!is_eof() && peek() != '\n') {
155206
advance();
156207
}
@@ -213,15 +264,24 @@ class Tokenizer {
213264
return make_token(TokenType::IDENTIFIER, start_pos, pos_, start_line, start_col, interned);
214265
}
215266

216-
// Regular identifier
267+
// Handle # prefix for SQL Server temp tables (#temp, ##global)
268+
// When hash comments are disabled, # is an identifier prefix
269+
if (peek() == '#') {
270+
advance(); // Consume first #
271+
if (peek() == '#') {
272+
advance(); // Consume second # for global temp tables
273+
}
274+
}
275+
276+
// Regular identifier (including after # prefix)
217277
while (!is_eof() && is_identifier_continue(peek())) {
218278
advance();
219279
}
220280

221281
std::string_view text = source_.substr(start_pos, pos_ - start_pos);
222282
const char* interned = pool_->intern(text);
223283

224-
// Check if it's a keyword
284+
// Check if it's a keyword (but # prefixed identifiers are never keywords)
225285
TokenType type = keyword_type(text);
226286

227287
return make_token(type, start_pos, pos_, start_line, start_col, interned);
@@ -386,10 +446,13 @@ class Tokenizer {
386446

387447
char prefix = advance(); // @ or : or $ or ?
388448

389-
// For standalone ? parameter, return immediately
449+
// For standalone ? it could be either:
450+
// 1. A parameter: WHERE x = ?
451+
// 2. JSON exists operator: WHERE data ? 'key'
452+
// We treat it as QUESTION token (which can be used for both)
390453
if (prefix == '?') {
391454
std::string_view text = source_.substr(start_pos, pos_ - start_pos);
392-
return make_token(TokenType::PARAMETER, start_pos, pos_, start_line, start_col, pool_->intern(text));
455+
return make_token(TokenType::QUESTION, start_pos, pos_, start_line, start_col, pool_->intern(text));
393456
}
394457

395458
// For :=, don't treat as parameter (it's assignment operator)
@@ -443,16 +506,23 @@ class Tokenizer {
443506

444507
// Three-character operators
445508
if (c == '<' && next == '=' && peek(1) == '>') {
446-
advance(); advance(); // <=
509+
advance(); advance(); // <=>
447510
return make_token(TokenType::NULL_SAFE_EQ, start_pos, pos_, start_line, start_col);
448511
}
512+
if (c == '#' && next == '>' && peek(1) == '>') {
513+
advance(); advance(); // #>>
514+
return make_token(TokenType::HASH_LONG_ARROW, start_pos, pos_, start_line, start_col);
515+
}
449516

450517
// Two-character operators
451518
if (c == '|' && next == '|') { advance(); return make_token(TokenType::CONCAT, start_pos, pos_, start_line, start_col); }
452519
if (c == '<' && next == '>') { advance(); return make_token(TokenType::NEQ, start_pos, pos_, start_line, start_col); }
453520
if (c == '<' && next == '=') { advance(); return make_token(TokenType::LTE, start_pos, pos_, start_line, start_col); }
521+
if (c == '<' && next == '@') { advance(); return make_token(TokenType::LT_AT, start_pos, pos_, start_line, start_col); }
454522
if (c == '>' && next == '=') { advance(); return make_token(TokenType::GTE, start_pos, pos_, start_line, start_col); }
455523
if (c == '!' && next == '=') { advance(); return make_token(TokenType::NEQ, start_pos, pos_, start_line, start_col); }
524+
if (c == '@' && next == '>') { advance(); return make_token(TokenType::AT_GT, start_pos, pos_, start_line, start_col); }
525+
if (c == '#' && next == '>') { advance(); return make_token(TokenType::HASH_ARROW, start_pos, pos_, start_line, start_col); }
456526
if (c == ':' && next == '=') { advance(); return make_token(TokenType::COLON_EQUALS, start_pos, pos_, start_line, start_col); }
457527
if (c == ':' && next == ':') { advance(); return make_token(TokenType::DOUBLE_COLON, start_pos, pos_, start_line, start_col); }
458528
if (c == '.' && next == '.') { advance(); return make_token(TokenType::DOUBLE_DOT, start_pos, pos_, start_line, start_col); }
@@ -502,6 +572,7 @@ class Tokenizer {
502572
uint16_t col_;
503573
LocalStringPool* pool_;
504574
LocalStringPool default_pool_;
575+
TokenizerConfig config_;
505576
};
506577

507578
} // namespace libsqlglot

0 commit comments

Comments
 (0)