From cc581e959578e605a49ef15e70b990a308c7305d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9rgio=20Silveira?= Date: Thu, 23 Apr 2026 15:36:31 +0200 Subject: [PATCH] Fix tokenizer over-consuming character after ->> operator (#70) TokenizeLongArrow called _state.Next() after confirming the second '>' and then delegated to ConsumeForBinOp, which calls _state.Next() again. The result was that the character immediately following `->>` was silently eaten. When that character was the opening single quote of a string literal, subsequent tokenization walked past the closing quote and raised "Unterminated string literal". This only surfaced when `->>` was written without whitespace before the following token (e.g. `meta->>'x'`). The existing tests all used `meta ->> 'x'`, where the swallowed character happened to be the harmless space. The fix drops the redundant _state.Next(), aligning `->>` with the already-correct pattern used by `#>>` in TokenizeHash. Regression tests are added against the dialects that genuinely support `->>` (PostgreSQL, DuckDB, MySQL, SQLite, Redshift, Generic). `->>` is a PostgreSQL-originated extension and is not part of ANSI/ISO SQL, so dialects that use other JSON extraction mechanisms (Snowflake, BigQuery, MS SQL Server, Hive, Databricks, Oracle, ANSI) are intentionally excluded. --- .../LongArrowJsonExtractionTests.cs | 101 ++++++++++++++++++ src/SqlParser/Tokenizer.cs | 1 - 2 files changed, 101 insertions(+), 1 deletion(-) create mode 100644 src/SqlParser.Tests/LongArrowJsonExtractionTests.cs diff --git a/src/SqlParser.Tests/LongArrowJsonExtractionTests.cs b/src/SqlParser.Tests/LongArrowJsonExtractionTests.cs new file mode 100644 index 0000000..a91f06f --- /dev/null +++ b/src/SqlParser.Tests/LongArrowJsonExtractionTests.cs @@ -0,0 +1,101 @@ +using SqlParser.Ast; +using SqlParser.Dialects; +using SqlParser.Tokens; +using static SqlParser.Ast.Expression; + +namespace SqlParser.Tests; + +// Regression tests for https://github.com/TylerBrinks/SqlParser-cs/issues/70 +// +// The `->>` (LongArrow) JSON-extraction operator was incorrectly consuming the +// character immediately following `->>` when there was no whitespace between +// the operator and the next token. When that character was a single quote +// starting a string literal, the tokenizer would then run past the closing +// quote looking for a terminator that no longer existed, raising +// "Unterminated string literal". Example from the bug report: +// +// meta->>'description' -- failed to tokenize +// meta ->> 'description' -- worked because the extra Next() ate the space +// +// Scope of these tests +// -------------------- +// `->>` is NOT part of the ANSI/ISO SQL standard. It originated as a +// PostgreSQL extension and has since been adopted by a handful of other +// engines. It is NOT supported by Snowflake, BigQuery, MS SQL Server, Hive, +// Databricks, ClickHouse, Oracle, or ANSI SQL, which use other mechanisms for +// JSON extraction (`JSON_VALUE`, `get_json_object`, `col:path`, etc.). +// +// We therefore only exercise these tests against dialects that genuinely +// support `->>`: +// - PostgreSQL (native, originated here) +// - DuckDB (the dialect from the bug report) +// - MySQL (since 5.7.13, as shorthand for JSON_UNQUOTE(JSON_EXTRACT(...))) +// - SQLite (since 3.38.0, 2022) +// - Redshift (Postgres-derived) +// - Generic (the repo's permissive catch-all dialect) +public class LongArrowJsonExtractionTests : ParserTestBase +{ + private static readonly Dialect[] DialectsSupportingLongArrow = + [ + new PostgreSqlDialect(), + new DuckDbDialect(), + new MySqlDialect(), + new SQLiteDialect(), + new RedshiftDialect(), + new GenericDialect() + ]; + + public static readonly IEnumerable LongArrowDialects = + DialectsSupportingLongArrow.Select(d => new object[] { d }); + + [Theory] + [MemberData(nameof(LongArrowDialects))] + public void Tokenize_LongArrow_Followed_By_String_Literal(Dialect dialect) + { + var tokens = new Tokenizer().Tokenize("a->>'x'", dialect); + + var expected = new Token[] + { + new Word("a"), + new LongArrow(), + new SingleQuotedString("x") + }; + + TokenizerTestBase.Compare(expected, tokens); + } + + [Theory] + [MemberData(nameof(LongArrowDialects))] + public void Parse_LongArrow_With_No_Whitespace_Before_String(Dialect dialect) + { + var parsed = new Parser().ParseSql("SELECT meta->>'description' FROM events", dialect); + + Assert.Single(parsed); + var select = ((SetExpression.SelectExpression)parsed[0]!.AsQuery()!.Body).Select; + + var expected = new SelectItem.UnnamedExpression(new BinaryOp( + new Identifier("meta"), + BinaryOperator.LongArrow, + new LiteralValue(new Value.SingleQuotedString("description")) + )); + + Assert.Equal(expected, select.Projection.Single()); + } + + [Fact] + public void Parse_Issue_70_Repro() + { + // Exact SQL from the bug report. + const string sql = """ + select + category_seq as seq, + data.name as name, + meta->>'description' as description + from category + order by seq + """; + + var parsed = new Parser().ParseSql(sql, new DuckDbDialect()); + Assert.Single(parsed); + } +} diff --git a/src/SqlParser/Tokenizer.cs b/src/SqlParser/Tokenizer.cs index 82ae9d1..f7674fb 100644 --- a/src/SqlParser/Tokenizer.cs +++ b/src/SqlParser/Tokenizer.cs @@ -660,7 +660,6 @@ private Token TokenizeLongArrow() return StartBinOp("->", new Arrow()); } - _state.Next(); return ConsumeForBinOp("->>", new LongArrow()); }