From cc581e959578e605a49ef15e70b990a308c7305d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9rgio=20Silveira?= <sdsilveira@gmail.com>
Date: Thu, 23 Apr 2026 15:36:31 +0200
Subject: [PATCH] Fix tokenizer over-consuming character after ->> operator
 (#70)

TokenizeLongArrow called _state.Next() after confirming the second '>'
and then delegated to ConsumeForBinOp, which calls _state.Next() again.
The result was that the character immediately following `->>` was
silently eaten. When that character was the opening single quote of a
string literal, subsequent tokenization walked past the closing quote
and raised "Unterminated string literal".

This only surfaced when `->>` was written without whitespace before the
following token (e.g. `meta->>'x'`). The existing tests all used
`meta ->> 'x'`, where the swallowed character happened to be the
harmless space.

The fix drops the redundant _state.Next(), aligning `->>` with the
already-correct pattern used by `#>>` in TokenizeHash.

Regression tests are added against the dialects that genuinely support
`->>` (PostgreSQL, DuckDB, MySQL, SQLite, Redshift, Generic). `->>` is
a PostgreSQL-originated extension and is not part of ANSI/ISO SQL, so
dialects that use other JSON extraction mechanisms (Snowflake, BigQuery,
MS SQL Server, Hive, Databricks, Oracle, ANSI) are intentionally
excluded.
---
 .../LongArrowJsonExtractionTests.cs           | 101 ++++++++++++++++++
 src/SqlParser/Tokenizer.cs                    |   1 -
 2 files changed, 101 insertions(+), 1 deletion(-)
 create mode 100644 src/SqlParser.Tests/LongArrowJsonExtractionTests.cs

diff --git a/src/SqlParser.Tests/LongArrowJsonExtractionTests.cs b/src/SqlParser.Tests/LongArrowJsonExtractionTests.cs
new file mode 100644
index 0000000..a91f06f
--- /dev/null
+++ b/src/SqlParser.Tests/LongArrowJsonExtractionTests.cs
@@ -0,0 +1,101 @@
+using SqlParser.Ast;
+using SqlParser.Dialects;
+using SqlParser.Tokens;
+using static SqlParser.Ast.Expression;
+
+namespace SqlParser.Tests;
+
+// Regression tests for https://github.com/TylerBrinks/SqlParser-cs/issues/70
+//
+// The `->>` (LongArrow) JSON-extraction operator was incorrectly consuming the
+// character immediately following `->>` when there was no whitespace between
+// the operator and the next token. When that character was a single quote
+// starting a string literal, the tokenizer would then run past the closing
+// quote looking for a terminator that no longer existed, raising
+// "Unterminated string literal". Example from the bug report:
+//
+//     meta->>'description'       -- failed to tokenize
+//     meta ->> 'description'     -- worked because the extra Next() ate the space
+//
+// Scope of these tests
+// --------------------
+// `->>` is NOT part of the ANSI/ISO SQL standard. It originated as a
+// PostgreSQL extension and has since been adopted by a handful of other
+// engines. It is NOT supported by Snowflake, BigQuery, MS SQL Server, Hive,
+// Databricks, ClickHouse, Oracle, or ANSI SQL, which use other mechanisms for
+// JSON extraction (`JSON_VALUE`, `get_json_object`, `col:path`, etc.).
+//
+// We therefore only exercise these tests against dialects that genuinely
+// support `->>`:
+//   - PostgreSQL  (native, originated here)
+//   - DuckDB      (the dialect from the bug report)
+//   - MySQL       (since 5.7.13, as shorthand for JSON_UNQUOTE(JSON_EXTRACT(...)))
+//   - SQLite      (since 3.38.0, 2022)
+//   - Redshift    (Postgres-derived)
+//   - Generic     (the repo's permissive catch-all dialect)
+public class LongArrowJsonExtractionTests : ParserTestBase
+{
+    private static readonly Dialect[] DialectsSupportingLongArrow =
+    [
+        new PostgreSqlDialect(),
+        new DuckDbDialect(),
+        new MySqlDialect(),
+        new SQLiteDialect(),
+        new RedshiftDialect(),
+        new GenericDialect()
+    ];
+
+    public static readonly IEnumerable<object[]> LongArrowDialects =
+        DialectsSupportingLongArrow.Select(d => new object[] { d });
+
+    [Theory]
+    [MemberData(nameof(LongArrowDialects))]
+    public void Tokenize_LongArrow_Followed_By_String_Literal(Dialect dialect)
+    {
+        var tokens = new Tokenizer().Tokenize("a->>'x'", dialect);
+
+        var expected = new Token[]
+        {
+            new Word("a"),
+            new LongArrow(),
+            new SingleQuotedString("x")
+        };
+
+        TokenizerTestBase.Compare(expected, tokens);
+    }
+
+    [Theory]
+    [MemberData(nameof(LongArrowDialects))]
+    public void Parse_LongArrow_With_No_Whitespace_Before_String(Dialect dialect)
+    {
+        var parsed = new Parser().ParseSql("SELECT meta->>'description' FROM events", dialect);
+
+        Assert.Single(parsed);
+        var select = ((SetExpression.SelectExpression)parsed[0]!.AsQuery()!.Body).Select;
+
+        var expected = new SelectItem.UnnamedExpression(new BinaryOp(
+            new Identifier("meta"),
+            BinaryOperator.LongArrow,
+            new LiteralValue(new Value.SingleQuotedString("description"))
+        ));
+
+        Assert.Equal(expected, select.Projection.Single());
+    }
+
+    [Fact]
+    public void Parse_Issue_70_Repro()
+    {
+        // Exact SQL from the bug report.
+        const string sql = """
+                           select
+                           category_seq as seq,
+                           data.name as name,
+                           meta->>'description' as description
+                           from category
+                           order by seq
+                           """;
+
+        var parsed = new Parser().ParseSql(sql, new DuckDbDialect());
+        Assert.Single(parsed);
+    }
+}
diff --git a/src/SqlParser/Tokenizer.cs b/src/SqlParser/Tokenizer.cs
index 82ae9d1..f7674fb 100644
--- a/src/SqlParser/Tokenizer.cs
+++ b/src/SqlParser/Tokenizer.cs
@@ -660,7 +660,6 @@ private Token TokenizeLongArrow()
             return StartBinOp("->", new Arrow());
         }
 
-        _state.Next();
         return ConsumeForBinOp("->>", new LongArrow());
     }