From 3eb3a801403b6f18d62dba227826578ffd96cd6b Mon Sep 17 00:00:00 2001 From: Jerry Hu Date: Mon, 11 May 2026 21:40:16 +0800 Subject: [PATCH] [fix](be) Correct Arrow UTF8/String size limit (#63137) Issue Number: None Related PR: None Problem Summary: Fix the Arrow UTF8 max size constant so it matches the documented 2G limit instead of a much smaller value. None - Test: No need to test - No need to test (header constant fix only) - Behavior changed: Yes (Arrow UTF8 size limit now matches the intended 2G threshold) - Does this need documentation: No ### What problem does this PR solve? Issue Number: close #xxx Related PR: #xxx Problem Summary: ### Release note None ### Check List (For Author) - Test - [ ] Regression test - [ ] Unit Test - [ ] Manual test (add detailed scripts or steps below) - [ ] No need to test or manual test. Explain why: - [ ] This is a refactor/code format and no logic has been changed. - [ ] Previous test can cover this change. - [ ] No code files have been changed. - [ ] Other reason - Behavior changed: - [ ] No. - [ ] Yes. - Does this need documentation? - [ ] No. - [ ] Yes. ### Check List (For Reviewer who merge this PR) - [ ] Confirm the release note - [ ] Confirm test cases - [ ] Confirm document - [ ] Add branch pick label --- be/src/util/arrow/row_batch.h | 2 +- .../data/arrow_flight_sql_p0/test_select.out | 8 ++++- .../arrow_flight_sql_p0/test_select.groovy | 36 ++++++++++++++++++- 3 files changed, 43 insertions(+), 3 deletions(-) diff --git a/be/src/util/arrow/row_batch.h b/be/src/util/arrow/row_batch.h index 6dafbe8cab6d22..54da5f522b49dd 100644 --- a/be/src/util/arrow/row_batch.h +++ b/be/src/util/arrow/row_batch.h @@ -39,7 +39,7 @@ class Schema; namespace doris { -constexpr size_t MAX_ARROW_UTF8 = (1ULL << 21); // 2G +constexpr size_t MAX_ARROW_UTF8 = (1ULL << 31); // 2G class RowDescriptor; diff --git a/regression-test/data/arrow_flight_sql_p0/test_select.out b/regression-test/data/arrow_flight_sql_p0/test_select.out index f2f4b86bbf5ceb..62888cd3dfcabd 100644 --- a/regression-test/data/arrow_flight_sql_p0/test_select.out +++ b/regression-test/data/arrow_flight_sql_p0/test_select.out @@ -5,4 +5,10 @@ -- !arrow_flight_sql_datetime -- 333 plsql333 2024-07-21 12:00:00.123456 2024-07-21 12:00:00.0 222 plsql222 2024-07-20 12:00:00.123456 2024-07-20 12:00:00.0 -111 plsql111 2024-07-19 12:00:00.123456 2024-07-19 12:00:00.0 \ No newline at end of file +111 plsql111 2024-07-19 12:00:00.123456 2024-07-19 12:00:00.0 + +-- !arrow_flight_sql_jsonb -- +1 {"k1":1,"k2":"v2"} +2 [1,2,{"nested":true}] +3 \N + diff --git a/regression-test/suites/arrow_flight_sql_p0/test_select.groovy b/regression-test/suites/arrow_flight_sql_p0/test_select.groovy index 950fb4af7e9034..85f119fc2c3a9b 100644 --- a/regression-test/suites/arrow_flight_sql_p0/test_select.groovy +++ b/regression-test/suites/arrow_flight_sql_p0/test_select.groovy @@ -26,7 +26,7 @@ suite("test_select", "arrow_flight_sql") { sql """INSERT INTO ${tableName} VALUES(222, "plsql222")""" sql """INSERT INTO ${tableName} VALUES(333, "plsql333")""" sql """INSERT INTO ${tableName} VALUES(111, "plsql333")""" - + qt_arrow_flight_sql "select sum(id) as a, count(1) as b from ${tableName}" tableName = "test_select_datetime" @@ -40,4 +40,38 @@ suite("test_select", "arrow_flight_sql") { sql """INSERT INTO ${tableName} VALUES(333, "plsql333","2024-07-21 12:00:00.123456","2024-07-21 12:00:00")""" qt_arrow_flight_sql_datetime "select * from ${tableName} order by id desc" + + tableName = "test_select_jsonb" + sql "DROP TABLE IF EXISTS ${tableName}" + sql """ + create table ${tableName} (id int, payload jsonb) DUPLICATE key(`id`) distributed by hash (`id`) buckets 4 + properties ("replication_num"="1"); + """ + sql """ + INSERT INTO ${tableName} VALUES + (1, '{"k1": 1, "k2": "v2"}'), + (2, '[1, 2, {"nested": true}]'), + (3, NULL) + """ + + qt_arrow_flight_sql_jsonb "select id, payload from ${tableName} order by id" + + def largeJsonValueSize = 2100000 + sql """ + INSERT INTO ${tableName} + SELECT 4, CAST(CONCAT('{"large":"', REPEAT('x', ${largeJsonValueSize}), '"}') AS JSONB) + """ + + // This row exceeds MAX_ARROW_UTF8 and exercises JSONB -> LargeString serialization. + def largeJsonbResult = arrow_flight_sql """ + select payload, length(cast(payload as string)) from ${tableName} where id = 4 + """ + assertEquals(1, largeJsonbResult.size()) + assertEquals(2, largeJsonbResult[0].size()) + def expectedLargeJsonbSize = largeJsonValueSize + '{"large":""}'.length() + def largeJsonb = largeJsonbResult[0][0].toString() + assertEquals(expectedLargeJsonbSize, largeJsonb.length()) + assertEquals(expectedLargeJsonbSize, (largeJsonbResult[0][1] as Number).intValue()) + assertTrue(largeJsonb.startsWith('{"large":"')) + assertTrue(largeJsonb.endsWith('"}')) }