From 06215042bc6ecf55744f847e7a4a068cfb57234c Mon Sep 17 00:00:00 2001 From: Varun Chawla Date: Sun, 22 Feb 2026 19:52:58 -0800 Subject: [PATCH 1/2] GH-47279: [C++] Support BinaryView/StringView in ReferencedBufferSize The `ReferencedBufferSize` function (used by pyarrow's `nbytes` property) was missing a visitor for `BinaryViewType`, causing an `ArrowTypeError` when calling `table.nbytes` on tables containing string_view or binary_view columns. Add a `Visit(const BinaryViewType&)` handler to `GetByteRangesArray` that correctly accounts for the views buffer (fixed-width, 16 bytes per element) and any out-of-line data buffers referenced by non-inline views. Since `StringViewType` inherits from `BinaryViewType`, this handles both types. --- cpp/src/arrow/util/byte_size.cc | 43 ++++++++++++++++++++++++++++ cpp/src/arrow/util/byte_size_test.cc | 37 ++++++++++++++++++++++++ 2 files changed, 80 insertions(+) diff --git a/cpp/src/arrow/util/byte_size.cc b/cpp/src/arrow/util/byte_size.cc index 53382a10775..08789fa2446 100644 --- a/cpp/src/arrow/util/byte_size.cc +++ b/cpp/src/arrow/util/byte_size.cc @@ -18,6 +18,7 @@ #include "arrow/util/byte_size.h" #include +#include #include #include "arrow/array.h" @@ -192,6 +193,48 @@ struct GetByteRangesArray { Status Visit(const LargeBinaryType& type) const { return VisitBaseBinary(type); } + Status Visit(const BinaryViewType& type) const { + using c_type = BinaryViewType::c_type; + RETURN_NOT_OK(VisitBitmap(input.buffers[0])); + + // Views buffer (buffer[1]) is fixed-width: 16 bytes per view + const Buffer& views_buffer = *input.buffers[1]; + RETURN_NOT_OK(range_starts->Append(reinterpret_cast(views_buffer.data()))); + RETURN_NOT_OK(range_offsets->Append(static_cast(offset) * sizeof(c_type))); + RETURN_NOT_OK(range_lengths->Append(static_cast(length) * sizeof(c_type))); + + // For out-of-line views, track the referenced ranges in data buffers. + // We track [min_offset, max_end) per data buffer to report a single range + // per buffer. + const c_type* views = input.GetValues(1, offset); + // Map from buffer_index to (min_offset, max_end) + std::unordered_map> buffer_ranges; + for (int64_t i = 0; i < length; i++) { + const c_type& view = views[i]; + if (!view.is_inline() && view.size() > 0) { + int32_t buf_index = view.ref.buffer_index; + int32_t buf_offset = view.ref.offset; + int32_t buf_end = buf_offset + view.size(); + auto it = buffer_ranges.find(buf_index); + if (it == buffer_ranges.end()) { + buffer_ranges[buf_index] = {buf_offset, buf_end}; + } else { + it->second.first = std::min(it->second.first, buf_offset); + it->second.second = std::max(it->second.second, buf_end); + } + } + } + for (const auto& [buf_index, range] : buffer_ranges) { + const Buffer& data_buffer = *input.buffers[2 + buf_index]; + RETURN_NOT_OK( + range_starts->Append(reinterpret_cast(data_buffer.data()))); + RETURN_NOT_OK(range_offsets->Append(static_cast(range.first))); + RETURN_NOT_OK( + range_lengths->Append(static_cast(range.second - range.first))); + } + return Status::OK(); + } + template Status VisitBaseList(const BaseListType& type) const { using offset_type = typename BaseListType::offset_type; diff --git a/cpp/src/arrow/util/byte_size_test.cc b/cpp/src/arrow/util/byte_size_test.cc index 0aaf0a76a2a..321bda4d8be 100644 --- a/cpp/src/arrow/util/byte_size_test.cc +++ b/cpp/src/arrow/util/byte_size_test.cc @@ -474,3 +474,40 @@ TEST(ByteRanges, TableNoOverlap) { } // namespace util } // namespace arrow + +TEST(ByteRanges, BinaryViewInline) { + // All inline strings (<=12 bytes each) - only bitmap + views buffers + std::shared_ptr sv_arr = + ArrayFromJSON(binary_view(), R"(["a", "bb", "ccc"])"); + // 3 views * 16 bytes = 48 bytes for views buffer + // No data buffers since everything is inline + ASSERT_OK_AND_ASSIGN(int64_t size, ReferencedBufferSize(*sv_arr)); + ASSERT_EQ(48, size); + + // With nulls + std::shared_ptr sv_arr_null = + ArrayFromJSON(binary_view(), R"(["a", null, "ccc"])"); + ASSERT_OK_AND_ASSIGN(int64_t size_null, ReferencedBufferSize(*sv_arr_null)); + // 1 byte bitmap + 48 bytes views + ASSERT_EQ(49, size_null); +} + +TEST(ByteRanges, StringViewInline) { + // string_view should work exactly the same as binary_view + std::shared_ptr sv_arr = + ArrayFromJSON(utf8_view(), R"(["hello", "world"])"); + ASSERT_OK_AND_ASSIGN(int64_t size, ReferencedBufferSize(*sv_arr)); + // 2 views * 16 bytes = 32 bytes + ASSERT_EQ(32, size); +} + +TEST(ByteRanges, BinaryViewOutOfLine) { + // Strings > 12 bytes are stored out-of-line in data buffers + std::shared_ptr sv_arr = + ArrayFromJSON(binary_view(), R"(["this string is longer than twelve bytes"])"); + ASSERT_OK_AND_ASSIGN(int64_t size, ReferencedBufferSize(*sv_arr)); + // 1 view * 16 bytes = 16 bytes for views + // + the out-of-line data length (40 bytes for "this string is longer than twelve bytes") + ASSERT_EQ(16 + 40, size); +} + From 21e4da29ca3e8ac775cca008a2c0f16008ac4865 Mon Sep 17 00:00:00 2001 From: Varun Chawla Date: Mon, 23 Feb 2026 23:40:07 -0800 Subject: [PATCH 2/2] Fix BinaryView tests: move inside arrow::util namespace and fix formatting The new BinaryView/StringView test cases were placed outside the arrow::util namespace closing braces, causing compilation errors due to unresolved symbols (Array, ArrayFromJSON, binary_view, utf8_view, ReferencedBufferSize). Moved the tests back inside the namespace. Also fixed lint formatting: collapsed short RETURN_NOT_OK calls to single lines and wrapped a long comment to stay within line limits. --- cpp/src/arrow/util/byte_size.cc | 3 +-- cpp/src/arrow/util/byte_size_test.cc | 14 ++++++-------- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/cpp/src/arrow/util/byte_size.cc b/cpp/src/arrow/util/byte_size.cc index 08789fa2446..0b0f1b7be08 100644 --- a/cpp/src/arrow/util/byte_size.cc +++ b/cpp/src/arrow/util/byte_size.cc @@ -226,8 +226,7 @@ struct GetByteRangesArray { } for (const auto& [buf_index, range] : buffer_ranges) { const Buffer& data_buffer = *input.buffers[2 + buf_index]; - RETURN_NOT_OK( - range_starts->Append(reinterpret_cast(data_buffer.data()))); + RETURN_NOT_OK(range_starts->Append(reinterpret_cast(data_buffer.data()))); RETURN_NOT_OK(range_offsets->Append(static_cast(range.first))); RETURN_NOT_OK( range_lengths->Append(static_cast(range.second - range.first))); diff --git a/cpp/src/arrow/util/byte_size_test.cc b/cpp/src/arrow/util/byte_size_test.cc index 321bda4d8be..b4711b4e219 100644 --- a/cpp/src/arrow/util/byte_size_test.cc +++ b/cpp/src/arrow/util/byte_size_test.cc @@ -472,13 +472,9 @@ TEST(ByteRanges, TableNoOverlap) { ASSERT_OK_AND_EQ(13, ReferencedBufferSize(*table)); } -} // namespace util -} // namespace arrow - TEST(ByteRanges, BinaryViewInline) { // All inline strings (<=12 bytes each) - only bitmap + views buffers - std::shared_ptr sv_arr = - ArrayFromJSON(binary_view(), R"(["a", "bb", "ccc"])"); + std::shared_ptr sv_arr = ArrayFromJSON(binary_view(), R"(["a", "bb", "ccc"])"); // 3 views * 16 bytes = 48 bytes for views buffer // No data buffers since everything is inline ASSERT_OK_AND_ASSIGN(int64_t size, ReferencedBufferSize(*sv_arr)); @@ -494,8 +490,7 @@ TEST(ByteRanges, BinaryViewInline) { TEST(ByteRanges, StringViewInline) { // string_view should work exactly the same as binary_view - std::shared_ptr sv_arr = - ArrayFromJSON(utf8_view(), R"(["hello", "world"])"); + std::shared_ptr sv_arr = ArrayFromJSON(utf8_view(), R"(["hello", "world"])"); ASSERT_OK_AND_ASSIGN(int64_t size, ReferencedBufferSize(*sv_arr)); // 2 views * 16 bytes = 32 bytes ASSERT_EQ(32, size); @@ -507,7 +502,10 @@ TEST(ByteRanges, BinaryViewOutOfLine) { ArrayFromJSON(binary_view(), R"(["this string is longer than twelve bytes"])"); ASSERT_OK_AND_ASSIGN(int64_t size, ReferencedBufferSize(*sv_arr)); // 1 view * 16 bytes = 16 bytes for views - // + the out-of-line data length (40 bytes for "this string is longer than twelve bytes") + // + the out-of-line data length (40 bytes for "this string is longer than twelve + // bytes") ASSERT_EQ(16 + 40, size); } +} // namespace util +} // namespace arrow