diff --git a/cpp/src/arrow/util/byte_size.cc b/cpp/src/arrow/util/byte_size.cc index 53382a10775..0b0f1b7be08 100644 --- a/cpp/src/arrow/util/byte_size.cc +++ b/cpp/src/arrow/util/byte_size.cc @@ -18,6 +18,7 @@ #include "arrow/util/byte_size.h" #include +#include #include #include "arrow/array.h" @@ -192,6 +193,47 @@ struct GetByteRangesArray { Status Visit(const LargeBinaryType& type) const { return VisitBaseBinary(type); } + Status Visit(const BinaryViewType& type) const { + using c_type = BinaryViewType::c_type; + RETURN_NOT_OK(VisitBitmap(input.buffers[0])); + + // Views buffer (buffer[1]) is fixed-width: 16 bytes per view + const Buffer& views_buffer = *input.buffers[1]; + RETURN_NOT_OK(range_starts->Append(reinterpret_cast(views_buffer.data()))); + RETURN_NOT_OK(range_offsets->Append(static_cast(offset) * sizeof(c_type))); + RETURN_NOT_OK(range_lengths->Append(static_cast(length) * sizeof(c_type))); + + // For out-of-line views, track the referenced ranges in data buffers. + // We track [min_offset, max_end) per data buffer to report a single range + // per buffer. + const c_type* views = input.GetValues(1, offset); + // Map from buffer_index to (min_offset, max_end) + std::unordered_map> buffer_ranges; + for (int64_t i = 0; i < length; i++) { + const c_type& view = views[i]; + if (!view.is_inline() && view.size() > 0) { + int32_t buf_index = view.ref.buffer_index; + int32_t buf_offset = view.ref.offset; + int32_t buf_end = buf_offset + view.size(); + auto it = buffer_ranges.find(buf_index); + if (it == buffer_ranges.end()) { + buffer_ranges[buf_index] = {buf_offset, buf_end}; + } else { + it->second.first = std::min(it->second.first, buf_offset); + it->second.second = std::max(it->second.second, buf_end); + } + } + } + for (const auto& [buf_index, range] : buffer_ranges) { + const Buffer& data_buffer = *input.buffers[2 + buf_index]; + RETURN_NOT_OK(range_starts->Append(reinterpret_cast(data_buffer.data()))); + RETURN_NOT_OK(range_offsets->Append(static_cast(range.first))); + RETURN_NOT_OK( + range_lengths->Append(static_cast(range.second - range.first))); + } + return Status::OK(); + } + template Status VisitBaseList(const BaseListType& type) const { using offset_type = typename BaseListType::offset_type; diff --git a/cpp/src/arrow/util/byte_size_test.cc b/cpp/src/arrow/util/byte_size_test.cc index 0aaf0a76a2a..b4711b4e219 100644 --- a/cpp/src/arrow/util/byte_size_test.cc +++ b/cpp/src/arrow/util/byte_size_test.cc @@ -472,5 +472,40 @@ TEST(ByteRanges, TableNoOverlap) { ASSERT_OK_AND_EQ(13, ReferencedBufferSize(*table)); } +TEST(ByteRanges, BinaryViewInline) { + // All inline strings (<=12 bytes each) - only bitmap + views buffers + std::shared_ptr sv_arr = ArrayFromJSON(binary_view(), R"(["a", "bb", "ccc"])"); + // 3 views * 16 bytes = 48 bytes for views buffer + // No data buffers since everything is inline + ASSERT_OK_AND_ASSIGN(int64_t size, ReferencedBufferSize(*sv_arr)); + ASSERT_EQ(48, size); + + // With nulls + std::shared_ptr sv_arr_null = + ArrayFromJSON(binary_view(), R"(["a", null, "ccc"])"); + ASSERT_OK_AND_ASSIGN(int64_t size_null, ReferencedBufferSize(*sv_arr_null)); + // 1 byte bitmap + 48 bytes views + ASSERT_EQ(49, size_null); +} + +TEST(ByteRanges, StringViewInline) { + // string_view should work exactly the same as binary_view + std::shared_ptr sv_arr = ArrayFromJSON(utf8_view(), R"(["hello", "world"])"); + ASSERT_OK_AND_ASSIGN(int64_t size, ReferencedBufferSize(*sv_arr)); + // 2 views * 16 bytes = 32 bytes + ASSERT_EQ(32, size); +} + +TEST(ByteRanges, BinaryViewOutOfLine) { + // Strings > 12 bytes are stored out-of-line in data buffers + std::shared_ptr sv_arr = + ArrayFromJSON(binary_view(), R"(["this string is longer than twelve bytes"])"); + ASSERT_OK_AND_ASSIGN(int64_t size, ReferencedBufferSize(*sv_arr)); + // 1 view * 16 bytes = 16 bytes for views + // + the out-of-line data length (40 bytes for "this string is longer than twelve + // bytes") + ASSERT_EQ(16 + 40, size); +} + } // namespace util } // namespace arrow