Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 42 additions & 0 deletions cpp/src/arrow/util/byte_size.cc
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include "arrow/util/byte_size.h"

#include <cstdint>
#include <unordered_map>
#include <unordered_set>

#include "arrow/array.h"
Expand Down Expand Up @@ -192,6 +193,47 @@ struct GetByteRangesArray {

Status Visit(const LargeBinaryType& type) const { return VisitBaseBinary(type); }

Status Visit(const BinaryViewType& type) const {
using c_type = BinaryViewType::c_type;
RETURN_NOT_OK(VisitBitmap(input.buffers[0]));

// Views buffer (buffer[1]) is fixed-width: 16 bytes per view
const Buffer& views_buffer = *input.buffers[1];
RETURN_NOT_OK(range_starts->Append(reinterpret_cast<uint64_t>(views_buffer.data())));
RETURN_NOT_OK(range_offsets->Append(static_cast<uint64_t>(offset) * sizeof(c_type)));
RETURN_NOT_OK(range_lengths->Append(static_cast<uint64_t>(length) * sizeof(c_type)));

// For out-of-line views, track the referenced ranges in data buffers.
// We track [min_offset, max_end) per data buffer to report a single range
// per buffer.
const c_type* views = input.GetValues<c_type>(1, offset);
// Map from buffer_index to (min_offset, max_end)
std::unordered_map<int32_t, std::pair<int32_t, int32_t>> buffer_ranges;
for (int64_t i = 0; i < length; i++) {
const c_type& view = views[i];
if (!view.is_inline() && view.size() > 0) {
int32_t buf_index = view.ref.buffer_index;
int32_t buf_offset = view.ref.offset;
int32_t buf_end = buf_offset + view.size();
auto it = buffer_ranges.find(buf_index);
if (it == buffer_ranges.end()) {
buffer_ranges[buf_index] = {buf_offset, buf_end};
} else {
it->second.first = std::min(it->second.first, buf_offset);
it->second.second = std::max(it->second.second, buf_end);
}
}
}
for (const auto& [buf_index, range] : buffer_ranges) {
const Buffer& data_buffer = *input.buffers[2 + buf_index];
RETURN_NOT_OK(range_starts->Append(reinterpret_cast<uint64_t>(data_buffer.data())));
RETURN_NOT_OK(range_offsets->Append(static_cast<uint64_t>(range.first)));
RETURN_NOT_OK(
range_lengths->Append(static_cast<uint64_t>(range.second - range.first)));
}
return Status::OK();
}

template <typename BaseListType>
Status VisitBaseList(const BaseListType& type) const {
using offset_type = typename BaseListType::offset_type;
Expand Down
35 changes: 35 additions & 0 deletions cpp/src/arrow/util/byte_size_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -472,5 +472,40 @@ TEST(ByteRanges, TableNoOverlap) {
ASSERT_OK_AND_EQ(13, ReferencedBufferSize(*table));
}

TEST(ByteRanges, BinaryViewInline) {
// All inline strings (<=12 bytes each) - only bitmap + views buffers
std::shared_ptr<Array> sv_arr = ArrayFromJSON(binary_view(), R"(["a", "bb", "ccc"])");
// 3 views * 16 bytes = 48 bytes for views buffer
// No data buffers since everything is inline
ASSERT_OK_AND_ASSIGN(int64_t size, ReferencedBufferSize(*sv_arr));
ASSERT_EQ(48, size);

// With nulls
std::shared_ptr<Array> sv_arr_null =
ArrayFromJSON(binary_view(), R"(["a", null, "ccc"])");
ASSERT_OK_AND_ASSIGN(int64_t size_null, ReferencedBufferSize(*sv_arr_null));
// 1 byte bitmap + 48 bytes views
ASSERT_EQ(49, size_null);
}

TEST(ByteRanges, StringViewInline) {
// string_view should work exactly the same as binary_view
std::shared_ptr<Array> sv_arr = ArrayFromJSON(utf8_view(), R"(["hello", "world"])");
ASSERT_OK_AND_ASSIGN(int64_t size, ReferencedBufferSize(*sv_arr));
// 2 views * 16 bytes = 32 bytes
ASSERT_EQ(32, size);
}

TEST(ByteRanges, BinaryViewOutOfLine) {
// Strings > 12 bytes are stored out-of-line in data buffers
std::shared_ptr<Array> sv_arr =
ArrayFromJSON(binary_view(), R"(["this string is longer than twelve bytes"])");
ASSERT_OK_AND_ASSIGN(int64_t size, ReferencedBufferSize(*sv_arr));
// 1 view * 16 bytes = 16 bytes for views
// + the out-of-line data length (40 bytes for "this string is longer than twelve
// bytes")
ASSERT_EQ(16 + 40, size);
}

} // namespace util
} // namespace arrow
Loading