diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp index 7b6db1636234aa..384132206c77cb 100644 --- a/be/src/common/config.cpp +++ b/be/src/common/config.cpp @@ -1276,6 +1276,9 @@ DEFINE_Int32(ann_index_result_cache_stale_sweep_time_sec, "1800"); // inverted index DEFINE_mDouble(inverted_index_ram_buffer_size, "512"); +// Cap the CLucene buffered postings for analyzed inverted indexes when RAM directory is disabled. +// Values <= 0 keep inverted_index_ram_buffer_size unchanged. +DEFINE_mDouble(inverted_index_ram_buffer_size_when_ram_dir_disabled, "64"); // -1 indicates not working. // Normally we should not change this, it's useful for testing. DEFINE_mInt32(inverted_index_max_buffered_docs, "-1"); diff --git a/be/src/common/config.h b/be/src/common/config.h index 427282a4452bc4..02354ad89747c4 100644 --- a/be/src/common/config.h +++ b/be/src/common/config.h @@ -1320,6 +1320,7 @@ DECLARE_Int32(ann_index_result_cache_stale_sweep_time_sec); // inverted index DECLARE_mDouble(inverted_index_ram_buffer_size); +DECLARE_mDouble(inverted_index_ram_buffer_size_when_ram_dir_disabled); DECLARE_mInt32(inverted_index_max_buffered_docs); // dict path for chinese analyzer DECLARE_String(inverted_index_dict_path); diff --git a/be/src/storage/index/inverted/inverted_index_writer.cpp b/be/src/storage/index/inverted/inverted_index_writer.cpp index 0f82b5225e666e..9ef1d37618a028 100644 --- a/be/src/storage/index/inverted/inverted_index_writer.cpp +++ b/be/src/storage/index/inverted/inverted_index_writer.cpp @@ -17,6 +17,10 @@ #include "storage/index/inverted/inverted_index_writer.h" +#include +#include + +#include "common/cast_set.h" #include "storage/index/inverted/analyzer/analyzer.h" #include "storage/index/inverted/inverted_index_common.h" #include "storage/index/inverted/inverted_index_fs_directory.h" @@ -32,6 +36,46 @@ const int32_t MAX_LEAF_COUNT = 1024; const float MAXMBSortInHeap = 512.0 * 8; const int DIMS = 1; +namespace { + +int64_t index_writer_memory_size(const std::unique_ptr& index_writer) { + if (index_writer == nullptr) { + return 0; + } + return index_writer->ramSizeInBytes(); +} + +int64_t ram_directory_memory_size(const std::shared_ptr& dir) { + if (dir == nullptr || std::strcmp(dir->getObjectName(), "DorisRAMFSDirectory") != 0) { + return 0; + } + + int64_t size = 0; + std::vector files; + dir->list(&files); + for (const auto& file : files) { + size += dir->fileLength(file.c_str()); + } + return size; +} + +bool is_fs_directory(const std::shared_ptr& dir) { + return dir != nullptr && std::strcmp(dir->getObjectName(), "DorisFSDirectory") == 0; +} + +float index_writer_ram_buffer_size(const std::shared_ptr& dir, + bool should_analyzer) { + auto ram_buffer_size = config::inverted_index_ram_buffer_size; + if (should_analyzer && is_fs_directory(dir) && ram_buffer_size > 0 && + config::inverted_index_ram_buffer_size_when_ram_dir_disabled > 0) { + ram_buffer_size = std::min(ram_buffer_size, + config::inverted_index_ram_buffer_size_when_ram_dir_disabled); + } + return static_cast(ram_buffer_size); +} + +} // namespace + template InvertedIndexColumnWriter::InvertedIndexColumnWriter(const std::string& field_name, IndexFileWriter* index_file_writer, @@ -141,7 +185,7 @@ InvertedIndexColumnWriter::create_index_writer() { { index_writer->setMaxBufferedDocs(1); }) DBUG_EXECUTE_IF("InvertedIndexColumnWriter::create_index_writer_setMergeFactor_error", { index_writer->setMergeFactor(1); }) - index_writer->setRAMBufferSizeMB(static_cast(config::inverted_index_ram_buffer_size)); + index_writer->setRAMBufferSizeMB(index_writer_ram_buffer_size(_dir, _should_analyzer)); index_writer->setMaxBufferedDocs(config::inverted_index_max_buffered_docs); index_writer->setMaxFieldLength(MAX_FIELD_LEN); index_writer->setMergeFactor(MERGE_FACTOR); @@ -566,8 +610,14 @@ Status InvertedIndexColumnWriter::add_value(const CppType& value) { template int64_t InvertedIndexColumnWriter::size() const { - //TODO: get memory size of inverted index - return 0; + int64_t size = cast_set(_null_bitmap.getSizeInBytes(false)); + if constexpr (field_is_slice_type(field_type)) { + if (_should_analyzer) { + size += index_writer_memory_size(_index_writer); + } + } + size += ram_directory_memory_size(_dir); + return size; } template @@ -683,4 +733,4 @@ template class InvertedIndexColumnWriter; template class InvertedIndexColumnWriter; template class InvertedIndexColumnWriter; -} // namespace doris::segment_v2 \ No newline at end of file +} // namespace doris::segment_v2 diff --git a/be/src/storage/segment/column_writer.cpp b/be/src/storage/segment/column_writer.cpp index 15d4f558f9b054..542d0b597c0892 100644 --- a/be/src/storage/segment/column_writer.cpp +++ b/be/src/storage/segment/column_writer.cpp @@ -721,6 +721,12 @@ uint64_t ScalarColumnWriter::estimate_buffer_size() { if (_opts.need_bloom_filter) { size += _bloom_filter_index_builder->size(); } + if (_opts.need_inverted_index) { + for (const auto& builder : _inverted_index_builders) { + DORIS_CHECK(builder != nullptr); + size += builder->size(); + } + } return size; } @@ -1121,9 +1127,18 @@ Status ArrayColumnWriter::append_data(const uint8_t** ptr, size_t num_rows) { } uint64_t ArrayColumnWriter::estimate_buffer_size() { - return _offset_writer->estimate_buffer_size() + - (is_nullable() ? _null_writer->estimate_buffer_size() : 0) + - _item_writer->estimate_buffer_size(); + uint64_t size = _offset_writer->estimate_buffer_size() + + (is_nullable() ? _null_writer->estimate_buffer_size() : 0) + + _item_writer->estimate_buffer_size(); + if (_opts.need_inverted_index) { + DORIS_CHECK(_inverted_index_writer != nullptr); + size += _inverted_index_writer->size(); + } + if (_opts.need_ann_index) { + DORIS_CHECK(_ann_index_writer != nullptr); + size += _ann_index_writer->size(); + } + return size; } Status ArrayColumnWriter::append_nullable(const uint8_t* null_map, const uint8_t** ptr, diff --git a/be/test/storage/segment/inverted_index_writer_test.cpp b/be/test/storage/segment/inverted_index_writer_test.cpp index 8b7f56221cf8cf..7af0000d448b35 100644 --- a/be/test/storage/segment/inverted_index_writer_test.cpp +++ b/be/test/storage/segment/inverted_index_writer_test.cpp @@ -25,9 +25,11 @@ #include #include +#include #include #include #include +#include #include #include @@ -775,6 +777,45 @@ class InvertedIndexWriterTest : public testing::Test { std::unique_ptr _inverted_index_query_cache; }; +namespace { + +TabletIndex create_standard_fulltext_index_meta(); +std::string fulltext_memory_token(size_t token_id); +std::vector fulltext_memory_strings(size_t value_count, size_t tokens_per_value); +std::vector fulltext_large_memory_strings(size_t value_count, + size_t min_bytes_per_value); +std::vector slices_from_strings(const std::vector& strings); +int64_t add_fulltext_values_and_get_peak_delta(IndexColumnWriter* column_writer, + const std::vector& values, + size_t batch_size = 32); +void check_fulltext_match(const std::shared_ptr& fulltext_reader, + const IndexQueryContextPtr& context, const std::string& field_name, + const std::string& query, uint64_t cardinality, + std::optional doc_id); + +class FullTextIndexConfigGuard { +public: + FullTextIndexConfigGuard() + : _inverted_index_ram_dir_enable(config::inverted_index_ram_dir_enable), + _inverted_index_ram_buffer_size(config::inverted_index_ram_buffer_size), + _inverted_index_ram_buffer_size_when_ram_dir_disabled( + config::inverted_index_ram_buffer_size_when_ram_dir_disabled) {} + + ~FullTextIndexConfigGuard() { + config::inverted_index_ram_dir_enable = _inverted_index_ram_dir_enable; + config::inverted_index_ram_buffer_size = _inverted_index_ram_buffer_size; + config::inverted_index_ram_buffer_size_when_ram_dir_disabled = + _inverted_index_ram_buffer_size_when_ram_dir_disabled; + } + +private: + bool _inverted_index_ram_dir_enable; + double _inverted_index_ram_buffer_size; + double _inverted_index_ram_buffer_size_when_ram_dir_disabled; +}; + +} // namespace + // Test case for writing string values TEST_F(InvertedIndexWriterTest, StringWrite) { test_string_write("test_rowset_1", 0); @@ -790,6 +831,146 @@ TEST_F(InvertedIndexWriterTest, NumericWrite) { test_numeric_write("test_rowset_3", 0); } +TEST_F(InvertedIndexWriterTest, FullTextStringMemoryEstimateIncludesBufferedPostings) { + auto tablet_schema = create_schema(); + TabletIndex idx_meta = create_standard_fulltext_index_meta(); + std::string rowset_id = "test_rowset_fulltext_memory"; + int seg_id = 0; + std::string index_path_prefix {InvertedIndexDescriptor::get_index_file_path_prefix( + local_segment_path(kTestDir, rowset_id, seg_id))}; + std::string index_path = InvertedIndexDescriptor::get_index_file_path_v2(index_path_prefix); + + io::FileWriterPtr file_writer; + io::FileWriterOptions opts; + auto fs = io::global_local_filesystem(); + Status sts = fs->create_file(index_path, &file_writer, &opts); + ASSERT_TRUE(sts.ok()) << sts; + auto index_file_writer = std::make_unique( + fs, index_path_prefix, rowset_id, seg_id, InvertedIndexStorageFormatPB::V2, + std::move(file_writer)); + + const TabletColumn& column = tablet_schema->column(1); + ASSERT_NE(&column, nullptr); + std::unique_ptr field(StorageFieldFactory::create(column)); + ASSERT_NE(field.get(), nullptr); + + std::unique_ptr column_writer; + auto status = IndexColumnWriter::create(field.get(), &column_writer, index_file_writer.get(), + &idx_meta); + ASSERT_TRUE(status.ok()) << status; + + constexpr size_t value_count = 256; + constexpr size_t tokens_per_value = 48; + const auto strings = fulltext_memory_strings(value_count, tokens_per_value); + const auto values = slices_from_strings(strings); + const int64_t peak_delta = add_fulltext_values_and_get_peak_delta(column_writer.get(), values); + + RecordProperty("legacy_peak_delta_bytes", 0); + RecordProperty("peak_delta_bytes", peak_delta); + EXPECT_GT(peak_delta, 256 * 1024); + EXPECT_LT(peak_delta, 8 * 1024 * 1024); + + status = column_writer->finish(); + ASSERT_TRUE(status.ok()) << status; + status = index_file_writer->begin_close(); + ASSERT_TRUE(status.ok()) << status; + status = index_file_writer->finish_close(); + ASSERT_TRUE(status.ok()) << status; + + auto reader = std::make_shared( + io::global_local_filesystem(), index_path_prefix, InvertedIndexStorageFormatPB::V2); + status = reader->init(); + ASSERT_EQ(status, Status::OK()); + auto result = reader->open(&idx_meta); + ASSERT_TRUE(result.has_value()) << "Failed to open compound reader" << result.error(); + + auto fulltext_reader = FullTextIndexReader::create_shared(&idx_meta, reader); + ASSERT_NE(fulltext_reader, nullptr); + + OlapReaderStatistics stats; + RuntimeState runtime_state; + TQueryOptions query_options; + query_options.enable_inverted_index_searcher_cache = false; + runtime_state.set_query_options(query_options); + io::IOContext io_ctx; + auto context = std::make_shared(); + context->io_ctx = &io_ctx; + context->stats = &stats; + context->runtime_state = &runtime_state; + std::string field_name = std::to_string(field->unique_id()); + + check_fulltext_match(fulltext_reader, context, field_name, "sharedterm", value_count, + std::nullopt); + constexpr size_t selected_row = 17; + check_fulltext_match(fulltext_reader, context, field_name, + fulltext_memory_token(selected_row * tokens_per_value + 5), 1, + selected_row); +} + +TEST_F(InvertedIndexWriterTest, FullTextLargeStringRamDirDisabledCapsBufferedPostings) { + FullTextIndexConfigGuard config_guard; + config::inverted_index_ram_dir_enable = false; + config::inverted_index_ram_buffer_size = 512; + + auto tablet_schema = create_schema(); + TabletIndex idx_meta = create_standard_fulltext_index_meta(); + constexpr size_t value_count = 2; + constexpr size_t min_bytes_per_value = 1024 * 1024; + const auto strings = fulltext_large_memory_strings(value_count, min_bytes_per_value); + ASSERT_GE(strings.front().size(), min_bytes_per_value); + const auto values = slices_from_strings(strings); + + auto write_and_measure = [&](std::string_view rowset_id, int seg_id, int64_t* peak_delta) { + std::string index_path_prefix {InvertedIndexDescriptor::get_index_file_path_prefix( + local_segment_path(kTestDir, rowset_id, seg_id))}; + std::string index_path = InvertedIndexDescriptor::get_index_file_path_v2(index_path_prefix); + + io::FileWriterPtr file_writer; + io::FileWriterOptions opts; + auto fs = io::global_local_filesystem(); + Status sts = fs->create_file(index_path, &file_writer, &opts); + ASSERT_TRUE(sts.ok()) << sts; + auto index_file_writer = std::make_unique( + fs, index_path_prefix, std::string {rowset_id}, seg_id, + InvertedIndexStorageFormatPB::V2, std::move(file_writer)); + + const TabletColumn& column = tablet_schema->column(1); + ASSERT_NE(&column, nullptr); + std::unique_ptr field(StorageFieldFactory::create(column)); + ASSERT_NE(field.get(), nullptr); + + std::unique_ptr column_writer; + auto status = IndexColumnWriter::create(field.get(), &column_writer, + index_file_writer.get(), &idx_meta); + ASSERT_TRUE(status.ok()) << status; + + *peak_delta = add_fulltext_values_and_get_peak_delta(column_writer.get(), values, 1); + + status = column_writer->finish(); + ASSERT_TRUE(status.ok()) << status; + status = index_file_writer->begin_close(); + ASSERT_TRUE(status.ok()) << status; + status = index_file_writer->finish_close(); + ASSERT_TRUE(status.ok()) << status; + }; + + int64_t uncapped_peak_delta = 0; + config::inverted_index_ram_buffer_size_when_ram_dir_disabled = 0; + ASSERT_NO_FATAL_FAILURE( + write_and_measure("test_rowset_large_string_uncapped", 0, &uncapped_peak_delta)); + + int64_t capped_peak_delta = 0; + config::inverted_index_ram_buffer_size_when_ram_dir_disabled = 1; + ASSERT_NO_FATAL_FAILURE( + write_and_measure("test_rowset_large_string_capped", 1, &capped_peak_delta)); + + RecordProperty("uncapped_peak_delta_bytes", uncapped_peak_delta); + RecordProperty("capped_peak_delta_bytes", capped_peak_delta); + EXPECT_GT(uncapped_peak_delta, 4 * 1024 * 1024); + EXPECT_LT(capped_peak_delta, 2 * 1024 * 1024); + EXPECT_LT(capped_peak_delta * 2, uncapped_peak_delta); +} + // Test case for Unicode string values with enable_correct_term_write=true TEST_F(InvertedIndexWriterTest, UnicodeStringWriteEnabled) { test_unicode_string_write("test_rowset_4", 0, true); @@ -1544,4 +1725,100 @@ TEST_F(InvertedIndexWriterTest, NormsFileCreationWithTokenization) { << "inverted_index_writer.cpp where .nrm file creation depends on _should_analyzer."; } -} // namespace doris::segment_v2 \ No newline at end of file +namespace { + +TabletIndex create_standard_fulltext_index_meta() { + auto index_meta_pb = std::make_unique(); + index_meta_pb->set_index_type(IndexType::INVERTED); + index_meta_pb->set_index_id(1); + index_meta_pb->set_index_name("test"); + index_meta_pb->clear_col_unique_id(); + index_meta_pb->add_col_unique_id(1); + auto* properties = index_meta_pb->mutable_properties(); + (*properties)["parser"] = "standard"; + + TabletIndex idx_meta; + idx_meta.init_from_pb(*index_meta_pb.get()); + return idx_meta; +} + +std::string fulltext_memory_token(size_t token_id) { + std::string token = "term"; + for (int i = 0; i < 5; ++i) { + token.push_back(static_cast('a' + token_id % 26)); + token_id /= 26; + } + return token; +} + +std::vector fulltext_memory_strings(size_t value_count, size_t tokens_per_value) { + std::vector strings; + strings.reserve(value_count); + for (size_t row = 0; row < value_count; ++row) { + std::string value = "sharedterm"; + for (size_t token_idx = 0; token_idx < tokens_per_value; ++token_idx) { + value.append(" "); + value.append(fulltext_memory_token(row * tokens_per_value + token_idx)); + } + strings.emplace_back(std::move(value)); + } + return strings; +} + +std::vector fulltext_large_memory_strings(size_t value_count, + size_t min_bytes_per_value) { + std::vector strings; + strings.reserve(value_count); + size_t token_id = 0; + for (size_t row = 0; row < value_count; ++row) { + std::string value = "sharedterm"; + while (value.size() < min_bytes_per_value) { + value.append(" "); + value.append(fulltext_memory_token(token_id++)); + } + strings.emplace_back(std::move(value)); + } + return strings; +} + +std::vector slices_from_strings(const std::vector& strings) { + std::vector values; + values.reserve(strings.size()); + for (const auto& value : strings) { + values.emplace_back(value); + } + return values; +} + +int64_t add_fulltext_values_and_get_peak_delta(IndexColumnWriter* column_writer, + const std::vector& values, + size_t batch_size) { + const int64_t initial_size = column_writer->size(); + int64_t peak_size = initial_size; + for (size_t offset = 0; offset < values.size(); offset += batch_size) { + size_t count = std::min(batch_size, values.size() - offset); + auto status = column_writer->add_values("c2", values.data() + offset, count); + EXPECT_TRUE(status.ok()) << status; + peak_size = std::max(peak_size, column_writer->size()); + } + return peak_size - initial_size; +} + +void check_fulltext_match(const std::shared_ptr& fulltext_reader, + const IndexQueryContextPtr& context, const std::string& field_name, + const std::string& query, uint64_t cardinality, + std::optional doc_id) { + std::shared_ptr bitmap = std::make_shared(); + StringRef query_ref(query.c_str(), query.length()); + auto status = fulltext_reader->query(context, field_name, &query_ref, + InvertedIndexQueryType::MATCH_ANY_QUERY, bitmap); + ASSERT_TRUE(status.ok()) << status; + EXPECT_EQ(bitmap->cardinality(), cardinality); + if (doc_id.has_value()) { + EXPECT_TRUE(bitmap->contains(doc_id.value())); + } +} + +} // namespace + +} // namespace doris::segment_v2 diff --git a/contrib/clucene b/contrib/clucene index c51b5cc9adc638..bab8109b4b3e12 160000 --- a/contrib/clucene +++ b/contrib/clucene @@ -1 +1 @@ -Subproject commit c51b5cc9adc63817ad8322f617c75737ece7288d +Subproject commit bab8109b4b3e12c1b1b35247c05891051ebacc16