-
Notifications
You must be signed in to change notification settings - Fork 92
Perf: optimize Tablet write with columnar string storage and lazy DeviceID construction (~10x throughput) #748
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: develop
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -98,10 +98,9 @@ int Tablet::init() { | |
| case BLOB: | ||
| case TEXT: | ||
| case STRING: { | ||
| value_matrix_[c].string_data = | ||
| static_cast<common::String*>(common::mem_alloc( | ||
| sizeof(String) * max_row_num_, common::MOD_TABLET)); | ||
| if (value_matrix_[c].string_data == nullptr) return E_OOM; | ||
| auto* sc = new StringColumn(); | ||
| sc->init(max_row_num_, max_row_num_ * 32); | ||
| value_matrix_[c].string_col = sc; | ||
| break; | ||
| } | ||
| default: | ||
|
|
@@ -117,6 +116,7 @@ int Tablet::init() { | |
| new (&bitmaps_[c]) BitMap(); | ||
| bitmaps_[c].init(max_row_num_, false); | ||
| } | ||
|
|
||
| return E_OK; | ||
| } | ||
|
|
||
|
|
@@ -150,7 +150,8 @@ void Tablet::destroy() { | |
| case BLOB: | ||
| case TEXT: | ||
| case STRING: | ||
| common::mem_free(value_matrix_[c].string_data); | ||
| value_matrix_[c].string_col->destroy(); | ||
| delete value_matrix_[c].string_col; | ||
| break; | ||
| default: | ||
| break; | ||
|
|
@@ -293,8 +294,7 @@ void* Tablet::get_value(int row_index, uint32_t schema_index, | |
| return &double_values[row_index]; | ||
| } | ||
| case STRING: { | ||
| auto string_values = column_values.string_data; | ||
| return &string_values[row_index]; | ||
| return &column_values.string_col->get_string_view(row_index); | ||
| } | ||
| default: | ||
| return nullptr; | ||
|
|
@@ -304,8 +304,8 @@ void* Tablet::get_value(int row_index, uint32_t schema_index, | |
| template <> | ||
| void Tablet::process_val(uint32_t row_index, uint32_t schema_index, | ||
| common::String str) { | ||
| value_matrix_[schema_index].string_data[row_index].dup_from(str, | ||
| page_arena_); | ||
| value_matrix_[schema_index].string_col->append(row_index, str.buf_, | ||
| str.len_); | ||
| bitmaps_[schema_index].clear(row_index); /* mark as non-null */ | ||
| } | ||
|
|
||
|
|
@@ -444,6 +444,57 @@ void Tablet::set_column_categories( | |
| } | ||
| } | ||
|
|
||
| void Tablet::reset_string_columns() { | ||
| size_t schema_count = schema_vec_->size(); | ||
| for (size_t c = 0; c < schema_count; c++) { | ||
| const MeasurementSchema& schema = schema_vec_->at(c); | ||
| if (schema.data_type_ == STRING || schema.data_type_ == TEXT || | ||
| schema.data_type_ == BLOB) { | ||
| value_matrix_[c].string_col->reset(); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| std::vector<uint32_t> Tablet::find_all_device_boundaries() const { | ||
| const uint32_t row_count = get_cur_row_size(); | ||
| if (row_count <= 1) return {}; | ||
|
|
||
| // Use uint64_t bitmap instead of vector<bool> for faster set/test/scan. | ||
| const uint32_t nwords = (row_count + 63) / 64; | ||
| std::vector<uint64_t> boundary(nwords, 0); | ||
|
|
||
| for (auto col_idx : id_column_indexes_) { | ||
| const StringColumn& sc = *value_matrix_[col_idx].string_col; | ||
| const uint32_t* off = sc.offsets; | ||
| const char* buf = sc.buffer; | ||
| for (uint32_t i = 1; i < row_count; i++) { | ||
| if (boundary[i >> 6] & (1ULL << (i & 63))) continue; | ||
| uint32_t len_a = off[i] - off[i - 1]; | ||
| uint32_t len_b = off[i + 1] - off[i]; | ||
| if (len_a != len_b || | ||
| (len_a > 0 && | ||
| memcmp(buf + off[i - 1], buf + off[i], len_a) != 0)) { | ||
| boundary[i >> 6] |= (1ULL << (i & 63)); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If the number of boundaries reaches the number of rows, may break. |
||
| } | ||
| } | ||
| } | ||
|
Comment on lines
+466
to
+480
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. May traverse tag columns in reversed order, because we tend to organize tags from big (like country) to small (like street). |
||
|
|
||
| // Collect boundary positions using bitscan | ||
| std::vector<uint32_t> result; | ||
| for (uint32_t w = 0; w < nwords; w++) { | ||
| uint64_t bits = boundary[w]; | ||
| while (bits) { | ||
| uint32_t bit = __builtin_ctzll(bits); | ||
| uint32_t idx = w * 64 + bit; | ||
| if (idx > 0 && idx < row_count) { | ||
| result.push_back(idx); | ||
| } | ||
| bits &= bits - 1; // clear lowest set bit | ||
| } | ||
| } | ||
| return result; | ||
| } | ||
|
|
||
| std::shared_ptr<IDeviceID> Tablet::get_device_id(int i) const { | ||
| std::vector<std::string*> id_array; | ||
| id_array.push_back(new std::string(insert_target_name_)); | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -46,14 +46,78 @@ class TabletColIterator; | |
| * with their associated metadata such as column names and types. | ||
| */ | ||
| class Tablet { | ||
| // Arrow-style string column: offsets + contiguous buffer. | ||
| // string[i] = buffer + offsets[i], len = offsets[i+1] - offsets[i] | ||
| struct StringColumn { | ||
| uint32_t* offsets; // length: max_rows + 1 | ||
| char* buffer; // contiguous string data | ||
| uint32_t buf_capacity; // allocated buffer size | ||
| uint32_t buf_used; // bytes written so far | ||
|
|
||
| StringColumn() | ||
| : offsets(nullptr), buffer(nullptr), buf_capacity(0), buf_used(0) {} | ||
|
|
||
| void init(uint32_t max_rows, uint32_t init_buf_capacity) { | ||
| offsets = (uint32_t*)common::mem_alloc( | ||
| sizeof(uint32_t) * (max_rows + 1), common::MOD_DEFAULT); | ||
| offsets[0] = 0; | ||
| buf_capacity = init_buf_capacity; | ||
| buffer = | ||
| (char*)common::mem_alloc(buf_capacity, common::MOD_DEFAULT); | ||
| buf_used = 0; | ||
| } | ||
|
|
||
| void destroy() { | ||
| if (offsets) common::mem_free(offsets); | ||
| offsets = nullptr; | ||
| if (buffer) common::mem_free(buffer); | ||
| buffer = nullptr; | ||
| buf_capacity = buf_used = 0; | ||
| } | ||
|
|
||
| void reset() { | ||
| buf_used = 0; | ||
| if (offsets) offsets[0] = 0; | ||
| } | ||
|
|
||
| void append(uint32_t row, const char* data, uint32_t len) { | ||
| // Grow buffer if needed | ||
| if (buf_used + len > buf_capacity) { | ||
| buf_capacity = buf_capacity * 2 + len; | ||
| buffer = (char*)common::mem_realloc(buffer, buf_capacity); | ||
| } | ||
| memcpy(buffer + buf_used, data, len); | ||
| offsets[row] = buf_used; | ||
| offsets[row + 1] = buf_used + len; | ||
| buf_used += len; | ||
| } | ||
|
Comment on lines
+83
to
+93
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If data equals the value of the previous row, may simply use the same offsets and avoid a memory copy. |
||
|
|
||
| const char* get_str(uint32_t row) const { | ||
| return buffer + offsets[row]; | ||
| } | ||
| uint32_t get_len(uint32_t row) const { | ||
| return offsets[row + 1] - offsets[row]; | ||
| } | ||
| // Return a String view for a given row. The returned reference is | ||
| // valid until the next call to get_string_view on this column. | ||
| common::String& get_string_view(uint32_t row) { | ||
| view_cache_.buf_ = buffer + offsets[row]; | ||
| view_cache_.len_ = offsets[row + 1] - offsets[row]; | ||
| return view_cache_; | ||
| } | ||
|
|
||
| private: | ||
| common::String view_cache_; | ||
| }; | ||
|
|
||
| struct ValueMatrixEntry { | ||
| union { | ||
| int32_t* int32_data; | ||
| int64_t* int64_data; | ||
| float* float_data; | ||
| double* double_data; | ||
| bool* bool_data; | ||
| common::String* string_data; | ||
| StringColumn* string_col; | ||
| }; | ||
| }; | ||
|
|
||
|
|
@@ -220,6 +284,7 @@ class Tablet { | |
| void set_column_categories( | ||
| const std::vector<common::ColumnCategory>& column_categories); | ||
| std::shared_ptr<IDeviceID> get_device_id(int i) const; | ||
| std::vector<uint32_t> find_all_device_boundaries() const; | ||
| /** | ||
| * @brief Template function to add a value of type T to the specified row | ||
| * and column by name. | ||
|
|
@@ -253,6 +318,8 @@ class Tablet { | |
| schema_map_ = schema_map; | ||
| } | ||
|
|
||
| void reset_string_columns(); | ||
|
|
||
| friend class TabletColIterator; | ||
| friend class TsFileWriter; | ||
| friend struct MeasurementNamesFromTablet; | ||
|
|
@@ -265,7 +332,6 @@ class Tablet { | |
| private: | ||
| template <typename T> | ||
| void process_val(uint32_t row_index, uint32_t schema_index, T val); | ||
| common::PageArena page_arena_{common::MOD_TABLET}; | ||
| uint32_t max_row_num_; | ||
| uint32_t cur_row_size_; | ||
| std::string insert_target_name_; | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Use mem_alloc?