From 9422ba321db2bc73c06ae6013cfe5dfac27be94c Mon Sep 17 00:00:00 2001
From: ColinLee <shuolin_l@163.com>
Date: Tue, 16 Jun 2026 22:24:59 +0800
Subject: [PATCH 1/9] TsFile C++: batch + SIMD + parallel read/write
 optimization

Batch decode/encode APIs (PLAIN / TS2DIFF / Gorilla) with single-pass
TsBlock decode, AVX2/NEON SIMD paths, a single process-wide worker pool
for chunk-level parallel read and column-parallel write, and batched
NEON statistics. On-disk format unchanged; interoperable with Java/Python.
---
 cpp/CMakeLists.txt                            |   11 +-
 cpp/pom.xml                                   |    4 +-
 cpp/src/CMakeLists.txt                        |   23 +-
 cpp/src/common/CMakeLists.txt                 |   10 +-
 cpp/src/common/allocator/alloc_base.h         |   24 +-
 cpp/src/common/allocator/byte_stream.h        |  173 +-
 cpp/src/common/allocator/mem_alloc.cc         |   12 +-
 cpp/src/common/allocator/page_arena.h         |   13 +
 cpp/src/common/config/config.h                |   33 +-
 cpp/src/common/container/bit_map.cc           |    5 +-
 cpp/src/common/container/bit_map.h            |   70 +-
 cpp/src/common/container/byte_buffer.h        |    6 +-
 cpp/src/common/device_id.cc                   |    2 +-
 cpp/src/common/global.cc                      |   90 +-
 cpp/src/common/global.h                       |   40 +-
 cpp/src/common/mutex/CMakeLists.txt           |   20 -
 cpp/src/common/mutex/mutex.h                  |   57 -
 cpp/src/common/path.cc                        |   78 -
 cpp/src/common/path.h                         |   59 +-
 cpp/src/common/seq_tvlist.h                   |  172 --
 cpp/src/common/seq_tvlist.inc                 |  174 --
 cpp/src/common/statistic.h                    |  372 ++-
 cpp/src/common/tablet.cc                      |  213 +-
 cpp/src/common/tablet.h                       |   82 +-
 cpp/src/common/thread_pool.h                  |   51 +-
 cpp/src/common/tsblock/tsblock.h              |   65 +-
 .../tsblock/vector/variable_length_vector.h   |   11 +-
 cpp/src/common/tsblock/vector/vector.h        |    3 +
 cpp/src/common/tsfile_common.h                |   62 +-
 cpp/src/compress/lz4_compressor.cc            |   16 +-
 cpp/src/compress/snappy_compressor.cc         |   19 +-
 cpp/src/compress/uncompressed_compressor.h    |   51 +-
 cpp/src/cwrapper/arrow_c.cc                   |   23 +-
 cpp/src/cwrapper/tsfile_cwrapper.cc           |  111 +-
 cpp/src/cwrapper/tsfile_cwrapper.h            |   52 +-
 cpp/src/encoding/decoder.h                    |  135 +
 cpp/src/encoding/dictionary_encoder.h         |    7 +-
 cpp/src/encoding/encoder.h                    |   75 +
 cpp/src/encoding/gorilla_decoder.h            |  474 +++-
 cpp/src/encoding/plain_decoder.h              |  144 +
 cpp/src/encoding/plain_encoder.h              |  191 +-
 cpp/src/encoding/ts2diff_decoder.h            |  673 ++++-
 cpp/src/encoding/ts2diff_encoder.h            |  395 ++-
 cpp/src/file/read_file.cc                     |    1 +
 cpp/src/file/restorable_tsfile_io_writer.cc   |   72 +-
 cpp/src/file/tsfile_io_reader.cc              |  297 ++-
 cpp/src/file/tsfile_io_reader.h               |   31 +
 cpp/src/file/tsfile_io_writer.cc              |   88 +-
 cpp/src/file/tsfile_io_writer.h               |   27 +-
 cpp/src/reader/aligned_chunk_reader.cc        | 2322 ++++++++++++++++-
 cpp/src/reader/aligned_chunk_reader.h         |  198 +-
 .../block/single_device_tsblock_reader.cc     |  752 +++++-
 .../block/single_device_tsblock_reader.h      |   35 +-
 cpp/src/reader/bloom_filter.cc                |   20 +
 cpp/src/reader/bloom_filter.h                 |    8 +
 cpp/src/reader/chunk_reader.cc                |  364 ++-
 cpp/src/reader/chunk_reader.h                 |   20 +-
 cpp/src/reader/device_meta_iterator.cc        |   12 +-
 cpp/src/reader/filter/and_filter.h            |   23 +
 cpp/src/reader/filter/filter.h                |   14 +
 cpp/src/reader/filter/or_filter.h             |   23 +
 cpp/src/reader/filter/time_operator.cc        |  335 ++-
 cpp/src/reader/filter/time_operator.h         |   18 +
 cpp/src/reader/qds_without_timegenerator.cc   |    7 +-
 cpp/src/reader/result_set.h                   |   54 +-
 cpp/src/reader/table_result_set.cc            |  104 +-
 cpp/src/reader/table_result_set.h             |   22 +-
 cpp/src/reader/task/device_query_task.cc      |   10 +-
 cpp/src/reader/task/device_task_iterator.cc   |    3 +
 cpp/src/reader/task/device_task_iterator.h    |   13 +-
 cpp/src/reader/tsfile_reader.cc               |   72 +-
 cpp/src/reader/tsfile_reader.h                |    5 +-
 cpp/src/reader/tsfile_series_scan_iterator.cc |  286 +-
 cpp/src/reader/tsfile_series_scan_iterator.h  |   59 +-
 cpp/src/utils/db_utils.h                      |    2 -
 cpp/src/writer/chunk_writer.cc                |    3 +
 cpp/src/writer/chunk_writer.h                 |   62 +
 cpp/src/writer/page_writer.cc                 |   13 +
 cpp/src/writer/page_writer.h                  |   73 +-
 cpp/src/writer/time_chunk_writer.cc           |    6 +-
 cpp/src/writer/time_chunk_writer.h            |   57 +-
 cpp/src/writer/time_page_writer.h             |   32 +-
 cpp/src/writer/tsfile_table_writer.cc         |   39 +-
 cpp/src/writer/tsfile_table_writer.h          |    2 +
 cpp/src/writer/tsfile_writer.cc               | 1397 +++++-----
 cpp/src/writer/tsfile_writer.h                |   93 +-
 cpp/src/writer/value_chunk_writer.cc          |   13 +-
 cpp/src/writer/value_chunk_writer.h           |   87 +-
 cpp/src/writer/value_page_writer.cc           |    6 +
 cpp/src/writer/value_page_writer.h            |  195 +-
 cpp/test/CMakeLists.txt                       |    3 +-
 cpp/test/common/allocator/byte_stream_test.cc |  105 +-
 cpp/test/common/tablet_test.cc                |  138 +
 cpp/test/common/thread_pool_test.cc           |   66 +
 cpp/test/common/tsfile_common_test.cc         |   25 +
 cpp/test/compress/lz4_compressor_test.cc      |   36 +
 cpp/test/compress/snappy_compressor_test.cc   |   36 +
 .../compress/uncompressed_compressor_test.cc  |   74 +
 cpp/test/cwrapper/c_release_test.cc           |   25 +-
 cpp/test/cwrapper/cwrapper_test.cc            |  153 +-
 .../cwrapper/query_by_row_cwrapper_test.cc    |    2 +-
 cpp/test/encoding/encoding_coverage_test.cc   |  406 +++
 cpp/test/encoding/gorilla_codec_test.cc       |  315 +++
 cpp/test/encoding/plain_codec_test.cc         |   86 +
 cpp/test/encoding/ts2diff_codec_test.cc       |  116 +
 .../file/restorable_tsfile_io_writer_test.cc  |   68 +-
 cpp/test/file/write_file_test.cc              |   44 +
 cpp/test/reader/filter/time_in_filter_test.cc |   84 +
 .../reader/query_by_row_performance_test.cc   |    4 +-
 .../tsfile_reader_table_batch_test.cc         |  217 ++
 .../table_view/tsfile_reader_table_test.cc    |   58 +-
 .../tsfile_table_query_by_row_test.cc         |  166 +-
 .../tree_view/tsfile_reader_tree_test.cc      |   45 +
 .../tsfile_tree_query_by_row_test.cc          |   93 +-
 cpp/test/reader/tsfile_reader_test.cc         |  598 +++++
 .../table_view/tsfile_writer_table_test.cc    |   50 +-
 cpp/test/writer/tsfile_writer_test.cc         |  436 ++++
 cpp/test/writer/value_page_writer_test.cc     |   33 +
 python/tests/test_tsfile_dataset.py           |   53 +-
 python/tsfile/dataset/reader.py               |   39 +-
 python/tsfile/tsfile_reader.pyx               |    4 +-
 121 files changed, 12937 insertions(+), 2317 deletions(-)
 delete mode 100644 cpp/src/common/mutex/CMakeLists.txt
 delete mode 100644 cpp/src/common/mutex/mutex.h
 delete mode 100644 cpp/src/common/path.cc
 delete mode 100644 cpp/src/common/seq_tvlist.h
 delete mode 100644 cpp/src/common/seq_tvlist.inc
 create mode 100644 cpp/test/common/thread_pool_test.cc
 create mode 100644 cpp/test/compress/uncompressed_compressor_test.cc
 create mode 100644 cpp/test/encoding/encoding_coverage_test.cc
 create mode 100644 cpp/test/reader/filter/time_in_filter_test.cc

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 319752482..07b4f6fc5 100755
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -129,7 +129,16 @@ else ()
         if (CMAKE_BUILD_TYPE STREQUAL "Debug")
             set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -g")
         elseif (CMAKE_BUILD_TYPE STREQUAL "Release")
-            set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O2")
+            # -flto + MinGW gcc + statically-linked antlr4_static produces
+            # unresolved-reference errors at link time (LTO intermediate objects
+            # can't see the .a's vtable thunks). -march=native is also a poor
+            # default for CI binaries shipped to other machines. Keep both on
+            # Linux/macOS where the optimization actually pays off.
+            if (MINGW OR WIN32)
+                set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3")
+            else ()
+                set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3 -march=native -flto")
+            endif ()
         elseif (CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
             set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -O2 -g")
         elseif (CMAKE_BUILD_TYPE STREQUAL "MinSizeRel")
diff --git a/cpp/pom.xml b/cpp/pom.xml
index 5415212f0..153e75dc2 100644
--- a/cpp/pom.xml
+++ b/cpp/pom.xml
@@ -99,8 +99,8 @@
                                     plugin's generate goal throw an NPE.
                                 -->
                             </options>
-                            <sourcePath />
-                            <targetPath />
+                            <sourcePath/>
+                            <targetPath/>
                         </configuration>
                     </execution>
                     <!-- Compile the test code -->
diff --git a/cpp/src/CMakeLists.txt b/cpp/src/CMakeLists.txt
index 93342c113..895c1ddba 100644
--- a/cpp/src/CMakeLists.txt
+++ b/cpp/src/CMakeLists.txt
@@ -37,6 +37,9 @@ message("cmake using: ENABLE_LZOKAY=${ENABLE_LZOKAY}")
 option(ENABLE_ZLIB "Enable Zlib compression" ON)
 message("cmake using: ENABLE_ZLIB=${ENABLE_ZLIB}")
 
+# ENABLE_SIMD is defined in the top-level CMakeLists.txt
+message("cmake using: ENABLE_SIMD=${ENABLE_SIMD}")
+
 message("Running in src directory")
 if (${COV_ENABLED})
     add_compile_options(-fprofile-arcs -ftest-coverage)
@@ -89,6 +92,13 @@ if (ENABLE_ANTLR4)
     message("Adding ANTLR4 include directory")
 endif()
 
+if (ENABLE_SIMD)
+    add_definitions(-DENABLE_SIMD)
+    list(APPEND PROJECT_INCLUDE_DIR
+            ${CMAKE_SOURCE_DIR}/third_party/simde-0.8.4-rc3
+    )
+endif()
+
 include_directories(${PROJECT_INCLUDE_DIR})
 
 # Mark every translation unit that is compiled into the tsfile library so that
@@ -144,10 +154,17 @@ add_library(tsfile SHARED)
 
 if (${COV_ENABLED})
     message("Enable code cov...")
+    # Apple clang ships coverage runtime via --coverage; libgcov isn't a
+    # standalone library on macOS.  Use --coverage there.
+    if (APPLE)
+        set(COV_LINK_LIB --coverage)
+    else()
+        set(COV_LINK_LIB -lgcov)
+    endif()
     if (ENABLE_ANTLR4)
-        target_link_libraries(tsfile common_obj compress_obj cwrapper_obj file_obj read_obj write_obj parser_obj -lgcov)
+        target_link_libraries(tsfile common_obj compress_obj cwrapper_obj file_obj read_obj write_obj parser_obj ${COV_LINK_LIB})
     else()
-        target_link_libraries(tsfile common_obj compress_obj cwrapper_obj file_obj read_obj write_obj -lgcov)
+        target_link_libraries(tsfile common_obj compress_obj cwrapper_obj file_obj read_obj write_obj ${COV_LINK_LIB})
     endif()
 else()
     message("Disable code cov...")
@@ -171,4 +188,4 @@ set_target_properties(tsfile PROPERTIES SOVERSION ${LIBTSFILE_SO_VERSION})
 install(TARGETS tsfile
         RUNTIME DESTINATION ${LIBRARY_OUTPUT_PATH}
         LIBRARY DESTINATION ${LIBRARY_OUTPUT_PATH}
-        ARCHIVE DESTINATION ${LIBRARY_OUTPUT_PATH})
\ No newline at end of file
+        ARCHIVE DESTINATION ${LIBRARY_OUTPUT_PATH})
diff --git a/cpp/src/common/CMakeLists.txt b/cpp/src/common/CMakeLists.txt
index 4406cb219..60e0fdccf 100644
--- a/cpp/src/common/CMakeLists.txt
+++ b/cpp/src/common/CMakeLists.txt
@@ -22,21 +22,15 @@ aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR} common_SRC_LIST)
 aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/allocator common_allocator_SRC_LIST)
 aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/container common_container_SRC_LIST)
 aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/tsblock common_tsblock_SRC_LIST)
-aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/mutex common_mutex_SRC_LIST)
 aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/datatype common_datatype_SRC_LIST)
 
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
-add_library(common_obj OBJECT ${common_SRC_LIST} 
+add_library(common_obj OBJECT ${common_SRC_LIST}
     ${common_allocator_SRC_LIST}
     ${common_container_SRC_LIST}
-    ${common_tsblock_SRC_LIST} 
-    ${common_mutex_SRC_LIST} 
+    ${common_tsblock_SRC_LIST}
     ${common_datatype_SRC_LIST})
 
-if (ENABLE_ANTLR4)
-    target_compile_definitions(common_obj PRIVATE ENABLE_ANTLR4)
-endif()
-
 # install header files recursively
 file(GLOB_RECURSE HEADERS "${CMAKE_CURRENT_SOURCE_DIR}/*.h")
 copy_to_dir(${HEADERS} "common_obj")
\ No newline at end of file
diff --git a/cpp/src/common/allocator/alloc_base.h b/cpp/src/common/allocator/alloc_base.h
index c89aed077..dd2e0ab61 100644
--- a/cpp/src/common/allocator/alloc_base.h
+++ b/cpp/src/common/allocator/alloc_base.h
@@ -82,35 +82,43 @@ class ModStat {
     }
     void init();
     void destroy();
-    INLINE void update_alloc(AllocModID mid, int32_t size) {
+    INLINE void update_alloc(AllocModID mid, int64_t size) {
 #ifdef ENABLE_MEM_STAT
         ASSERT(mid < __LAST_MOD_ID);
         ATOMIC_FAA(get_item(mid), size);
 #endif
     }
-    void update_free(AllocModID mid, uint32_t size) {
+    void update_free(AllocModID mid, uint64_t size) {
 #ifdef ENABLE_MEM_STAT
         ASSERT(mid < __LAST_MOD_ID);
-        ATOMIC_FAA(get_item(mid), 0 - size);
+        ATOMIC_FAA(get_item(mid), -static_cast<int64_t>(size));
 #endif
     }
     void print_stat();
 
+    int64_t get_stat(int8_t mid) {
+#ifdef ENABLE_MEM_STAT
+        if (stat_arr_ != NULL && mid < __LAST_MOD_ID)
+            return ATOMIC_FAA(get_item(mid), 0LL);
+#endif
+        return 0;
+    }
+
 #ifdef ENABLE_TEST
-    int32_t TEST_get_stat(int8_t mid) { return ATOMIC_FAA(get_item(mid), 0); }
+    int64_t TEST_get_stat(int8_t mid) { return ATOMIC_FAA(get_item(mid), 0LL); }
 #endif
 
    private:
-    INLINE int32_t* get_item(int8_t mid) {
-        return &(stat_arr_[mid * (ITEM_SIZE / sizeof(int32_t))]);
+    INLINE int64_t* get_item(int8_t mid) {
+        return &(stat_arr_[mid * (ITEM_SIZE / sizeof(int64_t))]);
     }
 
    private:
     static const int32_t ITEM_SIZE = CACHE_LINE_SIZE;
     static const int32_t ITEM_COUNT = __LAST_MOD_ID;
-    int32_t* stat_arr_;
+    int64_t* stat_arr_;
 
-    STATIC_ASSERT((ITEM_SIZE % sizeof(int32_t) == 0), ModStat_ITEM_SIZE_ERROR);
+    STATIC_ASSERT((ITEM_SIZE % sizeof(int64_t) == 0), ModStat_ITEM_SIZE_ERROR);
 };
 
 /* base allocator */
diff --git a/cpp/src/common/allocator/byte_stream.h b/cpp/src/common/allocator/byte_stream.h
index 435a1f6fd..ad8dbb90d 100644
--- a/cpp/src/common/allocator/byte_stream.h
+++ b/cpp/src/common/allocator/byte_stream.h
@@ -24,6 +24,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 
+#include <atomic>
 #include <iostream>
 #include <string>
 
@@ -33,51 +34,51 @@
 
 namespace common {
 
+// std::atomic<T> as the actual storage so the MSVC fallback no longer needs
+// `reinterpret_cast<atomic<T>*>(T*)` — that cast is UB because the underlying
+// object was never constructed as a std::atomic<T>.  When the caller asks for
+// non-atomic mode we still go through the atomic interface but with
+// memory_order_relaxed, which on x86/ARM compiles to a plain load/store.
+// std::atomic<T> is non-copyable, so neither is OptionalAtomic; existing
+// callers either construct in place or use shallow_clone_from / store.
 template <typename T>
 class OptionalAtomic {
    public:
     OptionalAtomic(T t, bool enable_atomic = false)
         : val_(t), enable_atomic_(enable_atomic) {}
 
+    OptionalAtomic(const OptionalAtomic&) = delete;
+    OptionalAtomic& operator=(const OptionalAtomic&) = delete;
+    OptionalAtomic(OptionalAtomic&&) = delete;
+    OptionalAtomic& operator=(OptionalAtomic&&) = delete;
+
     FORCE_INLINE T load() const {
-        if (UNLIKELY(enable_atomic_)) {
-            return ATOMIC_LOAD(&val_);
-        } else {
-            return val_;
-        }
+        return val_.load(UNLIKELY(enable_atomic_) ? std::memory_order_seq_cst
+                                                  : std::memory_order_relaxed);
     }
 
     FORCE_INLINE void store(const T t) {
-        if (UNLIKELY(enable_atomic_)) {
-            ATOMIC_STORE(&val_, t);
-        } else {
-            val_ = t;
-        }
+        val_.store(t, UNLIKELY(enable_atomic_) ? std::memory_order_seq_cst
+                                               : std::memory_order_relaxed);
     }
 
     FORCE_INLINE T atomic_faa(const T increment) {
-        if (UNLIKELY(enable_atomic_)) {
-            return ATOMIC_FAA(&val_, increment);
-        } else {
-            T old_val = val_;
-            val_ = val_ + increment;
-            return old_val;
-        }
+        return val_.fetch_add(increment, UNLIKELY(enable_atomic_)
+                                             ? std::memory_order_seq_cst
+                                             : std::memory_order_relaxed);
     }
 
     FORCE_INLINE T atomic_aaf(const T increment) {
-        if (UNLIKELY(enable_atomic_)) {
-            return ATOMIC_AAF(&val_, increment);
-        } else {
-            val_ = val_ + increment;
-            return val_;
-        }
+        return val_.fetch_add(increment, UNLIKELY(enable_atomic_)
+                                             ? std::memory_order_seq_cst
+                                             : std::memory_order_relaxed) +
+               increment;
     }
 
     FORCE_INLINE bool enable_atomic() const { return enable_atomic_; }
 
    private:
-    T val_;
+    std::atomic<T> val_;
     bool enable_atomic_;
 };
 
@@ -231,6 +232,23 @@ FORCE_INLINE double bytes_to_double(uint8_t bytes[8]) {
 
 // TODO define a WrappedByteStream class
 
+// Round n up to the next power of two (>=1). Used to normalize ByteStream
+// page sizes so that `& page_mask_` is equivalent to `% page_size_`.
+// Values above the largest power-of-two that fits in uint32_t are clamped to
+// 0x80000000 — the previous `while (ps < n) ps <<= 1` would shift past 2^31
+// and overflow to 0, looping forever.
+FORCE_INLINE uint32_t round_up_pow2(uint32_t n) {
+    if (n <= 1) return 1;
+    if (n > 0x80000000u) return 0x80000000u;
+    uint32_t v = n - 1;
+    v |= v >> 1;
+    v |= v >> 2;
+    v |= v >> 4;
+    v |= v >> 8;
+    v |= v >> 16;
+    return v + 1;
+}
+
 // auto extend buffer for serialization
 class ByteStream {
    private:
@@ -253,6 +271,8 @@ class ByteStream {
     };
 
    public:
+    static const uint32_t DEFAULT_PAGE_SIZE = 1024;
+
     ByteStream(uint32_t page_size, AllocModID mid, bool enable_atomic = false,
                BaseAllocator& allocator = g_base_allocator)
         : allocator_(allocator),
@@ -262,11 +282,16 @@ class ByteStream {
           total_size_(0, enable_atomic),
           read_pos_(0),
           marked_read_pos_(0),
-          page_size_(page_size),
+          // page_mask_ is used as a bitmask in the hot read/write paths
+          // (`x & page_mask_` instead of `x % page_size_`), which only
+          // matches modulo arithmetic when page_size_ is a power of two.
+          // Round up so callers passing non-power-of-2 sizes still get a
+          // correctly-sized page, at the cost of <2x memory in the worst
+          // case (e.g. 1000 → 1024).
+          page_size_(round_up_pow2(page_size)),
+          page_mask_(round_up_pow2(page_size) - 1),
           mid_(mid),
-          wrapped_page_(false, nullptr) {
-        // assert(page_size >= 16);  // commented out by gxh on 2023.03.09
-    }
+          wrapped_page_(false, nullptr) {}
 
     // for wrap plain buffer to ByteStream
     ByteStream(AllocModID mid = MOD_DEFAULT)
@@ -278,6 +303,7 @@ class ByteStream {
           read_pos_(0),
           marked_read_pos_(0),
           page_size_(0),
+          page_mask_(0),
           mid_(mid),
           wrapped_page_(false, nullptr) {}
 
@@ -290,7 +316,10 @@ class ByteStream {
         wrapped_page_.next_.store(nullptr);
         wrapped_page_.buf_ = (uint8_t*)buf;
 
-        page_size_ = buf_len;
+        // page_mask_ is used as a bitmask; only correct for power-of-2
+        // page sizes (see ByteStream ctor comment).
+        page_size_ = round_up_pow2(static_cast<uint32_t>(buf_len));
+        page_mask_ = page_size_ - 1;
         head_.store(&wrapped_page_);
         tail_.store(&wrapped_page_);
         total_size_.store(buf_len);
@@ -305,14 +334,14 @@ class ByteStream {
     void clear_wrapped_buf() { wrapped_page_.buf_ = nullptr; }
 
     /* ================ Part 1: basic ================ */
-    FORCE_INLINE uint32_t remaining_size() const {
+    FORCE_INLINE uint64_t remaining_size() const {
         ASSERT(total_size_.load() >= read_pos_);
         return total_size_.load() - read_pos_;
     }
     FORCE_INLINE bool has_remaining() const { return remaining_size() > 0; }
 
     FORCE_INLINE void mark_read_pos() { marked_read_pos_ = read_pos_; }
-    FORCE_INLINE uint32_t get_mark_len() const {
+    FORCE_INLINE uint64_t get_mark_len() const {
         ASSERT(marked_read_pos_ <= read_pos_);
         return read_pos_ - marked_read_pos_;
     }
@@ -339,30 +368,46 @@ class ByteStream {
     // never used TODO
     void shallow_clone_from(ByteStream& other) {
         this->page_size_ = other.page_size_;
+        this->page_mask_ = other.page_mask_;
         this->mid_ = other.mid_;
         this->head_.store(other.head_.load());
         this->tail_.store(other.tail_.load());
         this->total_size_.store(other.total_size_.load());
     }
 
-    FORCE_INLINE uint32_t total_size() const { return total_size_.load(); }
-    FORCE_INLINE uint32_t read_pos() const { return read_pos_; };
+    FORCE_INLINE uint64_t total_size() const { return total_size_.load(); }
+    FORCE_INLINE uint64_t read_pos() const { return read_pos_; };
+    // Sum of bytes physically allocated for this stream's pages.  For a
+    // wrapped stream this just reports total_size(); for an owning stream
+    // it counts page_size_ per backing page so callers doing memory-pressure
+    // accounting see the real footprint, not the few bytes that happen to
+    // have been written into the latest 64 KiB page.
+    FORCE_INLINE uint64_t allocated_bytes() const {
+        if (is_wrapped()) return total_size_.load();
+        uint64_t total = 0;
+        Page* p = head_.load();
+        while (p != nullptr) {
+            total += page_size_;
+            p = p->next_.load();
+        }
+        return total;
+    }
     /**
      * Seek the read cursor to an absolute offset. Re-anchors read_page_ for
      * multi-page streams.
      */
-    void set_read_pos(uint32_t pos) {
+    void set_read_pos(uint64_t pos) {
         ASSERT(pos <= total_size());
         read_pos_ = pos;
         Page* p = head_.load();
-        uint32_t skipped = 0;
+        uint64_t skipped = 0;
         while (p != nullptr && skipped + page_size_ <= pos) {
             skipped += page_size_;
             p = p->next_.load();
         }
         read_page_ = p;
     }
-    FORCE_INLINE void wrapped_buf_advance_read_pos(uint32_t size) {
+    FORCE_INLINE void wrapped_buf_advance_read_pos(uint64_t size) {
         if (size + read_pos_ > total_size_.load()) {
             read_pos_ = total_size_.load();
         } else {
@@ -380,10 +425,10 @@ class ByteStream {
                 std::cout << "write_buf error " << ret << std::endl;
                 return ret;
             }
-            uint32_t remainder = page_size_ - (total_size_.load() % page_size_);
+            uint32_t remainder = page_size_ - (total_size_.load() & page_mask_);
             uint32_t copy_len =
                 remainder < (len - write_len) ? remainder : (len - write_len);
-            memcpy(tail_.load()->buf_ + total_size_.load() % page_size_,
+            memcpy(tail_.load()->buf_ + (total_size_.load() & page_mask_),
                    buf + write_len, copy_len);
             total_size_.atomic_aaf(copy_len);
             write_len += copy_len;
@@ -404,11 +449,11 @@ class ByteStream {
             if (RET_FAIL(check_space())) {
                 return ret;
             }
-            uint32_t remainder = page_size_ - (read_pos_ % page_size_);
+            uint32_t remainder = page_size_ - (read_pos_ & page_mask_);
             uint32_t copy_len = remainder < want_len_limited - read_len
                                     ? remainder
                                     : want_len_limited - read_len;
-            memcpy(buf + read_len, read_page_->buf_ + (read_pos_ % page_size_),
+            memcpy(buf + read_len, read_page_->buf_ + (read_pos_ & page_mask_),
                    copy_len);
             read_len += copy_len;
             read_pos_ += copy_len;
@@ -460,16 +505,17 @@ class ByteStream {
             return b;
         }
         b.buf_ =
-            (char*)(tail_.load()->buf_ + (total_size_.load() % page_size_));
-        b.len_ = page_size_ - (total_size_.load() % page_size_);
+            (char*)(tail_.load()->buf_ + (total_size_.load() & page_mask_));
+        b.len_ = page_size_ - (total_size_.load() & page_mask_);
         return b;
     }
 
     void buffer_used(uint32_t used_bytes) {
         ASSERT(used_bytes >= 1);
         // would not span page
-        ASSERT((total_size_.load() / page_size_) ==
-               ((total_size_.load() + used_bytes - 1) / page_size_));
+        ASSERT(page_size_ == 0 ||
+               (total_size_.load() / page_size_) ==
+                   ((total_size_.load() + used_bytes - 1) / page_size_));
         total_size_.atomic_aaf(used_bytes);
     }
 
@@ -485,7 +531,7 @@ class ByteStream {
             if (RET_FAIL(prepare_space())) {
                 return ret;
             }
-            uint32_t remainder = page_size_ - (total_size_.load() % page_size_);
+            uint32_t remainder = page_size_ - (total_size_.load() & page_mask_);
             uint32_t step =
                 remainder < (len - advanced) ? remainder : (len - advanced);
             total_size_.atomic_aaf(step);
@@ -504,6 +550,7 @@ class ByteStream {
         Page* cur_;
         Page* end_;
         int64_t total_size_;
+        int64_t consumed_ = 0;
         BufferIterator(const ByteStream& bs) : host_(bs) {
             cur_ = bs.head_.load();
             end_ = bs.tail_.load();
@@ -514,13 +561,17 @@ class ByteStream {
             Buffer b;
             if (cur_ != nullptr) {
                 b.buf_ = (char*)cur_->buf_;
-                if (cur_ == end_ &&
-                    host_.total_size_.load() % host_.page_size_ != 0) {
-                    b.len_ = host_.total_size_.load() % host_.page_size_;
+                if (cur_ == end_) {
+                    // Last page: clamp to remaining total_size_. For wrapped
+                    // streams page_size_ may have been rounded up past the
+                    // user buffer (see wrap_from), so we must not return
+                    // page_size_ as the length here.
+                    b.len_ = static_cast<uint32_t>(total_size_ - consumed_);
                 } else {
                     b.len_ = host_.page_size_;
                 }
                 ASSERT(b.len_ > 0);
+                consumed_ += b.len_;
                 cur_ = cur_->next_.load();
             }
             return b;
@@ -566,7 +617,7 @@ class ByteStream {
 
             // get tail position <tail_, total_size_> atomically
             Page* host_end = nullptr;
-            uint32_t host_total_size = 0;
+            uint64_t host_total_size = 0;
             while (true) {
                 host_end = host_.tail_.load();
                 host_total_size = host_.total_size_.load();
@@ -577,7 +628,7 @@ class ByteStream {
 
             while (true) {
                 if (cur_ == host_end) {
-                    if (host_total_size % host_.page_size_ == 0) {
+                    if ((host_total_size & host_.page_mask_) == 0) {
                         if (read_offset_within_cur_page_ == host_.page_size_) {
                             return b;
                         } else {
@@ -591,15 +642,15 @@ class ByteStream {
                         }
                     } else {
                         if (read_offset_within_cur_page_ ==
-                            (host_total_size % host_.page_size_)) {
+                            (host_total_size & host_.page_mask_)) {
                             return b;
                         } else {
                             b.buf_ = ((char*)(cur_->buf_)) +
                                      read_offset_within_cur_page_;
-                            b.len_ = (host_total_size % host_.page_size_) -
+                            b.len_ = (host_total_size & host_.page_mask_) -
                                      read_offset_within_cur_page_;
                             read_offset_within_cur_page_ =
-                                (host_total_size % host_.page_size_);
+                                (host_total_size & host_.page_mask_);
                             total_end_offset_ += b.len_;
                             return b;
                         }
@@ -629,7 +680,7 @@ class ByteStream {
     FORCE_INLINE int prepare_space() {
         int ret = common::E_OK;
         if (UNLIKELY(tail_.load() == nullptr ||
-                     total_size_.load() % page_size_ == 0)) {
+                     (total_size_.load() & page_mask_) == 0)) {
             Page* p = nullptr;
             if (RET_FAIL(alloc_page(p))) {
                 return ret;
@@ -646,7 +697,7 @@ class ByteStream {
         }
         if (UNLIKELY(read_page_ == nullptr)) {
             read_page_ = head_.load();
-        } else if (UNLIKELY(read_pos_ % page_size_ == 0)) {
+        } else if (UNLIKELY((read_pos_ & page_mask_) == 0)) {
             read_page_ = read_page_->next_.load();
         }
         if (UNLIKELY(read_page_ == nullptr)) {
@@ -682,10 +733,14 @@ class ByteStream {
     OptionalAtomic<Page*> head_;
     OptionalAtomic<Page*> tail_;
     Page* read_page_;  // only one thread is allow to reader this ByteStream
-    OptionalAtomic<uint32_t> total_size_;  // total size in byte
-    uint32_t read_pos_;                    // current reader position
-    uint32_t marked_read_pos_;             // current reader position
+    OptionalAtomic<uint64_t> total_size_;  // total size in byte
+    // 64-bit so streams that legitimately grow past 4 GiB don't truncate
+    // the read cursor (e.g. concatenated chunk buffers in the writer's
+    // write_stream_ before the next flush).
+    uint64_t read_pos_;         // current reader position
+    uint64_t marked_read_pos_;  // current reader position
     uint32_t page_size_;
+    uint32_t page_mask_;  // page_size_ - 1, for bitwise AND instead of modulo
     AllocModID mid_;
 
    public:
@@ -1185,6 +1240,7 @@ class SerializationUtil {
     // indicates that memory has been allocated and must be freed.
     FORCE_INLINE static int read_var_char_ptr(std::string*& str,
                                               ByteStream& in) {
+        str = nullptr;
         int ret = common::E_OK;
         int32_t len = 0;
         int32_t read_len = 0;
@@ -1192,7 +1248,6 @@ class SerializationUtil {
             return ret;
         } else {
             if (len == storage::NO_STR_TO_READ) {
-                str = nullptr;
                 return ret;
             } else {
                 char* tmp_buf =
diff --git a/cpp/src/common/allocator/mem_alloc.cc b/cpp/src/common/allocator/mem_alloc.cc
index 524287e75..b7c5c09c1 100644
--- a/cpp/src/common/allocator/mem_alloc.cc
+++ b/cpp/src/common/allocator/mem_alloc.cc
@@ -95,7 +95,7 @@ void* mem_alloc(uint32_t size, AllocModID mid) {
     auto high4b = static_cast<uint32_t>(header >> 32);
     *reinterpret_cast<uint32_t*>(raw) = high4b;
     *reinterpret_cast<uint32_t*>(raw + 4) = low4b;
-    ModStat::get_instance().update_alloc(mid, static_cast<int32_t>(size));
+    ModStat::get_instance().update_alloc(mid, static_cast<int64_t>(size));
     return raw + header_size;
 }
 
@@ -158,7 +158,7 @@ void* mem_realloc(void* ptr, uint32_t size) {
     *reinterpret_cast<uint32_t*>(p) = high4b;
     *reinterpret_cast<uint32_t*>(p + 4) = low4b;
     ModStat::get_instance().update_alloc(
-        mid, int32_t(size) - int32_t(original_size));
+        mid, int64_t(size) - int64_t(original_size));
     return p + ALIGNMENT;
 }
 
@@ -166,9 +166,9 @@ void ModStat::init() {
     if (stat_arr_ != NULL) {
         return;
     }
-    stat_arr_ = (int32_t*)(::malloc(ITEM_SIZE * ITEM_COUNT));
+    stat_arr_ = (int64_t*)(::malloc(ITEM_SIZE * ITEM_COUNT));
     for (int8_t i = 0; i < __LAST_MOD_ID; i++) {
-        int32_t* item = get_item(i);
+        int64_t* item = get_item(i);
         *item = 0;
     }
 }
@@ -183,14 +183,14 @@ void ModStat::print_stat() {
 
     struct Entry {
         const char* name;
-        int32_t val;
+        int64_t val;
     };
     Entry entries[__LAST_MOD_ID];
     int count = 0;
     int64_t total = 0;
 
     for (int i = 0; i < __LAST_MOD_ID; i++) {
-        int32_t val = ATOMIC_FAA(get_item(i), 0);
+        int64_t val = ATOMIC_FAA(get_item(i), 0LL);
         total += val;
         if (val != 0) {
             entries[count++] = {g_mod_names[i], val};
diff --git a/cpp/src/common/allocator/page_arena.h b/cpp/src/common/allocator/page_arena.h
index 9b8ce5ef6..c0dfbebb9 100644
--- a/cpp/src/common/allocator/page_arena.h
+++ b/cpp/src/common/allocator/page_arena.h
@@ -47,6 +47,19 @@ class PageArena {
     FORCE_INLINE void destroy() { reset(); }
     void reset();
 
+    // Returns the number of bytes actually consumed across all pages.
+    // This is the precise M_meta size: metadata structs are not data-encoded,
+    // so arena used bytes == metadata memory exactly.
+    int64_t get_total_used_bytes() const {
+        int64_t total = 0;
+        Page* p = dummy_head_.next_;
+        while (p) {
+            total += p->cur_alloc_ - reinterpret_cast<char*>(p + 1);
+            p = p->next_;
+        }
+        return total;
+    }
+
 #ifdef ENABLE_TEST
     int TEST_get_page_count() const {
         int count = 0;
diff --git a/cpp/src/common/config/config.h b/cpp/src/common/config/config.h
index e2b2039a7..5cf968688 100644
--- a/cpp/src/common/config/config.h
+++ b/cpp/src/common/config/config.h
@@ -36,7 +36,7 @@ typedef struct ConfigValue {
     TSEncoding time_encoding_type_;
     TSDataType time_data_type_;
     CompressionType time_compress_type_;
-    int32_t chunk_group_size_threshold_;
+    int64_t chunk_group_size_threshold_;
     int32_t record_count_for_next_mem_check_;
     bool encrypt_flag_ = false;
     TSEncoding boolean_encoding_type_;
@@ -46,14 +46,21 @@ typedef struct ConfigValue {
     TSEncoding double_encoding_type_;
     TSEncoding string_encoding_type_;
     CompressionType default_compression_type_;
+    bool parallel_read_enabled_;
     bool parallel_write_enabled_;
-    int32_t write_thread_count_;
-    // When true, aligned writer enforces page size limit strictly by
-    // interleaving time/value writes and sealing pages together when any side
-    // becomes full.
-    // When false, aligned writer may disable some page-size checks to improve
-    // write performance.
-    bool strict_page_size_ = true;
+    // Size of the single global worker pool (common::g_thread_pool_) shared by
+    // the parallel write and parallel read paths.  The pool is (re)created from
+    // this value in init_common().  Like sync_on_close_/encrypt_flag_ it keeps
+    // its in-class default rather than being reset by init_config_value(), so a
+    // set_thread_count() call made before libtsfile_init() actually sizes the
+    // pool instead of being clobbered by the init-time defaults.
+    int32_t thread_count_ = 6;
+    // Durability knob: when true (default), TsFileIOWriter::end_file() issues
+    // an fsync() before closing so that a process / OS crash cannot leave a
+    // partially-flushed file behind. Disabling this trades durability for
+    // throughput: writes return success as soon as data is in the page cache.
+    // Only set to false if the caller drives its own fsync policy.
+    bool sync_on_close_ = true;
 } ConfigValue;
 
 extern void init_config_value();
@@ -62,10 +69,14 @@ extern CompressionType get_default_compressor();
 // In the future, configuration items need to be dynamically adjusted according
 // to the level
 extern void set_config_value();
-extern void config_set_page_max_point_count(uint32_t page_max_point_count);
-extern void config_set_max_degree_of_index_node(
+// Public config setters: validate at the entry point and return
+// E_INVALID_ARG when the requested value is outside the supported range.
+// On rejection the underlying field is left untouched so the writer keeps
+// running with whatever value it had before — callers that don't check the
+// return are no worse off than they were before validation existed.
+extern int config_set_page_max_point_count(uint32_t page_max_point_count);
+extern int config_set_max_degree_of_index_node(
     uint32_t max_degree_of_index_node);
-extern void config_set_strict_page_size(bool strict_page_size);
 
 }  // namespace common
 
diff --git a/cpp/src/common/container/bit_map.cc b/cpp/src/common/container/bit_map.cc
index 407605e56..3b1af6ab2 100644
--- a/cpp/src/common/container/bit_map.cc
+++ b/cpp/src/common/container/bit_map.cc
@@ -31,14 +31,15 @@ BitMap::~BitMap() {
     }
 }
 
-int BitMap::init(uint32_t item_size, bool init_as_zero) {
+int BitMap::init(uint32_t item_size, bool init_as_zero, AllocModID mod_id) {
     uint32_t size = (item_size + 7) / 8;
-    bitmap_ = static_cast<char*>(mem_alloc(size, MOD_TSBLOCK));
+    bitmap_ = static_cast<char*>(mem_alloc(size, mod_id));
     // need set to 0, otherwise there will be wrong data
     const char initial_char = init_as_zero ? 0x00 : 0xFF;
     memset(bitmap_, initial_char, size);
     size_ = size;
     init_as_zero_ = init_as_zero;
+    has_set_bits_ = !init_as_zero;
     return common::E_OK;
 }
 
diff --git a/cpp/src/common/container/bit_map.h b/cpp/src/common/container/bit_map.h
index 757ab1fb1..90ed0e0b6 100644
--- a/cpp/src/common/container/bit_map.h
+++ b/cpp/src/common/container/bit_map.h
@@ -25,16 +25,13 @@
 #include <intrin.h>
 #endif
 
+#include "common/allocator/alloc_base.h"
 #include "utils/errno_define.h"
 #include "utils/util_define.h"
 
 namespace common {
 
-// Cross-platform bit-twiddling helpers. GCC/Clang use their builtins; MSVC
-// uses the equivalent intrinsics from <intrin.h>; any other compiler falls
-// back to a portable loop.
 namespace bitops {
-// Population count of an 8-bit value.
 FORCE_INLINE int popcount8(uint8_t v) {
 #if defined(__GNUC__) || defined(__clang__)
     return __builtin_popcount(v);
@@ -49,7 +46,7 @@ FORCE_INLINE int popcount8(uint8_t v) {
     return c;
 #endif
 }
-// Count trailing zero bits. The argument must be non-zero.
+
 FORCE_INLINE int ctz_nonzero(uint32_t v) {
 #if defined(__GNUC__) || defined(__clang__)
     return __builtin_ctz(v);
@@ -66,23 +63,13 @@ FORCE_INLINE int ctz_nonzero(uint32_t v) {
     return c;
 #endif
 }
-// Count trailing zero bits of a 64-bit value. The argument must be non-zero.
-FORCE_INLINE int ctz64_nonzero(uint64_t v) {
+
+FORCE_INLINE int ctz_nonzero(uint64_t v) {
 #if defined(__GNUC__) || defined(__clang__)
     return __builtin_ctzll(v);
 #elif defined(_MSC_VER)
     unsigned long idx;
-#if defined(_M_X64) || defined(_M_ARM64)
     _BitScanForward64(&idx, v);
-#else
-    // 32-bit MSVC has no _BitScanForward64.
-    if (static_cast<uint32_t>(v) != 0) {
-        _BitScanForward(&idx, static_cast<uint32_t>(v));
-    } else {
-        _BitScanForward(&idx, static_cast<uint32_t>(v >> 32));
-        idx += 32;
-    }
-#endif
     return static_cast<int>(idx);
 #else
     int c = 0;
@@ -97,13 +84,19 @@ FORCE_INLINE int ctz64_nonzero(uint64_t v) {
 
 class BitMap {
    public:
-    BitMap() : bitmap_(nullptr), size_(0), init_as_zero_(true) {}
+    BitMap()
+        : bitmap_(nullptr),
+          size_(0),
+          init_as_zero_(true),
+          has_set_bits_(false) {}
     ~BitMap();
-    int init(uint32_t item_size, bool init_as_zero = true);
+    int init(uint32_t item_size, bool init_as_zero = true,
+             AllocModID mod_id = MOD_TSBLOCK);
 
     FORCE_INLINE void reset() {
         const char initial_char = init_as_zero_ ? 0x00 : 0xFF;
         memset(bitmap_, initial_char, size_);
+        has_set_bits_ = !init_as_zero_;
     }
 
     FORCE_INLINE void set(uint32_t index) {
@@ -113,6 +106,7 @@ class BitMap {
         char* start_addr = bitmap_ + offset;
         uint8_t bit_mask = get_bit_mask(index);
         *start_addr = (*start_addr) | (bit_mask);
+        has_set_bits_ = true;
     }
 
     FORCE_INLINE void clear(uint32_t index) {
@@ -124,7 +118,26 @@ class BitMap {
         *start_addr = (*start_addr) & (~bit_mask);
     }
 
-    FORCE_INLINE void clear_all() { memset(bitmap_, 0x00, size_); }
+    FORCE_INLINE void clear_all() {
+        memset(bitmap_, 0x00, size_);
+        has_set_bits_ = false;
+    }
+
+    // Copy `bytes` of externally-owned bitmap data into this BitMap's buffer
+    // and keep has_set_bits_ in sync. Without this, callers that memcpy
+    // directly into get_bitmap() can leave the has_set_bits_ shortcut stale
+    // and downstream readers (may_have_set_bits()) will falsely treat the
+    // bitmap as empty.
+    FORCE_INLINE void copy_from(const char* src, uint32_t bytes) {
+        ASSERT(bytes <= size_);
+        memcpy(bitmap_, src, bytes);
+        // Conservative: assume the caller-provided bitmap can have set bits.
+        // We could scan to be precise, but the false-positive only costs a
+        // bit of per-cell testing in writers — never silent data loss.
+        if (bytes > 0) {
+            has_set_bits_ = true;
+        }
+    }
 
     FORCE_INLINE bool test(uint32_t index) {
         uint32_t offset = index >> 3;
@@ -135,7 +148,6 @@ class BitMap {
         return (*start_addr & bit_mask);
     }
 
-    // Count the number of bits set to 1 (i.e., number of null entries).
     FORCE_INLINE uint32_t count_set_bits() const {
         uint32_t count = 0;
         const uint8_t* p = reinterpret_cast<const uint8_t*>(bitmap_);
@@ -145,26 +157,21 @@ class BitMap {
         return count;
     }
 
-    // Find the next set bit (null position) at or after @from,
-    // within [0, total_bits). Returns total_bits if none found.
-    // Skips zero bytes in bulk so cost is proportional to the number
-    // of null bytes, not total rows.
     FORCE_INLINE uint32_t next_set_bit(uint32_t from,
                                        uint32_t total_bits) const {
         if (from >= total_bits) return total_bits;
         const uint8_t* p = reinterpret_cast<const uint8_t*>(bitmap_);
         uint32_t byte_idx = from >> 3;
-        // Check remaining bits in the first (partial) byte
         uint8_t byte_val = p[byte_idx] >> (from & 7);
         if (byte_val) {
-            return from + bitops::ctz_nonzero(byte_val);
+            return from + bitops::ctz_nonzero(static_cast<uint32_t>(byte_val));
         }
-        // Scan subsequent full bytes, skipping zeros
         const uint32_t byte_end = (total_bits + 7) >> 3;
         for (++byte_idx; byte_idx < byte_end; ++byte_idx) {
             if (p[byte_idx]) {
                 uint32_t pos =
-                    (byte_idx << 3) + bitops::ctz_nonzero(p[byte_idx]);
+                    (byte_idx << 3) +
+                    bitops::ctz_nonzero(static_cast<uint32_t>(p[byte_idx]));
                 return pos < total_bits ? pos : total_bits;
             }
         }
@@ -175,6 +182,10 @@ class BitMap {
 
     FORCE_INLINE char* get_bitmap() { return bitmap_; }
 
+    // Fast check: returns false only when guaranteed no bits are set.
+    // May return true even when no bits are actually set (conservative).
+    FORCE_INLINE bool may_have_set_bits() const { return has_set_bits_; }
+
    private:
     FORCE_INLINE uint8_t get_bit_mask(uint32_t index) {
         return 1 << (index & 7);
@@ -184,6 +195,7 @@ class BitMap {
     char* bitmap_;
     uint32_t size_;
     bool init_as_zero_;
+    bool has_set_bits_;
 };
 }  // namespace common
 
diff --git a/cpp/src/common/container/byte_buffer.h b/cpp/src/common/container/byte_buffer.h
index 88006dac6..4e2dfab15 100644
--- a/cpp/src/common/container/byte_buffer.h
+++ b/cpp/src/common/container/byte_buffer.h
@@ -107,11 +107,11 @@ class ByteBuffer {
 
     // for variable len value
     FORCE_INLINE char* read(uint32_t offset, uint32_t* len) {
+        ASSERT(offset + variable_type_len_ <= real_data_size_);
         uint32_t tmp;
-        // Directly memcpy to avoid potential alignment issues when casting
-        // int32_t array pointer
         std::memcpy(&tmp, data_ + offset, sizeof(tmp));
         *len = tmp;
+        ASSERT(offset + variable_type_len_ + *len <= real_data_size_);
         char* p = &data_[offset + variable_type_len_];
         return p;
     }
@@ -128,4 +128,4 @@ class ByteBuffer {
 };
 
 }  // namespace common
-#endif  // COMMON_CONTAINER_BYTE_BUFFER_H
\ No newline at end of file
+#endif  // COMMON_CONTAINER_BYTE_BUFFER_H
diff --git a/cpp/src/common/device_id.cc b/cpp/src/common/device_id.cc
index b35a8593f..e88cdac8a 100644
--- a/cpp/src/common/device_id.cc
+++ b/cpp/src/common/device_id.cc
@@ -144,7 +144,7 @@ int StringArrayDeviceID::deserialize(common::ByteStream& read_stream) {
 
     segments_.clear();
     for (uint32_t i = 0; i < num_segments; ++i) {
-        std::string* segment;
+        std::string* segment = nullptr;
         if (RET_FAIL(common::SerializationUtil::read_var_char_ptr(
                 segment, read_stream))) {
             delete segment;
diff --git a/cpp/src/common/global.cc b/cpp/src/common/global.cc
index b49b55657..cc6c5117f 100644
--- a/cpp/src/common/global.cc
+++ b/cpp/src/common/global.cc
@@ -19,31 +19,31 @@
 
 #include "global.h"
 
+#ifdef ENABLE_THREADS
+#include "common/thread_pool.h"
+#endif
+
 #ifndef _WIN32
 #include <execinfo.h>
+#include <strings.h>  // strncasecmp
 #endif
 #include <stdlib.h>
+#include <string.h>  // strlen
 
-#include <thread>
-
-#ifdef ENABLE_THREADS
-#include "common/thread_pool.h"
-#endif
 #include "utils/injection.h"
-#include "utils/util_define.h"  // strncasecmp and other platform-compat shims
+#include "utils/util_define.h"  // strncasecmp -> _strnicmp shim on Windows
 
 namespace common {
 
 ColumnSchema g_time_column_schema;
+ConfigValue g_config_value_;
 #ifdef ENABLE_THREADS
-ThreadPool* g_write_thread_pool_ = nullptr;
+ThreadPool* g_thread_pool_ = nullptr;
 #endif
-ConfigValue g_config_value_;
 
 void init_config_value() {
-    g_config_value_.tsblock_mem_inc_step_size_ = 8000;  // 8k
-    g_config_value_.tsblock_max_memory_ = 64000;        // 64k
-    // g_config_value_.tsblock_max_memory_ = 32;
+    g_config_value_.tsblock_mem_inc_step_size_ = 8000;      // 8k
+    g_config_value_.tsblock_max_memory_ = 2 * 1024 * 1024;  // 2 MB
     g_config_value_.page_writer_max_point_num_ = 10000;
     g_config_value_.page_writer_max_memory_bytes_ = 128 * 1024;  // 128 k
     g_config_value_.max_degree_of_index_node_ = 256;
@@ -64,19 +64,21 @@ void init_config_value() {
     g_config_value_.float_encoding_type_ = GORILLA;
     g_config_value_.double_encoding_type_ = GORILLA;
     g_config_value_.string_encoding_type_ = PLAIN;
-    // Default compression type is LZ4
-#ifdef ENABLE_LZ4
+    // Pick the strongest compressor that was actually compiled in. Gating on
+    // ENABLE_LZ4 while setting SNAPPY (the original code) would request a
+    // compressor that the factory can't produce when the build disables
+    // Snappy, returning nullptr at write time.
+#ifdef ENABLE_SNAPPY
+    g_config_value_.default_compression_type_ = SNAPPY;
+#elif defined(ENABLE_LZ4)
     g_config_value_.default_compression_type_ = LZ4;
 #else
     g_config_value_.default_compression_type_ = UNCOMPRESSED;
 #endif
-    unsigned int hw_cores = std::thread::hardware_concurrency();
-    if (hw_cores == 0) hw_cores = 1;  // fallback if detection fails
-    g_config_value_.parallel_write_enabled_ = (hw_cores > 1);
-    g_config_value_.write_thread_count_ =
-        static_cast<int32_t>(std::min(hw_cores, 64u));
-    // Enforce aligned page size limits strictly by default.
-    g_config_value_.strict_page_size_ = true;
+    g_config_value_.parallel_read_enabled_ = true;
+    g_config_value_.parallel_write_enabled_ = true;
+    // thread_count_ keeps its in-class default (see config.h) so a
+    // set_thread_count() before libtsfile_init() is not reset here.
 }
 
 extern TSEncoding get_value_encoder(TSDataType data_type) {
@@ -113,16 +115,20 @@ extern CompressionType get_default_compressor() {
     return g_config_value_.default_compression_type_;
 }
 
-void config_set_page_max_point_count(uint32_t page_max_point_count) {
+int config_set_page_max_point_count(uint32_t page_max_point_count) {
+    if (page_max_point_count == 0) {
+        return E_INVALID_ARG;
+    }
     g_config_value_.page_writer_max_point_num_ = page_max_point_count;
+    return E_OK;
 }
 
-void config_set_max_degree_of_index_node(uint32_t max_degree_of_index_node) {
+int config_set_max_degree_of_index_node(uint32_t max_degree_of_index_node) {
+    if (max_degree_of_index_node < 2u) {
+        return E_INVALID_ARG;
+    }
     g_config_value_.max_degree_of_index_node_ = max_degree_of_index_node;
-}
-
-void config_set_strict_page_size(bool strict_page_size) {
-    g_config_value_.strict_page_size_ = strict_page_size;
+    return E_OK;
 }
 
 void set_config_value() {}
@@ -145,17 +151,35 @@ int init_common() {
     g_time_column_schema.compression_ = UNCOMPRESSED;
     g_time_column_schema.column_name_ = storage::TIME_COLUMN_NAME;
 #ifdef ENABLE_THREADS
-    // (Re)create the global write thread pool with the configured size.
-    delete g_write_thread_pool_;
-    size_t pool_size =
-        g_config_value_.write_thread_count_ > 0
-            ? static_cast<size_t>(g_config_value_.write_thread_count_)
-            : size_t{1};
-    g_write_thread_pool_ = new ThreadPool(pool_size);
+    // (Re)create the single global worker pool with the configured size.  All
+    // parallel write/read paths submit here; torn down in libtsfile_destroy().
+    delete g_thread_pool_;
+    size_t pool_size = g_config_value_.thread_count_ > 0
+                           ? static_cast<size_t>(g_config_value_.thread_count_)
+                           : size_t{1};
+    g_thread_pool_ = new ThreadPool(pool_size);
 #endif
     return ret;
 }
 
+int set_thread_count(int32_t count) {
+    if (count < 1 || count > 64) return E_INVALID_ARG;
+    g_config_value_.thread_count_ = count;
+#ifdef ENABLE_THREADS
+    // If the global pool already exists (libtsfile_init has run) rebuild it at
+    // the new size so the change takes effect immediately instead of only at
+    // the next libtsfile_init().  This joins all current workers and recreates
+    // them, so the caller must ensure no read/write is concurrently using the
+    // pool — intended for setup / benchmark reconfiguration, not mid-operation
+    // resizing.
+    if (g_thread_pool_ != nullptr) {
+        delete g_thread_pool_;
+        g_thread_pool_ = new ThreadPool(static_cast<size_t>(count));
+    }
+#endif
+    return E_OK;
+}
+
 bool is_timestamp_column_name(const char* time_col_name) {
     // both "time" and "timestamp" refer to timestamp column.
     int32_t len = strlen(time_col_name);
diff --git a/cpp/src/common/global.h b/cpp/src/common/global.h
index 5bee0fa60..ae04c6afa 100644
--- a/cpp/src/common/global.h
+++ b/cpp/src/common/global.h
@@ -29,6 +29,15 @@ namespace common {
 extern TSFILE_API ConfigValue g_config_value_;
 extern TSFILE_API ColumnSchema g_time_column_schema;
 
+#ifdef ENABLE_THREADS
+class ThreadPool;
+// The single process-wide worker pool shared by every parallel code path
+// (write column encoding, read column decoding).  Created in init_common()
+// and torn down in libtsfile_destroy(); null until libtsfile_init() runs, so
+// every caller must fall back to the serial path when it is null.
+extern TSFILE_API ThreadPool* g_thread_pool_;
+#endif
+
 FORCE_INLINE int set_global_time_data_type(uint8_t data_type) {
     ASSERT(data_type >= BOOLEAN && data_type <= STRING);
     if (data_type != INT64) {
@@ -163,29 +172,28 @@ FORCE_INLINE uint8_t get_global_compression() {
     return static_cast<uint8_t>(g_config_value_.default_compression_type_);
 }
 
+FORCE_INLINE void set_parallel_read_enabled(bool enabled) {
+    g_config_value_.parallel_read_enabled_ = enabled;
+}
+
+FORCE_INLINE bool get_parallel_read_enabled() {
+    return g_config_value_.parallel_read_enabled_;
+}
+
 FORCE_INLINE void set_parallel_write_enabled(bool enabled) {
     g_config_value_.parallel_write_enabled_ = enabled;
 }
 
 FORCE_INLINE bool get_parallel_write_enabled() {
-    return g_config_value_.parallel_write_enabled_ &&
-           g_config_value_.write_thread_count_ > 1;
-}
-
-// Set the number of threads for parallel writes.  Must be called before
-// init_common() / libtsfile_init() — the global thread pool is created
-// during initialization and is not resized at runtime.
-FORCE_INLINE int set_write_thread_count(int32_t count) {
-    if (count < 1 || count > 64) return E_INVALID_ARG;
-    g_config_value_.write_thread_count_ = count;
-    return E_OK;
+    return g_config_value_.parallel_write_enabled_;
 }
 
-#ifdef ENABLE_THREADS
-class ThreadPool;
-// Global write thread pool, created by init_common().
-extern ThreadPool* g_write_thread_pool_;
-#endif
+// Size of the single global worker pool.  Rejects values outside [1, 64] with
+// E_INVALID_ARG, leaving the field untouched.  If the pool already exists
+// (libtsfile_init has run) it is rebuilt at the new size immediately; the
+// caller must ensure no read/write is concurrently using the pool.  Defined in
+// global.cc (needs the full ThreadPool type).
+extern int set_thread_count(int32_t count);
 
 extern int init_common();
 extern bool is_timestamp_column_name(const char* time_col_name);
diff --git a/cpp/src/common/mutex/CMakeLists.txt b/cpp/src/common/mutex/CMakeLists.txt
deleted file mode 100644
index e7ef66faa..000000000
--- a/cpp/src/common/mutex/CMakeLists.txt
+++ /dev/null
@@ -1,20 +0,0 @@
-#[[
-Licensed to the Apache Software Foundation (ASF) under one
-or more contributor license agreements.  See the NOTICE file
-distributed with this work for additional information
-regarding copyright ownership.  The ASF licenses this file
-to you under the Apache License, Version 2.0 (the
-"License"); you may not use this file except in compliance
-with the License.  You may obtain a copy of the License at
-
-    https://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing,
-software distributed under the License is distributed on an
-"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-KIND, either express or implied.  See the License for the
-specific language governing permissions and limitations
-under the License.
-]]
-
-
diff --git a/cpp/src/common/mutex/mutex.h b/cpp/src/common/mutex/mutex.h
deleted file mode 100644
index b35d328de..000000000
--- a/cpp/src/common/mutex/mutex.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * License); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#ifndef COMMON_MUTEX_MUTEX_H
-#define COMMON_MUTEX_MUTEX_H
-
-#include <mutex>
-
-#include "utils/util_define.h"
-
-namespace common {
-
-// Thin wrapper over std::mutex. Implemented with the C++11 standard library
-// (instead of pthreads directly) so it builds on every platform, including
-// MSVC where pthreads is not available.
-class Mutex {
-   public:
-    Mutex() {}
-    ~Mutex() {}
-
-    void lock() { mutex_.lock(); }
-
-    void unlock() { mutex_.unlock(); }
-
-    bool try_lock() { return mutex_.try_lock(); }
-
-   private:
-    std::mutex mutex_;
-};
-
-class MutexGuard {
-   public:
-    MutexGuard(Mutex& m) : m_(m) { m_.lock(); }
-    ~MutexGuard() { m_.unlock(); }
-
-   private:
-    Mutex& m_;
-};
-
-}  // end namespace common
-#endif  // COMMON_MUTEX_MUTEX_H
diff --git a/cpp/src/common/path.cc b/cpp/src/common/path.cc
deleted file mode 100644
index d70a9d6c6..000000000
--- a/cpp/src/common/path.cc
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * License); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#include "common/path.h"
-
-#include "common/constant/tsfile_constant.h"
-
-#ifdef ENABLE_ANTLR4
-#include "parser/path_nodes_generator.h"
-#endif
-
-namespace storage {
-
-Path::Path() = default;
-
-Path::Path(std::string& device, std::string& measurement)
-    : measurement_(measurement),
-      device_id_(std::make_shared<StringArrayDeviceID>(device)) {
-    full_path_ = device + "." + measurement;
-}
-
-Path::Path(const std::string& path_sc, bool if_split) {
-    if (!path_sc.empty()) {
-        if (!if_split) {
-            full_path_ = path_sc;
-            device_id_ = std::make_shared<StringArrayDeviceID>(path_sc);
-        } else {
-#ifdef ENABLE_ANTLR4
-            std::vector<std::string> nodes =
-                PathNodesGenerator::invokeParser(path_sc);
-#else
-            std::vector<std::string> nodes =
-                IDeviceID::split_string(path_sc, '.');
-#endif
-            if (nodes.size() > 1) {
-                // Join nodes, then parse like write path / Java Path (not
-                // per-segment vector).
-                std::string device_joined;
-                for (size_t i = 0; i + 1 < nodes.size(); ++i) {
-                    if (i > 0) {
-                        device_joined += PATH_SEPARATOR_CHAR;
-                    }
-                    device_joined += nodes[i];
-                }
-                device_id_ =
-                    std::make_shared<StringArrayDeviceID>(device_joined);
-                measurement_ = nodes[nodes.size() - 1];
-                full_path_ = device_id_->get_device_name() + "." + measurement_;
-            } else {
-                full_path_ = path_sc;
-                device_id_ = std::make_shared<StringArrayDeviceID>();
-                measurement_ = path_sc;
-            }
-        }
-    } else {
-        full_path_ = "";
-        device_id_ = std::make_shared<StringArrayDeviceID>();
-        measurement_ = "";
-    }
-}
-
-}  // namespace storage
diff --git a/cpp/src/common/path.h b/cpp/src/common/path.h
index 3896b2715..c176d93db 100644
--- a/cpp/src/common/path.h
+++ b/cpp/src/common/path.h
@@ -21,7 +21,12 @@
 
 #include <string>
 
+#include "common/constant/tsfile_constant.h"
 #include "common/device_id.h"
+#ifdef ENABLE_ANTLR4
+#include "parser/generated/PathParser.h"
+#include "parser/path_nodes_generator.h"
+#endif
 #include "utils/errno_define.h"
 
 namespace storage {
@@ -31,9 +36,57 @@ struct Path {
     std::shared_ptr<IDeviceID> device_id_;
     std::string full_path_;
 
-    Path();
-    Path(std::string& device, std::string& measurement);
-    Path(const std::string& path_sc, bool if_split = true);
+    Path() {}
+
+    Path(std::string& device, std::string& measurement)
+        : measurement_(measurement),
+          device_id_(std::make_shared<StringArrayDeviceID>(device)) {
+        full_path_ = device + "." + measurement;
+    }
+
+    Path(const std::string& path_sc, bool if_split = true) {
+        if (!path_sc.empty()) {
+            if (!if_split) {
+                full_path_ = path_sc;
+                device_id_ = std::make_shared<StringArrayDeviceID>(path_sc);
+            } else {
+#ifdef ENABLE_ANTLR4
+                std::vector<std::string> nodes =
+                    PathNodesGenerator::invokeParser(path_sc);
+#else
+                std::vector<std::string> nodes =
+                    IDeviceID::split_string(path_sc, '.');
+#endif
+                if (nodes.size() > 1) {
+                    // Join nodes, then parse like write path / Java Path
+                    // (route through the interpretive string ctor instead of
+                    // the literal per-segment vector ctor, so a stored
+                    // "root.sg.d1" device matches a query path
+                    // "root.sg.d1.s1").
+                    std::string device_joined;
+                    for (size_t i = 0; i + 1 < nodes.size(); ++i) {
+                        if (i > 0) {
+                            device_joined += PATH_SEPARATOR_CHAR;
+                        }
+                        device_joined += nodes[i];
+                    }
+                    device_id_ =
+                        std::make_shared<StringArrayDeviceID>(device_joined);
+                    measurement_ = nodes[nodes.size() - 1];
+                    full_path_ =
+                        device_id_->get_device_name() + "." + measurement_;
+                } else {
+                    full_path_ = path_sc;
+                    device_id_ = std::make_shared<StringArrayDeviceID>();
+                    measurement_ = path_sc;
+                }
+            }
+        } else {
+            full_path_ = "";
+            device_id_ = std::make_shared<StringArrayDeviceID>();
+            measurement_ = "";
+        }
+    }
 
     bool operator==(const Path& path) {
         if (measurement_.compare(path.measurement_) == 0 &&
diff --git a/cpp/src/common/seq_tvlist.h b/cpp/src/common/seq_tvlist.h
deleted file mode 100644
index 24805ac5d..000000000
--- a/cpp/src/common/seq_tvlist.h
+++ /dev/null
@@ -1,172 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * License); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#ifndef COMMON_SEQ_TVLIST_H
-#define COMMON_SEQ_TVLIST_H
-
-#include "common/allocator/alloc_base.h"
-#include "common/allocator/page_arena.h"
-#include "common/mutex/mutex.h"
-#include "utils/db_utils.h"
-#include "utils/errno_define.h"
-#include "utils/storage_utils.h"
-#include "utils/util_define.h"
-
-namespace storage {
-
-class SeqTVListBase {
-   public:
-    SeqTVListBase()
-        : data_type_(common::VECTOR),
-          mutex_(),
-          ref_count_(0),
-          primary_array_size_(0),
-          list_size_(0),
-          write_count_(0),
-          page_arena_(common::g_base_allocator),
-          use_page_arena_(false),
-          is_immutable_(false) {}
-    virtual ~SeqTVListBase() {}
-    virtual void destroy() {}
-
-    FORCE_INLINE void ref() { ATOMIC_AAF(&ref_count_, 1); }
-    FORCE_INLINE bool unref() { return 0 == ATOMIC_AAF(&ref_count_, -1); }
-
-    FORCE_INLINE void lock() { mutex_.lock(); }
-    FORCE_INLINE void unlock() { mutex_.unlock(); }
-
-    int32_t get_total_count() const { return write_count_; }
-    common::TSDataType get_data_type() const { return data_type_; }
-    virtual TimeRange get_time_range() const = 0;
-    void mark_immutable() { is_immutable_ = true; }
-    bool is_immutable() const { return is_immutable_; }
-
-   protected:
-    common::TSDataType data_type_;
-    mutable common::Mutex mutex_;
-    int32_t ref_count_;
-    int32_t primary_array_size_;
-    int32_t list_size_;
-    int32_t write_count_;
-    common::PageArena page_arena_;
-    bool use_page_arena_;
-    bool is_immutable_;
-};
-
-template <typename Type>
-class SeqTVList : public SeqTVListBase {
-   public:
-    typedef struct TV {
-        int64_t time_;
-        Type value_;
-    } TV;
-
-    struct Iterator {
-        SeqTVList* host_list_;
-        int32_t read_idx_;
-        int32_t end_idx_;
-
-        Iterator() : host_list_(nullptr), read_idx_(UINT32_MAX), end_idx_(0) {}
-
-        INLINE void init(SeqTVList* host, int32_t start_idx, int32_t end_idx) {
-            host_list_ = host;
-            read_idx_ = start_idx;
-            end_idx_ = end_idx;
-        }
-
-        int next(TV& tv) {
-            if (read_idx_ >= end_idx_) {
-                return common::E_NO_MORE_DATA;
-            }
-            tv = host_list_->at(read_idx_);
-            read_idx_++;
-            return common::E_OK;
-        }
-    };
-
-   public:
-    SeqTVList() : tv_array_list_(nullptr), last_time_(-1) {
-        data_type_ = common::GetDataTypeFromTemplateType<Type>();
-    }
-    virtual ~SeqTVList() {}
-
-    int init(int32_t primary_array_size, int32_t max_count,
-             bool use_page_arena);
-    void destroy() OVERRIDE;
-
-    int push(int64_t time, Type value);
-    int push_without_lock(int64_t time, Type value);
-    Iterator scan_without_lock(int64_t start_time, int64_t end_time);
-    Iterator scan_without_lock();
-
-    TimeRange get_time_range() const OVERRIDE {
-        TimeRange time_range;
-        common::MutexGuard mg(mutex_);
-        if (write_count_ > 0) {
-            time_range.start_time_ = time_at(0);
-            time_range.end_time_ = time_at(write_count_ - 1);
-            ASSERT(time_range.start_time_ <= time_range.end_time_);
-        }
-        return time_range;
-    }
-
-    FORCE_INLINE TV at(int32_t tv_idx) const {
-        ASSERT(tv_idx < write_count_);
-        int32_t list_idx = tv_idx / primary_array_size_;
-        int32_t list_offset = tv_idx % primary_array_size_;
-        return tv_array_list_[list_idx][list_offset];
-    }
-
-    FORCE_INLINE int64_t time_at(int32_t tv_idx) const {
-        return at(tv_idx).time_;
-    }
-
-#ifdef ENABLE_TEST
-    int32_t TEST_binary_search_upper(int64_t time) {
-        return binary_search_upper(time);
-    }
-    int32_t TEST_binary_search_lower(int64_t time) {
-        return binary_search_lower(time);
-    }
-#endif
-
-   private:
-    FORCE_INLINE void* alloc(uint32_t size) {
-        if (use_page_arena_) {
-            return page_arena_.alloc(size);
-        } else {
-            return common::mem_alloc(size, common::MOD_TVLIST_DATA);
-        }
-    }
-
-    // return the first tv which is larger or equal to @time
-    int32_t binary_search_upper(int64_t time);
-    // return the last tv which is less or equal to @time
-    int32_t binary_search_lower(int64_t time);
-
-   private:
-    TV** tv_array_list_;
-    int64_t last_time_;
-};
-
-}  // namespace storage
-
-#include "seq_tvlist.inc"
-
-#endif  // COMMON_SEQ_TVLIST_H
diff --git a/cpp/src/common/seq_tvlist.inc b/cpp/src/common/seq_tvlist.inc
deleted file mode 100644
index c25e49f45..000000000
--- a/cpp/src/common/seq_tvlist.inc
+++ /dev/null
@@ -1,174 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * License); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-// #include "seq_tvlist.h"
-#include <stdio.h>
-#include <string.h>
-#include <iostream>
-#include "common/mutex/mutex.h"
-#include "common/logger/elog.h"
-
-
-namespace storage
-{
-
-template<typename Type>
-int SeqTVList<Type>::init(int32_t primary_array_size,
-                          int32_t max_count,
-                          bool use_page_arena)
-{
-  if (primary_array_size > max_count) {
-    //common:://log_err("TVList init error, primary_array_size=%u, max_count=%u", primary_array_size, max_count);
-    return common::E_INVALID_ARG;
-  }
-  use_page_arena_ = use_page_arena;
-
-  primary_array_size_ = primary_array_size;
-  list_size_ = (max_count / primary_array_size_) +
-               (max_count % primary_array_size_ == 0 ? 0 : 1);
-
-  int32_t alloc_size = sizeof(TV) * list_size_;
-  tv_array_list_ = (TV**)alloc(alloc_size);
-  if (tv_array_list_ == nullptr) {
-    return common::E_OOM;
-  }
-  memset(tv_array_list_, 0, alloc_size);
-  write_count_ = 0;
-  if (use_page_arena_) {
-    // TODO make it configurable
-    page_arena_.init(sizeof(TV) * primary_array_size_ * 4, common::MOD_TVLIST_OBJ);
-  }
-  return common::E_OK;
-}
-
-template<typename Type>
-int SeqTVList<Type>::push(int64_t time, Type value)
-{
-  common::MutexGuard mg(mutex_);
-  return push_without_lock(time, value);
-};
-
-template<typename Type>
-int SeqTVList<Type>::push_without_lock(int64_t time, Type value)
-{
-  if (UNLIKELY(time <= last_time_)) {
-    return common::E_OUT_OF_ORDER;
-  }
-  if (UNLIKELY(write_count_ >= list_size_ * primary_array_size_)) {
-    return common::E_OVERFLOW;
-  }
-
-  int32_t list_idx = write_count_ / primary_array_size_;
-  int32_t list_offset = write_count_ % primary_array_size_;
-  if (UNLIKELY(list_offset == 0)) {
-    ASSERT(tv_array_list_[list_idx] == nullptr);
-    tv_array_list_[list_idx] = static_cast<TV*>(alloc(sizeof(TV) * primary_array_size_));
-    if (UNLIKELY(tv_array_list_[list_idx] == nullptr)) {
-      return common::E_OOM;
-    }
-  }
-
-  TV insert_tv;
-  insert_tv.time_ = time;
-  insert_tv.value_ = value;
-#if STORAGE_ENGINE_DEBUG
-  std::cout << "tvlist[" << list_idx << "][" << list_offset << "] = (" << time << ", " << value << ")" << std::endl;
-#endif
-  tv_array_list_[list_idx][list_offset] = insert_tv;
-  write_count_++;
-  last_time_ = time;
-  return common::E_OK;
-};
-
-template<typename Type>
-void SeqTVList<Type>::destroy()
-{
-  if (use_page_arena_) {
-    page_arena_.destroy();
-  } else {
-    int32_t list_size = write_count_ / primary_array_size_
-                        + (write_count_ % primary_array_size_ == 0 ? 0 : 1);
-    for (int i = 0; i < list_size; i++) {
-      common::mem_free(tv_array_list_[i]);
-    }
-    common::mem_free(tv_array_list_);
-  }
-}
-
-template<typename Type>
-typename SeqTVList<Type>::Iterator SeqTVList<Type>::scan_without_lock(int64_t start_time, int64_t end_time)
-{
-  ASSERT(start_time < end_time);
-  int32_t start_idx = binary_search_lower(start_time);
-  int32_t end_idx = binary_search_upper(end_time);
-  ASSERT(start_idx <= end_time + 1);
-  SeqTVList::Iterator iter;
-  iter.init(this, start_idx, end_idx);
-  return iter;
-}
-
-template<typename Type>
-typename SeqTVList<Type>::Iterator SeqTVList<Type>::scan_without_lock()
-{
-  SeqTVList::Iterator iter;
-  iter.init(this, 0, write_count_);
-  return iter;
-}
-
-// return the first tv which is larger or equal to @time
-template<typename Type>
-int32_t SeqTVList<Type>::binary_search_lower(int64_t time)
-{
-  int32_t start = -1;
-  int32_t end = write_count_;
-
-  // arr[start] < time <= arr[end]
-  while (start + 1 != end) {
-    int mid = (start + end) / 2;
-    int64_t mid_time = time_at(mid);
-    if (mid_time < time) {
-      start = mid;
-    } else {
-      end = mid;
-    }
-  }
-  return end;
-}
-
-// return the last tv which is less or equal to @time
-template<typename Type>
-int32_t SeqTVList<Type>::binary_search_upper(int64_t time)
-{
-  int32_t start = 0;
-  int32_t end = write_count_;
-
-  // arr[start] <= time < arr[end]
-  while (start + 1 != end) {
-    int mid = (start + end) / 2;
-    int64_t mid_time = time_at(mid);
-    if (mid_time <= time) {
-      start = mid;
-    } else {
-      end = mid;
-    }
-  }
-  return start;
-}
-
-} // namespace storage
-
diff --git a/cpp/src/common/statistic.h b/cpp/src/common/statistic.h
index bced66173..3d45b4f43 100644
--- a/cpp/src/common/statistic.h
+++ b/cpp/src/common/statistic.h
@@ -22,12 +22,18 @@
 
 #include <inttypes.h>
 
+#include <algorithm>
 #include <sstream>
 
 #include "common/allocator/alloc_base.h"
 #include "common/allocator/byte_stream.h"
 #include "common/db_common.h"
 
+#if defined(__ARM_NEON) || defined(__ARM_NEON__)
+#include <arm_neon.h>
+#define TSFILE_HAS_NEON 1
+#endif
+
 namespace storage {
 
 /*
@@ -176,6 +182,48 @@ class Statistic {
     }
     virtual FORCE_INLINE void update(int64_t time) { ASSERT(false); }
 
+    virtual void update_time_batch(const int64_t* timestamps, uint32_t count) {
+        for (uint32_t i = 0; i < count; i++) {
+            update(timestamps[i]);
+        }
+    }
+    virtual void update_batch(const int64_t* timestamps, const bool* values,
+                              uint32_t count) {
+        for (uint32_t i = 0; i < count; i++) {
+            update(timestamps[i], values[i]);
+        }
+    }
+    virtual void update_batch(const int64_t* timestamps, const int32_t* values,
+                              uint32_t count) {
+        for (uint32_t i = 0; i < count; i++) {
+            update(timestamps[i], values[i]);
+        }
+    }
+    virtual void update_batch(const int64_t* timestamps, const int64_t* values,
+                              uint32_t count) {
+        for (uint32_t i = 0; i < count; i++) {
+            update(timestamps[i], values[i]);
+        }
+    }
+    virtual void update_batch(const int64_t* timestamps, const float* values,
+                              uint32_t count) {
+        for (uint32_t i = 0; i < count; i++) {
+            update(timestamps[i], values[i]);
+        }
+    }
+    virtual void update_batch(const int64_t* timestamps, const double* values,
+                              uint32_t count) {
+        for (uint32_t i = 0; i < count; i++) {
+            update(timestamps[i], values[i]);
+        }
+    }
+    virtual void update_batch(const int64_t* timestamps,
+                              const common::String* values, uint32_t count) {
+        for (uint32_t i = 0; i < count; i++) {
+            update(timestamps[i], values[i]);
+        }
+    }
+
     virtual int serialize_to(common::ByteStream& out) {
         int ret = common::E_OK;
         if (RET_FAIL(common::SerializationUtil::write_var_uint(count_, out))) {
@@ -554,17 +602,17 @@ class BooleanStatistic : public Statistic {
         last_value_ = that.last_value_;
     }
 
-    FORCE_INLINE void reset() {
+    FORCE_INLINE void reset() override {
         count_ = 0;
         sum_value_ = 0;
         first_value_ = false;
         last_value_ = false;
     }
 
-    FORCE_INLINE void update(int64_t time, bool value) {
+    FORCE_INLINE void update(int64_t time, bool value) override {
         BOOL_STAT_UPDATE(time, value);
     }
-    int serialize_typed_stat(common::ByteStream& out) {
+    int serialize_typed_stat(common::ByteStream& out) override {
         int ret = common::E_OK;
         if (RET_FAIL(common::SerializationUtil::write_ui8(first_value_ ? 1 : 0,
                                                           out))) {
@@ -575,7 +623,7 @@ class BooleanStatistic : public Statistic {
         }
         return ret;
     }
-    int deserialize_typed_stat(common::ByteStream& in) {
+    int deserialize_typed_stat(common::ByteStream& in) override {
         int ret = common::E_OK;
         if (RET_FAIL(common::SerializationUtil::read_ui8((uint8_t&)first_value_,
                                                          in))) {
@@ -587,13 +635,15 @@ class BooleanStatistic : public Statistic {
         return ret;
     }
 
-    FORCE_INLINE common::TSDataType get_type() { return common::BOOLEAN; }
+    FORCE_INLINE common::TSDataType get_type() override {
+        return common::BOOLEAN;
+    }
 
-    int merge_with(Statistic* stat) {
+    int merge_with(Statistic* stat) override {
         MERGE_BOOL_STAT_FROM(BooleanStatistic, stat);
     }
 
-    int deep_copy_from(Statistic* stat) {
+    int deep_copy_from(Statistic* stat) override {
         DEEP_COPY_BOOL_STAT_FROM(BooleanStatistic, stat);
     }
 };
@@ -625,7 +675,7 @@ class Int32Statistic : public Statistic {
         last_value_ = that.last_value_;
     }
 
-    FORCE_INLINE void reset() {
+    FORCE_INLINE void reset() override {
         count_ = 0;
         sum_value_ = 0;
         min_value_ = 0;
@@ -634,13 +684,41 @@ class Int32Statistic : public Statistic {
         last_value_ = 0;
     }
 
-    FORCE_INLINE void update(int64_t time, int32_t value) {
+    FORCE_INLINE void update(int64_t time, int32_t value) override {
         NUM_STAT_UPDATE(time, value);
     }
 
-    FORCE_INLINE common::TSDataType get_type() { return common::INT32; }
+    void update_batch(const int64_t* timestamps, const int32_t* values,
+                      uint32_t count) override {
+        if (count == 0) return;
+        uint32_t start = 0;
+        if (count_ == 0) {
+            start_time_ = timestamps[0];
+            end_time_ = timestamps[0];
+            first_value_ = values[0];
+            last_value_ = values[0];
+            min_value_ = values[0];
+            max_value_ = values[0];
+            sum_value_ = (int64_t)values[0];
+            count_ = 1;
+            start = 1;
+        }
+        for (uint32_t i = start; i < count; i++) {
+            if (timestamps[i] < start_time_) start_time_ = timestamps[i];
+            if (timestamps[i] > end_time_) end_time_ = timestamps[i];
+            if (values[i] < min_value_) min_value_ = values[i];
+            if (values[i] > max_value_) max_value_ = values[i];
+            sum_value_ += (int64_t)values[i];
+        }
+        last_value_ = values[count - 1];
+        count_ += (count - start);
+    }
+
+    FORCE_INLINE common::TSDataType get_type() override {
+        return common::INT32;
+    }
 
-    int serialize_typed_stat(common::ByteStream& out) {
+    int serialize_typed_stat(common::ByteStream& out) override {
         int ret = common::E_OK;
         if (RET_FAIL(common::SerializationUtil::write_ui32(min_value_, out))) {
         } else if (RET_FAIL(common::SerializationUtil::write_ui32(max_value_,
@@ -654,7 +732,7 @@ class Int32Statistic : public Statistic {
         }
         return ret;
     }
-    int deserialize_typed_stat(common::ByteStream& in) {
+    int deserialize_typed_stat(common::ByteStream& in) override {
         int ret = common::E_OK;
         if (RET_FAIL(common::SerializationUtil::read_ui32((uint32_t&)min_value_,
                                                           in))) {
@@ -676,15 +754,15 @@ class Int32Statistic : public Statistic {
         //           << std::endl;
         return ret;
     }
-    int merge_with(Statistic* stat) {
+    int merge_with(Statistic* stat) override {
         MERGE_NUM_STAT_FROM(Int32Statistic, stat);
     }
 
-    int deep_copy_from(Statistic* stat) {
+    int deep_copy_from(Statistic* stat) override {
         DEEP_COPY_NUM_STAT_FROM(Int32Statistic, stat);
     }
 
-    std::string to_string() const {
+    std::string to_string() const override {
         std::ostringstream oss;
         oss << "{count=" << count_ << ", start_time=" << start_time_
             << ", end_time=" << end_time_ << ", first_val=" << first_value_
@@ -696,7 +774,7 @@ class Int32Statistic : public Statistic {
 };
 
 class DateStatistic : public Int32Statistic {
-    FORCE_INLINE common::TSDataType get_type() { return common::DATE; }
+    FORCE_INLINE common::TSDataType get_type() override { return common::DATE; }
 };
 
 class Int64Statistic : public Statistic {
@@ -726,7 +804,7 @@ class Int64Statistic : public Statistic {
         last_value_ = that.last_value_;
     }
 
-    FORCE_INLINE void reset() {
+    FORCE_INLINE void reset() override {
         count_ = 0;
         sum_value_ = 0;
         min_value_ = 0;
@@ -734,13 +812,69 @@ class Int64Statistic : public Statistic {
         first_value_ = 0;
         last_value_ = 0;
     }
-    FORCE_INLINE void update(int64_t time, int64_t value) {
+    FORCE_INLINE void update(int64_t time, int64_t value) override {
         NUM_STAT_UPDATE(time, value);
     }
 
-    FORCE_INLINE common::TSDataType get_type() { return common::INT64; }
+    void update_batch(const int64_t* timestamps, const int64_t* values,
+                      uint32_t count) override {
+        if (count == 0) return;
+        uint32_t start = 0;
+        if (count_ == 0) {
+            start_time_ = timestamps[0];
+            end_time_ = timestamps[0];
+            first_value_ = values[0];
+            last_value_ = values[0];
+            min_value_ = values[0];
+            max_value_ = values[0];
+            sum_value_ = (double)values[0];
+            count_ = 1;
+            start = 1;
+        }
+        // Timestamps are monotonic (verified by TimePageWriter),
+        // so only first/last matter for start_time_/end_time_.
+        if (count > start) {
+            if (timestamps[start] < start_time_)
+                start_time_ = timestamps[start];
+            if (timestamps[count - 1] > end_time_)
+                end_time_ = timestamps[count - 1];
+        }
+        uint32_t i = start;
+#if TSFILE_HAS_NEON
+        {
+            int64x2_t vmin = vdupq_n_s64(min_value_);
+            int64x2_t vmax = vdupq_n_s64(max_value_);
+            float64x2_t vsum = vdupq_n_f64(0.0);
+            for (; i + 2 <= count; i += 2) {
+                int64x2_t v = vld1q_s64(&values[i]);
+                // min/max via compare+select (no vminq_s64 in NEON)
+                uint64x2_t lt = vcltq_s64(v, vmin);
+                vmin = vbslq_s64(lt, v, vmin);
+                uint64x2_t gt = vcgtq_s64(v, vmax);
+                vmax = vbslq_s64(gt, v, vmax);
+                vsum = vaddq_f64(vsum, vcvtq_f64_s64(v));
+            }
+            min_value_ =
+                std::min(vgetq_lane_s64(vmin, 0), vgetq_lane_s64(vmin, 1));
+            max_value_ =
+                std::max(vgetq_lane_s64(vmax, 0), vgetq_lane_s64(vmax, 1));
+            sum_value_ += vgetq_lane_f64(vsum, 0) + vgetq_lane_f64(vsum, 1);
+        }
+#endif
+        for (; i < count; i++) {
+            if (values[i] < min_value_) min_value_ = values[i];
+            if (values[i] > max_value_) max_value_ = values[i];
+            sum_value_ += (double)values[i];
+        }
+        last_value_ = values[count - 1];
+        count_ += (count - start);
+    }
+
+    FORCE_INLINE common::TSDataType get_type() override {
+        return common::INT64;
+    }
 
-    int serialize_typed_stat(common::ByteStream& out) {
+    int serialize_typed_stat(common::ByteStream& out) override {
         int ret = common::E_OK;
         if (RET_FAIL(common::SerializationUtil::write_ui64(min_value_, out))) {
         } else if (RET_FAIL(common::SerializationUtil::write_ui64(max_value_,
@@ -754,7 +888,7 @@ class Int64Statistic : public Statistic {
         }
         return ret;
     }
-    int deserialize_typed_stat(common::ByteStream& in) {
+    int deserialize_typed_stat(common::ByteStream& in) override {
         int ret = common::E_OK;
         if (RET_FAIL(common::SerializationUtil::read_ui64((uint64_t&)min_value_,
                                                           in))) {
@@ -769,15 +903,15 @@ class Int64Statistic : public Statistic {
         }
         return ret;
     }
-    int merge_with(Statistic* stat) {
+    int merge_with(Statistic* stat) override {
         MERGE_NUM_STAT_FROM(Int64Statistic, stat);
     }
 
-    int deep_copy_from(Statistic* stat) {
+    int deep_copy_from(Statistic* stat) override {
         DEEP_COPY_NUM_STAT_FROM(Int64Statistic, stat);
     }
 
-    std::string to_string() const {
+    std::string to_string() const override {
         std::ostringstream oss;
         oss << "{count=" << count_ << ", start_time=" << start_time_
             << ", end_time=" << end_time_ << ", first_val=" << first_value_
@@ -815,7 +949,7 @@ class FloatStatistic : public Statistic {
         last_value_ = that.last_value_;
     }
 
-    FORCE_INLINE void reset() {
+    FORCE_INLINE void reset() override {
         count_ = 0;
         sum_value_ = 0;
         min_value_ = 0;
@@ -823,13 +957,15 @@ class FloatStatistic : public Statistic {
         first_value_ = 0;
         last_value_ = 0;
     }
-    FORCE_INLINE void update(int64_t time, float value) {
+    FORCE_INLINE void update(int64_t time, float value) override {
         NUM_STAT_UPDATE(time, value);
     }
 
-    FORCE_INLINE common::TSDataType get_type() { return common::FLOAT; }
+    FORCE_INLINE common::TSDataType get_type() override {
+        return common::FLOAT;
+    }
 
-    int serialize_typed_stat(common::ByteStream& out) {
+    int serialize_typed_stat(common::ByteStream& out) override {
         int ret = common::E_OK;
         if (RET_FAIL(common::SerializationUtil::write_float(min_value_, out))) {
         } else if (RET_FAIL(common::SerializationUtil::write_float(max_value_,
@@ -843,7 +979,7 @@ class FloatStatistic : public Statistic {
         }
         return ret;
     }
-    int deserialize_typed_stat(common::ByteStream& in) {
+    int deserialize_typed_stat(common::ByteStream& in) override {
         int ret = common::E_OK;
         if (RET_FAIL(common::SerializationUtil::read_float(min_value_, in))) {
         } else if (RET_FAIL(
@@ -857,10 +993,10 @@ class FloatStatistic : public Statistic {
         }
         return ret;
     }
-    int merge_with(Statistic* stat) {
+    int merge_with(Statistic* stat) override {
         MERGE_NUM_STAT_FROM(FloatStatistic, stat);
     }
-    int deep_copy_from(Statistic* stat) {
+    int deep_copy_from(Statistic* stat) override {
         DEEP_COPY_NUM_STAT_FROM(FloatStatistic, stat);
     }
 };
@@ -892,7 +1028,7 @@ class DoubleStatistic : public Statistic {
         last_value_ = that.last_value_;
     }
 
-    FORCE_INLINE void reset() {
+    FORCE_INLINE void reset() override {
         count_ = 0;
         sum_value_ = 0;
         min_value_ = 0;
@@ -900,13 +1036,64 @@ class DoubleStatistic : public Statistic {
         first_value_ = 0;
         last_value_ = 0;
     }
-    FORCE_INLINE void update(int64_t time, double value) {
+    FORCE_INLINE void update(int64_t time, double value) override {
         NUM_STAT_UPDATE(time, value);
     }
 
-    FORCE_INLINE common::TSDataType get_type() { return common::DOUBLE; }
+    void update_batch(const int64_t* timestamps, const double* values,
+                      uint32_t count) override {
+        if (count == 0) return;
+        uint32_t start = 0;
+        if (count_ == 0) {
+            start_time_ = timestamps[0];
+            end_time_ = timestamps[0];
+            first_value_ = values[0];
+            last_value_ = values[0];
+            min_value_ = values[0];
+            max_value_ = values[0];
+            sum_value_ = values[0];
+            count_ = 1;
+            start = 1;
+        }
+        if (count > start) {
+            if (timestamps[start] < start_time_)
+                start_time_ = timestamps[start];
+            if (timestamps[count - 1] > end_time_)
+                end_time_ = timestamps[count - 1];
+        }
+        uint32_t i = start;
+#if TSFILE_HAS_NEON
+        {
+            float64x2_t vmin = vdupq_n_f64(min_value_);
+            float64x2_t vmax = vdupq_n_f64(max_value_);
+            float64x2_t vsum = vdupq_n_f64(0.0);
+            for (; i + 2 <= count; i += 2) {
+                float64x2_t v = vld1q_f64(&values[i]);
+                vmin = vminq_f64(vmin, v);
+                vmax = vmaxq_f64(vmax, v);
+                vsum = vaddq_f64(vsum, v);
+            }
+            min_value_ =
+                std::min(vgetq_lane_f64(vmin, 0), vgetq_lane_f64(vmin, 1));
+            max_value_ =
+                std::max(vgetq_lane_f64(vmax, 0), vgetq_lane_f64(vmax, 1));
+            sum_value_ += vgetq_lane_f64(vsum, 0) + vgetq_lane_f64(vsum, 1);
+        }
+#endif
+        for (; i < count; i++) {
+            if (values[i] < min_value_) min_value_ = values[i];
+            if (values[i] > max_value_) max_value_ = values[i];
+            sum_value_ += values[i];
+        }
+        last_value_ = values[count - 1];
+        count_ += (count - start);
+    }
+
+    FORCE_INLINE common::TSDataType get_type() override {
+        return common::DOUBLE;
+    }
 
-    int serialize_typed_stat(common::ByteStream& out) {
+    int serialize_typed_stat(common::ByteStream& out) override {
         int ret = common::E_OK;
         if (RET_FAIL(
                 common::SerializationUtil::write_double(min_value_, out))) {
@@ -921,7 +1108,7 @@ class DoubleStatistic : public Statistic {
         }
         return ret;
     }
-    int deserialize_typed_stat(common::ByteStream& in) {
+    int deserialize_typed_stat(common::ByteStream& in) override {
         int ret = common::E_OK;
         if (RET_FAIL(common::SerializationUtil::read_double(min_value_, in))) {
         } else if (RET_FAIL(common::SerializationUtil::read_double(max_value_,
@@ -935,10 +1122,10 @@ class DoubleStatistic : public Statistic {
         }
         return ret;
     }
-    int merge_with(Statistic* stat) {
+    int merge_with(Statistic* stat) override {
         MERGE_NUM_STAT_FROM(DoubleStatistic, stat);
     }
-    int deep_copy_from(Statistic* stat) {
+    int deep_copy_from(Statistic* stat) override {
         DEEP_COPY_NUM_STAT_FROM(DoubleStatistic, stat);
     }
 };
@@ -960,30 +1147,50 @@ class TimeStatistic : public Statistic {
         end_time_ = that.end_time_;
     }
 
-    FORCE_INLINE void reset() {
+    FORCE_INLINE void reset() override {
         count_ = 0;
         start_time_ = 0;
         end_time_ = 0;
     }
 
-    FORCE_INLINE void update(int64_t time) {
+    FORCE_INLINE void update(int64_t time) override {
         TIME_STAT_UPDATE((time));
         count_++;
     }
 
-    FORCE_INLINE common::TSDataType get_type() { return common::VECTOR; }
+    void update_time_batch(const int64_t* timestamps, uint32_t count) override {
+        if (count == 0) return;
+        if (count_ == 0) {
+            start_time_ = timestamps[0];
+            end_time_ = timestamps[0];
+        }
+        // Timestamps are already verified monotonic in TimePageWriter,
+        // so first element is min candidate and last is max candidate.
+        if (timestamps[0] < start_time_) start_time_ = timestamps[0];
+        if (timestamps[count - 1] > end_time_)
+            end_time_ = timestamps[count - 1];
+        count_ += count;
+    }
 
-    int serialize_typed_stat(common::ByteStream& out) { return common::E_OK; }
-    int deserialize_typed_stat(common::ByteStream& in) { return common::E_OK; }
-    int merge_with(Statistic* stat) {
+    FORCE_INLINE common::TSDataType get_type() override {
+        return common::VECTOR;
+    }
+
+    int serialize_typed_stat(common::ByteStream& out) override {
+        return common::E_OK;
+    }
+    int deserialize_typed_stat(common::ByteStream& in) override {
+        return common::E_OK;
+    }
+    int merge_with(Statistic* stat) override {
         MERGE_TIME_STAT_FROM(TimeStatistic, stat);
     }
 
-    int deep_copy_from(Statistic* stat) {
+    int deep_copy_from(Statistic* stat) override {
         DEEP_COPY_TIME_STAT_FROM(TimeStatistic, stat);
     }
 
-    std::string to_string() const {
+    std::string to_string() const override {
         std::ostringstream oss;
         oss << "{count=" << count_ << ", start_time=" << start_time_
             << ", end_time=" << end_time_ << "}";
@@ -992,7 +1199,9 @@ class TimeStatistic : public Statistic {
 };
 
 class TimestampStatistics : public Int64Statistic {
-    FORCE_INLINE common::TSDataType get_type() { return common::TIMESTAMP; }
+    FORCE_INLINE common::TSDataType get_type() override {
+        return common::TIMESTAMP;
+    }
 };
 
 class StringStatistic : public Statistic {
@@ -1002,35 +1211,24 @@ class StringStatistic : public Statistic {
     common::String first_value_;
     common::String last_value_;
     StringStatistic()
-        : min_value_(),
-          max_value_(),
-          first_value_(),
-          last_value_(),
-          pa_(nullptr),
-          owns_pa_(true) {
+        : min_value_(), max_value_(), first_value_(), last_value_() {
         pa_ = new common::PageArena();
         pa_->init(512, common::MOD_STATISTIC_OBJ);
     }
 
     StringStatistic(common::PageArena* pa)
-        : min_value_(),
-          max_value_(),
-          first_value_(),
-          last_value_(),
-          pa_(pa),
-          owns_pa_(false) {}
+        : min_value_(), max_value_(), first_value_(), last_value_(), pa_(pa) {}
 
     ~StringStatistic() { destroy(); }
 
-    void destroy() {
-        if (owns_pa_ && pa_) {
+    void destroy() override {
+        if (pa_) {
             delete pa_;
             pa_ = nullptr;
         }
-        owns_pa_ = false;
     }
 
-    FORCE_INLINE void reset() {
+    FORCE_INLINE void reset() override {
         count_ = 0;
         start_time_ = 0;
         end_time_ = 0;
@@ -1050,13 +1248,15 @@ class StringStatistic : public Statistic {
         last_value_.dup_from(that.last_value_, *pa_);
     }
 
-    FORCE_INLINE void update(int64_t time, common::String value) {
+    FORCE_INLINE void update(int64_t time, common::String value) override {
         STRING_STAT_UPDATE(time, value);
     }
 
-    FORCE_INLINE common::TSDataType get_type() { return common::STRING; }
+    FORCE_INLINE common::TSDataType get_type() override {
+        return common::STRING;
+    }
 
-    int serialize_typed_stat(common::ByteStream& out) {
+    int serialize_typed_stat(common::ByteStream& out) override {
         int ret = common::E_OK;
         if (RET_FAIL(common::SerializationUtil::write_str(first_value_, out))) {
         } else if (RET_FAIL(common::SerializationUtil::write_str(last_value_,
@@ -1068,7 +1268,7 @@ class StringStatistic : public Statistic {
         }
         return ret;
     }
-    int deserialize_typed_stat(common::ByteStream& in) {
+    int deserialize_typed_stat(common::ByteStream& in) override {
         int ret = common::E_OK;
         if (RET_FAIL(
                 common::SerializationUtil::read_str(first_value_, pa_, in))) {
@@ -1081,42 +1281,39 @@ class StringStatistic : public Statistic {
         }
         return ret;
     }
-    int merge_with(Statistic* stat) {
+    int merge_with(Statistic* stat) override {
         MERGE_STRING_STAT_FROM(StringStatistic, stat);
     }
-    int deep_copy_from(Statistic* stat) {
+    int deep_copy_from(Statistic* stat) override {
         DEEP_COPY_STRING_STAT_FROM(StringStatistic, stat);
     }
 
    private:
     common::PageArena* pa_;
-    bool owns_pa_;
 };
 
 class TextStatistic : public Statistic {
    public:
     common::String first_value_;
     common::String last_value_;
-    TextStatistic()
-        : first_value_(), last_value_(), pa_(nullptr), owns_pa_(true) {
+    TextStatistic() : first_value_(), last_value_() {
         pa_ = new common::PageArena();
         pa_->init(512, common::MOD_STATISTIC_OBJ);
     }
 
     TextStatistic(common::PageArena* pa)
-        : first_value_(), last_value_(), pa_(pa), owns_pa_(false) {}
+        : first_value_(), last_value_(), pa_(pa) {}
 
     ~TextStatistic() { destroy(); }
 
-    void destroy() {
-        if (owns_pa_ && pa_) {
+    void destroy() override {
+        if (pa_) {
             delete pa_;
             pa_ = nullptr;
         }
-        owns_pa_ = false;
     }
 
-    FORCE_INLINE void reset() {
+    FORCE_INLINE void reset() override {
         count_ = 0;
         start_time_ = 0;
         end_time_ = 0;
@@ -1132,13 +1329,13 @@ class TextStatistic : public Statistic {
         last_value_.dup_from(that.last_value_, *pa_);
     }
 
-    FORCE_INLINE void update(int64_t time, common::String value) {
+    FORCE_INLINE void update(int64_t time, common::String value) override {
         TEXT_STAT_UPDATE(time, value);
     }
 
-    FORCE_INLINE common::TSDataType get_type() { return common::TEXT; }
+    FORCE_INLINE common::TSDataType get_type() override { return common::TEXT; }
 
-    int serialize_typed_stat(common::ByteStream& out) {
+    int serialize_typed_stat(common::ByteStream& out) override {
         int ret = common::E_OK;
         if (RET_FAIL(common::SerializationUtil::write_str(first_value_, out))) {
         } else if (RET_FAIL(common::SerializationUtil::write_str(last_value_,
@@ -1146,7 +1343,7 @@ class TextStatistic : public Statistic {
         }
         return ret;
     }
-    int deserialize_typed_stat(common::ByteStream& in) {
+    int deserialize_typed_stat(common::ByteStream& in) override {
         int ret = common::E_OK;
         if (RET_FAIL(
                 common::SerializationUtil::read_str(first_value_, pa_, in))) {
@@ -1155,35 +1352,33 @@ class TextStatistic : public Statistic {
         }
         return ret;
     }
-    int merge_with(Statistic* stat) {
+    int merge_with(Statistic* stat) override {
         MERGE_TEXT_STAT_FROM(TextStatistic, stat);
     }
-    int deep_copy_from(Statistic* stat) {
+    int deep_copy_from(Statistic* stat) override {
         DEEP_COPY_TEXT_STAT_FROM(TextStatistic, stat);
     }
 
    private:
     common::PageArena* pa_;
-    bool owns_pa_;
 };
 
 class BlobStatistic : public Statistic {
    public:
-    BlobStatistic() : pa_(nullptr), owns_pa_(true) {
+    BlobStatistic() {
         pa_ = new common::PageArena();
         pa_->init(512, common::MOD_STATISTIC_OBJ);
     }
 
-    BlobStatistic(common::PageArena* pa) : pa_(pa), owns_pa_(false) {}
+    BlobStatistic(common::PageArena* pa) {}
 
     ~BlobStatistic() { destroy(); }
 
     void destroy() {
-        if (owns_pa_ && pa_) {
+        if (pa_) {
             delete pa_;
             pa_ = nullptr;
         }
-        owns_pa_ = false;
     }
 
     FORCE_INLINE void reset() {
@@ -1214,7 +1409,6 @@ class BlobStatistic : public Statistic {
 
    private:
     common::PageArena* pa_;
-    bool owns_pa_;
 };
 
 FORCE_INLINE uint32_t get_typed_statistic_sizeof(common::TSDataType type) {
diff --git a/cpp/src/common/tablet.cc b/cpp/src/common/tablet.cc
index b9ae5301a..ba37a3245 100644
--- a/cpp/src/common/tablet.cc
+++ b/cpp/src/common/tablet.cc
@@ -20,8 +20,10 @@
 #include "tablet.h"
 
 #include <cstdlib>
+#include <limits>
 
 #include "allocator/alloc_base.h"
+#include "container/bit_map.h"
 #include "datatype/date_converter.h"
 #include "utils/errno_define.h"
 
@@ -98,14 +100,13 @@ int Tablet::init() {
             case BLOB:
             case TEXT:
             case STRING: {
-                auto* sc = static_cast<StringColumn*>(common::mem_alloc(
-                    sizeof(StringColumn), common::MOD_TABLET));
-                if (sc == nullptr) return E_OOM;
-                new (sc) StringColumn();
-                // 8 bytes/row is a conservative initial estimate for short
-                // string columns (e.g. device IDs, tags). The buffer grows
-                // automatically on demand via mem_realloc.
-                sc->init(max_row_num_, max_row_num_ * 8);
+                void* mem =
+                    common::mem_alloc(sizeof(StringColumn), common::MOD_TABLET);
+                if (mem == nullptr) {
+                    return E_OOM;
+                }
+                auto* sc = new (mem) StringColumn();
+                sc->init(max_row_num_, max_row_num_ * 32);
                 value_matrix_[c].string_col = sc;
                 break;
             }
@@ -120,8 +121,9 @@ int Tablet::init() {
     if (bitmaps_ == nullptr) return E_OOM;
     for (size_t c = 0; c < schema_count; c++) {
         new (&bitmaps_[c]) BitMap();
-        bitmaps_[c].init(max_row_num_, false);
+        bitmaps_[c].init(max_row_num_, false, common::MOD_TABLET);
     }
+
     return E_OK;
 }
 
@@ -156,6 +158,7 @@ void Tablet::destroy() {
                 case TEXT:
                 case STRING:
                     value_matrix_[c].string_col->destroy();
+                    value_matrix_[c].string_col->~StringColumn();
                     common::mem_free(value_matrix_[c].string_col);
                     break;
                 default:
@@ -192,9 +195,7 @@ int Tablet::add_timestamp(uint32_t row_index, int64_t timestamp) {
 }
 
 int Tablet::set_timestamps(const int64_t* timestamps, uint32_t count) {
-    if (err_code_ != E_OK) {
-        return err_code_;
-    }
+    if (err_code_ != E_OK) return err_code_;
     ASSERT(timestamps_ != NULL);
     if (UNLIKELY(count > static_cast<uint32_t>(max_row_num_))) {
         return E_OUT_OF_RANGE;
@@ -206,15 +207,10 @@ int Tablet::set_timestamps(const int64_t* timestamps, uint32_t count) {
 
 int Tablet::set_column_values(uint32_t schema_index, const void* data,
                               const uint8_t* bitmap, uint32_t count) {
-    if (err_code_ != E_OK) {
-        return err_code_;
-    }
-    if (UNLIKELY(schema_index >= schema_vec_->size())) {
-        return E_OUT_OF_RANGE;
-    }
-    if (UNLIKELY(count > static_cast<uint32_t>(max_row_num_))) {
+    if (err_code_ != E_OK) return err_code_;
+    if (UNLIKELY(schema_index >= schema_vec_->size())) return E_OUT_OF_RANGE;
+    if (UNLIKELY(count > static_cast<uint32_t>(max_row_num_)))
         return E_OUT_OF_RANGE;
-    }
 
     const MeasurementSchema& schema = schema_vec_->at(schema_index);
     size_t elem_size = 0;
@@ -250,9 +246,13 @@ int Tablet::set_column_values(uint32_t schema_index, const void* data,
     if (bitmap == nullptr) {
         bitmaps_[schema_index].clear_all();
     } else {
-        char* tsfile_bm = bitmaps_[schema_index].get_bitmap();
+        // copy_from also refreshes has_set_bits_; a plain memcpy into
+        // get_bitmap() would leave the flag stale (e.g. cleared by a prior
+        // clear_all()) and downstream may_have_set_bits() checks would skip
+        // null-mask handling for the column.
         uint32_t bm_bytes = (count + 7) / 8;
-        std::memcpy(tsfile_bm, bitmap, bm_bytes);
+        bitmaps_[schema_index].copy_from(reinterpret_cast<const char*>(bitmap),
+                                         bm_bytes);
     }
     cur_row_size_ = std::max(count, cur_row_size_);
     return E_OK;
@@ -271,15 +271,36 @@ int Tablet::set_column_string_values(uint32_t schema_index,
         return E_OUT_OF_RANGE;
     }
 
+    // Reject non-string types: the union member is StringColumn*, but for
+    // numeric columns the same slot holds the numeric buffer pointer.
+    // Interpreting it as StringColumn* and writing into ->buffer/->offsets
+    // would corrupt the numeric buffer.
+    const TSDataType dt = schema_vec_->at(schema_index).data_type_;
+    if (dt != STRING && dt != TEXT && dt != BLOB) {
+        return E_TYPE_NOT_MATCH;
+    }
     StringColumn* sc = value_matrix_[schema_index].string_col;
     if (sc == nullptr) {
         return E_INVALID_ARG;
     }
 
+    // offsets is the Arrow-style "offsets" array (count + 1 entries).  All
+    // downstream code assumes offsets[0] == 0, offsets are non-negative,
+    // and offsets[i] <= offsets[i+1].  Skipping these checks would let a
+    // caller pass e.g. {0, 10, 5} and trigger an unsigned underflow on
+    // (offsets[i+1] - offsets[i]) at serialize time, plus a wild memcpy.
+    if (UNLIKELY(offsets == nullptr)) return E_INVALID_ARG;
+    if (UNLIKELY(offsets[0] != 0)) return E_INVALID_ARG;
+    for (uint32_t i = 0; i < count; i++) {
+        if (UNLIKELY(offsets[i + 1] < offsets[i])) return E_INVALID_ARG;
+    }
+    if (UNLIKELY(offsets[count] < 0)) return E_INVALID_ARG;
     uint32_t total_bytes = static_cast<uint32_t>(offsets[count]);
     if (total_bytes > sc->buf_capacity) {
+        char* new_buf = (char*)mem_realloc(sc->buffer, total_bytes);
+        if (UNLIKELY(new_buf == nullptr)) return E_OOM;
+        sc->buffer = new_buf;
         sc->buf_capacity = total_bytes;
-        sc->buffer = (char*)mem_realloc(sc->buffer, sc->buf_capacity);
     }
 
     if (total_bytes > 0) {
@@ -291,14 +312,74 @@ int Tablet::set_column_string_values(uint32_t schema_index,
     if (bitmap == nullptr) {
         bitmaps_[schema_index].clear_all();
     } else {
-        char* tsfile_bm = bitmaps_[schema_index].get_bitmap();
         uint32_t bm_bytes = (count + 7) / 8;
-        std::memcpy(tsfile_bm, bitmap, bm_bytes);
+        bitmaps_[schema_index].copy_from(reinterpret_cast<const char*>(bitmap),
+                                         bm_bytes);
+    }
+    cur_row_size_ = std::max(count, cur_row_size_);
+    return E_OK;
+}
+
+int Tablet::set_column_string_repeated(uint32_t schema_index, const char* str,
+                                       uint32_t str_len, uint32_t count) {
+    if (err_code_ != E_OK) return err_code_;
+    if (UNLIKELY(schema_index >= schema_vec_->size())) return E_OUT_OF_RANGE;
+    if (UNLIKELY(count > static_cast<uint32_t>(max_row_num_)))
+        return E_OUT_OF_RANGE;
+
+    // See set_column_string_values: the union member is only valid as
+    // StringColumn* when the schema column is a variable-width type.
+    const TSDataType dt = schema_vec_->at(schema_index).data_type_;
+    if (dt != STRING && dt != TEXT && dt != BLOB) {
+        return E_TYPE_NOT_MATCH;
+    }
+    StringColumn* sc = value_matrix_[schema_index].string_col;
+    if (sc == nullptr) return E_INVALID_ARG;
+
+    // str_len * count can overflow uint32_t; do the multiply in uint64_t and
+    // reject anything that wouldn't fit, otherwise the subsequent loop would
+    // walk past the truncated buf_capacity allocation.
+    uint64_t total_bytes_64 =
+        static_cast<uint64_t>(str_len) * static_cast<uint64_t>(count);
+    if (total_bytes_64 > std::numeric_limits<uint32_t>::max()) {
+        return E_OVERFLOW;
     }
+    uint32_t total_bytes = static_cast<uint32_t>(total_bytes_64);
+    if (total_bytes > sc->buf_capacity) {
+        char* new_buf = (char*)mem_realloc(sc->buffer, total_bytes);
+        if (UNLIKELY(new_buf == nullptr)) return E_OOM;
+        sc->buffer = new_buf;
+        sc->buf_capacity = total_bytes;
+    }
+
+    for (uint32_t i = 0; i < count; i++) {
+        sc->offsets[i] = i * str_len;
+        memcpy(sc->buffer + i * str_len, str, str_len);
+    }
+    sc->offsets[count] = total_bytes;
+    sc->buf_used = total_bytes;
+
+    bitmaps_[schema_index].clear_all();
     cur_row_size_ = std::max(count, cur_row_size_);
     return E_OK;
 }
 
+void Tablet::reset(uint32_t row_count) {
+    ASSERT(row_count <= max_row_num_);
+    cur_row_size_ = row_count;
+    reset_string_columns();
+    // Bitmaps init to all-null (bit=1); writes flip bits to mark non-null.
+    // Without resetting them here, a reused Tablet would inherit cleared
+    // bits from the previous batch, causing stale values to be reported as
+    // non-null and written out again.
+    if (bitmaps_ != nullptr) {
+        const size_t schema_count = schema_vec_->size();
+        for (size_t c = 0; c < schema_count; c++) {
+            bitmaps_[c].reset();
+        }
+    }
+}
+
 void* Tablet::get_value(int row_index, uint32_t schema_index,
                         common::TSDataType& data_type) const {
     if (UNLIKELY(schema_index >= schema_vec_->size())) {
@@ -505,31 +586,21 @@ void Tablet::reset_string_columns() {
     }
 }
 
-// Find all row indices where the device ID changes.  A device ID is the
-// composite key formed by all id columns (e.g. region + sensor_id).  Row i
-// is a boundary when at least one id column differs between row i-1 and row i.
-//
-// Example (2 id columns: region, sensor_id):
-//   row 0: "A", "s1"
-//   row 1: "A", "s2"  <- boundary: sensor_id changed
-//   row 2: "B", "s1"  <- boundary: region changed
-//   row 3: "B", "s1"
-//   row 4: "B", "s2"  <- boundary: sensor_id changed
-//   result: [1, 2, 4]
-//
-// Boundaries are computed in one shot at flush time rather than maintained
-// incrementally during add_value / set_column_*. The total work is similar
-// either way, but batch computation here is far more CPU-friendly: the inner
-// loop is a tight memcmp scan over contiguous buffers with good cache
-// locality, and the CPU can pipeline comparisons without the branch overhead
-// and cache thrashing of per-row bookkeeping spread across the write path.
 std::vector<uint32_t> Tablet::find_all_device_boundaries() const {
     const uint32_t row_count = get_cur_row_size();
     if (row_count <= 1) return {};
 
+    // Use uint64_t bitmap instead of vector<bool> for faster set/test/scan.
     const uint32_t nwords = (row_count + 63) / 64;
     std::vector<uint64_t> boundary(nwords, 0);
 
+    // Walk id columns RIGHT to LEFT.  In time-series tag systems the rightmost
+    // tags (sensor_id, metric_name, etc.) typically have the highest
+    // cardinality and change most often.  By processing them first we mark most
+    // of the boundary bitmap early; subsequent (lower-cardinality) columns then
+    // short- circuit on `boundary[i] already set` for the bulk of their rows.
+    // Reverse order also lets us bail out of the entire scan as soon as every
+    // possible boundary is marked.
     uint32_t boundary_count = 0;
     const uint32_t max_boundaries = row_count - 1;
     for (auto it = id_column_indexes_.rbegin(); it != id_column_indexes_.rend();
@@ -537,43 +608,55 @@ std::vector<uint32_t> Tablet::find_all_device_boundaries() const {
         const StringColumn& sc = *value_matrix_[*it].string_col;
         const int32_t* off = sc.offsets;
         const char* buf = sc.buffer;
+        common::BitMap& bitmap = const_cast<common::BitMap&>(bitmaps_[*it]);
         for (uint32_t i = 1; i < row_count; i++) {
-            if (boundary[i >> 6] & (1ULL << (i & 63))) continue;
+            if (boundary[i >> 6] & (1ULL << (i & 63))) {
+                continue;
+            }
+            const bool prev_null = bitmap.test(i - 1);
+            const bool curr_null = bitmap.test(i);
+            if (prev_null != curr_null) {
+                boundary[i >> 6] |= (1ULL << (i & 63));
+                if (++boundary_count >= max_boundaries) {
+                    break;
+                }
+                continue;
+            }
+            if (prev_null) {
+                continue;
+            }
+            // Signed int32 widths so an offset-array corruption that would
+            // otherwise underflow to a huge unsigned value surfaces as
+            // len < 0 instead.  memcmp's size_t param needs an explicit cast,
+            // guarded by `len_a > 0`.
             int32_t len_a = off[i] - off[i - 1];
             int32_t len_b = off[i + 1] - off[i];
             if (len_a != len_b ||
                 (len_a > 0 && memcmp(buf + off[i - 1], buf + off[i],
-                                     static_cast<uint32_t>(len_a)) != 0)) {
+                                     static_cast<size_t>(len_a)) != 0)) {
                 boundary[i >> 6] |= (1ULL << (i & 63));
-                if (++boundary_count >= max_boundaries) break;
+                if (++boundary_count >= max_boundaries) {
+                    break;
+                }
             }
         }
-        if (boundary_count >= max_boundaries) break;
-    }
-
-    // Sweep the bitmap word by word, extracting set bit positions in order.
-    // Each word covers 64 consecutive rows: word w covers rows [w*64, w*64+63].
-    //
-    // For each word we use two standard bit tricks:
-    //   __builtin_ctzll(bits)  — count trailing zeros = index of lowest set bit
-    //   bits &= bits - 1       — clear the lowest set bit
-    //
-    // Example: w=1, bits=0b...00010100 (bits 2 and 4 set)
-    //   iter 1: ctzll=2 → idx=1*64+2=66, bits becomes 0b...00010000
-    //   iter 2: ctzll=4 → idx=1*64+4=68, bits becomes 0b...00000000 → exit
-    //
-    // Guards: idx>0 because row 0 can never be a boundary (no predecessor);
-    // idx<row_count trims padding bits in the last word when row_count%64 != 0.
+        if (boundary_count >= max_boundaries) {
+            break;
+        }
+    }
+
+    // Collect boundary positions using bitscan
     std::vector<uint32_t> result;
     for (uint32_t w = 0; w < nwords; w++) {
         uint64_t bits = boundary[w];
         while (bits) {
-            uint32_t bit = bitops::ctz64_nonzero(bits);
+            uint32_t bit =
+                static_cast<uint32_t>(common::bitops::ctz_nonzero(bits));
             uint32_t idx = w * 64 + bit;
             if (idx > 0 && idx < row_count) {
                 result.push_back(idx);
             }
-            bits &= bits - 1;
+            bits &= bits - 1;  // clear lowest set bit
         }
     }
     return result;
@@ -612,4 +695,4 @@ std::shared_ptr<IDeviceID> Tablet::get_device_id(int i) const {
     return res;
 }
 
-}  // end namespace storage
\ No newline at end of file
+}  // end namespace storage
diff --git a/cpp/src/common/tablet.h b/cpp/src/common/tablet.h
index 799d6b7cc..76af3ac0e 100644
--- a/cpp/src/common/tablet.h
+++ b/cpp/src/common/tablet.h
@@ -22,7 +22,6 @@
 
 #include <algorithm>
 #include <memory>
-#include <utility>
 #include <vector>
 
 #include "common/config/config.h"
@@ -47,7 +46,6 @@ class TabletColIterator;
  * with their associated metadata such as column names and types.
  */
 class Tablet {
-   public:
     // Arrow-style string column: offsets + contiguous buffer.
     // string[i] = buffer + offsets[i], len = offsets[i+1] - offsets[i]
     struct StringColumn {
@@ -61,11 +59,10 @@ class Tablet {
 
         void init(uint32_t max_rows, uint32_t init_buf_capacity) {
             offsets = (int32_t*)common::mem_alloc(
-                sizeof(int32_t) * (max_rows + 1), common::MOD_DEFAULT);
+                sizeof(int32_t) * (max_rows + 1), common::MOD_TABLET);
             offsets[0] = 0;
             buf_capacity = init_buf_capacity;
-            buffer =
-                (char*)common::mem_alloc(buf_capacity, common::MOD_DEFAULT);
+            buffer = (char*)common::mem_alloc(buf_capacity, common::MOD_TABLET);
             buf_used = 0;
         }
 
@@ -98,14 +95,13 @@ class Tablet {
             return buffer + offsets[row];
         }
         uint32_t get_len(uint32_t row) const {
-            return static_cast<uint32_t>(offsets[row + 1] - offsets[row]);
+            return offsets[row + 1] - offsets[row];
         }
         // Return a String view for a given row. The returned reference is
         // valid until the next call to get_string_view on this column.
         common::String& get_string_view(uint32_t row) {
             view_cache_.buf_ = buffer + offsets[row];
-            view_cache_.len_ =
-                static_cast<uint32_t>(offsets[row + 1] - offsets[row]);
+            view_cache_.len_ = offsets[row + 1] - offsets[row];
             return view_cache_;
         }
 
@@ -231,11 +227,14 @@ class Tablet {
 
     ~Tablet() { destroy(); }
 
-    // Tablet owns raw heap buffers (timestamps_, value_matrix_, bitmaps_) that
-    // destroy() frees. The implicitly generated copy operations would shallow-
-    // copy those pointers, causing double-free / use-after-free, so copying is
-    // disabled. Move transfers ownership and leaves the source empty (its
-    // pointers nulled) so the moved-from object destructs harmlessly.
+    // Tablet owns several heap buffers (timestamps_, value_matrix_ with its
+    // StringColumn::buffer/offsets, bitmaps_) that ~Tablet frees. The default
+    // copy ctor / copy-assign shallow-copies the raw pointers, so any copy
+    // path (e.g. `return tablet;` without NRVO under MSVC Debug) leaves the
+    // source Tablet's destructor freeing buffers the copy still points at,
+    // triggering heap-use-after-free in code like
+    // Tablet::find_all_device_boundaries. Make Tablet move-only with a
+    // pointer-stealing move ctor / move-assign so return-by-value is safe.
     Tablet(const Tablet&) = delete;
     Tablet& operator=(const Tablet&) = delete;
 
@@ -250,10 +249,14 @@ class Tablet {
           value_matrix_(other.value_matrix_),
           bitmaps_(other.bitmaps_),
           column_categories_(std::move(other.column_categories_)),
-          id_column_indexes_(std::move(other.id_column_indexes_)) {
+          id_column_indexes_(std::move(other.id_column_indexes_)),
+          single_device_(other.single_device_) {
         other.timestamps_ = nullptr;
         other.value_matrix_ = nullptr;
         other.bitmaps_ = nullptr;
+        other.cur_row_size_ = 0;
+        // Leaving other.schema_vec_ moved-from is fine; destroy() only
+        // touches the heap buffers above, which we've now nulled out.
     }
 
     Tablet& operator=(Tablet&& other) noexcept {
@@ -270,9 +273,11 @@ class Tablet {
             bitmaps_ = other.bitmaps_;
             column_categories_ = std::move(other.column_categories_);
             id_column_indexes_ = std::move(other.id_column_indexes_);
+            single_device_ = other.single_device_;
             other.timestamps_ = nullptr;
             other.value_matrix_ = nullptr;
             other.bitmaps_ = nullptr;
+            other.cur_row_size_ = 0;
         }
         return *this;
     }
@@ -283,12 +288,6 @@ class Tablet {
     }
     size_t get_column_count() const { return schema_vec_->size(); }
     uint32_t get_cur_row_size() const { return cur_row_size_; }
-    int64_t get_timestamp(uint32_t row_index) const {
-        return timestamps_[row_index];
-    }
-    bool is_null(uint32_t row_index, uint32_t col_index) const {
-        return bitmaps_[col_index].test(row_index);
-    }
 
     /**
      * @brief Adds a timestamp to the specified row.
@@ -300,25 +299,27 @@ class Tablet {
      */
     int add_timestamp(uint32_t row_index, int64_t timestamp);
 
-    /**
-     * @brief Bulk copy timestamps into the tablet.
-     *
-     * @param timestamps Pointer to an array of timestamp values.
-     * @param count Number of timestamps to copy. Must be <= max_row_num.
-     *        If count > cur_row_size_, cur_row_size_ is updated to count,
-     *        so that subsequent operations know how many rows are populated.
-     * @return Returns 0 on success, or a non-zero error code on failure
-     *         (E_OUT_OF_RANGE if count > max_row_num).
-     */
     int set_timestamps(const int64_t* timestamps, uint32_t count);
 
-    // Bulk copy fixed-length column data. If bitmap is nullptr, all rows are
-    // non-null. Otherwise bit=1 means null, bit=0 means valid (same as TsFile
-    // BitMap convention). Callers using other conventions (e.g. Arrow, where
-    // 1=valid) must invert before calling.
+    // Bulk copy fixed-length column data. bitmap=nullptr means all non-null.
+    // bitmap uses TsFile convention: bit=1 is null, bit=0 is valid.
     int set_column_values(uint32_t schema_index, const void* data,
                           const uint8_t* bitmap, uint32_t count);
 
+    // Bulk copy a STRING column from Arrow-style offsets + flat data buffer.
+    // bitmap=nullptr means all non-null; same convention as set_column_values.
+    int set_column_string_values(uint32_t schema_index, const int32_t* offsets,
+                                 const char* data, const uint8_t* bitmap,
+                                 uint32_t count);
+
+    // Bulk fill a STRING column with the same value for all rows.
+    int set_column_string_repeated(uint32_t schema_index, const char* str,
+                                   uint32_t str_len, uint32_t count);
+
+    // Reset per-batch state so the tablet can be reused without reallocating
+    // its backing buffers. row_count is typically 0 before refilling.
+    void reset(uint32_t row_count = 0);
+
     void* get_value(int row_index, uint32_t schema_index,
                     common::TSDataType& data_type) const;
     /**
@@ -341,14 +342,10 @@ class Tablet {
     std::shared_ptr<IDeviceID> get_device_id(int i) const;
     std::vector<uint32_t> find_all_device_boundaries() const;
 
-    // Bulk copy string column data (offsets + data buffer).
-    // offsets has count+1 entries and must start from 0 (offsets[0] == 0).
-    // bitmap follows TsFile convention (bit=1 means null, nullptr means all
-    // valid). Callers using Arrow convention (bit=1 means valid) must invert
-    // before calling.
-    int set_column_string_values(uint32_t schema_index, const int32_t* offsets,
-                                 const char* data, const uint8_t* bitmap,
-                                 uint32_t count);
+    // When the caller guarantees that all rows belong to a single device,
+    // set this flag to skip the O(n*m) boundary detection in the write path.
+    void set_single_device(bool v) { single_device_ = v; }
+    bool is_single_device() const { return single_device_; }
     /**
      * @brief Template function to add a value of type T to the specified row
      * and column by name.
@@ -406,6 +403,7 @@ class Tablet {
     common::BitMap* bitmaps_;
     std::vector<common::ColumnCategory> column_categories_;
     std::vector<int> id_column_indexes_;
+    bool single_device_ = false;
 };
 
 }  // end namespace storage
diff --git a/cpp/src/common/thread_pool.h b/cpp/src/common/thread_pool.h
index f82aea038..191001bd9 100644
--- a/cpp/src/common/thread_pool.h
+++ b/cpp/src/common/thread_pool.h
@@ -27,7 +27,6 @@
 #include <mutex>
 #include <queue>
 #include <thread>
-#include <type_traits>
 #include <vector>
 
 namespace common {
@@ -38,12 +37,27 @@ namespace common {
 // (column-parallel decoding).
 class ThreadPool {
    public:
-    explicit ThreadPool(size_t num_threads) : stop_(false), active_(0) {
-        for (size_t i = 0; i < num_threads; i++) {
-            workers_.emplace_back([this] { worker_loop(); });
+    explicit ThreadPool(size_t num_threads)
+        // A zero-thread pool would silently accept submit() but wait_all()
+        // would block forever because active_ never reaches 0.  init_common()
+        // already clamps the configured size to >= 1 before building the
+        // global pool; this normalization is a defensive backstop so any
+        // direct ThreadPool(0) still makes progress.
+        : num_threads_(num_threads == 0 ? 1 : num_threads),
+          stop_(false),
+          active_(0) {
+        for (size_t i = 0; i < num_threads_; i++) {
+            workers_.emplace_back([this, i] { worker_loop(i); });
         }
     }
 
+    // Returns this worker's index in [0, num_threads).  Returns SIZE_MAX when
+    // called from a non-pool thread.  Used by callers that want per-worker
+    // state (e.g., per-worker decoders/compressors).
+    static size_t current_worker_id() { return tl_worker_id_(); }
+
+    size_t num_threads() const { return num_threads_; }
+
     ~ThreadPool() {
         {
             std::lock_guard<std::mutex> lk(mu_);
@@ -88,7 +102,8 @@ class ThreadPool {
     }
 
    private:
-    void worker_loop() {
+    void worker_loop(size_t id) {
+        tl_worker_id_() = id;
         while (true) {
             std::function<void()> task;
             {
@@ -98,7 +113,23 @@ class ThreadPool {
                 task = std::move(tasks_.front());
                 tasks_.pop();
             }
-            task();
+            // Without the try/catch, a task that throws would:
+            //   (1) skip the active_-- below → wait_all() blocks forever
+            //       because active_ never drops to zero, and
+            //   (2) propagate the exception out of the std::thread function
+            //       → std::terminate() takes down the whole process.
+            // Swallowing the exception is unfortunate but it matches the
+            // contract of the public submit(std::function<void()>) overload
+            // which has no way to surface the failure back to the caller.
+            // submit<F>() callers receive their error via the std::future
+            // wrapper installed by std::packaged_task — that path never
+            // reaches here, so this catch only fires for fire-and-forget
+            // tasks where the alternative is termination.
+            try {
+                task();
+            } catch (...) {
+                // Intentionally suppressed; see comment above.
+            }
             {
                 std::lock_guard<std::mutex> lk(mu_);
                 active_--;
@@ -107,6 +138,14 @@ class ThreadPool {
         }
     }
 
+    // Wrapped in a function so static-initialization order is well-defined
+    // (function-local static is zero-initialized to a sentinel).
+    static size_t& tl_worker_id_() {
+        static thread_local size_t id = static_cast<size_t>(-1);
+        return id;
+    }
+
+    size_t num_threads_;
     std::vector<std::thread> workers_;
     std::queue<std::function<void()>> tasks_;
     std::mutex mu_;
diff --git a/cpp/src/common/tsblock/tsblock.h b/cpp/src/common/tsblock/tsblock.h
index 859ad393d..b68af1611 100644
--- a/cpp/src/common/tsblock/tsblock.h
+++ b/cpp/src/common/tsblock/tsblock.h
@@ -144,6 +144,12 @@ class RowAppender {
         ASSERT(tsblock_->row_count_ > 0);
         tsblock_->row_count_--;
     }
+    FORCE_INLINE uint32_t remaining() const {
+        return tsblock_->max_row_count_ - tsblock_->row_count_;
+    }
+    FORCE_INLINE void add_rows(uint32_t count) {
+        tsblock_->row_count_ += count;
+    }
 
     FORCE_INLINE void append(uint32_t slot_index, const char* value,
                              uint32_t len) {
@@ -222,6 +228,19 @@ class ColAppender {
     }
     FORCE_INLINE void reset() { column_row_count_ = 0; }
 
+    FORCE_INLINE void bulk_append_fixed(const char* data, uint32_t count,
+                                        uint32_t elem_size) {
+        vec_->get_value_data().append_fixed_value(data, count * elem_size);
+        vec_->add_row_nums(count);
+        column_row_count_ += count;
+    }
+
+    FORCE_INLINE uint32_t get_column_row_count() const {
+        return column_row_count_;
+    }
+
+    FORCE_INLINE Vector* get_vector() { return vec_; }
+
    private:
     uint32_t column_index_;
     uint32_t column_row_count_;
@@ -242,6 +261,8 @@ class RowIterator {
 
     FORCE_INLINE bool has_next() { return row_id_ < tsblock_->row_count_; }
 
+    FORCE_INLINE uint32_t get_row_id() const { return row_id_; }
+
     FORCE_INLINE uint32_t get_column_count() { return column_count_; }
 
     FORCE_INLINE TSDataType get_data_type(uint32_t column_index) {
@@ -251,17 +272,14 @@ class RowIterator {
 
     FORCE_INLINE void next() {
         ASSERT(row_id_ < tsblock_->row_count_);
-        ++row_id_;
+        const uint32_t current_row_id = row_id_++;
         for (uint32_t i = 0; i < column_count_; ++i) {
-            tsblock_->vectors_[i]->update_offset();
+            if (!tsblock_->vectors_[i]->is_null(current_row_id)) {
+                tsblock_->vectors_[i]->update_offset();
+            }
         }
     }
 
-    FORCE_INLINE void next(size_t ind) const {
-        ASSERT(row_id_ < tsblock_->row_count_);
-        tsblock_->vectors_[ind]->update_offset();
-    }
-
     FORCE_INLINE void update_row_id() { row_id_++; }
 
     FORCE_INLINE char* read(uint32_t column_index, uint32_t* __restrict len,
@@ -271,6 +289,22 @@ class RowIterator {
         return vec->read(len, null, row_id_);
     }
 
+    // Cheap null check at the current row that avoids the value-read path.
+    FORCE_INLINE bool is_null_at(uint32_t column_index) {
+        ASSERT(column_index < column_count_);
+        return tsblock_->vectors_[column_index]->is_null(row_id_);
+    }
+
+    // Direct access to the underlying Vector for the column. Caller is
+    // responsible for type-correct interpretation of the buffer; intended
+    // for the fast typed-read path that wants to bypass Vector::read's
+    // virtual dispatch (read into the raw buffer at the vector's current
+    // offset_).
+    FORCE_INLINE Vector* get_vector(uint32_t column_index) {
+        ASSERT(column_index < column_count_);
+        return tsblock_->vectors_[column_index];
+    }
+
     std::string debug_string();  // for debug
 
    private:
@@ -311,6 +345,23 @@ class ColIterator {
 
     FORCE_INLINE uint32_t get_column_index() { return column_index_; }
 
+    FORCE_INLINE uint32_t remaining() const {
+        return tsblock_->row_count_ - row_id_;
+    }
+    FORCE_INLINE char* data_ptr() {
+        return vec_->get_value_data().get_data() + vec_->get_offset();
+    }
+    FORCE_INLINE void advance(uint32_t n, uint32_t elem_size) {
+        row_id_ += n;
+        vec_->advance_offset(n * elem_size);
+    }
+
+    FORCE_INLINE void advance_row_only(uint32_t n) { row_id_ += n; }
+
+    FORCE_INLINE uint32_t get_row_id() const { return row_id_; }
+
+    FORCE_INLINE Vector* get_vector() { return vec_; }
+
    private:
     uint32_t column_index_;
     uint32_t row_id_;
diff --git a/cpp/src/common/tsblock/vector/variable_length_vector.h b/cpp/src/common/tsblock/vector/variable_length_vector.h
index b98a9c739..84e541e5c 100644
--- a/cpp/src/common/tsblock/vector/variable_length_vector.h
+++ b/cpp/src/common/tsblock/vector/variable_length_vector.h
@@ -45,8 +45,15 @@ class VariableLengthVector : public Vector {
 
     // cppcheck-suppress missingOverride
     FORCE_INLINE void update_offset() OVERRIDE {
-        offset_ += variable_type_len_;
-        offset_ += last_value_len_;
+        // Self-contained advance: read the length prefix at the current
+        // offset from the buffer rather than relying on a side effect from
+        // a prior read(). This makes update_offset safe when callers skip
+        // reading variable-length columns for some rows (e.g. a row
+        // iterator that only consumes fixed-width columns).
+        uint32_t value_len = 0;
+        std::memcpy(&value_len, values_.get_data() + offset_,
+                    sizeof(value_len));
+        offset_ += variable_type_len_ + value_len;
     }
 
     // cppcheck-suppress missingOverride
diff --git a/cpp/src/common/tsblock/vector/vector.h b/cpp/src/common/tsblock/vector/vector.h
index 37a96c543..dde3e76cc 100644
--- a/cpp/src/common/tsblock/vector/vector.h
+++ b/cpp/src/common/tsblock/vector/vector.h
@@ -73,6 +73,9 @@ class Vector {
     FORCE_INLINE uint32_t get_row_num() { return row_num_; }
 
     FORCE_INLINE void add_row_num() { row_num_++; }
+    FORCE_INLINE void add_row_nums(uint32_t n) { row_num_ += n; }
+    FORCE_INLINE uint32_t get_offset() const { return offset_; }
+    FORCE_INLINE void advance_offset(uint32_t bytes) { offset_ += bytes; }
 
     FORCE_INLINE common::TsBlock* get_tsblock() { return tsblock_; }
 
diff --git a/cpp/src/common/tsfile_common.h b/cpp/src/common/tsfile_common.h
index b516b608f..fd3690200 100644
--- a/cpp/src/common/tsfile_common.h
+++ b/cpp/src/common/tsfile_common.h
@@ -314,6 +314,11 @@ class ITimeseriesIndex {
     virtual common::SimpleList<ChunkMeta*>* get_value_chunk_meta_list() const {
         return nullptr;
     }
+    virtual uint32_t get_value_column_count() const { return 1; }
+    virtual common::SimpleList<ChunkMeta*>* get_value_chunk_meta_list(
+        uint32_t col_index) const {
+        return col_index == 0 ? get_value_chunk_meta_list() : nullptr;
+    }
 
     virtual common::String get_measurement_name() const {
         return common::String();
@@ -457,7 +462,7 @@ class TimeseriesIndex : public ITimeseriesIndex {
                 (timeseries_meta_type_ & 0x3F);  // TODO
             chunk_meta_list_ =
                 new (chunk_meta_list_buf) common::SimpleList<ChunkMeta*>(pa);
-            uint32_t start_pos = in.read_pos();
+            uint64_t start_pos = in.read_pos();
             while (IS_SUCC(ret) &&
                    in.read_pos() < start_pos + chunk_meta_list_data_size_) {
                 void* cm_buf = pa->alloc(sizeof(ChunkMeta));
@@ -589,11 +594,17 @@ class AlignedTimeseriesIndex : public ITimeseriesIndex {
     virtual common::String get_measurement_name() const {
         return value_ts_idx_->get_measurement_name();
     }
+    // Return the VALUE column's data type — that's what consumers like
+    // TsFileReader::get_timeseries_schema and metadata APIs expect for an
+    // aligned measurement.  Returning time_ts_idx_->get_data_type() would
+    // surface the time chunk's on-wire VECTOR marker (or INT64 depending
+    // on how the marker is interpreted) for every aligned timeseries,
+    // breaking schema introspection.
     virtual common::TSDataType get_data_type() const {
         return value_ts_idx_ == nullptr ? common::INVALID_DATATYPE
                                         : value_ts_idx_->get_data_type();
     }
-    virtual bool is_aligned() const { return true; }
+    bool is_aligned() const override { return true; }
     virtual Statistic* get_statistic() const {
         return value_ts_idx_->get_statistic();
     }
@@ -608,6 +619,52 @@ class AlignedTimeseriesIndex : public ITimeseriesIndex {
 #endif
 };
 
+class MultiAlignedTimeseriesIndex : public ITimeseriesIndex {
+   public:
+    TimeseriesIndex* time_ts_idx_ = nullptr;
+    std::vector<TimeseriesIndex*> value_ts_idxs_;
+
+    MultiAlignedTimeseriesIndex() {}
+    ~MultiAlignedTimeseriesIndex() {}
+
+    common::SimpleList<ChunkMeta*>* get_time_chunk_meta_list() const override {
+        return time_ts_idx_ ? time_ts_idx_->get_chunk_meta_list() : nullptr;
+    }
+    common::SimpleList<ChunkMeta*>* get_value_chunk_meta_list() const override {
+        return value_ts_idxs_.empty()
+                   ? nullptr
+                   : value_ts_idxs_[0]->get_chunk_meta_list();
+    }
+    uint32_t get_value_column_count() const override {
+        return value_ts_idxs_.size();
+    }
+    common::SimpleList<ChunkMeta*>* get_value_chunk_meta_list(
+        uint32_t col_index) const override {
+        return col_index < value_ts_idxs_.size()
+                   ? value_ts_idxs_[col_index]->get_chunk_meta_list()
+                   : nullptr;
+    }
+    common::String get_measurement_name() const override {
+        return value_ts_idxs_.empty()
+                   ? common::String()
+                   : value_ts_idxs_[0]->get_measurement_name();
+    }
+    // Same fix as AlignedTimeseriesIndex: report the first value column's
+    // type rather than the time chunk's VECTOR marker.  Consumers walking
+    // a multi-aligned device for schema info expect the measurement type.
+    common::TSDataType get_data_type() const override {
+        return value_ts_idxs_.empty() || value_ts_idxs_[0] == nullptr
+                   ? common::INVALID_DATATYPE
+                   : value_ts_idxs_[0]->get_data_type();
+    }
+    bool is_aligned() const override { return true; }
+    Statistic* get_statistic() const override { return nullptr; }
+
+    const std::vector<TimeseriesIndex*>& get_value_indices() const {
+        return value_ts_idxs_;
+    }
+};
+
 class TSMIterator {
    public:
     explicit TSMIterator(
@@ -629,7 +686,6 @@ class TSMIterator {
     common::SimpleList<ChunkMeta*>::Iterator chunk_meta_iter_;
 
     // timeseries measurenemnt chunk meta info
-    // map <device_name, <measurement_name, vector<chunk_meta>>>
     std::map<std::shared_ptr<IDeviceID>,
              std::map<common::String, std::vector<ChunkMeta*>>,
              IDeviceIDComparator>
diff --git a/cpp/src/compress/lz4_compressor.cc b/cpp/src/compress/lz4_compressor.cc
index 88c64466f..0f19ce179 100644
--- a/cpp/src/compress/lz4_compressor.cc
+++ b/cpp/src/compress/lz4_compressor.cc
@@ -76,9 +76,13 @@ int LZ4Compressor::compress(char* uncompressed_buf,
 }
 
 void LZ4Compressor::after_compress(char* compressed_buf) {
+    // See SnappyCompressor::after_compress for the same reasoning: the member
+    // pointer can lag behind the caller-known buffer across page reuse.
     if (compressed_buf != nullptr) {
-        mem_free(compressed_buf_);
-        compressed_buf_ = nullptr;
+        mem_free(compressed_buf);
+        if (compressed_buf_ == compressed_buf) {
+            compressed_buf_ = nullptr;
+        }
     }
 }
 
@@ -132,9 +136,11 @@ int LZ4Compressor::uncompress(char* compressed_buf, uint32_t compressed_buf_len,
 
 void LZ4Compressor::after_uncompress(char* uncompressed_buf) {
     if (uncompressed_buf != nullptr) {
-        mem_free(uncompressed_buf_);
-        uncompressed_buf_ = nullptr;
+        mem_free(uncompressed_buf);
+        if (uncompressed_buf_ == uncompressed_buf) {
+            uncompressed_buf_ = nullptr;
+        }
     }
 }
 
-}  // end namespace storage
\ No newline at end of file
+}  // end namespace storage
diff --git a/cpp/src/compress/snappy_compressor.cc b/cpp/src/compress/snappy_compressor.cc
index 6a2735e7b..e78a67ac3 100644
--- a/cpp/src/compress/snappy_compressor.cc
+++ b/cpp/src/compress/snappy_compressor.cc
@@ -73,9 +73,16 @@ int SnappyCompressor::compress(char* uncompressed_buf,
 }
 
 void SnappyCompressor::after_compress(char* compressed_buf) {
+    // Free the buffer the caller is releasing, not whatever we last cached in
+    // compressed_buf_. The member is only kept so destroy() can clean up if
+    // after_compress is never called. When the same compressor is reused
+    // across pages, compressed_buf_ may point to a different (live) allocation
+    // or be null by the time the caller releases an earlier page's buffer.
     if (compressed_buf != nullptr) {
-        mem_free(compressed_buf_);
-        compressed_buf_ = nullptr;
+        mem_free(compressed_buf);
+        if (compressed_buf_ == compressed_buf) {
+            compressed_buf_ = nullptr;
+        }
     }
 }
 
@@ -109,9 +116,11 @@ int SnappyCompressor::uncompress(char* compressed_buf,
 
 void SnappyCompressor::after_uncompress(char* uncompressed_buf) {
     if (uncompressed_buf != nullptr) {
-        mem_free(uncompressed_buf_);
-        uncompressed_buf_ = nullptr;
+        mem_free(uncompressed_buf);
+        if (uncompressed_buf_ == uncompressed_buf) {
+            uncompressed_buf_ = nullptr;
+        }
     }
 }
 
-}  // end namespace storage
\ No newline at end of file
+}  // end namespace storage
diff --git a/cpp/src/compress/uncompressed_compressor.h b/cpp/src/compress/uncompressed_compressor.h
index c262837a8..c342b5001 100644
--- a/cpp/src/compress/uncompressed_compressor.h
+++ b/cpp/src/compress/uncompressed_compressor.h
@@ -20,19 +20,38 @@
 #ifndef COMPRESS_UNCOMPRESSED_COMPRESSOR_H
 #define COMPRESS_UNCOMPRESSED_COMPRESSOR_H
 
+#include <string.h>
+
+#include "common/allocator/alloc_base.h"
 #include "compressor.h"
+#include "utils/errno_define.h"
+#include "utils/util_define.h"
 
 namespace storage {
 
 class UncompressedCompressor : public Compressor {
    public:
-    UncompressedCompressor() {}
-    virtual ~UncompressedCompressor() {}
+    UncompressedCompressor() : uncompressed_buf_(nullptr) {}
+    virtual ~UncompressedCompressor() {
+        if (uncompressed_buf_ != nullptr) {
+            common::mem_free(uncompressed_buf_);
+            uncompressed_buf_ = nullptr;
+        }
+    }
     int reset(bool for_compress) {
         UNUSED(for_compress);
+        if (uncompressed_buf_ != nullptr) {
+            common::mem_free(uncompressed_buf_);
+            uncompressed_buf_ = nullptr;
+        }
         return common::E_OK;
     }
-    void destroy() {}
+    void destroy() {
+        if (uncompressed_buf_ != nullptr) {
+            common::mem_free(uncompressed_buf_);
+            uncompressed_buf_ = nullptr;
+        }
+    }
     int compress(char* uncompressed_buf, uint32_t uncompressed_buf_len,
                  char*& compressed_buf, uint32_t& compressed_buf_len) {
         compressed_buf = uncompressed_buf;
@@ -43,11 +62,33 @@ class UncompressedCompressor : public Compressor {
 
     int uncompress(char* compressed_buf, uint32_t compressed_buf_len,
                    char*& uncompressed_buf, uint32_t& uncompressed_buf_len) {
-        uncompressed_buf = compressed_buf;
+        char* buf = static_cast<char*>(
+            common::mem_alloc(compressed_buf_len, common::MOD_COMPRESSOR_OBJ));
+        if (buf == nullptr) {
+            return common::E_OOM;
+        }
+        memcpy(buf, compressed_buf, compressed_buf_len);
+        uncompressed_buf = buf;
+        uncompressed_buf_ = buf;
         uncompressed_buf_len = compressed_buf_len;
         return common::E_OK;
     }
-    void after_uncompress(char* uncompressed_buf) { UNUSED(uncompressed_buf); }
+    void after_uncompress(char* uncompressed_buf) {
+        // Free the buffer the caller is releasing, not the most-recently
+        // allocated one cached in uncompressed_buf_.  Two successive
+        // uncompress() calls would overwrite uncompressed_buf_ with the
+        // second allocation; after_uncompress(first) used to free that
+        // second buffer (use-after-free for the still-live one) and leak
+        // the first.
+        if (uncompressed_buf == nullptr) return;
+        common::mem_free(uncompressed_buf);
+        if (uncompressed_buf_ == uncompressed_buf) {
+            uncompressed_buf_ = nullptr;
+        }
+    }
+
+   private:
+    char* uncompressed_buf_;
 };
 
 }  // end namespace storage
diff --git a/cpp/src/cwrapper/arrow_c.cc b/cpp/src/cwrapper/arrow_c.cc
index 931c17de7..3f02a7692 100644
--- a/cpp/src/cwrapper/arrow_c.cc
+++ b/cpp/src/cwrapper/arrow_c.cc
@@ -843,7 +843,12 @@ int ArrowStructToTablet(const char* table_name, const ArrowArray* in_array,
         const ArrowArray* ts_arr = in_array->children[time_col_index];
         const int64_t* ts_buf =
             static_cast<const int64_t*>(ts_arr->buffers[1]) + ts_arr->offset;
-        tablet->set_timestamps(ts_buf, static_cast<uint32_t>(n_rows));
+        int sret =
+            tablet->set_timestamps(ts_buf, static_cast<uint32_t>(n_rows));
+        if (sret != common::E_OK) {
+            delete tablet;
+            return sret;
+        }
     }
 
     // Fill data columns from Arrow children (use read_modes to decode buffers)
@@ -892,11 +897,15 @@ int ArrowStructToTablet(const char* table_name, const ArrowArray* in_array,
                     delete tablet;
                     return common::E_OOM;
                 }
-                tablet->set_column_values(tcol, data, null_bm,
-                                          static_cast<uint32_t>(n_rows));
+                int sret = tablet->set_column_values(
+                    tcol, data, null_bm, static_cast<uint32_t>(n_rows));
                 if (null_bm != nullptr) {
                     common::mem_free(null_bm);
                 }
+                if (sret != common::E_OK) {
+                    delete tablet;
+                    return sret;
+                }
                 break;
             }
             case common::DATE: {
@@ -948,14 +957,18 @@ int ArrowStructToTablet(const char* table_name, const ArrowArray* in_array,
                     delete tablet;
                     return common::E_OOM;
                 }
-                tablet->set_column_string_values(tcol, offsets, data, null_bm,
-                                                 nrows);
+                int sret = tablet->set_column_string_values(tcol, offsets, data,
+                                                            null_bm, nrows);
                 if (null_bm != nullptr) {
                     common::mem_free(null_bm);
                 }
                 if (norm_offsets != nullptr) {
                     common::mem_free(norm_offsets);
                 }
+                if (sret != common::E_OK) {
+                    delete tablet;
+                    return sret;
+                }
                 break;
             }
             default:
diff --git a/cpp/src/cwrapper/tsfile_cwrapper.cc b/cpp/src/cwrapper/tsfile_cwrapper.cc
index d9e19fb6b..5661927f3 100644
--- a/cpp/src/cwrapper/tsfile_cwrapper.cc
+++ b/cpp/src/cwrapper/tsfile_cwrapper.cc
@@ -21,7 +21,9 @@
 
 #include <file/write_file.h>
 #include <reader/qds_without_timegenerator.h>
+#include <sys/stat.h>
 #include <writer/tsfile_table_writer.h>
+
 #ifdef _WIN32
 #include <io.h>
 #else
@@ -92,8 +94,14 @@ WriteFile write_file_new(const char* pathname, ERRNO* err_code) {
     int ret;
     init_tsfile_config();
 
-    if (access(pathname, F_OK) == 0) {
-        *err_code = common::E_ALREADY_EXIST;
+    struct stat path_stat {};
+    if (stat(pathname, &path_stat) == 0) {
+#ifdef _WIN32
+        const bool is_dir = (path_stat.st_mode & _S_IFDIR) != 0;
+#else
+        const bool is_dir = S_ISDIR(path_stat.st_mode);
+#endif
+        *err_code = is_dir ? common::E_FILE_OPEN_ERR : common::E_ALREADY_EXIST;
         return nullptr;
     }
 
@@ -110,6 +118,17 @@ WriteFile write_file_new(const char* pathname, ERRNO* err_code) {
 
 TsFileWriter tsfile_writer_new(WriteFile file, TableSchema* schema,
                                ERRNO* err_code) {
+    // C API: every public entry must defend against null callers — a null
+    // schema or err_code would crash the host process the moment it's
+    // dereferenced.  The tag-filter helpers already follow this pattern.
+    if (err_code == nullptr) {
+        return nullptr;
+    }
+    if (file == nullptr || schema == nullptr ||
+        schema->column_schemas == nullptr || schema->table_name == nullptr) {
+        *err_code = common::E_INVALID_ARG;
+        return nullptr;
+    }
     if (schema->column_num == 0) {
         *err_code = common::E_INVALID_SCHEMA;
         return nullptr;
@@ -149,6 +168,15 @@ TsFileWriter tsfile_writer_new_with_memory_threshold(WriteFile file,
                                                      TableSchema* schema,
                                                      uint64_t memory_threshold,
                                                      ERRNO* err_code) {
+    // See tsfile_writer_new() above for the null-guard rationale.
+    if (err_code == nullptr) {
+        return nullptr;
+    }
+    if (file == nullptr || schema == nullptr ||
+        schema->column_schemas == nullptr || schema->table_name == nullptr) {
+        *err_code = common::E_INVALID_ARG;
+        return nullptr;
+    }
     if (schema->column_num == 0) {
         *err_code = common::E_INVALID_SCHEMA;
         return nullptr;
@@ -158,11 +186,21 @@ TsFileWriter tsfile_writer_new_with_memory_threshold(WriteFile file,
     std::set<std::string> column_names;
     for (int i = 0; i < schema->column_num; i++) {
         ColumnSchema cur_schema = schema->column_schemas[i];
-        if (column_names.find(cur_schema.column_name) == column_names.end()) {
+        // Reject only when the name has already been seen.  The previous
+        // condition was inverted, so the first column (always a fresh name)
+        // was rejected as a duplicate and this constructor was effectively
+        // unusable — tsfile_writer_new()'s loop above has the correct check
+        // for comparison.
+        if (column_names.find(cur_schema.column_name) != column_names.end()) {
             *err_code = common::E_INVALID_SCHEMA;
             return nullptr;
         }
         column_names.insert(cur_schema.column_name);
+        if (cur_schema.column_category == TAG &&
+            cur_schema.data_type != TS_DATATYPE_STRING) {
+            *err_code = common::E_INVALID_SCHEMA;
+            return nullptr;
+        }
         column_schemas.emplace_back(
             cur_schema.column_name,
             static_cast<common::TSDataType>(cur_schema.data_type),
@@ -1210,6 +1248,8 @@ ERRNO populate_c_metadata_map_from_cpp(
             if (m.measurement_name == nullptr) {
                 for (uint32_t u = 0; u < slot; u++) {
                     free_timeseries_statistic_heap(&e.timeseries[u].statistic);
+                    free_timeseries_statistic_heap(
+                        &e.timeseries[u].timeline_statistic);
                     free(e.timeseries[u].measurement_name);
                 }
                 free(e.timeseries);
@@ -1470,6 +1510,13 @@ Tablet _tablet_new_with_target_name(const char* device_id,
 }
 
 ERRNO _tsfile_writer_register_table(TsFileWriter writer, TableSchema* schema) {
+    if (writer == nullptr || schema == nullptr ||
+        schema->column_schemas == nullptr || schema->table_name == nullptr) {
+        return common::E_INVALID_ARG;
+    }
+    if (schema->column_num <= 0) {
+        return common::E_INVALID_SCHEMA;
+    }
     std::vector<storage::MeasurementSchema*> measurement_schemas;
     std::vector<common::ColumnCategory> column_categories;
     measurement_schemas.resize(schema->column_num);
@@ -1592,13 +1639,50 @@ ResultSet _tsfile_reader_query_device(TsFileReader reader,
     return qds;
 }
 
-// ---------- Tag Filter API ----------
+// ============== Tag Filter API Implementation ==============
+
+// Helper macro to avoid repetition in tag filter factory functions.
+// The shared_ptr must stay alive while TagFilterBuilder accesses the schema.
+// Every C-API entry must validate its pointers: a null reader would deref
+// during the static_cast, and null table/column/value would feed std::string
+// a null pointer (UB / crash).
+#define DEFINE_TAG_FILTER_FACTORY(name, method)                               \
+    TagFilterHandle tsfile_tag_filter_##name(                                 \
+        TsFileReader reader, const char* table_name, const char* column_name, \
+        const char* value) {                                                  \
+        if (reader == nullptr || table_name == nullptr ||                     \
+            column_name == nullptr || value == nullptr) {                     \
+            return nullptr;                                                   \
+        }                                                                     \
+        auto* r = static_cast<storage::TsFileReader*>(reader);                \
+        auto schema = r->get_table_schema(table_name);                        \
+        if (!schema) return nullptr;                                          \
+        storage::TagFilterBuilder builder(schema.get());                      \
+        return builder.method(column_name, value);                            \
+    }
+
+DEFINE_TAG_FILTER_FACTORY(eq, eq)
+DEFINE_TAG_FILTER_FACTORY(neq, neq)
+DEFINE_TAG_FILTER_FACTORY(lt, lt)
+DEFINE_TAG_FILTER_FACTORY(lteq, lteq)
+DEFINE_TAG_FILTER_FACTORY(gt, gt)
+DEFINE_TAG_FILTER_FACTORY(gteq, gteq)
+
+#undef DEFINE_TAG_FILTER_FACTORY
 
 TagFilterHandle tsfile_tag_filter_create(TsFileReader reader,
                                          const char* table_name,
                                          const char* column_name,
                                          const char* value, TagFilterOp op,
                                          ERRNO* err_code) {
+    if (err_code == nullptr) {
+        return nullptr;
+    }
+    if (reader == nullptr || table_name == nullptr || column_name == nullptr ||
+        value == nullptr) {
+        *err_code = common::E_INVALID_ARG;
+        return nullptr;
+    }
     auto* r = static_cast<storage::TsFileReader*>(reader);
     auto schema = r->get_table_schema(table_name);
     if (!schema) {
@@ -1667,25 +1751,30 @@ TagFilterHandle tsfile_tag_filter_between(TsFileReader reader,
 
 TagFilterHandle tsfile_tag_filter_and(TagFilterHandle left,
                                       TagFilterHandle right) {
-    return static_cast<void*>(storage::TagFilterBuilder::and_filter(
+    if (!left || !right) return nullptr;
+    return storage::TagFilterBuilder::and_filter(
         static_cast<storage::Filter*>(left),
-        static_cast<storage::Filter*>(right)));
+        static_cast<storage::Filter*>(right));
 }
 
 TagFilterHandle tsfile_tag_filter_or(TagFilterHandle left,
                                      TagFilterHandle right) {
-    return static_cast<void*>(storage::TagFilterBuilder::or_filter(
+    if (!left || !right) return nullptr;
+    return storage::TagFilterBuilder::or_filter(
         static_cast<storage::Filter*>(left),
-        static_cast<storage::Filter*>(right)));
+        static_cast<storage::Filter*>(right));
 }
 
 TagFilterHandle tsfile_tag_filter_not(TagFilterHandle filter) {
-    return static_cast<void*>(storage::TagFilterBuilder::not_filter(
-        static_cast<storage::Filter*>(filter)));
+    if (!filter) return nullptr;
+    return storage::TagFilterBuilder::not_filter(
+        static_cast<storage::Filter*>(filter));
 }
 
 void tsfile_tag_filter_free(TagFilterHandle filter) {
-    delete static_cast<storage::Filter*>(filter);
+    if (filter) {
+        delete static_cast<storage::Filter*>(filter);
+    }
 }
 
 ResultSet tsfile_query_table_with_tag_filter(
diff --git a/cpp/src/cwrapper/tsfile_cwrapper.h b/cpp/src/cwrapper/tsfile_cwrapper.h
index 4471da89e..768aec962 100644
--- a/cpp/src/cwrapper/tsfile_cwrapper.h
+++ b/cpp/src/cwrapper/tsfile_cwrapper.h
@@ -908,32 +908,68 @@ TagFilterHandle tsfile_tag_filter_between(TsFileReader reader,
                                           bool is_not, ERRNO* err_code);
 
 /**
- * @brief Combine two tag filters with AND.
+ * @brief Create a tag equality filter: column == value.
+ *
+ * @param reader [in] Valid TsFileReader handle (used to resolve column index).
+ * @param table_name [in] Target table name.
+ * @param column_name [in] Tag column name.
+ * @param value [in] Value to compare against.
+ * @return TagFilterHandle on success, NULL on failure.
+ */
+TagFilterHandle tsfile_tag_filter_eq(TsFileReader reader,
+                                     const char* table_name,
+                                     const char* column_name,
+                                     const char* value);
+
+TagFilterHandle tsfile_tag_filter_neq(TsFileReader reader,
+                                      const char* table_name,
+                                      const char* column_name,
+                                      const char* value);
+
+TagFilterHandle tsfile_tag_filter_lt(TsFileReader reader,
+                                     const char* table_name,
+                                     const char* column_name,
+                                     const char* value);
+
+TagFilterHandle tsfile_tag_filter_lteq(TsFileReader reader,
+                                       const char* table_name,
+                                       const char* column_name,
+                                       const char* value);
+
+TagFilterHandle tsfile_tag_filter_gt(TsFileReader reader,
+                                     const char* table_name,
+                                     const char* column_name,
+                                     const char* value);
+
+TagFilterHandle tsfile_tag_filter_gteq(TsFileReader reader,
+                                       const char* table_name,
+                                       const char* column_name,
+                                       const char* value);
+
+/**
+ * @brief Logical AND of two tag filters. Takes ownership of left and right.
  */
 TagFilterHandle tsfile_tag_filter_and(TagFilterHandle left,
                                       TagFilterHandle right);
 
 /**
- * @brief Combine two tag filters with OR.
+ * @brief Logical OR of two tag filters. Takes ownership of left and right.
  */
 TagFilterHandle tsfile_tag_filter_or(TagFilterHandle left,
                                      TagFilterHandle right);
 
 /**
- * @brief Negate a tag filter.
+ * @brief Logical NOT of a tag filter. Takes ownership of filter.
  */
 TagFilterHandle tsfile_tag_filter_not(TagFilterHandle filter);
 
 /**
- * @brief Free a tag filter and all its children.
+ * @brief Free a tag filter handle.
  */
 void tsfile_tag_filter_free(TagFilterHandle filter);
 
 /**
- * @brief Query table with tag filter.
- *
- * @param batch_size <= 0 means row-by-row return mode,
- *                   > 0 means return TsBlock with the specified block size.
+ * @brief Batch query with tag filter support.
  */
 ResultSet tsfile_query_table_with_tag_filter(
     TsFileReader reader, const char* table_name, char** columns,
diff --git a/cpp/src/encoding/decoder.h b/cpp/src/encoding/decoder.h
index c290b5791..24455ca01 100644
--- a/cpp/src/encoding/decoder.h
+++ b/cpp/src/encoding/decoder.h
@@ -21,6 +21,7 @@
 #define ENCODING_DECODER_H
 
 #include "common/allocator/byte_stream.h"
+#include "common/db_common.h"
 
 namespace storage {
 
@@ -37,6 +38,140 @@ class Decoder {
     virtual int read_double(double& ret_value, common::ByteStream& in) = 0;
     virtual int read_String(common::String& ret_value, common::PageArena& pa,
                             common::ByteStream& in) = 0;
+
+    virtual int read_batch_int32(int32_t* out, int capacity, int& actual,
+                                 common::ByteStream& in) {
+        actual = 0;
+        int ret = common::E_OK;
+        int32_t val;
+        while (actual < capacity && has_remaining(in)) {
+            ret = read_int32(val, in);
+            if (ret != common::E_OK) {
+                return ret;
+            }
+            out[actual++] = val;
+        }
+        return common::E_OK;
+    }
+
+    virtual int read_batch_int64(int64_t* out, int capacity, int& actual,
+                                 common::ByteStream& in) {
+        actual = 0;
+        int ret = common::E_OK;
+        int64_t val;
+        while (actual < capacity && has_remaining(in)) {
+            ret = read_int64(val, in);
+            if (ret != common::E_OK) {
+                return ret;
+            }
+            out[actual++] = val;
+        }
+        return common::E_OK;
+    }
+
+    virtual int read_batch_float(float* out, int capacity, int& actual,
+                                 common::ByteStream& in) {
+        actual = 0;
+        int ret = common::E_OK;
+        float val;
+        while (actual < capacity && has_remaining(in)) {
+            ret = read_float(val, in);
+            if (ret != common::E_OK) {
+                return ret;
+            }
+            out[actual++] = val;
+        }
+        return common::E_OK;
+    }
+
+    virtual int read_batch_double(double* out, int capacity, int& actual,
+                                  common::ByteStream& in) {
+        actual = 0;
+        int ret = common::E_OK;
+        double val;
+        while (actual < capacity && has_remaining(in)) {
+            ret = read_double(val, in);
+            if (ret != common::E_OK) {
+                return ret;
+            }
+            out[actual++] = val;
+        }
+        return common::E_OK;
+    }
+
+    virtual int skip_int32(int count, int& skipped, common::ByteStream& in) {
+        skipped = 0;
+        int ret = common::E_OK;
+        int32_t dummy;
+        while (skipped < count && has_remaining(in)) {
+            ret = read_int32(dummy, in);
+            if (ret != common::E_OK) {
+                return ret;
+            }
+            ++skipped;
+        }
+        return common::E_OK;
+    }
+
+    virtual int skip_int64(int count, int& skipped, common::ByteStream& in) {
+        skipped = 0;
+        int ret = common::E_OK;
+        int64_t dummy;
+        while (skipped < count && has_remaining(in)) {
+            ret = read_int64(dummy, in);
+            if (ret != common::E_OK) {
+                return ret;
+            }
+            ++skipped;
+        }
+        return common::E_OK;
+    }
+
+    virtual int skip_float(int count, int& skipped, common::ByteStream& in) {
+        skipped = 0;
+        int ret = common::E_OK;
+        float dummy;
+        while (skipped < count && has_remaining(in)) {
+            ret = read_float(dummy, in);
+            if (ret != common::E_OK) {
+                return ret;
+            }
+            ++skipped;
+        }
+        return common::E_OK;
+    }
+
+    virtual int skip_double(int count, int& skipped, common::ByteStream& in) {
+        skipped = 0;
+        int ret = common::E_OK;
+        double dummy;
+        while (skipped < count && has_remaining(in)) {
+            ret = read_double(dummy, in);
+            if (ret != common::E_OK) {
+                return ret;
+            }
+            ++skipped;
+        }
+        return common::E_OK;
+    }
+
+    // Block-level filter check: peek the next block header and compute
+    // the value range [block_min, block_max] without decoding.
+    // Returns true if a block was peeked; false if not supported or no data.
+    // After peeking, caller must either:
+    //   - Call skip_peeked_block_int64() to skip the block
+    //   - Call read_batch_int64() which will use the peeked header
+    virtual bool peek_next_block_range_int64(common::ByteStream& in,
+                                             int64_t& block_min,
+                                             int64_t& block_max,
+                                             int& block_count) {
+        return false;
+    }
+
+    // Skip the block whose header was already consumed by peek.
+    virtual int skip_peeked_block_int64(common::ByteStream& in, int& skipped) {
+        return common::E_NOT_SUPPORT;
+    }
 };
 
 }  // end namespace storage
diff --git a/cpp/src/encoding/dictionary_encoder.h b/cpp/src/encoding/dictionary_encoder.h
index be5f78a09..8f7c495c4 100644
--- a/cpp/src/encoding/dictionary_encoder.h
+++ b/cpp/src/encoding/dictionary_encoder.h
@@ -83,7 +83,12 @@ class DictionaryEncoder : public Encoder {
         if (entry_index_.count(value) == 0) {
             index_entry_.push_back(value);
             map_size_ = map_size_ + value.length();
-            entry_index_[value] = static_cast<int>(index_entry_.size()) - 1;
+            // Compute the index before the insert: LHS/RHS evaluation order of
+            // `m[k] = m.size()` is unspecified before C++17, so a compiler
+            // that evaluates the LHS first would store size()+1 and corrupt
+            // the dictionary.
+            const int new_idx = static_cast<int>(index_entry_.size()) - 1;
+            entry_index_[value] = new_idx;
         }
         values_encoder_.encode(entry_index_[value], out);
         return common::E_OK;
diff --git a/cpp/src/encoding/encoder.h b/cpp/src/encoding/encoder.h
index 921686446..386129f6e 100644
--- a/cpp/src/encoding/encoder.h
+++ b/cpp/src/encoding/encoder.h
@@ -48,6 +48,81 @@ class Encoder {
      * @return the maximal size of possible memory occupied by current encoder
      */
     virtual int get_max_byte_size() = 0;
+
+    /*
+     * Batch encoding interfaces.
+     * Default implementations fall back to per-value encode().
+     * Subclasses may override for better performance.
+     */
+    virtual int encode_batch(const bool* values, uint32_t count,
+                             common::ByteStream& out_stream) {
+        int ret = common::E_OK;
+        for (uint32_t i = 0; i < count; i++) {
+            if (RET_FAIL(encode(values[i], out_stream))) {
+                return ret;
+            }
+        }
+        return ret;
+    }
+    virtual int encode_batch(const int32_t* values, uint32_t count,
+                             common::ByteStream& out_stream) {
+        int ret = common::E_OK;
+        for (uint32_t i = 0; i < count; i++) {
+            if (RET_FAIL(encode(values[i], out_stream))) {
+                return ret;
+            }
+        }
+        return ret;
+    }
+    virtual int encode_batch(const int64_t* values, uint32_t count,
+                             common::ByteStream& out_stream) {
+        int ret = common::E_OK;
+        for (uint32_t i = 0; i < count; i++) {
+            if (RET_FAIL(encode(values[i], out_stream))) {
+                return ret;
+            }
+        }
+        return ret;
+    }
+    virtual int encode_batch(const float* values, uint32_t count,
+                             common::ByteStream& out_stream) {
+        int ret = common::E_OK;
+        for (uint32_t i = 0; i < count; i++) {
+            if (RET_FAIL(encode(values[i], out_stream))) {
+                return ret;
+            }
+        }
+        return ret;
+    }
+    virtual int encode_batch(const double* values, uint32_t count,
+                             common::ByteStream& out_stream) {
+        int ret = common::E_OK;
+        for (uint32_t i = 0; i < count; i++) {
+            if (RET_FAIL(encode(values[i], out_stream))) {
+                return ret;
+            }
+        }
+        return ret;
+    }
+
+    // Batch encode strings from a contiguous buffer with offset array
+    // (Arrow-style layout from Tablet::StringColumn).
+    // string[i] = buffer + offsets[start_idx + i], length = offsets[start_idx +
+    // i + 1] - offsets[start_idx + i].
+    virtual int encode_string_batch(const char* buffer, const uint32_t* offsets,
+                                    uint32_t start_idx, uint32_t count,
+                                    common::ByteStream& out_stream) {
+        int ret = common::E_OK;
+        for (uint32_t i = 0; i < count; i++) {
+            uint32_t idx = start_idx + i;
+            uint32_t len = offsets[idx + 1] - offsets[idx];
+            common::String val(buffer + offsets[idx], len);
+            if (RET_FAIL(encode(val, out_stream))) {
+                return ret;
+            }
+        }
+        return ret;
+    }
 };
 
 }  // end namespace storage
diff --git a/cpp/src/encoding/gorilla_decoder.h b/cpp/src/encoding/gorilla_decoder.h
index 5684561aa..e1e490105 100644
--- a/cpp/src/encoding/gorilla_decoder.h
+++ b/cpp/src/encoding/gorilla_decoder.h
@@ -30,6 +30,163 @@
 
 namespace storage {
 
+// ── Raw-pointer bit reader ────────────────────────────────────────────────
+// Operates directly on a contiguous byte array, bypassing ByteStream's
+// per-byte read_buf() overhead (atomic loads, page boundary checks, memcpy).
+
+struct GorillaBitReader {
+    const uint8_t* data;
+    uint32_t pos;       // next byte index to load
+    uint32_t data_len;  // total bytes
+    int bits;           // remaining bits in cur_byte (0..8)
+    uint8_t cur_byte;
+    // Set once a load was attempted on an empty input, or once read_bit /
+    // read_long ran out of bits mid-value.  Without this, a truncated page
+    // would spin read_long() forever (bits stays 0, n -= 0 makes no
+    // progress) and read_bit() would execute a negative shift via
+    // (cur_byte >> (bits - 1)).
+    bool exhausted = false;
+
+    FORCE_INLINE void load_byte_if_empty() {
+        if (bits == 0) {
+            if (pos < data_len) {
+                cur_byte = data[pos++];
+                bits = 8;
+            } else {
+                exhausted = true;
+            }
+        }
+    }
+
+    FORCE_INLINE bool read_bit() {
+        if (UNLIKELY(bits == 0)) {
+            exhausted = true;
+            return false;
+        }
+        bool bit = ((cur_byte >> (bits - 1)) & 1) == 1;
+        bits--;
+        load_byte_if_empty();
+        return bit;
+    }
+
+    FORCE_INLINE int64_t read_long(int n) {
+        int64_t value = 0;
+        while (n > 0) {
+            if (UNLIKELY(bits == 0)) {
+                // Input drained mid-value; bail so the outer loop in
+                // read_control_bits / batch_decode_raw doesn't spin.
+                exhausted = true;
+                return value;
+            }
+            if (n > bits || n == 8) {
+                value = (value << bits) + (cur_byte & ((1 << bits) - 1));
+                n -= bits;
+                bits = 0;
+            } else {
+                value =
+                    (value << n) + ((cur_byte >> (bits - n)) & ((1 << n) - 1));
+                bits -= n;
+                n = 0;
+            }
+            load_byte_if_empty();
+        }
+        return value;
+    }
+
+    FORCE_INLINE uint8_t read_control_bits(int max_bits) {
+        uint8_t value = 0x00;
+        for (int i = 0; i < max_bits; i++) {
+            value <<= 1;
+            if (exhausted) break;
+            if (read_bit()) {
+                value |= 0x01;
+            } else {
+                break;
+            }
+        }
+        return value;
+    }
+};
+
+// ── Templated raw-pointer decode helpers ──────────────────────────────────
+
+template <typename T>
+struct GorillaRawOps {
+    static FORCE_INLINE T read_next(GorillaBitReader& r, T& stored_value,
+                                    int& stored_leading_zeros,
+                                    int& stored_trailing_zeros);
+};
+
+template <>
+struct GorillaRawOps<int32_t> {
+    static constexpr int VALUE_BITS = VALUE_BITS_LENGTH_32BIT;
+
+    static FORCE_INLINE int32_t read_next(GorillaBitReader& r,
+                                          int32_t& stored_value,
+                                          int& stored_leading_zeros,
+                                          int& stored_trailing_zeros) {
+        uint8_t ctrl = r.read_control_bits(2);
+        switch (ctrl) {
+            case 3: {
+                stored_leading_zeros =
+                    (int)r.read_long(LEADING_ZERO_BITS_LENGTH_32BIT);
+                uint8_t sig =
+                    (uint8_t)r.read_long(MEANINGFUL_XOR_BITS_LENGTH_32BIT);
+                sig++;
+                stored_trailing_zeros = VALUE_BITS - sig - stored_leading_zeros;
+            }
+            // fallthrough
+            case 2: {
+                int32_t xor_value = (int32_t)r.read_long(
+                    VALUE_BITS - stored_leading_zeros - stored_trailing_zeros);
+                xor_value = static_cast<uint32_t>(xor_value)
+                            << stored_trailing_zeros;
+                stored_value ^= xor_value;
+            }
+            // fallthrough
+            default:
+                return stored_value;
+        }
+        return stored_value;
+    }
+};
+
+template <>
+struct GorillaRawOps<int64_t> {
+    static constexpr int VALUE_BITS = VALUE_BITS_LENGTH_64BIT;
+
+    static FORCE_INLINE int64_t read_next(GorillaBitReader& r,
+                                          int64_t& stored_value,
+                                          int& stored_leading_zeros,
+                                          int& stored_trailing_zeros) {
+        uint8_t ctrl = r.read_control_bits(2);
+        switch (ctrl) {
+            case 3: {
+                stored_leading_zeros =
+                    (int)r.read_long(LEADING_ZERO_BITS_LENGTH_64BIT);
+                uint8_t sig =
+                    (uint8_t)r.read_long(MEANINGFUL_XOR_BITS_LENGTH_64BIT);
+                sig++;
+                stored_trailing_zeros = VALUE_BITS - sig - stored_leading_zeros;
+            }
+            // fallthrough
+            case 2: {
+                int64_t xor_value = r.read_long(
+                    VALUE_BITS - stored_leading_zeros - stored_trailing_zeros);
+                xor_value = static_cast<uint64_t>(xor_value)
+                            << stored_trailing_zeros;
+                stored_value ^= xor_value;
+            }
+            // fallthrough
+            default:
+                return stored_value;
+        }
+        return stored_value;
+    }
+};
+
+// ──────────────────────────────────────────────────────────────────────────
+
 template <typename T>
 class GorillaDecoder : public Decoder {
    public:
@@ -127,6 +284,197 @@ class GorillaDecoder : public Decoder {
     int read_String(common::String& ret_value, common::PageArena& pa,
                     common::ByteStream& in) override;
 
+    // Batch overrides — declared here, defined after template specializations
+    int read_batch_int32(int32_t* out, int capacity, int& actual,
+                         common::ByteStream& in) override;
+    int read_batch_int64(int64_t* out, int capacity, int& actual,
+                         common::ByteStream& in) override;
+    int skip_int32(int count, int& skipped, common::ByteStream& in) override;
+    int skip_int64(int count, int& skipped, common::ByteStream& in) override;
+
+   protected:
+    // ── Batch decode using raw pointer (bypasses ByteStream) ─────────────
+    // The decode() contract:
+    //   stored_value_ holds the "next" value to be returned.
+    //   decode() returns stored_value_, then advances via cache_next().
+    //   has_next_==false means the ending sentinel was hit.
+    //
+    // batch_decode_raw replicates this logic using GorillaBitReader on the
+    // wrapped contiguous buffer, then syncs state back to ByteStream.
+    int batch_decode_raw(T* out, int capacity, int& actual, T ending,
+                         common::ByteStream& in) {
+        int ret = common::E_OK;
+        actual = 0;
+        // Bootstrap below would unconditionally write out[0]; guard the
+        // zero-capacity edge case so callers can probe without writing.
+        if (capacity <= 0) {
+            return common::E_OK;
+        }
+        if (!in.is_wrapped()) {
+            return batch_decode_fallback(out, capacity, actual, ending, in);
+        }
+
+        const uint8_t* base =
+            (const uint8_t*)in.get_wrapped_buf() + in.read_pos();
+        // Gorilla pages are bounded by the page-writer cap (well below 4 GiB),
+        // so saturating to uint32_t is safe and matches GorillaBitReader's
+        // 32-bit cursor.
+        uint32_t remain = static_cast<uint32_t>(
+            std::min<uint64_t>(in.remaining_size(), UINT32_MAX));
+
+        GorillaBitReader r;
+        r.data = base;
+        r.pos = 0;
+        r.data_len = remain;
+        r.bits = bits_left_;
+        r.cur_byte = buffer_;
+
+        // Bootstrap first value if needed (mirrors decode()'s first-call path)
+        if (UNLIKELY(!first_value_was_read_)) {
+            if (r.bits == 0 && r.pos >= r.data_len) goto done;
+            r.load_byte_if_empty();
+            stored_value_ = (T)r.read_long(GorillaRawOps<T>::VALUE_BITS);
+            if (UNLIKELY(r.exhausted)) {
+                // Page truncated before the first value finished; refuse to
+                // emit a partially-decoded sentinel.
+                first_value_was_read_ = false;
+                ret = common::E_BUF_NOT_ENOUGH;
+                goto done;
+            }
+            first_value_was_read_ = true;
+            // Save the first value before cache_next mutates stored_value_
+            T first_value = stored_value_;
+            // cache_next: read_next then check ending
+            GorillaRawOps<T>::read_next(r, stored_value_, stored_leading_zeros_,
+                                        stored_trailing_zeros_);
+            if (UNLIKELY(r.exhausted)) {
+                ret = common::E_BUF_NOT_ENOUGH;
+                goto done;
+            }
+            if (stored_value_ == ending) {
+                has_next_ = false;
+            } else {
+                has_next_ = true;
+            }
+            // Output the first value
+            out[actual++] = first_value;
+            if (!has_next_ || actual >= capacity) goto done;
+        }
+
+        // Main batch loop
+        while (actual < capacity && has_next_) {
+            out[actual++] = stored_value_;
+            GorillaRawOps<T>::read_next(r, stored_value_, stored_leading_zeros_,
+                                        stored_trailing_zeros_);
+            if (UNLIKELY(r.exhausted)) {
+                ret = common::E_BUF_NOT_ENOUGH;
+                goto done;
+            }
+            if (stored_value_ == ending) {
+                has_next_ = false;
+            }
+        }
+
+    done:
+        // Sync bit-reader state back
+        buffer_ = r.cur_byte;
+        bits_left_ = r.bits;
+        in.wrapped_buf_advance_read_pos(r.pos);
+        return ret;
+    }
+
+    int batch_skip_raw(int count, int& skipped, T ending,
+                       common::ByteStream& in) {
+        int ret = common::E_OK;
+        skipped = 0;
+        // Bootstrap below would consume first_value_ even when count == 0,
+        // advancing the stream past data the caller didn't ask to skip.
+        if (count <= 0) {
+            return common::E_OK;
+        }
+        if (!in.is_wrapped()) {
+            return batch_skip_fallback(count, skipped, ending, in);
+        }
+
+        const uint8_t* base =
+            (const uint8_t*)in.get_wrapped_buf() + in.read_pos();
+        // Same saturation as batch_decode_raw: GorillaBitReader is 32-bit
+        // internally; pages are well under 4 GiB.
+        uint32_t remain = static_cast<uint32_t>(
+            std::min<uint64_t>(in.remaining_size(), UINT32_MAX));
+
+        GorillaBitReader r;
+        r.data = base;
+        r.pos = 0;
+        r.data_len = remain;
+        r.bits = bits_left_;
+        r.cur_byte = buffer_;
+
+        if (UNLIKELY(!first_value_was_read_)) {
+            if (r.bits == 0 && r.pos >= r.data_len) goto done;
+            r.load_byte_if_empty();
+            stored_value_ = (T)r.read_long(GorillaRawOps<T>::VALUE_BITS);
+            if (UNLIKELY(r.exhausted)) {
+                first_value_was_read_ = false;
+                ret = common::E_BUF_NOT_ENOUGH;
+                goto done;
+            }
+            first_value_was_read_ = true;
+            GorillaRawOps<T>::read_next(r, stored_value_, stored_leading_zeros_,
+                                        stored_trailing_zeros_);
+            if (UNLIKELY(r.exhausted)) {
+                ret = common::E_BUF_NOT_ENOUGH;
+                goto done;
+            }
+            if (stored_value_ == ending) {
+                has_next_ = false;
+            } else {
+                has_next_ = true;
+            }
+            // The first value counts as one skip
+            skipped++;
+            if (!has_next_ || skipped >= count) goto done;
+        }
+
+        while (skipped < count && has_next_) {
+            skipped++;
+            GorillaRawOps<T>::read_next(r, stored_value_, stored_leading_zeros_,
+                                        stored_trailing_zeros_);
+            if (UNLIKELY(r.exhausted)) {
+                ret = common::E_BUF_NOT_ENOUGH;
+                goto done;
+            }
+            if (stored_value_ == ending) {
+                has_next_ = false;
+            }
+        }
+
+    done:
+        buffer_ = r.cur_byte;
+        bits_left_ = r.bits;
+        in.wrapped_buf_advance_read_pos(r.pos);
+        return ret;
+    }
+
+    int batch_decode_fallback(T* out, int capacity, int& actual, T ending,
+                              common::ByteStream& in) {
+        actual = 0;
+        while (actual < capacity && has_remaining(in)) {
+            out[actual++] = decode(in);
+        }
+        return common::E_OK;
+    }
+
+    int batch_skip_fallback(int count, int& skipped, T ending,
+                            common::ByteStream& in) {
+        skipped = 0;
+        while (skipped < count && has_remaining(in)) {
+            decode(in);
+            skipped++;
+        }
+        return common::E_OK;
+    }
+
    public:
     common::TSEncoding type_;
     T stored_value_;
@@ -254,18 +602,18 @@ FORCE_INLINE int64_t GorillaDecoder<int64_t>::decode(common::ByteStream& in) {
 
 class FloatGorillaDecoder : public GorillaDecoder<int32_t> {
    public:
-    int read_boolean(bool& ret_value, common::ByteStream& in);
-    int read_int32(int32_t& ret_value, common::ByteStream& in);
-    int read_int64(int64_t& ret_value, common::ByteStream& in);
-    int read_float(float& ret_value, common::ByteStream& in);
-    int read_double(double& ret_value, common::ByteStream& in);
+    int read_boolean(bool& ret_value, common::ByteStream& in) override;
+    int read_int32(int32_t& ret_value, common::ByteStream& in) override;
+    int read_int64(int64_t& ret_value, common::ByteStream& in) override;
+    int read_float(float& ret_value, common::ByteStream& in) override;
+    int read_double(double& ret_value, common::ByteStream& in) override;
 
     float decode(common::ByteStream& in) {
         int32_t value_int = GorillaDecoder<int32_t>::decode(in);
         return common::int_to_float(value_int);
     }
 
-    int32_t cache_next(common::ByteStream& in) {
+    int32_t cache_next(common::ByteStream& in) override {
         read_next(in);
         if (stored_value_ ==
             common::float_to_int(GORILLA_ENCODING_ENDING_FLOAT)) {
@@ -273,22 +621,46 @@ class FloatGorillaDecoder : public GorillaDecoder<int32_t> {
         }
         return stored_value_;
     }
+
+    int read_batch_float(float* out, int capacity, int& actual,
+                         common::ByteStream& in) override {
+        int32_t ending = common::float_to_int(GORILLA_ENCODING_ENDING_FLOAT);
+        actual = 0;
+        while (actual < capacity && has_remaining(in)) {
+            int32_t buf[129];
+            int batch = std::min(129, capacity - actual);
+            int buf_actual = 0;
+            int ret = batch_decode_raw(buf, batch, buf_actual, ending, in);
+            if (ret != common::E_OK) return ret;
+            if (buf_actual == 0) break;
+            for (int i = 0; i < buf_actual; i++) {
+                out[actual + i] = common::int_to_float(buf[i]);
+            }
+            actual += buf_actual;
+        }
+        return common::E_OK;
+    }
+
+    int skip_float(int count, int& skipped, common::ByteStream& in) override {
+        int32_t ending = common::float_to_int(GORILLA_ENCODING_ENDING_FLOAT);
+        return batch_skip_raw(count, skipped, ending, in);
+    }
 };
 
 class DoubleGorillaDecoder : public GorillaDecoder<int64_t> {
    public:
-    int read_boolean(bool& ret_value, common::ByteStream& in);
-    int read_int32(int32_t& ret_value, common::ByteStream& in);
-    int read_int64(int64_t& ret_value, common::ByteStream& in);
-    int read_float(float& ret_value, common::ByteStream& in);
-    int read_double(double& ret_value, common::ByteStream& in);
+    int read_boolean(bool& ret_value, common::ByteStream& in) override;
+    int read_int32(int32_t& ret_value, common::ByteStream& in) override;
+    int read_int64(int64_t& ret_value, common::ByteStream& in) override;
+    int read_float(float& ret_value, common::ByteStream& in) override;
+    int read_double(double& ret_value, common::ByteStream& in) override;
 
     double decode(common::ByteStream& in) {
         int64_t value_long = GorillaDecoder<int64_t>::decode(in);
         return common::long_to_double(value_long);
     }
 
-    int64_t cache_next(common::ByteStream& in) {
+    int64_t cache_next(common::ByteStream& in) override {
         read_next(in);
         if (stored_value_ ==
             common::double_to_long(GORILLA_ENCODING_ENDING_DOUBLE)) {
@@ -296,12 +668,88 @@ class DoubleGorillaDecoder : public GorillaDecoder<int64_t> {
         }
         return stored_value_;
     }
+
+    int read_batch_double(double* out, int capacity, int& actual,
+                          common::ByteStream& in) override {
+        int64_t ending = common::double_to_long(GORILLA_ENCODING_ENDING_DOUBLE);
+        actual = 0;
+        while (actual < capacity && has_remaining(in)) {
+            int64_t buf[129];
+            int batch = std::min(129, capacity - actual);
+            int buf_actual = 0;
+            int ret = batch_decode_raw(buf, batch, buf_actual, ending, in);
+            if (ret != common::E_OK) return ret;
+            if (buf_actual == 0) break;
+            for (int i = 0; i < buf_actual; i++) {
+                out[actual + i] = common::long_to_double(buf[i]);
+            }
+            actual += buf_actual;
+        }
+        return common::E_OK;
+    }
+
+    int skip_double(int count, int& skipped, common::ByteStream& in) override {
+        int64_t ending = common::double_to_long(GORILLA_ENCODING_ENDING_DOUBLE);
+        return batch_skip_raw(count, skipped, ending, in);
+    }
 };
 
 typedef GorillaDecoder<int32_t> IntGorillaDecoder;
 typedef GorillaDecoder<int64_t> LongGorillaDecoder;
 
-// wrap as Decoder interface
+// ── IntGorillaDecoder batch/skip overrides ─────────────────────────────────
+template <>
+inline int GorillaDecoder<int32_t>::read_batch_int32(int32_t* out, int capacity,
+                                                     int& actual,
+                                                     common::ByteStream& in) {
+    return batch_decode_raw(out, capacity, actual,
+                            GORILLA_ENCODING_ENDING_INTEGER, in);
+}
+template <>
+inline int GorillaDecoder<int32_t>::read_batch_int64(int64_t*, int, int& actual,
+                                                     common::ByteStream&) {
+    actual = 0;
+    return common::E_NOT_SUPPORT;
+}
+template <>
+inline int GorillaDecoder<int32_t>::skip_int32(int count, int& skipped,
+                                               common::ByteStream& in) {
+    return batch_skip_raw(count, skipped, GORILLA_ENCODING_ENDING_INTEGER, in);
+}
+template <>
+inline int GorillaDecoder<int32_t>::skip_int64(int, int& skipped,
+                                               common::ByteStream&) {
+    skipped = 0;
+    return common::E_NOT_SUPPORT;
+}
+
+// ── LongGorillaDecoder batch/skip overrides ───────────────────────────────
+template <>
+inline int GorillaDecoder<int64_t>::read_batch_int32(int32_t*, int, int& actual,
+                                                     common::ByteStream&) {
+    actual = 0;
+    return common::E_NOT_SUPPORT;
+}
+template <>
+inline int GorillaDecoder<int64_t>::read_batch_int64(int64_t* out, int capacity,
+                                                     int& actual,
+                                                     common::ByteStream& in) {
+    return batch_decode_raw(out, capacity, actual, GORILLA_ENCODING_ENDING_LONG,
+                            in);
+}
+template <>
+inline int GorillaDecoder<int64_t>::skip_int32(int, int& skipped,
+                                               common::ByteStream&) {
+    skipped = 0;
+    return common::E_NOT_SUPPORT;
+}
+template <>
+inline int GorillaDecoder<int64_t>::skip_int64(int count, int& skipped,
+                                               common::ByteStream& in) {
+    return batch_skip_raw(count, skipped, GORILLA_ENCODING_ENDING_LONG, in);
+}
+
+// ── Scalar Decoder interface wrappers (unchanged) ─────────────────────────
 template <>
 FORCE_INLINE int IntGorillaDecoder::read_boolean(bool& ret_value,
                                                  common::ByteStream& in) {
diff --git a/cpp/src/encoding/plain_decoder.h b/cpp/src/encoding/plain_decoder.h
index c2627f71d..3e83cfc76 100644
--- a/cpp/src/encoding/plain_decoder.h
+++ b/cpp/src/encoding/plain_decoder.h
@@ -20,10 +20,47 @@
 #ifndef ENCODING_PLAIN_DECODER_H
 #define ENCODING_PLAIN_DECODER_H
 
+#include <algorithm>
+#include <cstdint>
+#include <cstring>
+
+#if defined(_MSC_VER)
+#include <intrin.h>
+#include <stdlib.h>
+#endif
+
 #include "encoding/decoder.h"
 
 namespace storage {
 
+FORCE_INLINE uint32_t plain_bswap32(uint32_t v) {
+#if defined(__GNUC__) || defined(__clang__)
+    return __builtin_bswap32(v);
+#elif defined(_MSC_VER)
+    return _byteswap_ulong(v);
+#else
+    return ((v & 0x000000FFu) << 24) | ((v & 0x0000FF00u) << 8) |
+           ((v & 0x00FF0000u) >> 8) | ((v & 0xFF000000u) >> 24);
+#endif
+}
+
+FORCE_INLINE uint64_t plain_bswap64(uint64_t v) {
+#if defined(__GNUC__) || defined(__clang__)
+    return __builtin_bswap64(v);
+#elif defined(_MSC_VER)
+    return _byteswap_uint64(v);
+#else
+    return ((v & 0x00000000000000FFull) << 56) |
+           ((v & 0x000000000000FF00ull) << 40) |
+           ((v & 0x0000000000FF0000ull) << 24) |
+           ((v & 0x00000000FF000000ull) << 8) |
+           ((v & 0x000000FF00000000ull) >> 8) |
+           ((v & 0x0000FF0000000000ull) >> 24) |
+           ((v & 0x00FF000000000000ull) >> 40) |
+           ((v & 0xFF00000000000000ull) >> 56);
+#endif
+}
+
 class PlainDecoder : public Decoder {
    public:
     ~PlainDecoder() override = default;
@@ -62,6 +99,113 @@ class PlainDecoder : public Decoder {
                                  common::ByteStream& in) override {
         return common::SerializationUtil::read_mystring(ret_String, &pa, in);
     }
+
+    // ── Batch overrides ──────────────────────────────────────────────────────
+    //
+    // INT32: PLAIN encoding uses varint (variable stride).  Override to avoid
+    // virtual dispatch per element; actual decode is still per-value.
+    int read_batch_int32(int32_t* out, int capacity, int& actual,
+                         common::ByteStream& in) override {
+        actual = 0;
+        while (actual < capacity && in.has_remaining()) {
+            int ret = common::SerializationUtil::read_var_int(out[actual], in);
+            if (ret != common::E_OK) return ret;
+            ++actual;
+        }
+        return common::E_OK;
+    }
+
+    int skip_int32(int count, int& skipped, common::ByteStream& in) override {
+        skipped = 0;
+        int32_t dummy;
+        while (skipped < count && in.has_remaining()) {
+            int ret = common::SerializationUtil::read_var_int(dummy, in);
+            if (ret != common::E_OK) {
+                return ret;
+            }
+            ++skipped;
+        }
+        return common::E_OK;
+    }
+
+    // Fixed-stride INT64 / FLOAT / DOUBLE share the same shape: when the
+    // ByteStream is wrapped (contiguous buf), advance the read pointer in one
+    // step and byte-swap in place; otherwise fall back to per-value reads.
+    // The macros below expand into one override per type.
+#define PLAIN_SKIP_FIXED(NAME, T, STRIDE, READ_ONE)                         \
+    int NAME(int count, int& skipped, common::ByteStream& in) override {    \
+        skipped = 0;                                                        \
+        if (!in.is_wrapped()) {                                             \
+            T dummy;                                                        \
+            while (skipped < count && in.has_remaining()) {                 \
+                int ret = READ_ONE(dummy, in);                              \
+                if (ret != common::E_OK) {                                  \
+                    return ret;                                             \
+                }                                                           \
+                ++skipped;                                                  \
+            }                                                               \
+            return common::E_OK;                                            \
+        }                                                                   \
+        skipped = static_cast<int>(std::min<uint32_t>(                      \
+            in.remaining_size() / (STRIDE), static_cast<uint32_t>(count))); \
+        if (skipped <= 0) {                                                 \
+            skipped = 0;                                                    \
+            return common::E_OK;                                            \
+        }                                                                   \
+        in.wrapped_buf_advance_read_pos(static_cast<uint32_t>(skipped) *    \
+                                        (STRIDE));                          \
+        return common::E_OK;                                                \
+    }
+
+#define PLAIN_READ_BATCH_FIXED(NAME, T, U, STRIDE, READ_ONE, BSWAP)            \
+    int NAME(T* out, int capacity, int& actual, common::ByteStream& in)        \
+        override {                                                             \
+        actual = 0;                                                            \
+        if (!in.is_wrapped()) {                                                \
+            while (actual < capacity && in.has_remaining()) {                  \
+                int ret = READ_ONE(out[actual], in);                           \
+                if (ret != common::E_OK) {                                     \
+                    return ret;                                                \
+                }                                                              \
+                ++actual;                                                      \
+            }                                                                  \
+            return common::E_OK;                                               \
+        }                                                                      \
+        int n = static_cast<int>(std::min<uint32_t>(                           \
+            in.remaining_size() / (STRIDE), static_cast<uint32_t>(capacity))); \
+        if (n <= 0) {                                                          \
+            return common::E_OK;                                               \
+        }                                                                      \
+        const uint8_t* src =                                                   \
+            (const uint8_t*)in.get_wrapped_buf() + in.read_pos();              \
+        in.wrapped_buf_advance_read_pos(static_cast<uint32_t>(n) * (STRIDE));  \
+        actual = n;                                                            \
+        for (int i = 0; i < n; ++i) {                                          \
+            U v;                                                               \
+            memcpy(&v, src + i * (STRIDE), (STRIDE));                          \
+            v = BSWAP(v);                                                      \
+            memcpy(&out[i], &v, (STRIDE));                                     \
+        }                                                                      \
+        return common::E_OK;                                                   \
+    }
+
+    PLAIN_SKIP_FIXED(skip_int64, int64_t, 8,
+                     common::SerializationUtil::read_i64)
+    PLAIN_SKIP_FIXED(skip_float, float, 4,
+                     common::SerializationUtil::read_float)
+    PLAIN_SKIP_FIXED(skip_double, double, 8,
+                     common::SerializationUtil::read_double)
+
+    PLAIN_READ_BATCH_FIXED(read_batch_int64, int64_t, uint64_t, 8,
+                           common::SerializationUtil::read_i64, plain_bswap64)
+    PLAIN_READ_BATCH_FIXED(read_batch_float, float, uint32_t, 4,
+                           common::SerializationUtil::read_float, plain_bswap32)
+    PLAIN_READ_BATCH_FIXED(read_batch_double, double, uint64_t, 8,
+                           common::SerializationUtil::read_double,
+                           plain_bswap64)
+
+#undef PLAIN_SKIP_FIXED
+#undef PLAIN_READ_BATCH_FIXED
 };
 
 }  // end namespace storage
diff --git a/cpp/src/encoding/plain_encoder.h b/cpp/src/encoding/plain_encoder.h
index b768c9bf0..84ebee238 100644
--- a/cpp/src/encoding/plain_encoder.h
+++ b/cpp/src/encoding/plain_encoder.h
@@ -20,50 +20,221 @@
 #ifndef ENCODING_PLAIN_ENCODER_H
 #define ENCODING_PLAIN_ENCODER_H
 
+#include <cstring>
+
 #include "encoder.h"
 
+#if defined(__ARM_NEON) || defined(__ARM_NEON__)
+#include <arm_neon.h>
+#define TSFILE_HAS_NEON 1
+#endif
+
 namespace storage {
 
 class PlainEncoder : public Encoder {
    public:
     PlainEncoder() {}
     ~PlainEncoder() { destroy(); }
-    void destroy() { /* do nothing for PlainEncoder */
+    void destroy() override { /* do nothing for PlainEncoder */
     }
-    void reset() { /* do thing for PlainEncoder */
+    void reset() override { /* do thing for PlainEncoder */
     }
 
-    FORCE_INLINE int encode(bool value, common::ByteStream& out_stream) {
+    FORCE_INLINE int encode(bool value,
+                            common::ByteStream& out_stream) override {
         return common::SerializationUtil::write_i8(value ? 1 : 0, out_stream);
     }
 
-    FORCE_INLINE int encode(int32_t value, common::ByteStream& out_stream) {
+    FORCE_INLINE int encode(int32_t value,
+                            common::ByteStream& out_stream) override {
         return common::SerializationUtil::write_var_int(value, out_stream);
     }
 
-    FORCE_INLINE int encode(int64_t value, common::ByteStream& out_stream) {
+    FORCE_INLINE int encode(int64_t value,
+                            common::ByteStream& out_stream) override {
         return common::SerializationUtil::write_i64(value, out_stream);
     }
 
-    FORCE_INLINE int encode(float value, common::ByteStream& out_stream) {
+    FORCE_INLINE int encode(float value,
+                            common::ByteStream& out_stream) override {
         return common::SerializationUtil::write_float(value, out_stream);
     }
 
-    FORCE_INLINE int encode(double value, common::ByteStream& out_stream) {
+    FORCE_INLINE int encode(double value,
+                            common::ByteStream& out_stream) override {
         return common::SerializationUtil::write_double(value, out_stream);
     }
 
     FORCE_INLINE int encode(common::String value,
-                            common::ByteStream& out_stream) {
+                            common::ByteStream& out_stream) override {
         return common::SerializationUtil::write_mystring(value, out_stream);
     }
 
-    int flush(common::ByteStream& out_stream) {
+    int flush(common::ByteStream& out_stream) override {
         // do nothing for PlainEncoder
         return common::E_OK;
     }
 
-    int get_max_byte_size() { return 0; }
+    int get_max_byte_size() override { return 0; }
+
+    // Optimized batch encoding: directly byte-swap into ByteStream page buffer.
+    // Avoids per-value write_buf overhead entirely — only calls acquire_buf()
+    // once per page boundary crossing.
+    int encode_batch(const int64_t* values, uint32_t count,
+                     common::ByteStream& out_stream) override {
+        if (count == 0) return common::E_OK;
+        uint32_t offset = 0;
+        while (offset < count) {
+            common::ByteStream::Buffer buf = out_stream.acquire_buf();
+            if (UNLIKELY(buf.buf_ == nullptr)) return common::E_OOM;
+            // How many int64 values fit in the remaining page space?
+            uint32_t capacity = buf.len_ / 8;
+            if (capacity == 0) {
+                // Page has < 8 bytes left, fall back to write_buf for this one
+                return Encoder::encode_batch(values + offset, count - offset,
+                                             out_stream);
+            }
+            uint32_t batch = std::min(count - offset, capacity);
+            uint8_t* dst = (uint8_t*)buf.buf_;
+            const int64_t* src = values + offset;
+            uint32_t i = 0;
+#if TSFILE_HAS_NEON
+            // NEON: byte-reverse 2 x int64 per iteration
+            for (; i + 2 <= batch; i += 2) {
+                uint8x16_t v = vld1q_u8((const uint8_t*)&src[i]);
+                v = vrev64q_u8(v);
+                vst1q_u8(dst, v);
+                dst += 16;
+            }
+#endif
+            // Scalar tail
+            for (; i < batch; i++) {
+                uint64_t v = (uint64_t)src[i];
+                dst[0] = (uint8_t)(v >> 56);
+                dst[1] = (uint8_t)(v >> 48);
+                dst[2] = (uint8_t)(v >> 40);
+                dst[3] = (uint8_t)(v >> 32);
+                dst[4] = (uint8_t)(v >> 24);
+                dst[5] = (uint8_t)(v >> 16);
+                dst[6] = (uint8_t)(v >> 8);
+                dst[7] = (uint8_t)(v);
+                dst += 8;
+            }
+            out_stream.buffer_used(batch * 8);
+            offset += batch;
+        }
+        return common::E_OK;
+    }
+
+    int encode_batch(const double* values, uint32_t count,
+                     common::ByteStream& out_stream) override {
+        if (count == 0) return common::E_OK;
+        uint32_t offset = 0;
+        while (offset < count) {
+            common::ByteStream::Buffer buf = out_stream.acquire_buf();
+            if (UNLIKELY(buf.buf_ == nullptr)) return common::E_OOM;
+            uint32_t capacity = buf.len_ / 8;
+            if (capacity == 0) {
+                return Encoder::encode_batch(values + offset, count - offset,
+                                             out_stream);
+            }
+            uint32_t batch = std::min(count - offset, capacity);
+            uint8_t* dst = (uint8_t*)buf.buf_;
+            const double* src = values + offset;
+            uint32_t i = 0;
+#if TSFILE_HAS_NEON
+            // NEON byte-reverse of raw bytes works for double bits too.
+            for (; i + 2 <= batch; i += 2) {
+                uint8x16_t v = vld1q_u8((const uint8_t*)&src[i]);
+                v = vrev64q_u8(v);
+                vst1q_u8(dst, v);
+                dst += 16;
+            }
+#endif
+            // Scalar tail: round-trip the bits via memcpy to avoid the
+            // strict-aliasing violation of reading a double through an
+            // int64_t* (the old reinterpret_cast dispatch).
+            for (; i < batch; i++) {
+                uint64_t v;
+                memcpy(&v, &src[i], sizeof(double));
+                dst[0] = (uint8_t)(v >> 56);
+                dst[1] = (uint8_t)(v >> 48);
+                dst[2] = (uint8_t)(v >> 40);
+                dst[3] = (uint8_t)(v >> 32);
+                dst[4] = (uint8_t)(v >> 24);
+                dst[5] = (uint8_t)(v >> 16);
+                dst[6] = (uint8_t)(v >> 8);
+                dst[7] = (uint8_t)(v);
+                dst += 8;
+            }
+            out_stream.buffer_used(batch * 8);
+            offset += batch;
+        }
+        return common::E_OK;
+    }
+
+    int encode_batch(const float* values, uint32_t count,
+                     common::ByteStream& out_stream) override {
+        if (count == 0) return common::E_OK;
+        uint32_t offset = 0;
+        while (offset < count) {
+            common::ByteStream::Buffer buf = out_stream.acquire_buf();
+            if (UNLIKELY(buf.buf_ == nullptr)) return common::E_OOM;
+            uint32_t capacity = buf.len_ / 4;
+            if (capacity == 0) {
+                return Encoder::encode_batch(values + offset, count - offset,
+                                             out_stream);
+            }
+            uint32_t batch = std::min(count - offset, capacity);
+            uint8_t* dst = (uint8_t*)buf.buf_;
+            const float* src = values + offset;
+            uint32_t i = 0;
+#if TSFILE_HAS_NEON
+            // NEON: byte-reverse 4 x float (32-bit) per iteration
+            for (; i + 4 <= batch; i += 4) {
+                uint8x16_t v = vld1q_u8((const uint8_t*)&src[i]);
+                v = vrev32q_u8(v);
+                vst1q_u8(dst, v);
+                dst += 16;
+            }
+#endif
+            for (; i < batch; i++) {
+                uint32_t v;
+                memcpy(&v, &src[i], sizeof(float));
+                dst[0] = (uint8_t)(v >> 24);
+                dst[1] = (uint8_t)(v >> 16);
+                dst[2] = (uint8_t)(v >> 8);
+                dst[3] = (uint8_t)(v);
+                dst += 4;
+            }
+            out_stream.buffer_used(batch * 4);
+            offset += batch;
+        }
+        return common::E_OK;
+    }
+
+    // Batch encode strings from Arrow-style offset+buffer layout.
+    // Each string is serialized as: var_int(len) + raw bytes.
+    int encode_string_batch(const char* buffer, const uint32_t* offsets,
+                            uint32_t start_idx, uint32_t count,
+                            common::ByteStream& out_stream) override {
+        int ret = common::E_OK;
+        for (uint32_t i = 0; i < count; i++) {
+            uint32_t idx = start_idx + i;
+            uint32_t len = offsets[idx + 1] - offsets[idx];
+            if (RET_FAIL(common::SerializationUtil::write_var_int(
+                    (int32_t)len, out_stream))) {
+                return ret;
+            }
+            if (len > 0) {
+                if (RET_FAIL(
+                        out_stream.write_buf(buffer + offsets[idx], len))) {
+                    return ret;
+                }
+            }
+        }
+        return ret;
+    }
 };
 
 }  // end namespace storage
diff --git a/cpp/src/encoding/ts2diff_decoder.h b/cpp/src/encoding/ts2diff_decoder.h
index f37001003..bc6e89613 100644
--- a/cpp/src/encoding/ts2diff_decoder.h
+++ b/cpp/src/encoding/ts2diff_decoder.h
@@ -24,6 +24,7 @@
 
 #include <cmath>
 #include <cstddef>
+#include <cstring>
 #include <vector>
 
 #include "common/allocator/alloc_base.h"
@@ -31,8 +32,174 @@
 #include "decoder.h"
 #include "utils/util_define.h"
 
+#ifdef ENABLE_SIMD
+#include "simde/x86/avx2.h"
+#endif
+
 namespace storage {
 
+// ============================================================================
+// SIMD batch decode helpers (INT32)
+// ============================================================================
+#ifdef ENABLE_SIMD
+
+// Decode 4 INT32 values from bit-packed data using SIMD gather + shift.
+// @in:        pointer to the start of packed bit data for the block
+// @bit_width: bits per delta value
+// @delta_min: minimum delta offset for this block
+// @index:     current position within the block (0-based, among write_index_
+//             deltas)
+// @base:      the previous reconstructed value (for prefix-sum)
+// @out:       output array (4 values written)
+// Returns:    the last reconstructed value (new base for next group)
+static inline int32_t simd_decode_4_i32(const uint8_t* in, int32_t bit_width,
+                                        int32_t delta_min, int32_t index,
+                                        int32_t base, int32_t out[4]) {
+    static const simde__m128i SHUF_REV4 = simde_mm_setr_epi8(
+        3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12);
+
+    const simde__m128i VMIN4 = simde_mm_set1_epi32(delta_min);
+
+    int32_t pos0 = index * bit_width;
+    int32_t pos[4] = {pos0, pos0 + bit_width, pos0 + 2 * bit_width,
+                      pos0 + 3 * bit_width};
+    int32_t bidx[4] = {pos[0] >> 3, pos[1] >> 3, pos[2] >> 3, pos[3] >> 3};
+    int32_t off[4] = {pos[0] & 7, pos[1] & 7, pos[2] & 7, pos[3] & 7};
+
+    simde__m128i IDX = simde_mm_setr_epi32(bidx[0], bidx[1], bidx[2], bidx[3]);
+    simde__m128i OFF = simde_mm_setr_epi32(off[0], off[1], off[2], off[3]);
+
+    simde__m128i V4;
+
+    if (bit_width <= 16) {
+        int rshift = 32 - bit_width;
+        simde__m128i w32_le = simde_mm_i32gather_epi32((const int*)in, IDX, 1);
+        simde__m128i w32_be = simde_mm_shuffle_epi8(w32_le, SHUF_REV4);
+        simde__m128i U32 = simde_mm_sllv_epi32(w32_be, OFF);
+        simde__m128i RS32 = simde_mm_set1_epi32(rshift);
+        V4 = simde_mm_srlv_epi32(U32, RS32);
+    } else {
+        static const simde__m256i SHUF_REV8 = simde_mm256_setr_epi8(
+            7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3,
+            2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
+        int rshift = 64 - bit_width;
+        simde__m256i w64_le =
+            simde_mm256_i32gather_epi64((const int64_t*)in, IDX, 1);
+        simde__m256i w64_be = simde_mm256_shuffle_epi8(w64_le, SHUF_REV8);
+        simde__m256i OFF64 = simde_mm256_cvtepu32_epi64(OFF);
+        simde__m256i U64 = simde_mm256_sllv_epi64(w64_be, OFF64);
+        simde__m256i V64 =
+            simde_mm256_srl_epi64(U64, simde_mm_cvtsi32_si128(rshift));
+        simde__m256i perm = simde_mm256_setr_epi32(0, 2, 4, 6, 0, 0, 0, 0);
+        simde__m256i comp = simde_mm256_permutevar8x32_epi32(V64, perm);
+        V4 = simde_mm256_castsi256_si128(comp);
+    }
+
+    // Add delta_min
+    V4 = simde_mm_add_epi32(V4, VMIN4);
+
+    // Prefix sum to reconstruct absolute values
+    simde__m128i t;
+    t = simde_mm_slli_si128(V4, 4);
+    V4 = simde_mm_add_epi32(V4, t);
+    t = simde_mm_slli_si128(V4, 8);
+    V4 = simde_mm_add_epi32(V4, t);
+
+    // Add base
+    simde__m128i C4 = simde_mm_set1_epi32(base);
+    V4 = simde_mm_add_epi32(V4, C4);
+
+    simde_mm_storeu_si128((simde__m128i*)out, V4);
+    return out[3];
+}
+
+// Decode 4 INT64 values from bit-packed data using SIMD.
+static inline int64_t simd_decode_4_i64(const uint8_t* in, int32_t bit_width,
+                                        int64_t delta_min, int32_t index,
+                                        int64_t base, int64_t out[4]) {
+    static const simde__m256i SHUF_REV8 = simde_mm256_setr_epi8(
+        7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2,
+        1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
+
+    const simde__m256i VMIN4 = simde_mm256_set1_epi64x(delta_min);
+
+    int32_t pos0 = index * bit_width;
+    int32_t pos[4] = {pos0, pos0 + bit_width, pos0 + 2 * bit_width,
+                      pos0 + 3 * bit_width};
+    int32_t bidx[4] = {pos[0] >> 3, pos[1] >> 3, pos[2] >> 3, pos[3] >> 3};
+    int32_t off[4] = {pos[0] & 7, pos[1] & 7, pos[2] & 7, pos[3] & 7};
+
+    simde__m128i IDX = simde_mm_setr_epi32(bidx[0], bidx[1], bidx[2], bidx[3]);
+
+    int rshift = 64 - bit_width;
+    simde__m256i w64_le =
+        simde_mm256_i32gather_epi64((const int64_t*)in, IDX, 1);
+    simde__m256i w64_be = simde_mm256_shuffle_epi8(w64_le, SHUF_REV8);
+    simde__m256i OFF64 = simde_mm256_cvtepu32_epi64(
+        simde_mm_setr_epi32(off[0], off[1], off[2], off[3]));
+    simde__m256i U64 = simde_mm256_sllv_epi64(w64_be, OFF64);
+    simde__m256i V64 =
+        simde_mm256_srl_epi64(U64, simde_mm_cvtsi32_si128(rshift));
+
+    // Add delta_min
+    V64 = simde_mm256_add_epi64(V64, VMIN4);
+
+    // Prefix sum (64-bit, 4 lanes)
+    simde__m256i t;
+    // shift by 8 bytes = 1 lane
+    t = simde_mm256_slli_si256(V64, 8);
+    V64 = simde_mm256_add_epi64(V64, t);
+    // cross-lane: add lane[1] to lane[2] and lane[3]
+    // Extract high 128 bits, add broadcast of element[1] to both elements
+    int64_t tmp_buf[4];
+    simde_mm256_storeu_si256((simde__m256i*)tmp_buf, V64);
+    tmp_buf[2] += tmp_buf[1];
+    tmp_buf[3] += tmp_buf[1];
+    V64 = simde_mm256_loadu_si256((const simde__m256i*)tmp_buf);
+
+    // Add base
+    simde__m256i C4 = simde_mm256_set1_epi64x(base);
+    V64 = simde_mm256_add_epi64(V64, C4);
+
+    simde_mm256_storeu_si256((simde__m256i*)out, V64);
+    return out[3];
+}
+
+#endif  // ENABLE_SIMD
+
+// ============================================================================
+// Scalar batch decode helpers
+// ============================================================================
+
+// Scalar: extract one value from bit-packed data.
+// @data:      pointer to packed bits (NOT advanced; caller handles position)
+// @bit_pos:   bit offset from start of data
+// @bit_width: bits per value
+static inline int64_t scalar_read_bits(const uint8_t* data, int32_t bit_pos,
+                                       int32_t bit_width) {
+    int64_t value = 0;
+    int bits = bit_width;
+    int byte_idx = bit_pos >> 3;
+    int bit_offset = bit_pos & 7;
+    int bits_avail = 8 - bit_offset;
+
+    while (bits > 0) {
+        if (bits >= bits_avail) {
+            uint8_t d = data[byte_idx] & ((1 << bits_avail) - 1);
+            value = (value << bits_avail) | d;
+            bits -= bits_avail;
+            byte_idx++;
+            bits_avail = 8;
+        } else {
+            uint8_t d =
+                (data[byte_idx] >> (bits_avail - bits)) & ((1 << bits) - 1);
+            value = (value << bits) | d;
+            bits = 0;
+        }
+    }
+    return value;
+}
+
 namespace ts2diff_java_detail {
 
 // Java float/double TS_2DIFF overflow page markers.
@@ -54,7 +221,7 @@ inline bool bitmap_marked(const std::vector<uint8_t>& bm, int idx) {
 
 inline bool looks_like_ts2diff_header(common::ByteStream& in) {
     int ret = common::E_OK;
-    uint32_t probe_mark = in.read_pos();
+    uint64_t probe_mark = in.read_pos();
     int32_t write_index = 0;
     int32_t bit_width = 0;
     if (RET_FAIL(common::SerializationUtil::read_i32(write_index, in)) ||
@@ -82,7 +249,7 @@ inline int consume_float_double_ts2diff_prefix(
     underflow_bm.clear();
     overflow_bm.clear();
     segment_size = 0;
-    uint32_t mark = in.read_pos();
+    uint64_t mark = in.read_pos();
     uint32_t tag = 0;
     if (RET_FAIL(common::SerializationUtil::read_var_uint(tag, in))) {
         return ret;
@@ -132,6 +299,9 @@ inline int consume_float_double_ts2diff_prefix(
 
 }  // namespace ts2diff_java_detail
 
+// ============================================================================
+// TS2DIFFDecoder template
+// ============================================================================
 template <typename T>
 class TS2DIFFDecoder : public Decoder {
    public:
@@ -148,12 +318,14 @@ class TS2DIFFDecoder : public Decoder {
         previous_value_ = 0;
         bit_width_ = 0;
         current_index_ = 0;
+        header_peeked_ = false;
     }
 
     FORCE_INLINE bool has_remaining(const common::ByteStream& buffer) override {
         if (buffer.has_remaining()) return true;
-        return bits_left_ != 0 || (current_index_ <= write_index_ &&
-                                   write_index_ != -1 && current_index_ != 0);
+        return header_peeked_ || bits_left_ != 0 ||
+               (current_index_ <= write_index_ && write_index_ != -1 &&
+                current_index_ != 0);
     }
 
     void read_header(common::ByteStream& in) {
@@ -208,6 +380,18 @@ class TS2DIFFDecoder : public Decoder {
     int read_String(common::String& ret_value, common::PageArena& pa,
                     common::ByteStream& in) override;
 
+    int read_batch_int32(int32_t* out, int capacity, int& actual,
+                         common::ByteStream& in) override;
+    int read_batch_int64(int64_t* out, int capacity, int& actual,
+                         common::ByteStream& in) override;
+    int skip_int32(int count, int& skipped, common::ByteStream& in) override;
+    int skip_int64(int count, int& skipped, common::ByteStream& in) override;
+
+    bool peek_next_block_range_int64(common::ByteStream& in, int64_t& block_min,
+                                     int64_t& block_max,
+                                     int& block_count) override;
+    int skip_peeked_block_int64(common::ByteStream& in, int& skipped) override;
+
    public:
     T first_value_;
     T previous_value_;
@@ -218,8 +402,13 @@ class TS2DIFFDecoder : public Decoder {
     int bit_width_;
     int write_index_;
     int current_index_;
+    bool header_peeked_;
 };
 
+// ============================================================================
+// Per-value decode (unchanged)
+// ============================================================================
+
 template <>
 inline int32_t TS2DIFFDecoder<int32_t>::decode(common::ByteStream& in) {
     int32_t ret_value = stored_value_;
@@ -274,6 +463,436 @@ inline int64_t TS2DIFFDecoder<int64_t>::decode(common::ByteStream& in) {
     return ret_value;
 }
 
+// ============================================================================
+// Batch decode: INT32
+// Decodes one full block (up to 129 values) per call using SIMD when enabled.
+// ============================================================================
+
+template <>
+inline int TS2DIFFDecoder<int32_t>::read_batch_int32(int32_t* out, int capacity,
+                                                     int& actual,
+                                                     common::ByteStream& in) {
+    actual = 0;
+
+    while (actual < capacity && has_remaining(in)) {
+        // If we are mid-block (current_index_ != 0), finish it per-value.
+        if (current_index_ != 0) {
+            while (actual < capacity && current_index_ != 0 &&
+                   has_remaining(in)) {
+                out[actual++] = decode(in);
+            }
+            continue;
+        }
+
+        // Start of a new block — read header
+        read_header(in);
+        common::SerializationUtil::read_i32(delta_min_, in);
+        common::SerializationUtil::read_i32(first_value_, in);
+        bits_left_ = 0;
+        buffer_ = 0;
+
+        // Output first_value
+        if (actual >= capacity) {
+            // Must consume first_value next time; set state for per-value path
+            current_index_ = 0;
+            // We already consumed the header; push first_value as stored
+            // and let the next call to decode() handle it.
+            // Actually, we need to handle this: rewind is not possible.
+            // So we output first_value and accept going 1 over capacity.
+        }
+        out[actual++] = first_value_;
+
+        if (write_index_ == 0) {
+            // Block has only first_value, no deltas
+            current_index_ = 0;
+            continue;
+        }
+
+        int32_t remaining = write_index_;
+        if (actual + remaining > capacity) {
+            // Block won't fit in output. Fall back to per-value decode.
+            // Stream is at packed data start; bits_left_/buffer_ are reset.
+            current_index_ = 1;
+            continue;
+        }
+        if (!in.is_wrapped()) {
+            // SIMD/scalar block decode below requires a contiguous wrapped
+            // buffer.  For a paged ByteStream, drop down to per-value
+            // decode the same way the doesn't-fit branch does.
+            current_index_ = 1;
+            continue;
+        }
+
+        // Full block decode. Validate against corrupt headers before
+        // advancing the read position — a bogus bit_width_ or write_index_
+        // could compute a block_bytes that overflows the int32_t multiply
+        // or runs past the wrapped buffer.
+        if (UNLIKELY(write_index_ < 0 || bit_width_ < 0 || bit_width_ > 32)) {
+            return common::E_TSFILE_CORRUPTED;
+        }
+        int64_t block_bytes_64 =
+            (static_cast<int64_t>(write_index_) * bit_width_ + 7) / 8;
+        if (UNLIKELY(block_bytes_64 > in.remaining_size())) {
+            return common::E_TSFILE_CORRUPTED;
+        }
+        int32_t block_bytes = static_cast<int32_t>(block_bytes_64);
+        const uint8_t* blk_ptr =
+            (const uint8_t*)in.get_wrapped_buf() + in.read_pos();
+        in.wrapped_buf_advance_read_pos(static_cast<uint32_t>(block_bytes));
+
+        int32_t prev = first_value_;
+        int32_t i = 0;
+
+#ifdef ENABLE_SIMD
+        // SIMD path: decode 8 values at a time (2 groups of 4)
+        for (; i + 7 < remaining; i += 8) {
+            int32_t need_bytes = ((i + 7) * bit_width_ + bit_width_ + 7) / 8 +
+                                 (bit_width_ > 16 ? 8 : 4);
+            if (need_bytes > block_bytes) break;
+
+            int32_t grp_out[8];
+            prev = simd_decode_4_i32(blk_ptr, bit_width_, delta_min_, i, prev,
+                                     grp_out);
+            prev = simd_decode_4_i32(blk_ptr, bit_width_, delta_min_, i + 4,
+                                     prev, grp_out + 4);
+
+            memcpy(out + actual, grp_out, 8 * sizeof(int32_t));
+            actual += 8;
+        }
+#endif
+
+        // Scalar tail
+        int32_t bit_pos = i * bit_width_;
+        for (; i < remaining; ++i) {
+            int64_t delta = scalar_read_bits(blk_ptr, bit_pos, bit_width_);
+            bit_pos += bit_width_;
+            int32_t val = (int32_t)delta + prev + delta_min_;
+            prev = val;
+            out[actual++] = val;
+        }
+
+        // Block done, reset state
+        first_value_ = prev;
+        current_index_ = 0;
+    }
+
+    return common::E_OK;
+}
+
+// ============================================================================
+// Batch decode: INT64
+// ============================================================================
+
+template <>
+inline int TS2DIFFDecoder<int64_t>::read_batch_int64(int64_t* out, int capacity,
+                                                     int& actual,
+                                                     common::ByteStream& in) {
+    actual = 0;
+
+    while (actual < capacity && has_remaining(in)) {
+        // If mid-block, finish per-value
+        if (current_index_ != 0) {
+            while (actual < capacity && current_index_ != 0 &&
+                   has_remaining(in)) {
+                out[actual++] = decode(in);
+            }
+            continue;
+        }
+
+        // Start of a new block
+        if (!header_peeked_) {
+            read_header(in);
+            common::SerializationUtil::read_i64(delta_min_, in);
+            common::SerializationUtil::read_i64(first_value_, in);
+            bits_left_ = 0;
+            buffer_ = 0;
+        }
+        header_peeked_ = false;
+
+        out[actual++] = first_value_;
+
+        if (write_index_ == 0) {
+            current_index_ = 0;
+            continue;
+        }
+
+        int32_t remaining = write_index_;
+        if (actual + remaining > capacity) {
+            // Block won't fit in output. Fall back to per-value decode.
+            // Stream is at packed data start; bits_left_/buffer_ are reset.
+            current_index_ = 1;
+            continue;
+        }
+        if (!in.is_wrapped()) {
+            // SIMD/scalar block decode below requires a contiguous wrapped
+            // buffer.  Page-backed ByteStreams must use the per-value path.
+            current_index_ = 1;
+            continue;
+        }
+
+        // Validate against corrupt headers (see int32 path).
+        if (UNLIKELY(write_index_ < 0 || bit_width_ < 0 || bit_width_ > 64)) {
+            return common::E_TSFILE_CORRUPTED;
+        }
+        int64_t block_bytes_64 =
+            (static_cast<int64_t>(write_index_) * bit_width_ + 7) / 8;
+        if (UNLIKELY(block_bytes_64 > in.remaining_size())) {
+            return common::E_TSFILE_CORRUPTED;
+        }
+        int32_t block_bytes = static_cast<int32_t>(block_bytes_64);
+        // Direct pointer into the wrapped ByteStream buffer.
+        const uint8_t* blk_ptr =
+            (const uint8_t*)in.get_wrapped_buf() + in.read_pos();
+        in.wrapped_buf_advance_read_pos(static_cast<uint32_t>(block_bytes));
+
+        int64_t prev = first_value_;
+        int32_t i = 0;
+
+#ifdef ENABLE_SIMD
+        // SIMD path: decode 4 INT64 values at a time
+        for (; i + 3 < remaining; i += 4) {
+            int32_t need_bytes =
+                ((i + 3) * bit_width_ + bit_width_ + 7) / 8 + 8;
+            if (need_bytes > block_bytes) break;
+
+            int64_t grp_out[4];
+            prev = simd_decode_4_i64(blk_ptr, bit_width_, delta_min_, i, prev,
+                                     grp_out);
+            memcpy(out + actual, grp_out, 4 * sizeof(int64_t));
+            actual += 4;
+        }
+#endif
+
+        // Scalar tail
+        int32_t bit_pos = i * bit_width_;
+        for (; i < remaining; ++i) {
+            int64_t delta = scalar_read_bits(blk_ptr, bit_pos, bit_width_);
+            bit_pos += bit_width_;
+            int64_t val = delta + prev + delta_min_;
+            prev = val;
+            out[actual++] = val;
+        }
+
+        first_value_ = prev;
+        current_index_ = 0;
+    }
+
+    return common::E_OK;
+}
+
+// ============================================================================
+// Skip: INT32 — read header only, jump over packed data
+// ============================================================================
+
+template <>
+inline int TS2DIFFDecoder<int32_t>::skip_int32(int count, int& skipped,
+                                               common::ByteStream& in) {
+    skipped = 0;
+
+    // If mid-block, finish current block per-value
+    while (skipped < count && current_index_ != 0 && has_remaining(in)) {
+        decode(in);
+        ++skipped;
+    }
+
+    while (skipped < count && has_remaining(in)) {
+        int32_t wi, bw, dm, fv;
+        common::SerializationUtil::read_i32(wi, in);
+        common::SerializationUtil::read_i32(bw, in);
+        common::SerializationUtil::read_i32(dm, in);
+        common::SerializationUtil::read_i32(fv, in);
+
+        int32_t block_vals = wi + 1;
+        bits_left_ = 0;
+        buffer_ = 0;
+
+        if (count - skipped >= block_vals) {
+            // Whole-block fast path: jump over packed body.
+            int32_t skip_bytes = (wi * bw + 7) / 8;
+            in.wrapped_buf_advance_read_pos(skip_bytes);
+            skipped += block_vals;
+            current_index_ = 0;
+            write_index_ = -1;
+        } else {
+            // Partial block: reinstate decoder state as if we'd just
+            // emitted first_value_ from decode(), bump skipped by 1,
+            // then per-value decode the remaining count, leaving the
+            // rest of the block intact for the next decode() call.
+            write_index_ = wi;
+            bit_width_ = bw;
+            delta_min_ = dm;
+            first_value_ = fv;
+            current_index_ = (wi == 0) ? 0 : 1;
+            ++skipped;
+            while (skipped < count && current_index_ != 0 &&
+                   has_remaining(in)) {
+                decode(in);
+                ++skipped;
+            }
+        }
+    }
+
+    return common::E_OK;
+}
+
+// ============================================================================
+// Skip: INT64
+// ============================================================================
+
+template <>
+inline int TS2DIFFDecoder<int64_t>::skip_int64(int count, int& skipped,
+                                               common::ByteStream& in) {
+    skipped = 0;
+
+    while (skipped < count && current_index_ != 0 && has_remaining(in)) {
+        decode(in);
+        ++skipped;
+    }
+
+    while (skipped < count && has_remaining(in)) {
+        int32_t wi, bw;
+        int64_t dm, fv;
+        common::SerializationUtil::read_i32(wi, in);
+        common::SerializationUtil::read_i32(bw, in);
+        common::SerializationUtil::read_i64(dm, in);
+        common::SerializationUtil::read_i64(fv, in);
+
+        int32_t block_vals = wi + 1;
+        bits_left_ = 0;
+        buffer_ = 0;
+
+        if (count - skipped >= block_vals) {
+            int32_t skip_bytes = (wi * bw + 7) / 8;
+            in.wrapped_buf_advance_read_pos(skip_bytes);
+            skipped += block_vals;
+            current_index_ = 0;
+            write_index_ = -1;
+        } else {
+            write_index_ = wi;
+            bit_width_ = bw;
+            delta_min_ = dm;
+            first_value_ = fv;
+            current_index_ = (wi == 0) ? 0 : 1;
+            ++skipped;
+            while (skipped < count && current_index_ != 0 &&
+                   has_remaining(in)) {
+                decode(in);
+                ++skipped;
+            }
+        }
+    }
+
+    return common::E_OK;
+}
+
+// ============================================================================
+// Block-level filter check: peek header and compute value range
+// ============================================================================
+
+template <>
+inline bool TS2DIFFDecoder<int64_t>::peek_next_block_range_int64(
+    common::ByteStream& in, int64_t& block_min, int64_t& block_max,
+    int& block_count) {
+    if (current_index_ != 0 || !has_remaining(in)) return false;
+
+    read_header(in);
+    common::SerializationUtil::read_i64(delta_min_, in);
+    common::SerializationUtil::read_i64(first_value_, in);
+    bits_left_ = 0;
+    buffer_ = 0;
+
+    block_min = first_value_;
+    block_count = write_index_ + 1;
+
+    // Look-ahead: since timestamps are monotonically increasing, the true
+    // block_max is the last timestamp, which equals next block's first_value_.
+    // The next block header starts at read_pos + packed_bytes. first_value_ is
+    // at offset 16 within the header
+    // (write_index_(4)+bit_width_(4)+delta_min_(8)). We read it via raw pointer
+    // so the stream position is not consumed.
+    int32_t packed_bytes = (write_index_ * bit_width_ + 7) / 8;
+    if (in.remaining_size() >= (uint32_t)packed_bytes + 24) {
+        char* next_fv_ptr =
+            in.get_wrapped_buf() + in.read_pos() + packed_bytes + 16;
+        block_max = (int64_t)common::SerializationUtil::read_ui64(next_fv_ptr);
+    } else {
+        // Last block in page: fall back to conservative estimate.
+        if (write_index_ == 0 || bit_width_ == 0) {
+            block_max = first_value_ + (int64_t)write_index_ * delta_min_;
+        } else if (bit_width_ >= 63) {
+            block_max = INT64_MAX;
+        } else {
+            int64_t max_delta = delta_min_ + ((1LL << bit_width_) - 1);
+            block_max = first_value_ + (int64_t)write_index_ * max_delta;
+        }
+    }
+
+    header_peeked_ = true;
+    return true;
+}
+
+template <>
+inline int TS2DIFFDecoder<int64_t>::skip_peeked_block_int64(
+    common::ByteStream& in, int& skipped) {
+    skipped = write_index_ + 1;
+    int32_t skip_bytes = (write_index_ * bit_width_ + 7) / 8;
+    in.wrapped_buf_advance_read_pos(skip_bytes);
+    header_peeked_ = false;
+    bits_left_ = 0;
+    buffer_ = 0;
+    current_index_ = 0;
+    write_index_ = -1;
+    return common::E_OK;
+}
+
+// INT32 specialization: not applicable (timestamps are always INT64)
+template <>
+inline bool TS2DIFFDecoder<int32_t>::peek_next_block_range_int64(
+    common::ByteStream& in, int64_t& block_min, int64_t& block_max,
+    int& block_count) {
+    return false;
+}
+
+template <>
+inline int TS2DIFFDecoder<int32_t>::skip_peeked_block_int64(
+    common::ByteStream& in, int& skipped) {
+    return common::E_NOT_SUPPORT;
+}
+
+// ============================================================================
+// Default (unsupported type) batch/skip — fall back to base class
+// ============================================================================
+
+template <>
+inline int TS2DIFFDecoder<int32_t>::read_batch_int64(int64_t* out, int capacity,
+                                                     int& actual,
+                                                     common::ByteStream& in) {
+    return Decoder::read_batch_int64(out, capacity, actual, in);
+}
+
+template <>
+inline int TS2DIFFDecoder<int32_t>::skip_int64(int count, int& skipped,
+                                               common::ByteStream& in) {
+    return Decoder::skip_int64(count, skipped, in);
+}
+
+template <>
+inline int TS2DIFFDecoder<int64_t>::read_batch_int32(int32_t* out, int capacity,
+                                                     int& actual,
+                                                     common::ByteStream& in) {
+    return Decoder::read_batch_int32(out, capacity, actual, in);
+}
+
+template <>
+inline int TS2DIFFDecoder<int64_t>::skip_int32(int count, int& skipped,
+                                               common::ByteStream& in) {
+    return Decoder::skip_int32(count, skipped, in);
+}
+
+// ============================================================================
+// Float / Double wrapper decoders (unchanged)
+// ============================================================================
+
 class FloatTS2DIFFDecoder : public TS2DIFFDecoder<int32_t> {
    public:
     FloatTS2DIFFDecoder() = default;
@@ -282,11 +901,24 @@ class FloatTS2DIFFDecoder : public TS2DIFFDecoder<int32_t> {
         return common::int_to_float(value_int);
     }
 
-    int read_boolean(bool& ret_value, common::ByteStream& in);
-    int read_int32(int32_t& ret_value, common::ByteStream& in);
-    int read_int64(int64_t& ret_value, common::ByteStream& in);
-    int read_float(float& ret_value, common::ByteStream& in);
-    int read_double(double& ret_value, common::ByteStream& in);
+    int read_boolean(bool& ret_value, common::ByteStream& in) override;
+    int read_int32(int32_t& ret_value, common::ByteStream& in) override;
+    int read_int64(int64_t& ret_value, common::ByteStream& in) override;
+    int read_float(float& ret_value, common::ByteStream& in) override;
+    int read_double(double& ret_value, common::ByteStream& in) override;
+
+    int read_batch_float(float* out, int capacity, int& actual,
+                         common::ByteStream& in) override {
+        // Reuse SIMD batch decode for int32, then bit-cast to float
+        int32_t* buf = reinterpret_cast<int32_t*>(out);
+        int ret = TS2DIFFDecoder<int32_t>::read_batch_int32(buf, capacity,
+                                                            actual, in);
+        if (ret != common::E_OK) return ret;
+        for (int i = 0; i < actual; ++i) {
+            out[i] = common::int_to_float(buf[i]);
+        }
+        return common::E_OK;
+    }
 
    private:
     bool is_legacy_raw_{false};
@@ -306,11 +938,24 @@ class DoubleTS2DIFFDecoder : public TS2DIFFDecoder<int64_t> {
         return common::long_to_double(value_long);
     }
 
-    int read_boolean(bool& ret_value, common::ByteStream& in);
-    int read_int32(int32_t& ret_value, common::ByteStream& in);
-    int read_int64(int64_t& ret_value, common::ByteStream& in);
-    int read_float(float& ret_value, common::ByteStream& in);
-    int read_double(double& ret_value, common::ByteStream& in);
+    int read_boolean(bool& ret_value, common::ByteStream& in) override;
+    int read_int32(int32_t& ret_value, common::ByteStream& in) override;
+    int read_int64(int64_t& ret_value, common::ByteStream& in) override;
+    int read_float(float& ret_value, common::ByteStream& in) override;
+    int read_double(double& ret_value, common::ByteStream& in) override;
+
+    int read_batch_double(double* out, int capacity, int& actual,
+                          common::ByteStream& in) override {
+        // Reuse SIMD batch decode for int64, then bit-cast to double
+        int64_t* buf = reinterpret_cast<int64_t*>(out);
+        int ret = TS2DIFFDecoder<int64_t>::read_batch_int64(buf, capacity,
+                                                            actual, in);
+        if (ret != common::E_OK) return ret;
+        for (int i = 0; i < actual; ++i) {
+            out[i] = common::long_to_double(buf[i]);
+        }
+        return common::E_OK;
+    }
 
    private:
     bool is_legacy_raw_{false};
diff --git a/cpp/src/encoding/ts2diff_encoder.h b/cpp/src/encoding/ts2diff_encoder.h
index d1ab43bfd..fc494581a 100644
--- a/cpp/src/encoding/ts2diff_encoder.h
+++ b/cpp/src/encoding/ts2diff_encoder.h
@@ -29,12 +29,9 @@
 #include "common/allocator/alloc_base.h"
 #include "common/allocator/byte_stream.h"
 #include "encoder.h"
-#if defined(__SSE4_2__)
-#include <smmintrin.h>
-#define USE_SSE 1
-#elif defined(__AVX2__)
-#include <immintrin.h>
-#define USE_AVX2 1
+
+#ifdef ENABLE_SIMD
+#include "simde/x86/avx2.h"
 #endif
 
 namespace storage {
@@ -44,15 +41,16 @@ struct SIMDOps;
 
 template <>
 struct SIMDOps<int32_t> {
-#ifdef USE_SSE
+#ifdef ENABLE_SIMD
     static void rebase(int32_t* arr, int32_t min_val, size_t size) {
-        const __m128i min_vec = _mm_set1_epi32(min_val);
+        const simde__m128i min_vec = simde_mm_set1_epi32(min_val);
         size_t i = 0;
         for (; i + 3 < size; i += 4) {
-            __m128i vec =
-                _mm_loadu_si128(reinterpret_cast<const __m128i*>(arr + i));
-            vec = _mm_sub_epi32(vec, min_vec);
-            _mm_storeu_si128(reinterpret_cast<__m128i*>(arr + i), vec);
+            simde__m128i vec = simde_mm_loadu_si128(
+                reinterpret_cast<const simde__m128i*>(arr + i));
+            vec = simde_mm_sub_epi32(vec, min_vec);
+            simde_mm_storeu_si128(reinterpret_cast<simde__m128i*>(arr + i),
+                                  vec);
         }
         for (; i < size; ++i) {
             arr[i] -= min_val;
@@ -69,15 +67,16 @@ struct SIMDOps<int32_t> {
 
 template <>
 struct SIMDOps<int64_t> {
-#ifdef USE_AVX2
+#ifdef ENABLE_SIMD
     static void rebase(int64_t* arr, int64_t min_val, size_t size) {
-        const __m256i min_vec = _mm256_set1_epi64x(min_val);
+        const simde__m256i min_vec = simde_mm256_set1_epi64x(min_val);
         size_t i = 0;
         for (; i + 3 < size; i += 4) {
-            __m256i vec =
-                _mm256_loadu_si256(reinterpret_cast<const __m256i*>(arr + i));
-            vec = _mm256_sub_epi64(vec, min_vec);
-            _mm256_storeu_si256(reinterpret_cast<__m256i*>(arr + i), vec);
+            simde__m256i vec = simde_mm256_loadu_si256(
+                reinterpret_cast<const simde__m256i*>(arr + i));
+            vec = simde_mm256_sub_epi64(vec, min_vec);
+            simde_mm256_storeu_si256(reinterpret_cast<simde__m256i*>(arr + i),
+                                     vec);
         }
         for (; i < size; ++i) {
             arr[i] -= min_val;
@@ -99,7 +98,7 @@ class TS2DIFFEncoder : public Encoder {
 
     ~TS2DIFFEncoder() { destroy(); }
 
-    void reset() { write_index_ = -1; }
+    void reset() override { write_index_ = -1; }
 
     void init() {
         block_size_ = 128;
@@ -115,7 +114,7 @@ class TS2DIFFEncoder : public Encoder {
         previous_value_ = 0;
     }
 
-    void destroy() {
+    void destroy() override {
         if (delta_arr_ != nullptr) {
             common::mem_free(delta_arr_);
             delta_arr_ = nullptr;
@@ -167,17 +166,71 @@ class TS2DIFFEncoder : public Encoder {
         return bit_width;
     }
 
+    // Batch bit-pack `count` values (each `bit_width` bits, MSB-first within
+    // byte) into a single contiguous buffer and write it to out_stream in one
+    // call. Avoids the per-byte write_buf overhead of the scalar write_bits
+    // loop.
+    //
+    // Result codes:
+    //   E_OK  → written successfully.
+    //   -1    → caller must fall back to write_bits + flush_remaining because
+    //           bit_width exceeds the safe accumulator width.
+    //   any other non-zero value → real write_buf error; the caller must
+    //           propagate it instead of treating the flush as successful.
+    template <typename U>
+    static int pack_bits_msb(const U* values, int count, int bit_width,
+                             common::ByteStream& out_stream) {
+        if (count <= 0 || bit_width <= 0) return common::E_OK;
+        if (bit_width > 56) return -1;  // fall back
+
+        size_t total_bytes = ((size_t)count * (size_t)bit_width + 7) / 8;
+        std::vector<uint8_t> buf(total_bytes, 0);
+
+        uint64_t accum = 0;
+        int bits_in_accum = 0;
+        size_t pos = 0;
+        const uint64_t mask = (1ULL << bit_width) - 1;
+
+        for (int i = 0; i < count; i++) {
+            uint64_t v = static_cast<uint64_t>(values[i]) & mask;
+            accum = (accum << bit_width) | v;
+            bits_in_accum += bit_width;
+            while (bits_in_accum >= 8) {
+                buf[pos++] = static_cast<uint8_t>(accum >> (bits_in_accum - 8));
+                bits_in_accum -= 8;
+            }
+            if (bits_in_accum > 0) {
+                accum &= ((1ULL << bits_in_accum) - 1);
+            } else {
+                accum = 0;
+            }
+        }
+        if (bits_in_accum > 0) {
+            buf[pos++] = static_cast<uint8_t>(accum << (8 - bits_in_accum));
+        }
+        // Surface write failures.  Previously the return code was dropped on
+        // the floor and flush() returned E_OK, then reset() wiped the
+        // encoder state — the on-disk page ended up missing its delta block
+        // but the caller thought the data was safe.
+        return out_stream.write_buf(buf.data(), pos);
+    }
+
     int do_encode(T value, common::ByteStream& out_stream);
-    int encode(bool value, common::ByteStream& out_stream);
-    int encode(int32_t value, common::ByteStream& out_stream);
-    int encode(int64_t value, common::ByteStream& out_stream);
-    int encode(float value, common::ByteStream& out_stream);
-    int encode(double value, common::ByteStream& out_stream);
-    int encode(common::String value, common::ByteStream& out_stream);
+    int encode(bool value, common::ByteStream& out_stream) override;
+    int encode(int32_t value, common::ByteStream& out_stream) override;
+    int encode(int64_t value, common::ByteStream& out_stream) override;
+    int encode(float value, common::ByteStream& out_stream) override;
+    int encode(double value, common::ByteStream& out_stream) override;
+    int encode(common::String value, common::ByteStream& out_stream) override;
+
+    int encode_batch(const int32_t* values, uint32_t count,
+                     common::ByteStream& out_stream) override;
+    int encode_batch(const int64_t* values, uint32_t count,
+                     common::ByteStream& out_stream) override;
 
-    int flush(common::ByteStream& out_stream);
+    int flush(common::ByteStream& out_stream) override;
 
-    int get_max_byte_size() {
+    int get_max_byte_size() override {
         // The meaning of 24 is: index(4)+width(4)+minDeltaBase(8)+firstValue(8)
         return 24 + write_index_ * 8;
     }
@@ -235,16 +288,39 @@ inline int TS2DIFFEncoder<int32_t>::flush(common::ByteStream& out_stream) {
     SIMDOps<int32_t>::rebase(delta_arr_, delta_arr_min_, write_index_);
     // Calculate the bit length of each value to writer
     int bit_width = cal_bit_width(delta_arr_max_ - delta_arr_min_);
-    // writer header
-    common::SerializationUtil::write_ui32(write_index_, out_stream);
-    common::SerializationUtil::write_ui32(bit_width, out_stream);
-    common::SerializationUtil::write_ui32(delta_arr_min_, out_stream);
-    common::SerializationUtil::write_ui32(first_value_, out_stream);
-    // writer data
-    for (int i = 0; i < write_index_; i++) {
-        write_bits(delta_arr_[i], bit_width, out_stream);
+    // Header writes can fail too (back-pressure / OOM on the underlying
+    // stream); a half-written header followed by reset() leaves the page
+    // corrupted but the caller thinking the data was flushed.
+    if (RET_FAIL(
+            common::SerializationUtil::write_ui32(write_index_, out_stream))) {
+        return ret;
+    }
+    if (RET_FAIL(
+            common::SerializationUtil::write_ui32(bit_width, out_stream))) {
+        return ret;
+    }
+    if (RET_FAIL(common::SerializationUtil::write_ui32(delta_arr_min_,
+                                                       out_stream))) {
+        return ret;
+    }
+    if (RET_FAIL(
+            common::SerializationUtil::write_ui32(first_value_, out_stream))) {
+        return ret;
+    }
+    // writer data — batched bit-pack + single write_buf for the common case;
+    // fall back to per-bit path for the rare wide bit_width.
+    const int pack_ret =
+        pack_bits_msb(delta_arr_, write_index_, bit_width, out_stream);
+    if (pack_ret == -1) {
+        for (int i = 0; i < write_index_; i++) {
+            write_bits(delta_arr_[i], bit_width, out_stream);
+        }
+        flush_remaining(out_stream);
+    } else if (pack_ret != common::E_OK) {
+        // Real write failure — don't clear encoder state so the higher
+        // layer can detect the page is poisoned.
+        return pack_ret;
     }
-    flush_remaining(out_stream);
     reset();
     return ret;
 }
@@ -259,20 +335,222 @@ inline int TS2DIFFEncoder<int64_t>::flush(common::ByteStream& out_stream) {
     SIMDOps<int64_t>::rebase(delta_arr_, delta_arr_min_, write_index_);
     // Calculate the bit length of each value to writer
     int bit_width = cal_bit_width(delta_arr_max_ - delta_arr_min_);
-    // writer header
-    common::SerializationUtil::write_i32(write_index_, out_stream);
-    common::SerializationUtil::write_i32(bit_width, out_stream);
-    common::SerializationUtil::write_i64(delta_arr_min_, out_stream);
-    common::SerializationUtil::write_i64(first_value_, out_stream);
-    // writer data
-    for (int i = 0; i < write_index_; i++) {
-        write_bits(delta_arr_[i], bit_width, out_stream);
+    // Header writes can fail too — see int32 specialization for rationale.
+    if (RET_FAIL(
+            common::SerializationUtil::write_i32(write_index_, out_stream))) {
+        return ret;
+    }
+    if (RET_FAIL(common::SerializationUtil::write_i32(bit_width, out_stream))) {
+        return ret;
+    }
+    if (RET_FAIL(
+            common::SerializationUtil::write_i64(delta_arr_min_, out_stream))) {
+        return ret;
+    }
+    if (RET_FAIL(
+            common::SerializationUtil::write_i64(first_value_, out_stream))) {
+        return ret;
+    }
+    // writer data — batched bit-pack + single write_buf for the common case;
+    // fall back to per-bit path for the rare wide bit_width (>56).
+    const int pack_ret =
+        pack_bits_msb(delta_arr_, write_index_, bit_width, out_stream);
+    if (pack_ret == -1) {
+        for (int i = 0; i < write_index_; i++) {
+            write_bits(delta_arr_[i], bit_width, out_stream);
+        }
+        flush_remaining(out_stream);
+    } else if (pack_ret != common::E_OK) {
+        return pack_ret;
     }
-    flush_remaining(out_stream);
     reset();  // 语义，writeIndex=-1;
     return ret;
 }
 
+// ============================================================================
+// Batch encode: INT32
+// Adjacent-difference removes sequential dependency; SIMD for delta + min/max.
+// ============================================================================
+
+template <>
+inline int TS2DIFFEncoder<int32_t>::encode_batch(
+    const int32_t* values, uint32_t count, common::ByteStream& out_stream) {
+    int ret = common::E_OK;
+    uint32_t offset = 0;
+
+    while (offset < count) {
+        // Start of new block: store first_value
+        if (write_index_ == -1) {
+            first_value_ = values[offset];
+            previous_value_ = first_value_;
+            write_index_ = 0;
+            offset++;
+            continue;
+        }
+
+        // How many deltas fit in current block
+        uint32_t space = static_cast<uint32_t>(block_size_) - write_index_;
+        uint32_t batch = std::min(count - offset, space);
+
+        // ── Adjacent difference: delta[i] = values[i] - values[i-1] ──
+        // First delta uses previous_value_
+        delta_arr_[write_index_] = values[offset] - previous_value_;
+
+        uint32_t i = 1;
+#ifdef ENABLE_SIMD
+        // SIMD: 4 adjacent differences at a time
+        for (; i + 3 < batch; i += 4) {
+            simde__m128i cur = simde_mm_loadu_si128(
+                reinterpret_cast<const simde__m128i*>(values + offset + i));
+            simde__m128i prv = simde_mm_loadu_si128(
+                reinterpret_cast<const simde__m128i*>(values + offset + i - 1));
+            simde__m128i diff = simde_mm_sub_epi32(cur, prv);
+            simde_mm_storeu_si128(
+                reinterpret_cast<simde__m128i*>(delta_arr_ + write_index_ + i),
+                diff);
+        }
+#endif
+        for (; i < batch; i++) {
+            delta_arr_[write_index_ + i] =
+                values[offset + i] - values[offset + i - 1];
+        }
+        previous_value_ = values[offset + batch - 1];
+
+        // ── Min/max of new deltas ──
+        int32_t local_min = delta_arr_[write_index_];
+        int32_t local_max = delta_arr_[write_index_];
+
+        uint32_t j = 1;
+#ifdef ENABLE_SIMD
+        if (batch >= 5) {
+            simde__m128i vmin = simde_mm_set1_epi32(local_min);
+            simde__m128i vmax = vmin;
+            for (; j + 3 < batch; j += 4) {
+                simde__m128i v =
+                    simde_mm_loadu_si128(reinterpret_cast<const simde__m128i*>(
+                        delta_arr_ + write_index_ + j));
+                vmin = simde_mm_min_epi32(vmin, v);
+                vmax = simde_mm_max_epi32(vmax, v);
+            }
+            // Horizontal reduce
+            int32_t tmp[4];
+            simde_mm_storeu_si128(reinterpret_cast<simde__m128i*>(tmp), vmin);
+            for (int k = 0; k < 4; k++)
+                if (tmp[k] < local_min) local_min = tmp[k];
+            simde_mm_storeu_si128(reinterpret_cast<simde__m128i*>(tmp), vmax);
+            for (int k = 0; k < 4; k++)
+                if (tmp[k] > local_max) local_max = tmp[k];
+        }
+#endif
+        for (; j < batch; j++) {
+            int32_t d = delta_arr_[write_index_ + j];
+            if (d < local_min) local_min = d;
+            if (d > local_max) local_max = d;
+        }
+
+        // Merge with block min/max
+        if (write_index_ == 0) {
+            delta_arr_min_ = local_min;
+            delta_arr_max_ = local_max;
+        } else {
+            if (local_min < delta_arr_min_) delta_arr_min_ = local_min;
+            if (local_max > delta_arr_max_) delta_arr_max_ = local_max;
+        }
+
+        write_index_ += batch;
+        offset += batch;
+
+        if (write_index_ >= block_size_) {
+            if (RET_FAIL(flush(out_stream))) return ret;
+        }
+    }
+    return ret;
+}
+
+// ============================================================================
+// Batch encode: INT64
+// ============================================================================
+
+template <>
+inline int TS2DIFFEncoder<int64_t>::encode_batch(
+    const int64_t* values, uint32_t count, common::ByteStream& out_stream) {
+    int ret = common::E_OK;
+    uint32_t offset = 0;
+
+    while (offset < count) {
+        if (write_index_ == -1) {
+            first_value_ = values[offset];
+            previous_value_ = first_value_;
+            write_index_ = 0;
+            offset++;
+            continue;
+        }
+
+        uint32_t space = static_cast<uint32_t>(block_size_) - write_index_;
+        uint32_t batch = std::min(count - offset, space);
+
+        // Adjacent difference
+        delta_arr_[write_index_] = values[offset] - previous_value_;
+
+        uint32_t i = 1;
+#ifdef ENABLE_SIMD
+        // SIMD: 2 adjacent differences at a time (128-bit, native NEON)
+        for (; i + 1 < batch; i += 2) {
+            simde__m128i cur = simde_mm_loadu_si128(
+                reinterpret_cast<const simde__m128i*>(values + offset + i));
+            simde__m128i prv = simde_mm_loadu_si128(
+                reinterpret_cast<const simde__m128i*>(values + offset + i - 1));
+            simde__m128i diff = simde_mm_sub_epi64(cur, prv);
+            simde_mm_storeu_si128(
+                reinterpret_cast<simde__m128i*>(delta_arr_ + write_index_ + i),
+                diff);
+        }
+#endif
+        for (; i < batch; i++) {
+            delta_arr_[write_index_ + i] =
+                values[offset + i] - values[offset + i - 1];
+        }
+        previous_value_ = values[offset + batch - 1];
+
+        // Min/max (scalar — no efficient 64-bit SIMD min/max before AVX-512)
+        int64_t local_min = delta_arr_[write_index_];
+        int64_t local_max = delta_arr_[write_index_];
+        for (uint32_t j = 1; j < batch; j++) {
+            int64_t d = delta_arr_[write_index_ + j];
+            if (d < local_min) local_min = d;
+            if (d > local_max) local_max = d;
+        }
+
+        if (write_index_ == 0) {
+            delta_arr_min_ = local_min;
+            delta_arr_max_ = local_max;
+        } else {
+            if (local_min < delta_arr_min_) delta_arr_min_ = local_min;
+            if (local_max > delta_arr_max_) delta_arr_max_ = local_max;
+        }
+
+        write_index_ += batch;
+        offset += batch;
+
+        if (write_index_ >= block_size_) {
+            if (RET_FAIL(flush(out_stream))) return ret;
+        }
+    }
+    return ret;
+}
+
+// Default: unsupported types fall back to base class loop
+template <typename T>
+int TS2DIFFEncoder<T>::encode_batch(const int32_t* values, uint32_t count,
+                                    common::ByteStream& out) {
+    return Encoder::encode_batch(values, count, out);
+}
+template <typename T>
+int TS2DIFFEncoder<T>::encode_batch(const int64_t* values, uint32_t count,
+                                    common::ByteStream& out) {
+    return Encoder::encode_batch(values, count, out);
+}
+
 class FloatTS2DIFFEncoder : public TS2DIFFEncoder<int32_t> {
    public:
     FloatTS2DIFFEncoder() : max_point_number_(2), max_point_value_(100.0) {}
@@ -280,6 +558,14 @@ class FloatTS2DIFFEncoder : public TS2DIFFEncoder<int32_t> {
         int32_t value_int = convert_float_to_int(value);
         return TS2DIFFEncoder<int32_t>::do_encode(value_int, out_stream);
     }
+    // PageWriter resets the encoder between pages without going through a
+    // successful flush() (e.g. when the prior page was aborted).  The base
+    // reset() only clears write_index_; underflow_flags_ would otherwise
+    // leak the prior page's overflow markers into the next page's bitmap.
+    void reset() override {
+        TS2DIFFEncoder<int32_t>::reset();
+        underflow_flags_.clear();
+    }
     int flush(common::ByteStream& out_stream) override;
     int encode(bool value, common::ByteStream& out_stream);
     int encode(int32_t value, common::ByteStream& out_stream);
@@ -332,6 +618,12 @@ class DoubleTS2DIFFEncoder : public TS2DIFFEncoder<int64_t> {
         int64_t value_long = convert_double_to_long(value);
         return TS2DIFFEncoder<int64_t>::do_encode(value_long, out_stream);
     }
+    // See FloatTS2DIFFEncoder::reset for rationale — the prior page's
+    // overflow markers must not bleed into the next.
+    void reset() override {
+        TS2DIFFEncoder<int64_t>::reset();
+        underflow_flags_.clear();
+    }
     int flush(common::ByteStream& out_stream) override;
     int encode(bool value, common::ByteStream& out_stream);
     int encode(int32_t value, common::ByteStream& out_stream);
@@ -518,7 +810,6 @@ FORCE_INLINE int FloatTS2DIFFEncoder::flush(common::ByteStream& out_stream) {
         write_bits(delta_arr_[i], bit_width, inner);
     }
     flush_remaining(inner);
-    reset();
 
     const bool overflow = has_overflow();
     if (overflow) {
@@ -564,7 +855,12 @@ FORCE_INLINE int FloatTS2DIFFEncoder::flush(common::ByteStream& out_stream) {
     if (RET_FAIL(merge_byte_stream(out_stream, inner, true))) {
         return ret;
     }
+    // Defer encoder-state wipe until after every write into out_stream has
+    // committed.  An earlier reset() let a mid-flush failure leave
+    // write_index_ at -1, so the next flush() short-circuited at the top
+    // and the data was silently lost.
     underflow_flags_.clear();
+    TS2DIFFEncoder<int32_t>::reset();
     return ret;
 }
 
@@ -597,7 +893,6 @@ FORCE_INLINE int DoubleTS2DIFFEncoder::flush(common::ByteStream& out_stream) {
         write_bits(delta_arr_[i], bit_width, inner);
     }
     flush_remaining(inner);
-    reset();
 
     const bool overflow = has_overflow();
     if (overflow) {
@@ -643,7 +938,11 @@ FORCE_INLINE int DoubleTS2DIFFEncoder::flush(common::ByteStream& out_stream) {
     if (RET_FAIL(merge_byte_stream(out_stream, inner, true))) {
         return ret;
     }
+    // Same deferred-reset rationale as FloatTS2DIFFEncoder::flush — keeping
+    // write_index_ live until every committed write succeeds avoids the
+    // "next flush returns E_OK on lost data" pattern.
     underflow_flags_.clear();
+    TS2DIFFEncoder<int64_t>::reset();
     return ret;
 }
 
diff --git a/cpp/src/file/read_file.cc b/cpp/src/file/read_file.cc
index d9902ddb9..c6bfd547a 100644
--- a/cpp/src/file/read_file.cc
+++ b/cpp/src/file/read_file.cc
@@ -26,6 +26,7 @@
 #ifdef _WIN32
 #include <io.h>
 #include <windows.h>
+
 ssize_t pread(int fd, void* buf, size_t count, uint64_t offset);
 #else
 #include <unistd.h>
diff --git a/cpp/src/file/restorable_tsfile_io_writer.cc b/cpp/src/file/restorable_tsfile_io_writer.cc
index 22a3fb500..a1fc53402 100644
--- a/cpp/src/file/restorable_tsfile_io_writer.cc
+++ b/cpp/src/file/restorable_tsfile_io_writer.cc
@@ -328,12 +328,15 @@ static int recover_chunk_statistic(
     uint32_t value_buf_size = 0;
     std::vector<int64_t> time_decode_buf;
     const std::vector<int64_t>* times = nullptr;
-    std::vector<uint8_t> aligned_value_notnull_bitmap;
+    // For aligned pages, retain the per-row not-null bitmap so the stat-update
+    // loop can skip null positions and bind each decoded value to its real
+    // timestamp.  Without this we'd hand non-null values to times[0..N-1] and
+    // get wrong start/end/first/last stats on sparse columns.
+    const char* aligned_bitmap = nullptr;
     uint32_t aligned_num_values = 0;
-    const bool is_aligned_value_chunk =
-        (time_batch != nullptr && !time_batch->empty());
+    bool is_aligned_page = false;
 
-    if (is_aligned_value_chunk) {
+    if (time_batch != nullptr && !time_batch->empty()) {
         // Aligned value page: uncompressed layout = uint32(num_values) + bitmap
         // + value_buf
         if (uncompressed_size < 4) {
@@ -341,7 +344,7 @@ static int recover_chunk_statistic(
             CompressorFactory::free(compressor);
             return E_OK;
         }
-        aligned_num_values =
+        uint32_t num_values =
             (static_cast<uint32_t>(
                  static_cast<unsigned char>(uncompressed_buf[0]))
              << 24) |
@@ -353,20 +356,19 @@ static int recover_chunk_statistic(
              << 8) |
             (static_cast<uint32_t>(
                 static_cast<unsigned char>(uncompressed_buf[3])));
-        uint32_t bitmap_size = (aligned_num_values + 7) / 8;
+        uint32_t bitmap_size = (num_values + 7) / 8;
         if (uncompressed_size < 4 + bitmap_size) {
             compressor->after_uncompress(uncompressed_buf);
             CompressorFactory::free(compressor);
             return E_OK;
         }
-        aligned_value_notnull_bitmap.resize(bitmap_size);
-        if (bitmap_size > 0) {
-            std::memcpy(aligned_value_notnull_bitmap.data(),
-                        uncompressed_buf + 4, bitmap_size);
-        }
         value_buf = uncompressed_buf + 4 + bitmap_size;
         value_buf_size = uncompressed_size - 4 - bitmap_size;
         times = time_batch;
+        aligned_bitmap = uncompressed_buf + 4;
+        aligned_num_values = std::min<uint32_t>(
+            num_values, static_cast<uint32_t>(time_batch->size()));
+        is_aligned_page = true;
     } else {
         // Non-aligned value page: var_uint(time_buf_size) + time_buf +
         // value_buf
@@ -419,25 +421,25 @@ static int recover_chunk_statistic(
     value_decoder->reset();
     size_t idx = 0;
     const size_t num_times = times->size();
-    while (idx < num_times) {
-        int64_t t = (*times)[idx];
-        bool has_value = true;
-        if (is_aligned_value_chunk) {
-            has_value = false;
-            const uint32_t byte_idx = static_cast<uint32_t>(idx / 8);
-            const uint32_t bit_shift = static_cast<uint32_t>(idx % 8);
-            if (byte_idx < aligned_value_notnull_bitmap.size()) {
-                has_value = ((aligned_value_notnull_bitmap[byte_idx] & 0xFF) &
-                             (0x80 >> bit_shift)) != 0;
-            }
-        }
-        if (!has_value) {
+    // For aligned pages the value stream only stores non-null rows; advance
+    // `idx` past null bitmap entries so each decoded value pairs with the
+    // matching timestamp. Non-aligned pages have no bitmap (every row is
+    // present), so we keep the dense walk.
+    auto bitmap_is_valid = [&](size_t row) -> bool {
+        if (!is_aligned_page) return true;
+        if (row >= aligned_num_values) return false;
+        // Aligned value-page bitmap: MSB-first within each byte, bit set
+        // means the row is NOT null.
+        unsigned char byte =
+            static_cast<unsigned char>(aligned_bitmap[row / 8]);
+        return (byte & static_cast<unsigned char>(0x80 >> (row % 8))) != 0;
+    };
+    while (idx < num_times && value_decoder->has_remaining(value_in)) {
+        if (!bitmap_is_valid(idx)) {
             idx++;
             continue;
         }
-        if (!value_decoder->has_remaining(value_in)) {
-            break;
-        }
+        int64_t t = (*times)[idx];
         switch (chdr.data_type_) {
             case common::BOOLEAN: {
                 bool v;
@@ -518,6 +520,12 @@ void RestorableTsFileIOWriter::close() {
         write_file_ = nullptr;
         write_file_owned_ = false;
     }
+    // Run the base writer's cleanup (frees post-recovery appended chunk
+    // metadata) before tearing down self_check_arena_ that backs the
+    // recovered ChunkGroupMeta entries.  Base destroy() only touches entries
+    // it allocated itself (tracked in appended_chunk_metas_ /
+    // appended_chunk_group_metas_), so it never dereferences self_check
+    // arena memory.
     TsFileIOWriter::destroy();
     for (ChunkGroupMeta* cgm : self_check_recovered_cgm_) {
         cgm->device_id_.reset();
@@ -842,15 +850,13 @@ int RestorableTsFileIOWriter::self_check(bool truncate_corrupted) {
         }
     }
 
-    // --- Attach recovered ChunkGroupMeta to writer; record per-CGM prefix
-    // length so destroy() can free stats appended later. ---
-    recovery_chunk_meta_prefix_.clear();
+    // Attach recovered ChunkGroupMeta entries to the base writer.  These
+    // live in self_check_arena_ and are *not* tracked in
+    // appended_chunk_group_metas_ — base destroy() leaves them alone, and
+    // close() resets their device_id_ refs before tearing down the arena.
     for (ChunkGroupMeta* cgm : recovered_cgm_list) {
-        recovery_chunk_meta_prefix_[cgm] =
-            static_cast<uint32_t>(cgm->chunk_meta_list_.size());
         push_chunk_group_meta(cgm);
     }
-    chunk_group_meta_from_recovery_ = true;
 
     return E_OK;
 }
diff --git a/cpp/src/file/tsfile_io_reader.cc b/cpp/src/file/tsfile_io_reader.cc
index 296556c15..014e78832 100644
--- a/cpp/src/file/tsfile_io_reader.cc
+++ b/cpp/src/file/tsfile_io_reader.cc
@@ -51,6 +51,8 @@ void TsFileIOReader::reset() {
         }
         read_file_ = nullptr;
         tsfile_meta_page_arena_.destroy();
+        device_node_cache_.clear();
+        device_node_cache_pa_.destroy();
         tsfile_meta_ready_ = false;
     }
 }
@@ -61,6 +63,9 @@ int TsFileIOReader::alloc_ssi(std::shared_ptr<IDeviceID> device_id,
                               common::PageArena& pa, Filter* time_filter) {
     int ret = E_OK;
     if (RET_FAIL(load_tsfile_meta_if_necessary())) {
+    } else if (!bloom_filter_contains(device_id->get_device_name(),
+                                      measurement_name)) {
+        return E_NO_MORE_DATA;
     } else {
         ssi = new TsFileSeriesScanIterator;
         ssi->init(device_id, measurement_name, read_file_, time_filter, pa);
@@ -80,6 +85,95 @@ int TsFileIOReader::alloc_ssi(std::shared_ptr<IDeviceID> device_id,
     return ret;
 }
 
+int TsFileIOReader::alloc_multi_ssi(
+    std::shared_ptr<IDeviceID> device_id,
+    const std::vector<std::string>& measurement_names,
+    TsFileSeriesScanIterator*& ssi, common::PageArena& pa,
+    Filter* time_filter) {
+    int ret = E_OK;
+    if (RET_FAIL(load_tsfile_meta_if_necessary())) return ret;
+
+    ssi = new TsFileSeriesScanIterator;
+    ssi->init(device_id, measurement_names.empty() ? "" : measurement_names[0],
+              read_file_, time_filter, pa);
+
+    auto& ssi_pa = ssi->timeseries_index_pa_;
+
+    // Use cached device measurement node (avoids repeated file I/O)
+    CachedDeviceNode cached;
+    if (RET_FAIL(get_cached_device_node(device_id, ssi_pa, cached))) {
+        delete ssi;
+        ssi = nullptr;
+        return ret;
+    }
+    auto top_node = cached.top_node;
+    if (!cached.is_aligned) {
+        delete ssi;
+        ssi = nullptr;
+        return E_NOT_SUPPORT;
+    }
+
+    // Get time column metadata
+    TimeseriesIndex* time_ts_idx = nullptr;
+    if (RET_FAIL(get_time_column_metadata(top_node, time_ts_idx, ssi_pa))) {
+        delete ssi;
+        ssi = nullptr;
+        return ret;
+    }
+
+    // Create MultiAlignedTimeseriesIndex
+    void* multi_buf = ssi_pa.alloc(sizeof(MultiAlignedTimeseriesIndex));
+    if (IS_NULL(multi_buf)) {
+        delete ssi;
+        ssi = nullptr;
+        return E_OOM;
+    }
+    auto* multi_idx = new (multi_buf) MultiAlignedTimeseriesIndex;
+    multi_idx->time_ts_idx_ = time_ts_idx;
+
+    // Load each measurement's TimeseriesIndex
+    for (const auto& meas_name : measurement_names) {
+        std::shared_ptr<IMetaIndexEntry> meas_entry;
+        int64_t meas_end_offset = 0;
+        if (RET_FAIL(load_measurement_index_entry(
+                meas_name, top_node, meas_entry, meas_end_offset))) {
+            // Measurement not found — abort multi path
+            delete ssi;
+            ssi = nullptr;
+            return ret;
+        }
+
+        ITimeseriesIndex* ts_idx = nullptr;
+        if (RET_FAIL(do_load_timeseries_index(
+                meas_name, meas_entry->get_offset(), meas_end_offset, ssi_pa,
+                ts_idx, /*is_aligned=*/true))) {
+            delete ssi;
+            ssi = nullptr;
+            return ret;
+        }
+
+        auto* aligned_idx = dynamic_cast<AlignedTimeseriesIndex*>(ts_idx);
+        if (aligned_idx && aligned_idx->value_ts_idx_) {
+            multi_idx->value_ts_idxs_.push_back(aligned_idx->value_ts_idx_);
+        } else {
+            delete ssi;
+            ssi = nullptr;
+            return E_NOT_EXIST;
+        }
+    }
+
+    ssi->itimeseries_index_ = multi_idx;
+
+    // Skip global statistic filter for multi — per-chunk filtering still works.
+
+    if (RET_FAIL(ssi->init_chunk_reader())) {
+        ssi->destroy();
+        delete ssi;
+        ssi = nullptr;
+    }
+    return ret;
+}
+
 void TsFileIOReader::revert_ssi(TsFileSeriesScanIterator* ssi) {
     if (ssi != nullptr) {
         ssi->destroy();
@@ -96,61 +190,14 @@ int TsFileIOReader::get_device_timeseries_meta_without_chunk_meta(
     int64_t end_offset;
     std::vector<std::pair<std::shared_ptr<IMetaIndexEntry>, int64_t>>
         meta_index_entry_list;
-    std::shared_ptr<MetaIndexNode> top_node;
-    bool is_aligned = false;
-    TimeseriesIndex* time_timeseries_index = nullptr;
     if (RET_FAIL(load_device_index_entry(
             std::make_shared<DeviceIDComparable>(device_id), meta_index_entry,
             end_offset))) {
-    } else {
-        int64_t start_offset = meta_index_entry->get_offset();
-        ASSERT(start_offset < end_offset);
-        const int32_t read_size = end_offset - start_offset;
-        int32_t ret_read_len = 0;
-        char* data_buf = (char*)pa.alloc(read_size);
-        void* m_idx_node_buf = pa.alloc(sizeof(MetaIndexNode));
-        if (IS_NULL(data_buf) || IS_NULL(m_idx_node_buf)) {
-            return E_OOM;
-        }
-        auto* top_node_ptr = new (m_idx_node_buf) MetaIndexNode(&pa);
-        top_node = std::shared_ptr<MetaIndexNode>(top_node_ptr,
-                                                  MetaIndexNode::self_deleter);
-        if (RET_FAIL(read_file_->read(start_offset, data_buf, read_size,
-                                      ret_read_len))) {
-        } else if (RET_FAIL(top_node->deserialize_from(data_buf, read_size))) {
-        } else {
-            is_aligned = is_aligned_device(top_node);
-            if (is_aligned) {
-                if (RET_FAIL(get_time_column_metadata(
-                        top_node, time_timeseries_index, pa))) {
-                    return ret;
-                }
-            }
-        }
-    }
-    if (RET_FAIL(ret)) {
-        return ret;
-    }
-    if (RET_FAIL(load_all_measurement_index_entry(
-            meta_index_entry->get_offset(), end_offset, pa,
-            meta_index_entry_list))) {
+    } else if (RET_FAIL(load_all_measurement_index_entry(
+                   meta_index_entry->get_offset(), end_offset, pa,
+                   meta_index_entry_list))) {
     } else if (RET_FAIL(do_load_all_timeseries_index(meta_index_entry_list, pa,
                                                      timeseries_indexs))) {
-    } else if (is_aligned && time_timeseries_index != nullptr) {
-        for (size_t i = 0; i < timeseries_indexs.size(); i++) {
-            void* buf = pa.alloc(sizeof(AlignedTimeseriesIndex));
-            if (IS_NULL(buf)) {
-                return E_OOM;
-            }
-            auto* aligned_ts_idx = new (buf) AlignedTimeseriesIndex;
-            aligned_ts_idx->time_ts_idx_ = time_timeseries_index;
-            aligned_ts_idx->value_ts_idx_ =
-                dynamic_cast<TimeseriesIndex*>(timeseries_indexs[i]);
-            if (aligned_ts_idx->value_ts_idx_ == nullptr) {
-                return E_TYPE_NOT_MATCH;
-            }
-            timeseries_indexs[i] = aligned_ts_idx;
-        }
     }
     return ret;
 }
@@ -225,6 +272,20 @@ bool TsFileIOReader::filter_stasify(ITimeseriesIndex* ts_index,
     return time_filter->satisfy(ts_index->get_statistic());
 }
 
+bool TsFileIOReader::bloom_filter_contains(
+    const std::string& device_name, const std::string& measurement_name) {
+    BloomFilter* bf = tsfile_meta_.bloom_filter_;
+    if (bf == nullptr || bf->is_empty()) {
+        return true;  // no bloom filter — assume present
+    }
+    common::String dev_str, meas_str;
+    dev_str.buf_ = const_cast<char*>(device_name.c_str());
+    dev_str.len_ = static_cast<uint32_t>(device_name.size());
+    meas_str.buf_ = const_cast<char*>(measurement_name.c_str());
+    meas_str.len_ = static_cast<uint32_t>(measurement_name.size());
+    return bf->contains(dev_str, meas_str);
+}
+
 int TsFileIOReader::load_tsfile_meta_if_necessary() {
     int ret = E_OK;
     if (!tsfile_meta_ready_) {
@@ -323,44 +384,111 @@ int TsFileIOReader::load_tsfile_meta() {
     return ret;
 }
 
-int TsFileIOReader::load_timeseries_index_for_ssi(
-    std::shared_ptr<IDeviceID> device_id, const std::string& measurement_name,
-    TsFileSeriesScanIterator*& ssi) {
+int TsFileIOReader::get_cached_device_node(std::shared_ptr<IDeviceID> device_id,
+                                           common::PageArena& pa,
+                                           CachedDeviceNode& out) {
+    std::string dev_name = device_id->get_device_name();
+
+    {
+        std::lock_guard<std::mutex> lk(device_node_cache_mu_);
+        auto it = device_node_cache_.find(dev_name);
+        if (it != device_node_cache_.end()) {
+            out = it->second;
+            return E_OK;
+        }
+    }
+
+    // Read the device meta index outside the lock — load_device_index_entry()
+    // and the file read can block on I/O, and we don't want to serialize all
+    // concurrent first-time lookups behind one slow disk fetch.  Two callers
+    // racing on the same missing device may both do the read; that's wasted
+    // work but not corruption — the second insert is dropped below.
     int ret = E_OK;
     std::shared_ptr<IMetaIndexEntry> device_index_entry;
     int64_t device_ie_end_offset = 0;
-    std::shared_ptr<IMetaIndexEntry> measurement_index_entry;
-    int64_t measurement_ie_end_offset = 0;
-    // bool is_aligned = false;
     if (RET_FAIL(load_device_index_entry(
             std::make_shared<DeviceIDComparable>(device_id), device_index_entry,
             device_ie_end_offset))) {
         return ret;
     }
-    auto& pa = ssi->timeseries_index_pa_;
 
     int64_t start_offset = device_index_entry->get_offset(),
             end_offset = device_ie_end_offset;
     ASSERT(start_offset < end_offset);
-    const int32_t read_size = end_offset - start_offset;
+    const int64_t read_size_i64 = end_offset - start_offset;
+    // read_file_->read() takes int32_t; a meta index node larger than 2 GiB
+    // is implausible but explicitly reject it instead of silently truncating
+    // the read length and corrupting the parse.
+    if (read_size_i64 <= 0 || read_size_i64 > INT32_MAX) {
+        return E_TSFILE_CORRUPTED;
+    }
+    const int32_t read_size = static_cast<int32_t>(read_size_i64);
     int32_t ret_read_len = 0;
-    char* data_buf = (char*)pa.alloc(read_size);
-    void* m_idx_node_buf = pa.alloc(sizeof(MetaIndexNode));
-    if (IS_NULL(data_buf) || IS_NULL(m_idx_node_buf)) {
+
+    // Read into a heap-owned buffer outside the lock.  The previous
+    // implementation allocated data_buf inside device_node_cache_pa_ before
+    // the read happened — every failed read or parse left that allocation
+    // pinned forever in the shared arena, and repeated disk errors on the
+    // same device let a long-lived reader grow it without bound.  Using a
+    // unique_ptr here means the read buffer is released on every failure
+    // path, and only the small MetaIndexNode allocations inside the lock
+    // share the arena.
+    std::unique_ptr<char[]> data_buf(new (std::nothrow) char[read_size]);
+    if (data_buf == nullptr) {
         return E_OOM;
     }
-    auto* top_node_ptr = new (m_idx_node_buf) MetaIndexNode(&pa);
-    auto top_node = std::shared_ptr<MetaIndexNode>(top_node_ptr,
-                                                   MetaIndexNode::self_deleter);
-
-    if (RET_FAIL(read_file_->read(start_offset, data_buf, read_size,
+    if (RET_FAIL(read_file_->read(start_offset, data_buf.get(), read_size,
                                   ret_read_len))) {
         return ret;
-    } else if (RET_FAIL(top_node->deserialize_from(data_buf, read_size))) {
+    }
+
+    CachedDeviceNode cached;
+    {
+        // Allocations into device_node_cache_pa_ and the map insert must be
+        // serialized — PageArena is not thread-safe, and unordered_map's
+        // rehash invalidates concurrent lookups.
+        std::lock_guard<std::mutex> lk(device_node_cache_mu_);
+        // Re-check: another thread may have populated the entry while we
+        // were doing I/O.
+        auto it = device_node_cache_.find(dev_name);
+        if (it != device_node_cache_.end()) {
+            out = it->second;
+            return E_OK;
+        }
+
+        void* m_idx_node_buf =
+            device_node_cache_pa_.alloc(sizeof(MetaIndexNode));
+        if (IS_NULL(m_idx_node_buf)) {
+            return E_OOM;
+        }
+        auto* top_node_ptr =
+            new (m_idx_node_buf) MetaIndexNode(&device_node_cache_pa_);
+        auto top_node = std::shared_ptr<MetaIndexNode>(
+            top_node_ptr, MetaIndexNode::self_deleter);
+        if (RET_FAIL(top_node->deserialize_from(data_buf.get(), read_size))) {
+            return ret;
+        }
+        cached.top_node = top_node;
+        cached.is_aligned = is_aligned_device(top_node);
+        device_node_cache_.emplace(std::move(dev_name), cached);
+    }
+    out = cached;
+    return E_OK;
+}
+
+int TsFileIOReader::load_timeseries_index_for_ssi(
+    std::shared_ptr<IDeviceID> device_id, const std::string& measurement_name,
+    TsFileSeriesScanIterator*& ssi) {
+    int ret = E_OK;
+    auto& pa = ssi->timeseries_index_pa_;
+
+    CachedDeviceNode cached;
+    if (RET_FAIL(get_cached_device_node(device_id, pa, cached))) {
         return ret;
     }
+    auto top_node = cached.top_node;
+    bool is_aligned = cached.is_aligned;
 
-    bool is_aligned = is_aligned_device(top_node);
     TimeseriesIndex* timeseries_index = nullptr;
     if (is_aligned) {
         if (RET_FAIL(
@@ -369,6 +497,8 @@ int TsFileIOReader::load_timeseries_index_for_ssi(
         }
     }
 
+    std::shared_ptr<IMetaIndexEntry> measurement_index_entry;
+    int64_t measurement_ie_end_offset = 0;
     if (RET_FAIL(load_measurement_index_entry(measurement_name, top_node,
                                               measurement_index_entry,
                                               measurement_ie_end_offset))) {
@@ -570,16 +700,30 @@ int TsFileIOReader::get_timeseries_indexes(
 
     int64_t idx = 0;
     for (const auto& measurement_name : measurement_names) {
-        if (RET_FAIL(load_measurement_index_entry(measurement_name, top_node,
-                                                  measurement_index_entry,
-                                                  measurement_ie_end_offset))) {
-        } else if (do_load_timeseries_index(
-                       measurement_name, measurement_index_entry->get_offset(),
-                       measurement_ie_end_offset, pa, timeseries_indexs[idx],
-                       is_aligned) == E_NOT_EXIST) {
+        timeseries_indexs[idx] = nullptr;
+        ret = load_measurement_index_entry(measurement_name, top_node,
+                                           measurement_index_entry,
+                                           measurement_ie_end_offset);
+        if (ret == E_MEASUREMENT_NOT_EXIST || ret == E_NOT_EXIST) {
+            ret = E_OK;
             idx++;
             continue;
         }
+        if (RET_FAIL(ret)) {
+            return ret;
+        }
+
+        ret = do_load_timeseries_index(
+            measurement_name, measurement_index_entry->get_offset(),
+            measurement_ie_end_offset, pa, timeseries_indexs[idx], is_aligned);
+        if (ret == E_NOT_EXIST) {
+            ret = E_OK;
+            idx++;
+            continue;
+        }
+        if (RET_FAIL(ret)) {
+            return ret;
+        }
         if (is_aligned) {
             AlignedTimeseriesIndex* aligned_timeseries_index =
                 dynamic_cast<AlignedTimeseriesIndex*>(timeseries_indexs[idx]);
@@ -677,6 +821,9 @@ int TsFileIOReader::search_from_internal_node(
 
 bool TsFileIOReader::is_aligned_device(
     std::shared_ptr<MetaIndexNode> measurement_node) {
+    if (measurement_node->children_.empty()) {
+        return false;
+    }
     auto entry = measurement_node->children_[0];
     return entry->get_name().is_null() ||
            entry->get_name().to_std_string() == "";
diff --git a/cpp/src/file/tsfile_io_reader.h b/cpp/src/file/tsfile_io_reader.h
index 85443326f..db3030419 100644
--- a/cpp/src/file/tsfile_io_reader.h
+++ b/cpp/src/file/tsfile_io_reader.h
@@ -20,6 +20,8 @@
 #ifndef FILE_TSFILE_IO_REAER_H
 #define FILE_TSFILE_IO_REAER_H
 
+#include <mutex>
+#include <unordered_map>
 #include <unordered_set>
 
 #include "common/tsblock/tsblock.h"
@@ -46,6 +48,7 @@ class TsFileIOReader {
           tsfile_meta_ready_(false),
           read_file_created_(false) {
         tsfile_meta_page_arena_.init(512, common::MOD_TSFILE_READER);
+        device_node_cache_pa_.init(512, common::MOD_TSFILE_READER);
     }
 
     int init(const std::string& file_path);
@@ -59,6 +62,11 @@ class TsFileIOReader {
                   TsFileSeriesScanIterator*& ssi, common::PageArena& pa,
                   Filter* time_filter = nullptr);
 
+    int alloc_multi_ssi(std::shared_ptr<IDeviceID> device_id,
+                        const std::vector<std::string>& measurement_names,
+                        TsFileSeriesScanIterator*& ssi, common::PageArena& pa,
+                        Filter* time_filter = nullptr);
+
     void revert_ssi(TsFileSeriesScanIterator* ssi);
 
     std::string get_file_path() const { return read_file_->file_path(); }
@@ -147,17 +155,40 @@ class TsFileIOReader {
 
     bool filter_stasify(ITimeseriesIndex* ts_index, Filter* time_filter);
 
+    bool bloom_filter_contains(const std::string& device_name,
+                               const std::string& measurement_name);
+
     int get_all_leaf(
         std::shared_ptr<MetaIndexNode> index_node,
         std::vector<std::pair<std::shared_ptr<IMetaIndexEntry>, int64_t>>&
             index_node_entry_list);
 
+    struct CachedDeviceNode {
+        std::shared_ptr<MetaIndexNode> top_node;
+        bool is_aligned;
+    };
+
+    // Returns E_OK on hit (out is filled), or an error code on miss / load
+    // failure (E_DEVICE_NOT_EXIST when the device is absent, the propagated
+    // error otherwise).  Copying into out keeps the caller safe from rehash /
+    // concurrent eviction of the cache map.
+    int get_cached_device_node(std::shared_ptr<IDeviceID> device_id,
+                               common::PageArena& pa, CachedDeviceNode& out);
+
    private:
     ReadFile* read_file_;
     common::PageArena tsfile_meta_page_arena_;
     TsFileMeta tsfile_meta_;
     bool tsfile_meta_ready_;
     bool read_file_created_;
+    // Cache: device_name → deserialized measurement MetaIndexNode.
+    // Guarded by device_node_cache_mu_ — multiple SSIs and Result Sets can
+    // hit the cache concurrently on the same reader, and an unsynchronized
+    // unordered_map insert would race with a parallel lookup (rehash,
+    // bucket-list rewrite) and with the underlying PageArena allocation.
+    common::PageArena device_node_cache_pa_;
+    std::unordered_map<std::string, CachedDeviceNode> device_node_cache_;
+    mutable std::mutex device_node_cache_mu_;
 };
 
 }  // end namespace storage
diff --git a/cpp/src/file/tsfile_io_writer.cc b/cpp/src/file/tsfile_io_writer.cc
index 42d99feda..71bb08a7e 100644
--- a/cpp/src/file/tsfile_io_writer.cc
+++ b/cpp/src/file/tsfile_io_writer.cc
@@ -21,6 +21,8 @@
 
 #include <fcntl.h>
 
+#include <chrono>
+#include <iomanip>
 #include <memory>
 
 #include "common/device_id.h"
@@ -40,14 +42,20 @@ namespace storage {
 #define OFFSET_DEBUG(msg) void(msg)
 #endif
 
+int64_t TsFileIOWriter::get_meta_size() const {
+    return meta_allocator_.get_total_used_bytes();
+}
+
 int TsFileIOWriter::init(WriteFile* write_file) {
     int ret = E_OK;
     const uint32_t page_size = 1024;
     meta_allocator_.init(page_size, MOD_TSFILE_WRITER_META);
     chunk_meta_count_ = 0;
-    recovery_chunk_meta_prefix_.clear();
-    destroyed_ = false;
     file_ = write_file;
+    // Re-arm destroy() for the new lifecycle.  Without this, a writer that
+    // was destroy()'d and then init()'d again would leak the fresh
+    // meta_allocator_/write_stream_/file_ on its next destroy().
+    destroyed_ = false;
     return ret;
 }
 
@@ -55,48 +63,37 @@ void TsFileIOWriter::destroy() {
     if (destroyed_) {
         return;
     }
-    // Recovery attaches a prefix of ChunkGroupMeta; device_id and chunk stats
-    // in that snapshot live in reader/recovery memory. After open, new chunks
-    // may be pushed into the same ChunkGroupMeta (same device); only those
-    // appended ChunkMeta need statistic_->destroy() (see
-    // recovery_chunk_meta_prefix_).
-    for (auto iter = chunk_group_meta_list_.begin();
-         iter != chunk_group_meta_list_.end(); iter++) {
-        ChunkGroupMeta* cgm = iter.get();
-        auto prefix_it = recovery_chunk_meta_prefix_.find(cgm);
-        const bool is_recovery_cgm =
-            chunk_group_meta_from_recovery_ && cgm != nullptr &&
-            prefix_it != recovery_chunk_meta_prefix_.end();
-        uint32_t recovered_cm_count = is_recovery_cgm ? prefix_it->second : 0;
-
-        if (!is_recovery_cgm) {
-            if (cgm != nullptr && cgm->device_id_) {
-                cgm->device_id_.reset();
-            }
-        }
-
-        if (cgm == nullptr) {
-            continue;
-        }
-        uint32_t cm_idx = 0;
-        for (auto chunk_meta = cgm->chunk_meta_list_.begin();
-             chunk_meta != cgm->chunk_meta_list_.end();
-             chunk_meta++, cm_idx++) {
-            if (chunk_meta.get() == nullptr ||
-                chunk_meta.get()->statistic_ == nullptr) {
-                continue;
-            }
-            if (is_recovery_cgm && cm_idx < recovered_cm_count) {
-                continue;
-            }
-            chunk_meta.get()->statistic_->destroy();
+    // Free heap-allocated PageArenas held by each appended statistic and
+    // drop shared_ptr refs on each appended CGM's device_id_.  Recovered
+    // entries from RestorableTsFileIOWriter live in self_check_arena_ and
+    // are not tracked here; the restorable writer cleans those up itself.
+    for (ChunkMeta* cm : appended_chunk_metas_) {
+        if (cm != nullptr && cm->statistic_ != nullptr) {
+            cm->statistic_->destroy();
         }
     }
-
-    if (cur_chunk_meta_ != nullptr && cur_chunk_meta_->statistic_ != nullptr) {
-        cur_chunk_meta_->statistic_->destroy();
-        cur_chunk_meta_ = nullptr;
+    appended_chunk_metas_.clear();
+    for (ChunkGroupMeta* cgm : appended_chunk_group_metas_) {
+        if (cgm != nullptr && cgm->device_id_) {
+            cgm->device_id_.reset();
+        }
     }
+    appended_chunk_group_metas_.clear();
+    // Drop every pointer that referenced meta_allocator_-owned memory before
+    // destroying the arena.  Without this, a reused writer (destroy() + a new
+    // init()) would still see the dangling CGM list/index/cur_* slots from
+    // the previous lifecycle and dereference freed nodes the next time
+    // start_flush_chunk_group() linear-scans the list.
+    chunk_group_meta_list_.clear();
+    chunk_group_meta_index_.clear();
+    cur_chunk_meta_ = nullptr;
+    cur_chunk_group_meta_ = nullptr;
+    cur_device_name_.reset();
+    chunk_meta_count_ = 0;
+    use_prev_alloc_cgm_ = false;
+    is_aligned_ = false;
+    file_base_offset_ = 0;
+    destroyed_ = true;
 
     meta_allocator_.destroy();
     write_stream_.destroy();
@@ -104,7 +101,6 @@ void TsFileIOWriter::destroy() {
         delete file_;
         file_ = nullptr;
     }
-    destroyed_ = true;
 }
 
 int TsFileIOWriter::start_file() {
@@ -145,6 +141,7 @@ int TsFileIOWriter::start_flush_chunk_group(
         } else {
             cur_chunk_group_meta_ = new (buf) ChunkGroupMeta(&meta_allocator_);
             cur_chunk_group_meta_->init(device_name);
+            appended_chunk_group_metas_.push_back(cur_chunk_group_meta_);
         }
     }
     return ret;
@@ -183,6 +180,7 @@ int TsFileIOWriter::start_flush_chunk(common::ByteStream& chunk_data,
         ret = cur_chunk_meta_->init(mname, data_type, cur_file_position(),
                                     chunk_statistic_copy, mask, encoding,
                                     compression, meta_allocator_);
+        appended_chunk_metas_.push_back(cur_chunk_meta_);
     }
 
     // Step 2. serialize chunk header to write_stream_
@@ -258,6 +256,8 @@ int TsFileIOWriter::end_flush_chunk_group(bool is_aligned) {
         cur_chunk_group_meta_ = nullptr;
         return common::E_OK;
     }
+    chunk_group_meta_index_[cur_device_name_->get_device_name()] =
+        cur_chunk_group_meta_;
     int ret = chunk_group_meta_list_.push_back(cur_chunk_group_meta_);
     cur_chunk_group_meta_ = nullptr;
     return ret;
@@ -269,17 +269,19 @@ int TsFileIOWriter::end_file() {
         return E_OK;
     }
     OFFSET_DEBUG("before end file");
+
     if (RET_FAIL(write_log_index_range())) {
         std::cout << "writer range index error, ret =" << ret << std::endl;
     } else if (RET_FAIL(write_file_index())) {
         std::cout << "writer file index error, ret = " << ret << std::endl;
     } else if (RET_FAIL(write_file_footer())) {
         std::cout << "writer file footer error, ret = " << ret << std::endl;
-    } else if (RET_FAIL(sync_file())) {
+    } else if (g_config_value_.sync_on_close_ && RET_FAIL(sync_file())) {
         std::cout << "sync file error, ret = " << ret << std::endl;
     } else if (RET_FAIL(close_file())) {
         std::cout << "close file error, ret = " << ret << std::endl;
     }
+
     return ret;
 }
 
diff --git a/cpp/src/file/tsfile_io_writer.h b/cpp/src/file/tsfile_io_writer.h
index 088e52f56..4904b924a 100644
--- a/cpp/src/file/tsfile_io_writer.h
+++ b/cpp/src/file/tsfile_io_writer.h
@@ -21,6 +21,7 @@
 #define FILE_TSFILE_IO_WRITER_H
 
 #include <map>
+#include <unordered_map>
 #include <vector>
 
 #include "common/allocator/page_arena.h"
@@ -108,6 +109,7 @@ class TsFileIOWriter {
 
     FORCE_INLINE std::string get_file_path() { return file_->get_file_path(); }
     FORCE_INLINE std::shared_ptr<Schema> get_schema() { return schema_; }
+    int64_t get_meta_size() const;
 
    private:
     int write_log_index_range();
@@ -191,13 +193,19 @@ class TsFileIOWriter {
     /** For RestorableTsFileIOWriter: append a recovered ChunkGroupMeta. */
     void push_chunk_group_meta(ChunkGroupMeta* cgm) {
         chunk_group_meta_list_.push_back(cgm);
+        if (cgm->device_id_) {
+            chunk_group_meta_index_[cgm->device_id_->get_device_name()] = cgm;
+        }
     }
-    /** True when chunk_group_meta_list_ has a prefix loaded from recovery;
-     * destroy() must not free device_id_/statistic_ for that prefix only. */
-    bool chunk_group_meta_from_recovery_ = false;
-    /** Recovered ChunkGroupMeta* -> chunk_meta_list_.size() at attach (pointer
-     * keys avoid idx skew). */
-    std::map<ChunkGroupMeta*, uint32_t> recovery_chunk_meta_prefix_;
+    /** Chunks/CGMs allocated from meta_allocator_ via start_flush_chunk*()
+     * (post-recovery for the restorable writer, all chunks for the normal
+     * writer).  destroy() iterates these directly to free the heap-allocated
+     * PageArena owned by each statistic and the shared_ptr<IDeviceID> held
+     * by each new CGM, without touching recovery-owned entries that live in
+     * RestorableTsFileIOWriter::self_check_arena_. */
+    std::vector<ChunkMeta*> appended_chunk_metas_;
+    std::vector<ChunkGroupMeta*> appended_chunk_group_metas_;
+    bool destroyed_ = false;
     /**
      * Recovery only: set file_base_offset_ so that cur_file_position() returns
      * correct absolute offsets.  After recovery the writer behaves as if the
@@ -214,6 +222,9 @@ class TsFileIOWriter {
     ChunkGroupMeta* cur_chunk_group_meta_;
     int32_t chunk_meta_count_;  // for debug
     common::SimpleList<ChunkGroupMeta*> chunk_group_meta_list_;
+    // O(1) lookup for existing ChunkGroupMeta by device name, avoiding the
+    // O(N) linear scan through chunk_group_meta_list_ per device.
+    std::unordered_map<std::string, ChunkGroupMeta*> chunk_group_meta_index_;
     bool use_prev_alloc_cgm_;  // chunk group meta
     std::shared_ptr<IDeviceID> cur_device_name_;
     WriteFile* file_;
@@ -227,10 +238,6 @@ class TsFileIOWriter {
     /** Recovery only: absolute file offset at which write_stream_ logically
      * begins.  Normal (non-recovery) path keeps this at 0. */
     int64_t file_base_offset_ = 0;
-    /** Set after destroy() completes; avoids double cleanup when
-     * RestorableTsFileIOWriter::close() calls destroy() before
-     * self_check_arena_.destroy(), then ~TsFileIOWriter runs again. */
-    bool destroyed_ = false;
 
     friend class RestorableTsFileIOWriter;  // uses push_chunk_group_meta
 };
diff --git a/cpp/src/reader/aligned_chunk_reader.cc b/cpp/src/reader/aligned_chunk_reader.cc
index 49c469547..f130b524d 100644
--- a/cpp/src/reader/aligned_chunk_reader.cc
+++ b/cpp/src/reader/aligned_chunk_reader.cc
@@ -19,8 +19,13 @@
 
 #include "aligned_chunk_reader.h"
 
+#include <algorithm>
 #include <limits>
 
+#include "common/global.h"
+#ifdef ENABLE_THREADS
+#include "common/thread_pool.h"
+#endif
 #include "compress/compressor_factory.h"
 #include "encoding/decoder_factory.h"
 
@@ -56,19 +61,74 @@ void AlignedChunkReader::reset() {
     if (file_data_buf != nullptr) {
         mem_free(file_data_buf);
     }
+    time_in_stream_.clear_wrapped_buf();
     time_in_stream_.reset();
     file_data_buf = value_in_stream_.get_wrapped_buf();
     if (file_data_buf != nullptr) {
         mem_free(file_data_buf);
     }
+    value_in_stream_.clear_wrapped_buf();
     value_in_stream_.reset();
     file_data_time_buf_size_ = 0;
     file_data_value_buf_size_ = 0;
     time_chunk_visit_offset_ = 0;
     value_chunk_visit_offset_ = 0;
+    page_plan_built_ = false;
+    current_page_loaded_ = false;
+    current_page_plan_index_ = 0;
+    time_predecoded_ = false;
+    page_all_times_.clear();
+    page_time_count_ = 0;
+    page_time_cursor_ = 0;
+
+    // Free leftover uncompressed buffers from the previous chunk.
+    if (time_uncompressed_buf_ != nullptr && time_compressor_ != nullptr) {
+        time_compressor_->after_uncompress(time_uncompressed_buf_);
+        time_uncompressed_buf_ = nullptr;
+    }
+
+    // Multi-value reset
+    for (auto* col : value_columns_) {
+        // Free uncompressed buffer before resetting.
+        if (col->uncompressed_buf != nullptr && col->compressor != nullptr) {
+            col->compressor->after_uncompress(col->uncompressed_buf);
+            col->uncompressed_buf = nullptr;
+        }
+        char* buf = col->in_stream.get_wrapped_buf();
+        if (buf != nullptr) mem_free(buf);
+        col->in_stream.clear_wrapped_buf();
+        col->in_stream.reset();
+        col->in.reset();
+        col->chunk_header.reset();
+        col->cur_page_header.reset();
+        col->file_data_buf_size = 0;
+        col->chunk_visit_offset = 0;
+        col->notnull_bitmap.clear();
+        col->cur_value_index = -1;
+        col->chunk_meta = nullptr;
+        for (auto& pps : col->per_page_state) {
+            pps.predecode_pa.destroy();
+        }
+        col->per_page_state.clear();
+        col->pending_decoded_values.clear();
+        col->pending_decoded_count = 0;
+        col->pending_decoded_cursor = 0;
+        col->pending_decoded = false;
+        // Note: decoder/compressor are NOT freed here — they are reused by
+        // alloc_compressor_and_decoder() in load_by_aligned_meta_multi().
+    }
+    release_current_page_state();
+    chunk_pages_.clear();
+    per_page_times_.clear();
 }
 
 void AlignedChunkReader::destroy() {
+    // .clear() leaves the vector's internal heap buffer allocated, which
+    // mem_free can't reach because we placement-new the reader. swap with
+    // an empty vector to actually release the backing storage so ASan's
+    // LeakSanitizer doesn't flag the (rather large) ChunkPageInfo buffers.
+    std::vector<ChunkPageInfo>{}.swap(chunk_pages_);
+    std::vector<int64_t>{}.swap(page_all_times_);
     if (time_uncompressed_buf_ != nullptr && time_compressor_ != nullptr) {
         time_compressor_->after_uncompress(time_uncompressed_buf_);
         time_uncompressed_buf_ = nullptr;
@@ -112,6 +172,53 @@ void AlignedChunkReader::destroy() {
     }
     cur_value_page_header_.reset();
     chunk_header_.~ChunkHeader();
+
+    // Multi-value destroy
+    for (size_t ci = 0; ci < value_columns_.size(); ci++) {
+        auto* col = value_columns_[ci];
+        if (col->decoder != nullptr) {
+            col->decoder->~Decoder();
+            DecoderFactory::free(col->decoder);
+            col->decoder = nullptr;
+        }
+        if (col->compressor != nullptr) {
+            col->compressor->~Compressor();
+            CompressorFactory::free(col->compressor);
+            col->compressor = nullptr;
+        }
+        for (auto& pps : col->per_page_state) {
+            pps.predecode_pa.destroy();
+        }
+        col->per_page_state.clear();
+        col->pending_decoded_values.clear();
+        buf = col->in_stream.get_wrapped_buf();
+        if (buf != nullptr) {
+            mem_free(buf);
+            col->in_stream.clear_wrapped_buf();
+        }
+        col->cur_page_header.reset();
+        delete col;
+    }
+    value_columns_.clear();
+    release_current_page_state();
+    per_page_times_.clear();
+#ifdef ENABLE_THREADS
+    decode_pool_ = nullptr;  // borrowed, not owned
+    for (auto* d : time_decoder_pool_) {
+        if (d != nullptr) {
+            d->~Decoder();
+            DecoderFactory::free(d);
+        }
+    }
+    time_decoder_pool_.clear();
+    for (auto* c : time_compressor_pool_) {
+        if (c != nullptr) {
+            c->~Compressor();
+            CompressorFactory::free(c);
+        }
+    }
+    time_compressor_pool_.clear();
+#endif
 }
 
 int AlignedChunkReader::load_by_aligned_meta(ChunkMeta* time_chunk_meta,
@@ -218,15 +325,19 @@ int AlignedChunkReader::alloc_compressor_and_decoder(
 
 int AlignedChunkReader::get_next_page(TsBlock* ret_tsblock,
                                       Filter* oneshoot_filter, PageArena& pa) {
+    if (multi_value_mode_) {
+        return get_next_page_multi(ret_tsblock, oneshoot_filter, pa);
+    }
     int ret = E_OK;
     Filter* filter =
         (oneshoot_filter != nullptr ? oneshoot_filter : time_filter_);
-    if (prev_time_page_not_finish() && prev_value_page_not_finish()) {
-        ret = decode_time_value_buf_into_tsblock(ret_tsblock, oneshoot_filter,
-                                                 &pa);
+    bool pt = prev_time_page_not_finish();
+    bool pv = prev_value_page_not_finish();
+    if (pt && pv) {
+        ret = decode_time_value_buf_into_tsblock(ret_tsblock, filter, &pa);
         return ret;
     }
-    if (!prev_time_page_not_finish() && !prev_value_page_not_finish()) {
+    if (!pt && !pv) {
         while (IS_SUCC(ret)) {
             if (RET_FAIL(get_cur_page_header(
                     time_chunk_meta_, time_in_stream_, cur_time_page_header_,
@@ -249,8 +360,7 @@ int AlignedChunkReader::get_next_page(TsBlock* ret_tsblock,
         }
     }
     if (IS_SUCC(ret)) {
-        ret = decode_time_value_buf_into_tsblock(ret_tsblock, oneshoot_filter,
-                                                 &pa);
+        ret = decode_time_value_buf_into_tsblock(ret_tsblock, filter, &pa);
     }
     return ret;
 }
@@ -259,7 +369,8 @@ int AlignedChunkReader::get_cur_page_header(ChunkMeta*& chunk_meta,
                                             common::ByteStream& in_stream,
                                             PageHeader& cur_page_header,
                                             uint32_t& chunk_visit_offset,
-                                            ChunkHeader& chunk_header) {
+                                            ChunkHeader& chunk_header,
+                                            int32_t* override_buf_size) {
     int ret = E_OK;
     bool retry = true;
     int cur_page_header_serialized_size = 0;
@@ -282,7 +393,8 @@ int AlignedChunkReader::get_cur_page_header(ChunkMeta*& chunk_meta,
             retry = false;
             retry_read_want_size += 1024;
             int32_t& file_data_buf_size =
-                chunk_header.data_type_ == common::VECTOR
+                override_buf_size != nullptr ? *override_buf_size
+                : chunk_header.data_type_ == common::VECTOR
                     ? file_data_time_buf_size_
                     : file_data_value_buf_size_;
             // do not shrink buffer for page header, otherwise, the buffer is
@@ -326,9 +438,13 @@ int AlignedChunkReader::read_from_file_and_rewrap(
         (may_shrink && read_size < file_data_buf_size / 10)) {
         file_data_buf = (char*)mem_realloc(file_data_buf, read_size);
         if (IS_NULL(file_data_buf)) {
+            in_stream_.clear_wrapped_buf();
             return E_OOM;
         }
         file_data_buf_size = read_size;
+        // Update stream pointer immediately so it stays valid even if
+        // the subsequent read fails and the caller frees via destroy().
+        in_stream_.wrap_from(file_data_buf, read_size);
     }
     int ret_read_len = 0;
     if (RET_FAIL(
@@ -563,6 +679,7 @@ int AlignedChunkReader::decode_time_value_buf_into_tsblock(
                 row_appender.append_null(1);                                   \
                 continue;                                                      \
             }                                                                  \
+            assert(value_decoder_->has_remaining(value_in));                   \
             if (!value_decoder_->has_remaining(value_in)) {                    \
                 return common::E_DATA_INCONSISTENCY;                           \
             }                                                                  \
@@ -597,19 +714,19 @@ int AlignedChunkReader::i32_DECODE_TYPED_TV_INTO_TSBLOCK(
         if (value_page_col_notnull_bitmap_.empty() ||
             ((value_page_col_notnull_bitmap_[cur_value_index / 8] & 0xFF) &
              (mask >> (cur_value_index % 8))) == 0) {
-            if (UNLIKELY(!row_appender.add_row())) {
-                ret = E_OVERFLOW;
-                cur_value_index--;
-                break;
-            }
             ret = time_decoder_->read_int64(time, time_in);
             if (ret != E_OK) {
                 break;
             }
+            if (UNLIKELY(!row_appender.add_row())) {
+                ret = E_OVERFLOW;
+                break;
+            }
             row_appender.append(0, (char*)&time, sizeof(time));
             row_appender.append_null(1);
             continue;
         }
+        assert(value_decoder_->has_remaining(value_in));
         if (!value_decoder_->has_remaining(value_in)) {
             return common::E_DATA_INCONSISTENCY;
         }
@@ -632,6 +749,566 @@ int AlignedChunkReader::i32_DECODE_TYPED_TV_INTO_TSBLOCK(
     return ret;
 }
 
+int AlignedChunkReader::i32_DECODE_TV_BATCH(ByteStream& time_in,
+                                            ByteStream& value_in,
+                                            RowAppender& row_appender,
+                                            Filter* filter) {
+    int ret = E_OK;
+    const int BATCH = 129;
+    int64_t times[BATCH];
+    int32_t values[BATCH];
+    const uint32_t null_mask_base = 1 << 7;
+
+    while (time_decoder_->has_remaining(time_in)) {
+        if (row_appender.remaining() < (uint32_t)BATCH) {
+            ret = E_OVERFLOW;
+            break;
+        }
+
+        // Block-level time filter check
+        bool block_all_pass = false;
+        if (filter != nullptr) {
+            int64_t block_min, block_max;
+            int block_count;
+            if (time_decoder_->peek_next_block_range_int64(
+                    time_in, block_min, block_max, block_count)) {
+                if (!filter->satisfy_start_end_time(block_min, block_max)) {
+                    int skipped = 0;
+                    time_decoder_->skip_peeked_block_int64(time_in, skipped);
+                    int nonnull = 0;
+                    for (int i = 0; i < block_count; ++i) {
+                        int vi = cur_value_index + 1 + i;
+                        if (!value_page_col_notnull_bitmap_.empty() &&
+                            ((value_page_col_notnull_bitmap_[vi / 8] & 0xFF) &
+                             (null_mask_base >> (vi % 8))) != 0) {
+                            ++nonnull;
+                        }
+                    }
+                    cur_value_index += block_count;
+                    if (nonnull > 0) {
+                        // skip_* may legitimately fail (truncated page) or
+                        // short-read (corrupt bitmap vs. data); both must
+                        // abort the loop rather than silently desync the
+                        // value decoder.  Same defect the multi-value path
+                        // already guards against.
+                        int sk = 0;
+                        if (RET_FAIL(value_decoder_->skip_int32(nonnull, sk,
+                                                                value_in))) {
+                            break;
+                        }
+                        if (sk != nonnull) {
+                            ret = E_TSFILE_CORRUPTED;
+                            break;
+                        }
+                    }
+                    continue;
+                }
+                if (filter->contain_start_end_time(block_min, block_max)) {
+                    block_all_pass = true;
+                }
+            }
+        }
+
+        int time_count = 0;
+        if (RET_FAIL(time_decoder_->read_batch_int64(times, BATCH, time_count,
+                                                     time_in))) {
+            break;
+        }
+        if (time_count == 0) break;
+
+        bool is_null[BATCH];
+        int nonnull_count = 0;
+        for (int i = 0; i < time_count; ++i) {
+            int vi = cur_value_index + 1 + i;
+            if (value_page_col_notnull_bitmap_.empty() ||
+                ((value_page_col_notnull_bitmap_[vi / 8] & 0xFF) &
+                 (null_mask_base >> (vi % 8))) == 0) {
+                is_null[i] = true;
+            } else {
+                is_null[i] = false;
+                ++nonnull_count;
+            }
+        }
+
+        bool time_mask[BATCH];
+        int pass_count = time_count;
+        if (filter != nullptr && !block_all_pass) {
+            pass_count =
+                filter->satisfy_batch_time(times, time_count, time_mask);
+        }
+
+        if (pass_count == 0) {
+            if (nonnull_count > 0) {
+                int skipped = 0;
+                if (RET_FAIL(value_decoder_->skip_int32(nonnull_count, skipped,
+                                                        value_in))) {
+                    break;
+                }
+                if (skipped != nonnull_count) {
+                    ret = E_TSFILE_CORRUPTED;
+                    break;
+                }
+            }
+            cur_value_index += time_count;
+            continue;
+        }
+
+        int value_count = 0;
+        if (nonnull_count > 0) {
+            if (RET_FAIL(value_decoder_->read_batch_int32(
+                    values, nonnull_count, value_count, value_in))) {
+                break;
+            }
+        }
+
+        int val_idx = 0;
+        for (int i = 0; i < time_count; ++i) {
+            cur_value_index++;
+            if (filter != nullptr && !block_all_pass && !time_mask[i]) {
+                if (!is_null[i]) ++val_idx;
+                continue;
+            }
+            if (is_null[i]) {
+                if (UNLIKELY(!row_appender.add_row())) {
+                    ret = E_OVERFLOW;
+                    break;
+                }
+                row_appender.append(0, (char*)&times[i], sizeof(int64_t));
+                row_appender.append_null(1);
+            } else {
+                int32_t val = values[val_idx++];
+                if (filter != nullptr && !block_all_pass &&
+                    !filter->satisfy(times[i], (int64_t)val)) {
+                    continue;
+                }
+                if (UNLIKELY(!row_appender.add_row())) {
+                    ret = E_OVERFLOW;
+                    break;
+                }
+                row_appender.append(0, (char*)&times[i], sizeof(int64_t));
+                row_appender.append(1, (char*)&val, sizeof(int32_t));
+            }
+        }
+        if (ret != E_OK) break;
+    }
+    return ret;
+}
+
+int AlignedChunkReader::i64_DECODE_TV_BATCH(ByteStream& time_in,
+                                            ByteStream& value_in,
+                                            RowAppender& row_appender,
+                                            Filter* filter) {
+    int ret = E_OK;
+    const int BATCH = 129;
+    int64_t times[BATCH];
+    int64_t values[BATCH];
+    const uint32_t null_mask_base = 1 << 7;
+
+    while (time_decoder_->has_remaining(time_in)) {
+        if (row_appender.remaining() < (uint32_t)BATCH) {
+            ret = E_OVERFLOW;
+            break;
+        }
+
+        // Block-level time filter check: skip entire block if out of range
+        bool block_all_pass = false;
+        if (filter != nullptr) {
+            int64_t block_min, block_max;
+            int block_count;
+            if (time_decoder_->peek_next_block_range_int64(
+                    time_in, block_min, block_max, block_count)) {
+                if (!filter->satisfy_start_end_time(block_min, block_max)) {
+                    int skipped = 0;
+                    time_decoder_->skip_peeked_block_int64(time_in, skipped);
+                    int nonnull = 0;
+                    for (int i = 0; i < block_count; ++i) {
+                        int vi = cur_value_index + 1 + i;
+                        if (!value_page_col_notnull_bitmap_.empty() &&
+                            ((value_page_col_notnull_bitmap_[vi / 8] & 0xFF) &
+                             (null_mask_base >> (vi % 8))) != 0) {
+                            ++nonnull;
+                        }
+                    }
+                    cur_value_index += block_count;
+                    if (nonnull > 0) {
+                        // See i32 path above for the rationale.
+                        int sk = 0;
+                        if (RET_FAIL(value_decoder_->skip_int64(nonnull, sk,
+                                                                value_in))) {
+                            break;
+                        }
+                        if (sk != nonnull) {
+                            ret = E_TSFILE_CORRUPTED;
+                            break;
+                        }
+                    }
+                    continue;
+                }
+                if (filter->contain_start_end_time(block_min, block_max)) {
+                    block_all_pass = true;
+                }
+            }
+        }
+
+        int time_count = 0;
+        if (RET_FAIL(time_decoder_->read_batch_int64(times, BATCH, time_count,
+                                                     time_in))) {
+            break;
+        }
+        if (time_count == 0) break;
+
+        bool is_null[BATCH];
+        int nonnull_count = 0;
+        for (int i = 0; i < time_count; ++i) {
+            int vi = cur_value_index + 1 + i;
+            if (value_page_col_notnull_bitmap_.empty() ||
+                ((value_page_col_notnull_bitmap_[vi / 8] & 0xFF) &
+                 (null_mask_base >> (vi % 8))) == 0) {
+                is_null[i] = true;
+            } else {
+                is_null[i] = false;
+                ++nonnull_count;
+            }
+        }
+
+        bool time_mask[BATCH];
+        int pass_count = time_count;
+        if (filter != nullptr && !block_all_pass) {
+            pass_count =
+                filter->satisfy_batch_time(times, time_count, time_mask);
+        }
+
+        if (pass_count == 0) {
+            if (nonnull_count > 0) {
+                int skipped = 0;
+                if (RET_FAIL(value_decoder_->skip_int64(nonnull_count, skipped,
+                                                        value_in))) {
+                    break;
+                }
+                if (skipped != nonnull_count) {
+                    ret = E_TSFILE_CORRUPTED;
+                    break;
+                }
+            }
+            cur_value_index += time_count;
+            continue;
+        }
+
+        int value_count = 0;
+        if (nonnull_count > 0) {
+            if (RET_FAIL(value_decoder_->read_batch_int64(
+                    values, nonnull_count, value_count, value_in))) {
+                break;
+            }
+        }
+
+        int val_idx = 0;
+        for (int i = 0; i < time_count; ++i) {
+            cur_value_index++;
+            if (filter != nullptr && !block_all_pass && !time_mask[i]) {
+                if (!is_null[i]) ++val_idx;
+                continue;
+            }
+            if (is_null[i]) {
+                if (UNLIKELY(!row_appender.add_row())) {
+                    ret = E_OVERFLOW;
+                    break;
+                }
+                row_appender.append(0, (char*)&times[i], sizeof(int64_t));
+                row_appender.append_null(1);
+            } else {
+                int64_t val = values[val_idx++];
+                if (filter != nullptr && !block_all_pass &&
+                    !filter->satisfy(times[i], val)) {
+                    continue;
+                }
+                if (UNLIKELY(!row_appender.add_row())) {
+                    ret = E_OVERFLOW;
+                    break;
+                }
+                row_appender.append(0, (char*)&times[i], sizeof(int64_t));
+                row_appender.append(1, (char*)&val, sizeof(int64_t));
+            }
+        }
+        if (ret != E_OK) break;
+    }
+    return ret;
+}
+
+int AlignedChunkReader::float_DECODE_TV_BATCH(ByteStream& time_in,
+                                              ByteStream& value_in,
+                                              RowAppender& row_appender,
+                                              Filter* filter) {
+    int ret = E_OK;
+    const int BATCH = 129;
+    int64_t times[BATCH];
+    float values[BATCH];
+    const uint32_t null_mask_base = 1 << 7;
+
+    while (time_decoder_->has_remaining(time_in)) {
+        if (row_appender.remaining() < (uint32_t)BATCH) {
+            ret = E_OVERFLOW;
+            break;
+        }
+
+        // Block-level time filter check
+        bool block_all_pass = false;
+        if (filter != nullptr) {
+            int64_t block_min, block_max;
+            int block_count;
+            if (time_decoder_->peek_next_block_range_int64(
+                    time_in, block_min, block_max, block_count)) {
+                if (!filter->satisfy_start_end_time(block_min, block_max)) {
+                    int skipped = 0;
+                    time_decoder_->skip_peeked_block_int64(time_in, skipped);
+                    int nonnull = 0;
+                    for (int i = 0; i < block_count; ++i) {
+                        int vi = cur_value_index + 1 + i;
+                        if (!value_page_col_notnull_bitmap_.empty() &&
+                            ((value_page_col_notnull_bitmap_[vi / 8] & 0xFF) &
+                             (null_mask_base >> (vi % 8))) != 0) {
+                            ++nonnull;
+                        }
+                    }
+                    cur_value_index += block_count;
+                    if (nonnull > 0) {
+                        // See i32 path above for the rationale.
+                        int sk = 0;
+                        if (RET_FAIL(value_decoder_->skip_float(nonnull, sk,
+                                                                value_in))) {
+                            break;
+                        }
+                        if (sk != nonnull) {
+                            ret = E_TSFILE_CORRUPTED;
+                            break;
+                        }
+                    }
+                    continue;
+                }
+                if (filter->contain_start_end_time(block_min, block_max)) {
+                    block_all_pass = true;
+                }
+            }
+        }
+
+        int time_count = 0;
+        if (RET_FAIL(time_decoder_->read_batch_int64(times, BATCH, time_count,
+                                                     time_in))) {
+            break;
+        }
+        if (time_count == 0) break;
+
+        bool is_null[BATCH];
+        int nonnull_count = 0;
+        for (int i = 0; i < time_count; ++i) {
+            int vi = cur_value_index + 1 + i;
+            if (value_page_col_notnull_bitmap_.empty() ||
+                ((value_page_col_notnull_bitmap_[vi / 8] & 0xFF) &
+                 (null_mask_base >> (vi % 8))) == 0) {
+                is_null[i] = true;
+            } else {
+                is_null[i] = false;
+                ++nonnull_count;
+            }
+        }
+
+        bool time_mask[BATCH];
+        int pass_count = time_count;
+        if (filter != nullptr && !block_all_pass) {
+            pass_count =
+                filter->satisfy_batch_time(times, time_count, time_mask);
+        }
+
+        if (pass_count == 0) {
+            if (nonnull_count > 0) {
+                int skipped = 0;
+                if (RET_FAIL(value_decoder_->skip_float(nonnull_count, skipped,
+                                                        value_in))) {
+                    break;
+                }
+                if (skipped != nonnull_count) {
+                    ret = E_TSFILE_CORRUPTED;
+                    break;
+                }
+            }
+            cur_value_index += time_count;
+            continue;
+        }
+
+        int value_count = 0;
+        if (nonnull_count > 0) {
+            if (RET_FAIL(value_decoder_->read_batch_float(
+                    values, nonnull_count, value_count, value_in))) {
+                break;
+            }
+        }
+
+        int val_idx = 0;
+        for (int i = 0; i < time_count; ++i) {
+            cur_value_index++;
+            if (filter != nullptr && !block_all_pass && !time_mask[i]) {
+                if (!is_null[i]) ++val_idx;
+                continue;
+            }
+            if (is_null[i]) {
+                if (UNLIKELY(!row_appender.add_row())) {
+                    ret = E_OVERFLOW;
+                    break;
+                }
+                row_appender.append(0, (char*)&times[i], sizeof(int64_t));
+                row_appender.append_null(1);
+            } else {
+                float val = values[val_idx++];
+                if (UNLIKELY(!row_appender.add_row())) {
+                    ret = E_OVERFLOW;
+                    break;
+                }
+                row_appender.append(0, (char*)&times[i], sizeof(int64_t));
+                row_appender.append(1, (char*)&val, sizeof(float));
+            }
+        }
+        if (ret != E_OK) break;
+    }
+    return ret;
+}
+
+int AlignedChunkReader::double_DECODE_TV_BATCH(ByteStream& time_in,
+                                               ByteStream& value_in,
+                                               RowAppender& row_appender,
+                                               Filter* filter) {
+    int ret = E_OK;
+    const int BATCH = 129;
+    int64_t times[BATCH];
+    double values[BATCH];
+    const uint32_t null_mask_base = 1 << 7;
+
+    while (time_decoder_->has_remaining(time_in)) {
+        if (row_appender.remaining() < (uint32_t)BATCH) {
+            ret = E_OVERFLOW;
+            break;
+        }
+
+        // Block-level time filter check
+        bool block_all_pass = false;
+        if (filter != nullptr) {
+            int64_t block_min, block_max;
+            int block_count;
+            if (time_decoder_->peek_next_block_range_int64(
+                    time_in, block_min, block_max, block_count)) {
+                if (!filter->satisfy_start_end_time(block_min, block_max)) {
+                    int skipped = 0;
+                    time_decoder_->skip_peeked_block_int64(time_in, skipped);
+                    int nonnull = 0;
+                    for (int i = 0; i < block_count; ++i) {
+                        int vi = cur_value_index + 1 + i;
+                        if (!value_page_col_notnull_bitmap_.empty() &&
+                            ((value_page_col_notnull_bitmap_[vi / 8] & 0xFF) &
+                             (null_mask_base >> (vi % 8))) != 0) {
+                            ++nonnull;
+                        }
+                    }
+                    cur_value_index += block_count;
+                    if (nonnull > 0) {
+                        // See i32 path above for the rationale.
+                        int sk = 0;
+                        if (RET_FAIL(value_decoder_->skip_double(nonnull, sk,
+                                                                 value_in))) {
+                            break;
+                        }
+                        if (sk != nonnull) {
+                            ret = E_TSFILE_CORRUPTED;
+                            break;
+                        }
+                    }
+                    continue;
+                }
+                if (filter->contain_start_end_time(block_min, block_max)) {
+                    block_all_pass = true;
+                }
+            }
+        }
+
+        int time_count = 0;
+        if (RET_FAIL(time_decoder_->read_batch_int64(times, BATCH, time_count,
+                                                     time_in))) {
+            break;
+        }
+        if (time_count == 0) break;
+
+        bool is_null[BATCH];
+        int nonnull_count = 0;
+        for (int i = 0; i < time_count; ++i) {
+            int vi = cur_value_index + 1 + i;
+            if (value_page_col_notnull_bitmap_.empty() ||
+                ((value_page_col_notnull_bitmap_[vi / 8] & 0xFF) &
+                 (null_mask_base >> (vi % 8))) == 0) {
+                is_null[i] = true;
+            } else {
+                is_null[i] = false;
+                ++nonnull_count;
+            }
+        }
+
+        bool time_mask[BATCH];
+        int pass_count = time_count;
+        if (filter != nullptr && !block_all_pass) {
+            pass_count =
+                filter->satisfy_batch_time(times, time_count, time_mask);
+        }
+
+        if (pass_count == 0) {
+            if (nonnull_count > 0) {
+                int skipped = 0;
+                if (RET_FAIL(value_decoder_->skip_double(nonnull_count, skipped,
+                                                         value_in))) {
+                    break;
+                }
+                if (skipped != nonnull_count) {
+                    ret = E_TSFILE_CORRUPTED;
+                    break;
+                }
+            }
+            cur_value_index += time_count;
+            continue;
+        }
+
+        int value_count = 0;
+        if (nonnull_count > 0) {
+            if (RET_FAIL(value_decoder_->read_batch_double(
+                    values, nonnull_count, value_count, value_in))) {
+                break;
+            }
+        }
+
+        int val_idx = 0;
+        for (int i = 0; i < time_count; ++i) {
+            cur_value_index++;
+            if (filter != nullptr && !block_all_pass && !time_mask[i]) {
+                if (!is_null[i]) ++val_idx;
+                continue;
+            }
+            if (is_null[i]) {
+                if (UNLIKELY(!row_appender.add_row())) {
+                    ret = E_OVERFLOW;
+                    break;
+                }
+                row_appender.append(0, (char*)&times[i], sizeof(int64_t));
+                row_appender.append_null(1);
+            } else {
+                double val = values[val_idx++];
+                if (UNLIKELY(!row_appender.add_row())) {
+                    ret = E_OVERFLOW;
+                    break;
+                }
+                row_appender.append(0, (char*)&times[i], sizeof(int64_t));
+                row_appender.append(1, (char*)&val, sizeof(double));
+            }
+        }
+        if (ret != E_OK) break;
+    }
+    return ret;
+}
+
 int AlignedChunkReader::decode_tv_buf_into_tsblock_by_datatype(
     ByteStream& time_in, ByteStream& value_in, TsBlock* ret_tsblock,
     Filter* filter, common::PageArena* pa) {
@@ -644,23 +1321,24 @@ int AlignedChunkReader::decode_tv_buf_into_tsblock_by_datatype(
             break;
         case common::DATE:
         case common::INT32:
-            // DECODE_TYPED_TV_INTO_TSBLOCK(int32_t, int32, time_in_, value_in_,
-            //                              row_appender);
-            ret = i32_DECODE_TYPED_TV_INTO_TSBLOCK(time_in_, value_in_,
-                                                   row_appender, filter);
+            // Batch decode path: read_batch_int{32,64} consumes whole TS_2DIFF
+            // blocks at once (and uses SIMD when ENABLE_SIMD); replaces a
+            // per-value decode() loop that hot-dominated the read flame graph.
+            ret =
+                i32_DECODE_TV_BATCH(time_in_, value_in_, row_appender, filter);
             break;
         case common::TIMESTAMP:
         case common::INT64:
-            DECODE_TYPED_TV_INTO_TSBLOCK(int64_t, int64, time_in_, value_in_,
-                                         row_appender);
+            ret =
+                i64_DECODE_TV_BATCH(time_in_, value_in_, row_appender, filter);
             break;
         case common::FLOAT:
-            DECODE_TYPED_TV_INTO_TSBLOCK(float, float, time_in_, value_in_,
-                                         row_appender);
+            ret = float_DECODE_TV_BATCH(time_in_, value_in_, row_appender,
+                                        filter);
             break;
         case common::DOUBLE:
-            DECODE_TYPED_TV_INTO_TSBLOCK(double, double, time_in_, value_in_,
-                                         row_appender);
+            ret = double_DECODE_TV_BATCH(time_in_, value_in_, row_appender,
+                                         filter);
             break;
         case common::STRING:
         case common::BLOB:
@@ -695,6 +1373,7 @@ int AlignedChunkReader::STRING_DECODE_TYPED_TV_INTO_TSBLOCK(
         }
 
         if (should_read_data) {
+            assert(value_decoder_->has_remaining(value_in));
             if (!value_decoder_->has_remaining(value_in)) {
                 return E_DATA_INCONSISTENCY;
             }
@@ -740,21 +1419,15 @@ bool AlignedChunkReader::should_skip_page_by_offset(int& row_offset) {
     if (row_offset <= 0) {
         return false;
     }
-    // Aligned TV pages: only skip a whole page by count when both page headers
-    // expose the same positive row count. Using a single side (or min) when
-    // the other is missing or unequal can desynchronize row_offset from
-    // decoded row order vs. the paired time/value stream.
-    Statistic* ts = cur_time_page_header_.statistic_;
-    Statistic* vs = cur_value_page_header_.statistic_;
-    if (ts == nullptr || vs == nullptr) {
-        return false;
+    // Use time page statistic for count.
+    Statistic* stat = cur_time_page_header_.statistic_;
+    if (stat == nullptr) {
+        stat = cur_value_page_header_.statistic_;
     }
-    int32_t tc = ts->count_;
-    int32_t vc = vs->count_;
-    if (tc <= 0 || vc <= 0 || tc != vc) {
+    if (stat == nullptr || stat->count_ == 0) {
         return false;
     }
-    int32_t count = tc;
+    int32_t count = stat->count_;
     if (row_offset >= count) {
         row_offset -= count;
         return true;
@@ -766,6 +1439,19 @@ int AlignedChunkReader::get_next_page(TsBlock* ret_tsblock,
                                       Filter* oneshoot_filter, PageArena& pa,
                                       int64_t min_time_hint, int& row_offset,
                                       int& row_limit) {
+    if (multi_value_mode_) {
+        // Multi-value aligned path doesn't yet honour row_offset / row_limit
+        // / min_time_hint — they get dropped on the floor, which silently
+        // returns full chunk data when the caller asked for a sub-range.
+        // Refuse the combination so the caller sees an actual error instead
+        // of garbage results.  set_row_range(0, -1) keeps the all-rows
+        // contract intact for normal queries.
+        if (row_offset > 0 || row_limit >= 0 ||
+            min_time_hint != std::numeric_limits<int64_t>::min()) {
+            return common::E_NOT_SUPPORT;
+        }
+        return get_next_page_multi(ret_tsblock, oneshoot_filter, pa);
+    }
     int ret = E_OK;
     Filter* filter =
         (oneshoot_filter != nullptr ? oneshoot_filter : time_filter_);
@@ -774,12 +1460,14 @@ int AlignedChunkReader::get_next_page(TsBlock* ret_tsblock,
         return E_NO_MORE_DATA;
     }
 
-    if (prev_time_page_not_finish() && prev_value_page_not_finish()) {
-        ret = decode_time_value_buf_into_tsblock(ret_tsblock, oneshoot_filter,
-                                                 &pa);
+    bool pt = prev_time_page_not_finish();
+    bool pv = prev_value_page_not_finish();
+
+    if (pt && pv) {
+        ret = decode_time_value_buf_into_tsblock(ret_tsblock, filter, &pa);
         return ret;
     }
-    if (!prev_time_page_not_finish() && !prev_value_page_not_finish()) {
+    if (!pt && !pv) {
         while (IS_SUCC(ret)) {
             if (RET_FAIL(get_cur_page_header(
                     time_chunk_meta_, time_in_stream_, cur_time_page_header_,
@@ -810,10 +1498,1560 @@ int AlignedChunkReader::get_next_page(TsBlock* ret_tsblock,
         }
     }
     if (IS_SUCC(ret)) {
-        ret = decode_time_value_buf_into_tsblock(ret_tsblock, oneshoot_filter,
-                                                 &pa);
+        ret = decode_time_value_buf_into_tsblock(ret_tsblock, filter, &pa);
+    }
+    return ret;
+}
+
+// ══════════════════════════════════════════════════════════════════════════
+//  Multi-value AlignedChunkReader implementation
+// ══════════════════════════════════════════════════════════════════════════
+
+int AlignedChunkReader::load_by_aligned_meta_multi(
+    ChunkMeta* time_chunk_meta, const std::vector<ChunkMeta*>& value_metas) {
+    int ret = E_OK;
+    multi_value_mode_ = true;
+    time_chunk_meta_ = time_chunk_meta;
+    page_plan_built_ = false;
+    current_page_loaded_ = false;
+    current_page_plan_index_ = 0;
+    time_predecoded_ = false;
+    page_all_times_.clear();
+    page_time_count_ = 0;
+    page_time_cursor_ = 0;
+
+    // ── Load time chunk header ──
+    file_data_time_buf_size_ = 1024;
+    int32_t ret_read_len = 0;
+    char* time_file_data_buf =
+        (char*)mem_alloc(file_data_time_buf_size_, MOD_CHUNK_READER);
+    if (IS_NULL(time_file_data_buf)) return E_OOM;
+
+    ret = read_file_->read(time_chunk_meta_->offset_of_chunk_header_,
+                           time_file_data_buf, file_data_time_buf_size_,
+                           ret_read_len);
+    if (IS_SUCC(ret) && ret_read_len < ChunkHeader::MIN_SERIALIZED_SIZE) {
+        ret = E_TSFILE_CORRUPTED;
+        mem_free(time_file_data_buf);
+        return ret;
+    }
+    if (IS_SUCC(ret)) {
+        time_in_stream_.wrap_from(time_file_data_buf, ret_read_len);
+        if (RET_FAIL(time_chunk_header_.deserialize_from(time_in_stream_))) {
+            return ret;
+        }
+        time_chunk_visit_offset_ = time_in_stream_.read_pos();
+    }
+
+    // Alloc time decoder/compressor
+    if (IS_SUCC(ret)) {
+        if (RET_FAIL(alloc_compressor_and_decoder(
+                time_decoder_, time_compressor_,
+                time_chunk_header_.encoding_type_,
+                time_chunk_header_.data_type_,
+                time_chunk_header_.compression_type_))) {
+            return ret;
+        }
+    }
+
+    // ── Load each value column ──
+    // Reuse existing ValueColumnState objects if count matches (reset() already
+    // cleared their internal state).  Otherwise, recreate.
+    if (value_columns_.size() != value_metas.size()) {
+        for (auto* p : value_columns_) delete p;
+        value_columns_.clear();
+        value_columns_.reserve(value_metas.size());
+        for (size_t c = 0; c < value_metas.size(); c++) {
+            value_columns_.push_back(new ValueColumnState);
+        }
+    }
+    for (size_t c = 0; c < value_metas.size() && IS_SUCC(ret); c++) {
+        auto* col = value_columns_[c];
+        col->chunk_meta = value_metas[c];
+        col->file_data_buf_size = 1024;
+        ret_read_len = 0;
+        char* vbuf =
+            (char*)mem_alloc(col->file_data_buf_size, MOD_CHUNK_READER);
+        if (IS_NULL(vbuf)) return E_OOM;
+
+        ret = read_file_->read(col->chunk_meta->offset_of_chunk_header_, vbuf,
+                               col->file_data_buf_size, ret_read_len);
+        if (IS_SUCC(ret) && ret_read_len < ChunkHeader::MIN_SERIALIZED_SIZE) {
+            ret = E_TSFILE_CORRUPTED;
+            mem_free(vbuf);
+            break;
+        }
+        if (IS_SUCC(ret)) {
+            col->in_stream.wrap_from(vbuf, ret_read_len);
+            if (RET_FAIL(col->chunk_header.deserialize_from(col->in_stream))) {
+                break;
+            }
+            col->chunk_visit_offset = col->in_stream.read_pos();
+            if (RET_FAIL(alloc_compressor_and_decoder(
+                    col->decoder, col->compressor,
+                    col->chunk_header.encoding_type_,
+                    col->chunk_header.data_type_,
+                    col->chunk_header.compression_type_))) {
+                break;
+            }
+        }
+    }
+
+    return ret;
+}
+
+bool AlignedChunkReader::has_more_data_multi() const {
+    if (page_plan_built_) {
+        if (current_page_loaded_) {
+            return page_time_cursor_ < page_time_count_;
+        }
+        return current_page_plan_index_ < chunk_pages_.size();
+    }
+    if (prev_time_page_not_finish() || prev_any_value_page_not_finish_multi()) {
+        return true;
+    }
+    if (time_chunk_visit_offset_ - time_chunk_header_.serialized_size_ <
+        time_chunk_header_.data_size_) {
+        return true;
+    }
+    for (const auto* col : value_columns_) {
+        if (col->chunk_visit_offset - col->chunk_header.serialized_size_ <
+            col->chunk_header.data_size_) {
+            return true;
+        }
+    }
+    return false;
+}
+
+bool AlignedChunkReader::prev_any_value_page_not_finish_multi() const {
+    for (const auto* col : value_columns_) {
+        if ((col->decoder && col->decoder->has_remaining(col->in)) ||
+            col->in.has_remaining()) {
+            return true;
+        }
+    }
+    return false;
+}
+
+bool AlignedChunkReader::has_variable_length_value_column() const {
+    for (const auto* col : value_columns_) {
+        if (col->chunk_header.data_type_ == common::STRING ||
+            col->chunk_header.data_type_ == common::TEXT ||
+            col->chunk_header.data_type_ == common::BLOB) {
+            return true;
+        }
+    }
+    return false;
+}
+
+int AlignedChunkReader::count_non_null_prefix(
+    const std::vector<uint8_t>& bitmap, int32_t row_limit) const {
+    if (row_limit <= 0 || bitmap.empty()) {
+        return 0;
+    }
+    const uint32_t mask_base = 1 << 7;
+    int count = 0;
+    for (int32_t i = 0; i < row_limit; i++) {
+        if (((bitmap[i / 8] & 0xFF) & (mask_base >> (i % 8))) != 0) {
+            count++;
+        }
+    }
+    return count;
+}
+
+int AlignedChunkReader::decode_time_page_direct(
+    const ChunkPageInfo& page_info, std::vector<int64_t>& out_times) {
+    return decode_time_page_with(page_info, out_times, time_decoder_,
+                                 time_compressor_);
+}
+
+// Worker-safe variant: uses caller-provided decoder + compressor instead of
+// the shared time_decoder_/time_compressor_ members.  Used by the parallel
+// time-page decode dispatch in decode_all_planned_pages.
+int AlignedChunkReader::decode_time_page_with(const ChunkPageInfo& page_info,
+                                              std::vector<int64_t>& out_times,
+                                              Decoder* decoder,
+                                              Compressor* compressor) {
+    out_times.clear();
+    if (page_info.time_compressed_size == 0) {
+        return E_OK;
+    }
+
+    char stack_buf[4096];
+    char* compressed_buf = stack_buf;
+    bool heap = page_info.time_compressed_size > sizeof(stack_buf);
+    if (heap) {
+        compressed_buf = static_cast<char*>(common::mem_alloc(
+            page_info.time_compressed_size, common::MOD_DEFAULT));
+        if (compressed_buf == nullptr) {
+            return E_OOM;
+        }
+    }
+
+    int32_t read_len = 0;
+    int ret = read_file_->read(page_info.time_file_offset, compressed_buf,
+                               page_info.time_compressed_size, read_len);
+    if (IS_FAIL(ret)) {
+        if (heap) common::mem_free(compressed_buf);
+        return ret;
+    }
+    // ReadFile::read() returns E_OK + short read_len on EOF; uncompressing
+    // page_info.time_compressed_size from a buffer with uninitialised tail
+    // bytes would feed garbage to the decompressor.
+    if (read_len != static_cast<int32_t>(page_info.time_compressed_size)) {
+        if (heap) common::mem_free(compressed_buf);
+        return E_TSFILE_CORRUPTED;
+    }
+
+    char* uncompressed_buf = nullptr;
+    uint32_t uncompressed_size = 0;
+    if (RET_FAIL(compressor->reset(false))) {
+        if (heap) common::mem_free(compressed_buf);
+        return ret;
+    }
+    ret = compressor->uncompress(compressed_buf, page_info.time_compressed_size,
+                                 uncompressed_buf, uncompressed_size);
+    if (heap && compressed_buf != uncompressed_buf) {
+        common::mem_free(compressed_buf);
+    }
+    if (IS_FAIL(ret) || uncompressed_size != page_info.time_uncompressed_size) {
+        if (uncompressed_buf != nullptr) {
+            compressor->after_uncompress(uncompressed_buf);
+        }
+        return E_TSFILE_CORRUPTED;
+    }
+
+    common::ByteStream in;
+    in.wrap_from(uncompressed_buf, uncompressed_size);
+    decoder->reset();
+    const int batch_size = 1024;
+    int64_t batch[batch_size];
+    while (decoder->has_remaining(in)) {
+        int actual = 0;
+        if (RET_FAIL(
+                decoder->read_batch_int64(batch, batch_size, actual, in))) {
+            break;
+        }
+        if (actual == 0) {
+            break;
+        }
+        out_times.insert(out_times.end(), batch, batch + actual);
+    }
+    compressor->after_uncompress(uncompressed_buf);
+    return ret;
+}
+
+int AlignedChunkReader::build_page_plan(Filter* filter) {
+    int ret = E_OK;
+    chunk_pages_.clear();
+    current_page_plan_index_ = 0;
+    current_page_loaded_ = false;
+    page_plan_built_ = false;
+
+    const uint32_t num_cols = value_columns_.size();
+    while (IS_SUCC(ret)) {
+        if (time_chunk_visit_offset_ - time_chunk_header_.serialized_size_ >=
+            time_chunk_header_.data_size_) {
+            break;
+        }
+
+        if (RET_FAIL(get_cur_page_header(
+                time_chunk_meta_, time_in_stream_, cur_time_page_header_,
+                time_chunk_visit_offset_, time_chunk_header_))) {
+            break;
+        }
+        if (cur_time_page_header_.compressed_size_ == 0 &&
+            cur_time_page_header_.uncompressed_size_ == 0) {
+            break;
+        }
+
+        ChunkPageInfo page_info;
+        page_info.time_file_offset = time_chunk_meta_->offset_of_chunk_header_ +
+                                     time_chunk_visit_offset_;
+        page_info.time_compressed_size = cur_time_page_header_.compressed_size_;
+        page_info.time_uncompressed_size =
+            cur_time_page_header_.uncompressed_size_;
+        page_info.value_file_offsets.resize(num_cols);
+        page_info.value_compressed_sizes.resize(num_cols);
+        page_info.value_uncompressed_sizes.resize(num_cols);
+
+        for (uint32_t c = 0; c < num_cols && IS_SUCC(ret); c++) {
+            auto* col = value_columns_[c];
+            if (RET_FAIL(get_cur_page_header(
+                    col->chunk_meta, col->in_stream, col->cur_page_header,
+                    col->chunk_visit_offset, col->chunk_header,
+                    &col->file_data_buf_size))) {
+                break;
+            }
+            page_info.value_file_offsets[c] =
+                col->chunk_meta->offset_of_chunk_header_ +
+                col->chunk_visit_offset;
+            page_info.value_compressed_sizes[c] =
+                col->cur_page_header.compressed_size_;
+            page_info.value_uncompressed_sizes[c] =
+                col->cur_page_header.uncompressed_size_;
+        }
+        if (IS_FAIL(ret)) {
+            break;
+        }
+
+        Statistic* stat = cur_time_page_header_.statistic_;
+        if (filter == nullptr) {
+            page_info.pass_type = PagePassType::FULL_PASS;
+            page_info.row_begin = 0;
+            page_info.row_end = stat != nullptr ? stat->count_ : 0;
+        } else if (stat != nullptr && !filter->satisfy(stat)) {
+            page_info.pass_type = PagePassType::SKIP;
+        } else if (stat != nullptr && filter->contain_start_end_time(
+                                          stat->start_time_, stat->end_time_)) {
+            page_info.pass_type = PagePassType::FULL_PASS;
+            page_info.row_begin = 0;
+            page_info.row_end = stat->count_;
+        } else {
+            page_info.pass_type = PagePassType::BOUNDARY;
+            std::vector<int64_t> times;
+            if (RET_FAIL(decode_time_page_direct(page_info, times))) {
+                break;
+            }
+            int32_t first = -1;
+            int32_t last = -1;
+            for (int32_t i = 0; i < static_cast<int32_t>(times.size()); i++) {
+                if (filter->satisfy_start_end_time(times[i], times[i])) {
+                    if (first < 0) first = i;
+                    last = i;
+                }
+            }
+            if (first >= 0) {
+                page_info.row_begin = first;
+                page_info.row_end = last + 1;
+            } else {
+                page_info.pass_type = PagePassType::SKIP;
+            }
+        }
+
+        if (page_info.pass_type != PagePassType::SKIP) {
+            if (page_info.row_end == 0) {
+                std::vector<int64_t> times;
+                if (RET_FAIL(decode_time_page_direct(page_info, times))) {
+                    break;
+                }
+                page_info.row_end = static_cast<int32_t>(times.size());
+            }
+            if (page_info.row_begin < page_info.row_end) {
+                chunk_pages_.push_back(std::move(page_info));
+            }
+        }
+
+        time_chunk_visit_offset_ += cur_time_page_header_.compressed_size_;
+        time_in_stream_.wrapped_buf_advance_read_pos(
+            cur_time_page_header_.compressed_size_);
+        for (uint32_t c = 0; c < num_cols; c++) {
+            auto* col = value_columns_[c];
+            col->chunk_visit_offset += col->cur_page_header.compressed_size_;
+            col->in_stream.wrapped_buf_advance_read_pos(
+                col->cur_page_header.compressed_size_);
+        }
+    }
+
+    page_plan_built_ = IS_SUCC(ret);
+
+    if (page_plan_built_) {
+        per_page_times_.assign(chunk_pages_.size(), std::vector<int64_t>{});
+        for (auto* col : value_columns_) {
+            col->per_page_state.clear();
+            col->per_page_state.resize(chunk_pages_.size());
+        }
+    }
+    return ret;
+}
+
+void AlignedChunkReader::release_current_page_state() {
+    time_predecoded_ = false;
+    page_all_times_.clear();
+    page_time_count_ = 0;
+    page_time_cursor_ = 0;
+    for (auto* col : value_columns_) {
+        if (col->uncompressed_buf != nullptr && col->compressor != nullptr) {
+            col->compressor->after_uncompress(col->uncompressed_buf);
+            col->uncompressed_buf = nullptr;
+        }
+        col->notnull_bitmap.clear();
+        col->cur_value_index = -1;
+        col->in.reset();
+        for (auto& pps : col->per_page_state) {
+            pps.predecode_pa.destroy();
+        }
+        col->per_page_state.clear();
+        col->pending_decoded_values.clear();
+        col->pending_decoded_count = 0;
+        col->pending_decoded_cursor = 0;
+        col->pending_decoded = false;
+    }
+    per_page_times_.clear();
+    current_page_loaded_ = false;
+}
+
+int AlignedChunkReader::decode_value_page_for_slot(uint32_t col_idx,
+                                                   size_t page_idx) {
+    const ChunkPageInfo& page_info = chunk_pages_[page_idx];
+    auto* col = value_columns_[col_idx];
+    auto& pps = col->per_page_state[page_idx];
+
+    pps.notnull_bitmap.clear();
+    pps.predecoded_values.clear();
+    pps.predecoded_strings.clear();
+    pps.predecoded_read_pos = 0;
+    pps.predecoded_count = 0;
+    pps.predecode_pa.destroy();
+
+    if (page_info.value_compressed_sizes[col_idx] == 0) {
+        return E_OK;
+    }
+
+    char stack_buf[4096];
+    char* compressed_buf = stack_buf;
+    bool heap = page_info.value_compressed_sizes[col_idx] > sizeof(stack_buf);
+    if (heap) {
+        compressed_buf = static_cast<char*>(common::mem_alloc(
+            page_info.value_compressed_sizes[col_idx], common::MOD_DEFAULT));
+        if (compressed_buf == nullptr) return E_OOM;
+    }
+
+    int32_t read_len = 0;
+    int ret =
+        read_file_->read(page_info.value_file_offsets[col_idx], compressed_buf,
+                         page_info.value_compressed_sizes[col_idx], read_len);
+    if (IS_FAIL(ret)) {
+        if (heap) common::mem_free(compressed_buf);
+        return ret;
+    }
+    if (read_len !=
+        static_cast<int32_t>(page_info.value_compressed_sizes[col_idx])) {
+        if (heap) common::mem_free(compressed_buf);
+        return E_TSFILE_CORRUPTED;
+    }
+
+    char* uncompressed_buf = nullptr;
+    uint32_t uncompressed_size = 0;
+    if (RET_FAIL(col->compressor->reset(false))) {
+        if (heap) common::mem_free(compressed_buf);
+        return ret;
+    }
+    ret = col->compressor->uncompress(compressed_buf,
+                                      page_info.value_compressed_sizes[col_idx],
+                                      uncompressed_buf, uncompressed_size);
+    if (heap && compressed_buf != uncompressed_buf) {
+        common::mem_free(compressed_buf);
+    }
+    if (IS_FAIL(ret) ||
+        uncompressed_size != page_info.value_uncompressed_sizes[col_idx]) {
+        if (uncompressed_buf != nullptr) {
+            col->compressor->after_uncompress(uncompressed_buf);
+        }
+        return E_TSFILE_CORRUPTED;
+    }
+    // The value page begins with a uint32 data_num followed by a bitmap of
+    // ceil(data_num/8) bytes; a corrupt or truncated page that doesn't even
+    // hold the data_num header would let read_ui32() walk past the buffer.
+    if (uncompressed_size < sizeof(uint32_t)) {
+        col->compressor->after_uncompress(uncompressed_buf);
+        return E_TSFILE_CORRUPTED;
+    }
+
+    uint32_t offset = 0;
+    uint32_t data_num = SerializationUtil::read_ui32(uncompressed_buf);
+    offset += sizeof(uint32_t);
+    uint32_t bitmap_bytes = (data_num + 7) / 8;
+    if (uncompressed_size - offset < bitmap_bytes) {
+        col->compressor->after_uncompress(uncompressed_buf);
+        return E_TSFILE_CORRUPTED;
+    }
+    pps.notnull_bitmap.resize(bitmap_bytes);
+    for (size_t i = 0; i < pps.notnull_bitmap.size(); i++) {
+        pps.notnull_bitmap[i] = *(uncompressed_buf + offset++);
+    }
+
+    char* value_buf = uncompressed_buf + offset;
+    uint32_t value_buf_size = uncompressed_size - offset;
+    common::ByteStream in;
+    in.wrap_from(value_buf, value_buf_size);
+    col->decoder->reset();
+
+    auto dt = col->chunk_header.data_type_;
+    int nonnull_total = count_non_null_prefix(pps.notnull_bitmap,
+                                              static_cast<int32_t>(data_num));
+    int prefix_nonnull =
+        count_non_null_prefix(pps.notnull_bitmap, page_info.row_begin);
+    pps.predecoded_read_pos = prefix_nonnull;
+
+    auto cleanup = [&]() {
+        col->compressor->after_uncompress(uncompressed_buf);
+    };
+
+    if (dt == common::STRING || dt == common::TEXT || dt == common::BLOB) {
+        pps.predecode_pa.init(512, common::MOD_TSFILE_READER);
+        pps.predecoded_strings.resize(nonnull_total);
+        for (int i = 0; i < nonnull_total; i++) {
+            if (RET_FAIL(col->decoder->read_String(pps.predecoded_strings[i],
+                                                   pps.predecode_pa, in))) {
+                cleanup();
+                return ret;
+            }
+        }
+        pps.predecoded_count = nonnull_total;
+        cleanup();
+        return E_OK;
+    }
+
+    if (nonnull_total == 0) {
+        cleanup();
+        return E_OK;
+    }
+
+    uint32_t elem_size = common::get_data_type_size(dt);
+    pps.predecoded_values.resize(static_cast<size_t>(nonnull_total) *
+                                 elem_size);
+    int actual = 0;
+    switch (dt) {
+        case common::BOOLEAN: {
+            bool* out = reinterpret_cast<bool*>(pps.predecoded_values.data());
+            for (int i = 0; i < nonnull_total; i++) {
+                if (RET_FAIL(col->decoder->read_boolean(out[i], in))) {
+                    cleanup();
+                    return ret;
+                }
+            }
+            actual = nonnull_total;
+            break;
+        }
+        case common::INT32:
+        case common::DATE:
+            if (RET_FAIL(col->decoder->read_batch_int32(
+                    reinterpret_cast<int32_t*>(pps.predecoded_values.data()),
+                    nonnull_total, actual, in))) {
+                cleanup();
+                return ret;
+            }
+            break;
+        case common::INT64:
+        case common::TIMESTAMP:
+            if (RET_FAIL(col->decoder->read_batch_int64(
+                    reinterpret_cast<int64_t*>(pps.predecoded_values.data()),
+                    nonnull_total, actual, in))) {
+                cleanup();
+                return ret;
+            }
+            break;
+        case common::FLOAT:
+            if (RET_FAIL(col->decoder->read_batch_float(
+                    reinterpret_cast<float*>(pps.predecoded_values.data()),
+                    nonnull_total, actual, in))) {
+                cleanup();
+                return ret;
+            }
+            break;
+        case common::DOUBLE:
+            if (RET_FAIL(col->decoder->read_batch_double(
+                    reinterpret_cast<double*>(pps.predecoded_values.data()),
+                    nonnull_total, actual, in))) {
+                cleanup();
+                return ret;
+            }
+            break;
+        default:
+            cleanup();
+            return E_NOT_SUPPORT;
+    }
+    pps.predecoded_count = actual;
+    cleanup();
+    return E_OK;
+}
+
+// Multi-thread path: one task per value column, each decoding all non-SKIP
+// pages of that column serially.  Time pages dispatched as worker-bucketed
+// strided tasks using per-worker decoder/compressor (filled from
+// time_decoder_pool_ / time_compressor_pool_) so they don't contend on the
+// shared time_decoder_/time_compressor_.
+//
+// Single-thread: do NOT pre-decode every page upfront — leave per_page_state
+// empty so the scatter loop decodes on demand and releases after each page
+// (see decode_page_lazy() / release_page_slot()).  Bounds memory to one page.
+int AlignedChunkReader::decode_all_planned_pages() {
+    if (chunk_pages_.empty()) return E_OK;
+
+#ifdef ENABLE_THREADS
+    if (decode_pool_ != nullptr && value_columns_.size() > 1) {
+        // Lazily grow the per-worker time decoder/compressor pool.  Both
+        // factories can return nullptr on OOM/unsupported config; without
+        // checking, the worker task below dereferences null when calling
+        // decode_time_page_with().
+        size_t worker_count = decode_pool_->num_threads();
+        if (time_decoder_pool_.size() < worker_count) {
+            time_decoder_pool_.resize(worker_count, nullptr);
+            time_compressor_pool_.resize(worker_count, nullptr);
+            for (size_t w = 0; w < worker_count; w++) {
+                if (time_decoder_pool_[w] == nullptr) {
+                    time_decoder_pool_[w] =
+                        DecoderFactory::alloc_time_decoder();
+                    if (time_decoder_pool_[w] == nullptr) return E_OOM;
+                }
+                if (time_compressor_pool_[w] == nullptr) {
+                    time_compressor_pool_[w] =
+                        CompressorFactory::alloc_compressor(
+                            time_chunk_header_.compression_type_);
+                    if (time_compressor_pool_[w] == nullptr) return E_OOM;
+                }
+            }
+        }
+
+        std::vector<std::future<void>> futures;
+        std::vector<int> col_rets(value_columns_.size(), E_OK);
+        for (uint32_t c = 0; c < value_columns_.size(); c++) {
+            int* col_ret = &col_rets[c];
+            futures.push_back(decode_pool_->submit([this, c, col_ret]() {
+                for (size_t p = 0; p < chunk_pages_.size(); p++) {
+                    int r = decode_value_page_for_slot(c, p);
+                    if (IS_FAIL(r)) {
+                        *col_ret = r;
+                        return;
+                    }
+                }
+            }));
+        }
+        // Time pages dispatched in worker-sized chunks (one task per worker)
+        // to amortize submit/wait overhead.  Stride for load balance.
+        size_t time_task_count = std::min(worker_count, chunk_pages_.size());
+        std::vector<int> time_rets(time_task_count, E_OK);
+        for (size_t k = 0; k < time_task_count; k++) {
+            int* tr = &time_rets[k];
+            futures.push_back(decode_pool_->submit(
+                [this, k, tr, time_task_count, worker_count]() {
+                    size_t wid = common::ThreadPool::current_worker_id();
+                    if (wid >= worker_count) wid = 0;
+                    Decoder* dec = time_decoder_pool_[wid];
+                    Compressor* comp = time_compressor_pool_[wid];
+                    for (size_t p = k; p < chunk_pages_.size();
+                         p += time_task_count) {
+                        int r = decode_time_page_with(
+                            chunk_pages_[p], per_page_times_[p], dec, comp);
+                        if (IS_FAIL(r)) {
+                            *tr = r;
+                            return;
+                        }
+                    }
+                }));
+        }
+        // Wait on each task's own future rather than draining the whole pool:
+        // it is shared process-wide, so wait_all() would also block on
+        // unrelated concurrent operations' tasks still in flight.
+        for (auto& f : futures) f.get();
+        for (auto r : time_rets) {
+            if (IS_FAIL(r)) return r;
+        }
+        for (uint32_t c = 0; c < value_columns_.size(); c++) {
+            if (IS_FAIL(col_rets[c])) return col_rets[c];
+        }
+        return E_OK;
+    }
+#endif
+    // Single-thread: defer decode to scatter time.
+    return E_OK;
+}
+
+// Decode time + all value columns for a single page slot on demand.
+// Used by the single-thread path to keep memory bounded to one page.
+int AlignedChunkReader::decode_page_lazy(size_t page_idx) {
+    int ret = E_OK;
+    if (RET_FAIL(decode_time_page_direct(chunk_pages_[page_idx],
+                                         per_page_times_[page_idx]))) {
+        return ret;
+    }
+    for (uint32_t c = 0; c < value_columns_.size(); c++) {
+        if (RET_FAIL(decode_value_page_for_slot(c, page_idx))) {
+            return ret;
+        }
+    }
+    return E_OK;
+}
+
+// Release the decoded buffers of one page slot so they can be reused by the
+// next page (keeps memory footprint bounded for the single-thread path).
+void AlignedChunkReader::release_page_slot(size_t page_idx) {
+    std::vector<int64_t>{}.swap(per_page_times_[page_idx]);
+    for (auto* col : value_columns_) {
+        if (page_idx >= col->per_page_state.size()) continue;
+        auto& pps = col->per_page_state[page_idx];
+        std::vector<uint8_t>{}.swap(pps.notnull_bitmap);
+        std::vector<char>{}.swap(pps.predecoded_values);
+        std::vector<common::String>{}.swap(pps.predecoded_strings);
+        pps.predecode_pa.destroy();
+        pps.predecoded_count = 0;
+        pps.predecoded_read_pos = 0;
+    }
+}
+
+int AlignedChunkReader::get_next_page_multi(TsBlock* ret_tsblock,
+                                            Filter* oneshoot_filter,
+                                            PageArena& pa) {
+    int ret = E_OK;
+    Filter* filter =
+        (oneshoot_filter != nullptr ? oneshoot_filter : time_filter_);
+
+    // Dispatch:
+    //   - Multi-column with a thread pool → chunk-level pre-decode: one task
+    //     per value column decodes that column's whole chunk up front, then the
+    //     scatter loop bulk-memcpys.  decode_all_planned_pages() works for any
+    //     column count.  (An earlier cutoff sent >6 columns down the serial
+    //     path because per_page_state — the upfront predecode buffer — grows
+    //     with column count and was feared to thrash cache; it still grows, so
+    //     very wide aligned chunks are the case to watch if reads regress.)
+    //   - Single column, or no thread pool → serial path: decode the current
+    //     page's columns inline (multi_DECODE_TV_BATCH), no thread-pool
+    //     fan-out.
+#ifdef ENABLE_THREADS
+    const bool use_chunk_level =
+        decode_pool_ != nullptr && value_columns_.size() > 1;
+#else
+    const bool use_chunk_level = false;
+#endif
+    if (!use_chunk_level) {
+        return get_next_page_multi_serial(ret_tsblock, filter, pa);
+    }
+
+    if (!page_plan_built_) {
+        if (RET_FAIL(build_page_plan(filter))) {
+            return ret;
+        }
+        if (RET_FAIL(decode_all_planned_pages())) {
+            return ret;
+        }
+    }
+    if (chunk_pages_.empty()) {
+        return E_NO_MORE_DATA;
+    }
+
+    const uint32_t null_mask_base = 1 << 7;
+    const uint32_t num_cols = value_columns_.size();
+    RowAppender row_appender(ret_tsblock);
+    // Detect single-thread lazy mode by whether decode_all_planned_pages left
+    // per_page_times_ empty (it leaves slots empty when there's no pool).
+    const bool single_thread_lazy = per_page_times_[0].empty();
+
+    while (current_page_plan_index_ < chunk_pages_.size()) {
+        const ChunkPageInfo& page_info = chunk_pages_[current_page_plan_index_];
+
+        if (!current_page_loaded_) {
+            if (single_thread_lazy) {
+                if (RET_FAIL(decode_page_lazy(current_page_plan_index_))) {
+                    return ret;
+                }
+            }
+            page_time_cursor_ = page_info.row_begin;
+            page_time_count_ = page_info.row_end;
+            current_page_loaded_ = true;
+        }
+        const std::vector<int64_t>& times =
+            per_page_times_[current_page_plan_index_];
+
+        int32_t remaining_in_page = page_time_count_ - page_time_cursor_;
+        uint32_t budget = row_appender.remaining();
+
+        // Fast path: FULL_PASS page, no nulls in any value column, types
+        // match destination, budget > 0.  Bulk-memcpys up to
+        // min(budget, remaining_in_page) rows from page_time_cursor_; tail
+        // pages of an SSI tsblock still take the memcpy path instead of
+        // falling into the row-by-row scatter loop.
+        bool can_bulk = page_info.pass_type == PagePassType::FULL_PASS &&
+                        remaining_in_page > 0 && budget > 0;
+        if (can_bulk) {
+            for (uint32_t c = 0; c < num_cols; c++) {
+                auto* col = value_columns_[c];
+                auto& pps = col->per_page_state[current_page_plan_index_];
+                auto dt = col->chunk_header.data_type_;
+                if (dt == common::STRING || dt == common::TEXT ||
+                    dt == common::BLOB ||
+                    ret_tsblock->get_vector(c + 1)->get_vector_type() != dt ||
+                    pps.predecoded_count != page_time_count_) {
+                    can_bulk = false;
+                    break;
+                }
+            }
+        }
+
+        if (can_bulk) {
+            uint32_t bulk_count =
+                std::min(budget, static_cast<uint32_t>(remaining_in_page));
+            size_t time_byte_off =
+                static_cast<size_t>(page_time_cursor_) * sizeof(int64_t);
+            // Bulk-append both bytes AND row count for every Vector.
+            // Skipping add_row_nums() would leave each Vector's row_num_
+            // at 0 while the TsBlock-level row_count_ jumped to bulk_count;
+            // fill_trailling_nulls() would then mark every just-written
+            // row as null, and column iterators would report the wrong
+            // length.
+            common::Vector* time_vec = ret_tsblock->get_vector(0);
+            time_vec->get_value_data().append_fixed_value(
+                reinterpret_cast<const char*>(times.data()) + time_byte_off,
+                bulk_count * sizeof(int64_t));
+            time_vec->add_row_nums(bulk_count);
+            for (uint32_t c = 0; c < num_cols; c++) {
+                auto* col = value_columns_[c];
+                auto& pps = col->per_page_state[current_page_plan_index_];
+                uint32_t elem_size =
+                    common::get_data_type_size(col->chunk_header.data_type_);
+                common::Vector* vec = ret_tsblock->get_vector(c + 1);
+                vec->get_value_data().append_fixed_value(
+                    pps.predecoded_values.data() +
+                        static_cast<size_t>(page_time_cursor_) * elem_size,
+                    bulk_count * elem_size);
+                vec->add_row_nums(bulk_count);
+            }
+            row_appender.add_rows(bulk_count);
+            page_time_cursor_ += bulk_count;
+            if (page_time_cursor_ >= page_time_count_) {
+                if (single_thread_lazy) {
+                    release_page_slot(current_page_plan_index_);
+                }
+                current_page_plan_index_++;
+                current_page_loaded_ = false;
+                continue;
+            }
+            // Budget exhausted mid-page; caller will drain and resume.
+            return E_OK;
+        }
+
+        // Slow path: row-by-row.  Handles null bitmap, type promotion,
+        // BOUNDARY pages, and partial-page E_OVERFLOW.
+        // BOUNDARY pages: build_page_plan compressed the page to the
+        // [first-hit, last-hit] range, but timestamps inside that range may
+        // still fail the filter (e.g. TimeIn({2, 8}) leaves 3..7 unmatched).
+        // Re-apply the filter per timestamp here, advancing predecoded
+        // read positions for skipped non-null rows so the cursor stays
+        // aligned with the page's value layout.
+        const bool boundary_filter =
+            page_info.pass_type == PagePassType::BOUNDARY && filter != nullptr;
+        while (page_time_cursor_ < page_time_count_) {
+            if (row_appender.remaining() == 0) {
+                return E_OK;
+            }
+            int64_t ts = times[page_time_cursor_];
+            if (boundary_filter && !filter->satisfy_start_end_time(ts, ts)) {
+                for (uint32_t c = 0; c < num_cols; c++) {
+                    auto* col = value_columns_[c];
+                    auto& pps = col->per_page_state[current_page_plan_index_];
+                    bool is_null = true;
+                    if (!pps.notnull_bitmap.empty()) {
+                        is_null =
+                            ((pps.notnull_bitmap[page_time_cursor_ / 8] &
+                              0xFF) &
+                             (null_mask_base >> (page_time_cursor_ % 8))) == 0;
+                    }
+                    if (!is_null) pps.predecoded_read_pos++;
+                }
+                page_time_cursor_++;
+                continue;
+            }
+            if (UNLIKELY(!row_appender.add_row())) {
+                return E_OK;
+            }
+            row_appender.append(0, reinterpret_cast<char*>(&ts), sizeof(ts));
+
+            for (uint32_t c = 0; c < num_cols; c++) {
+                auto* col = value_columns_[c];
+                auto& pps = col->per_page_state[current_page_plan_index_];
+                bool is_null = true;
+                if (!pps.notnull_bitmap.empty()) {
+                    is_null =
+                        ((pps.notnull_bitmap[page_time_cursor_ / 8] & 0xFF) &
+                         (null_mask_base >> (page_time_cursor_ % 8))) == 0;
+                }
+                if (is_null) {
+                    row_appender.append_null(c + 1);
+                    continue;
+                }
+                if (col->chunk_header.data_type_ == common::STRING ||
+                    col->chunk_header.data_type_ == common::TEXT ||
+                    col->chunk_header.data_type_ == common::BLOB) {
+                    const common::String& value =
+                        pps.predecoded_strings[pps.predecoded_read_pos++];
+                    row_appender.append(c + 1, value.buf_, value.len_);
+                } else {
+                    uint32_t elem_size = common::get_data_type_size(
+                        col->chunk_header.data_type_);
+                    row_appender.append(
+                        c + 1,
+                        pps.predecoded_values.data() +
+                            static_cast<size_t>(pps.predecoded_read_pos++) *
+                                elem_size,
+                        elem_size);
+                }
+            }
+            page_time_cursor_++;
+        }
+
+        if (single_thread_lazy) {
+            release_page_slot(current_page_plan_index_);
+        }
+        current_page_plan_index_++;
+        current_page_loaded_ = false;
+    }
+    return E_NO_MORE_DATA;
+}
+
+int AlignedChunkReader::get_next_page_multi_serial(TsBlock* ret_tsblock,
+                                                   Filter* filter,
+                                                   PageArena& pa) {
+    int ret = E_OK;
+    bool pt = prev_time_page_not_finish();
+    bool pv = prev_any_value_page_not_finish_multi();
+    if (pt && pv) {
+        ret =
+            decode_time_value_buf_into_tsblock_multi(ret_tsblock, filter, &pa);
+        return ret;
+    }
+    if (!pt && !pv) {
+        while (IS_SUCC(ret)) {
+            if (RET_FAIL(get_cur_page_header(
+                    time_chunk_meta_, time_in_stream_, cur_time_page_header_,
+                    time_chunk_visit_offset_, time_chunk_header_))) {
+                break;
+            }
+            for (size_t c = 0; c < value_columns_.size() && IS_SUCC(ret); c++) {
+                auto* col = value_columns_[c];
+                if (RET_FAIL(get_cur_page_header(
+                        col->chunk_meta, col->in_stream, col->cur_page_header,
+                        col->chunk_visit_offset, col->chunk_header,
+                        &col->file_data_buf_size))) {
+                }
+            }
+            if (IS_FAIL(ret)) break;
+            if (cur_page_statisify_filter_multi(filter)) break;
+            if (RET_FAIL(skip_cur_page_multi())) break;
+            if (!has_more_data()) {
+                ret = E_NO_MORE_DATA;
+                break;
+            }
+        }
+        if (IS_SUCC(ret)) {
+            ret = decode_cur_time_page_data();
+            if (IS_SUCC(ret)) ret = decode_cur_value_pages_multi();
+        }
+    }
+    if (IS_SUCC(ret)) {
+        ret =
+            decode_time_value_buf_into_tsblock_multi(ret_tsblock, filter, &pa);
+    }
+    return ret;
+}
+
+bool AlignedChunkReader::cur_page_statisify_filter_multi(Filter* filter) {
+    bool time_satisfy = filter == nullptr ||
+                        cur_time_page_header_.statistic_ == nullptr ||
+                        filter->satisfy(cur_time_page_header_.statistic_);
+    return time_satisfy;
+}
+
+int AlignedChunkReader::skip_cur_page_multi() {
+    time_chunk_visit_offset_ += cur_time_page_header_.compressed_size_;
+    time_in_stream_.wrapped_buf_advance_read_pos(
+        cur_time_page_header_.compressed_size_);
+    for (auto* col : value_columns_) {
+        col->chunk_visit_offset += col->cur_page_header.compressed_size_;
+        col->in_stream.wrapped_buf_advance_read_pos(
+            col->cur_page_header.compressed_size_);
+    }
+    return E_OK;
+}
+
+int AlignedChunkReader::decode_cur_value_pages_multi() {
+    int ret = E_OK;
+    // Phase 1: Serial IO — ensure each column's page data is in memory.
+    for (size_t c = 0; c < value_columns_.size() && IS_SUCC(ret); c++) {
+        ret = ensure_value_page_loaded(*value_columns_[c]);
+    }
+    if (IS_FAIL(ret)) return ret;
+
+    // Phase 2: decompress + parse bitmap + reset decoder for each column's
+    // current page, inline.  This serial path now only runs for single-column
+    // reads or when no thread pool exists — multi-column reads with a pool take
+    // the chunk-level path (decode_all_planned_pages), so there is no per-page
+    // thread-pool fan-out here anymore.  predecode=false lets the scatter loop
+    // (multi_DECODE_TV_BATCH) decode inline, which has better cache locality
+    // when there is no parallelism to amortize an extra predecode buffer write.
+    for (size_t c = 0; c < value_columns_.size() && IS_SUCC(ret); c++) {
+        ret = decompress_and_parse_value_page(*value_columns_[c], false);
+    }
+    return ret;
+}
+
+int AlignedChunkReader::decode_cur_value_page_data_for(ValueColumnState& col) {
+    int ret = E_OK;
+
+    // Step 1: ensure full page data is loaded
+    if (col.in_stream.remaining_size() < col.cur_page_header.compressed_size_) {
+        if (RET_FAIL(read_from_file_and_rewrap(
+                col.in_stream, col.chunk_meta, col.chunk_visit_offset,
+                col.file_data_buf_size,
+                col.cur_page_header.compressed_size_))) {
+            return ret;
+        }
+    }
+
+    if (col.cur_page_header.compressed_size_ == 0) {
+        col.in.wrap_from(nullptr, 0);
+        return E_OK;
+    }
+
+    // Step 2: uncompress
+    char* compressed_buf =
+        col.in_stream.get_wrapped_buf() + col.in_stream.read_pos();
+    uint32_t compressed_size = col.cur_page_header.compressed_size_;
+    col.in_stream.wrapped_buf_advance_read_pos(compressed_size);
+    col.chunk_visit_offset += compressed_size;
+
+    char* uncompressed_buf = nullptr;
+    uint32_t uncompressed_size = 0;
+    if (RET_FAIL(col.compressor->reset(false))) {
+        return ret;
+    }
+    if (RET_FAIL(col.compressor->uncompress(compressed_buf, compressed_size,
+                                            uncompressed_buf,
+                                            uncompressed_size))) {
+        return ret;
+    }
+    col.uncompressed_buf = uncompressed_buf;
+
+    if (uncompressed_size != col.cur_page_header.uncompressed_size_) {
+        return E_TSFILE_CORRUPTED;
+    }
+
+    // Step 3: parse bitmap + value data
+    if (uncompressed_size < sizeof(uint32_t)) return E_TSFILE_CORRUPTED;
+    uint32_t offset = 0;
+    uint32_t data_num = SerializationUtil::read_ui32(uncompressed_buf);
+    offset += sizeof(uint32_t);
+    uint32_t bitmap_bytes = (data_num + 7) / 8;
+    if (uncompressed_size - offset < bitmap_bytes) return E_TSFILE_CORRUPTED;
+    col.notnull_bitmap.resize(bitmap_bytes);
+    for (size_t i = 0; i < col.notnull_bitmap.size(); i++) {
+        col.notnull_bitmap[i] = *(uncompressed_buf + offset);
+        offset++;
+    }
+    col.cur_value_index = -1;
+
+    char* value_buf = uncompressed_buf + offset;
+    uint32_t value_buf_size = uncompressed_size - offset;
+    col.decoder->reset();
+    col.in.wrap_from(value_buf, value_buf_size);
+    return ret;
+}
+
+int AlignedChunkReader::ensure_value_page_loaded(ValueColumnState& col) {
+    int ret = E_OK;
+    if (col.in_stream.remaining_size() < col.cur_page_header.compressed_size_) {
+        if (RET_FAIL(read_from_file_and_rewrap(
+                col.in_stream, col.chunk_meta, col.chunk_visit_offset,
+                col.file_data_buf_size,
+                col.cur_page_header.compressed_size_))) {
+            return ret;
+        }
+    }
+    return ret;
+}
+
+int AlignedChunkReader::decompress_and_parse_value_page(ValueColumnState& col,
+                                                        bool predecode) {
+    int ret = E_OK;
+
+    if (col.cur_page_header.compressed_size_ == 0) {
+        col.in.wrap_from(nullptr, 0);
+        return E_OK;
+    }
+
+    // Decompress
+    char* compressed_buf =
+        col.in_stream.get_wrapped_buf() + col.in_stream.read_pos();
+    uint32_t compressed_size = col.cur_page_header.compressed_size_;
+    col.in_stream.wrapped_buf_advance_read_pos(compressed_size);
+    col.chunk_visit_offset += compressed_size;
+
+    char* uncompressed_buf = nullptr;
+    uint32_t uncompressed_size = 0;
+    if (RET_FAIL(col.compressor->reset(false))) {
+        return ret;
+    }
+    if (RET_FAIL(col.compressor->uncompress(compressed_buf, compressed_size,
+                                            uncompressed_buf,
+                                            uncompressed_size))) {
+        return ret;
+    }
+    col.uncompressed_buf = uncompressed_buf;
+
+    if (uncompressed_size != col.cur_page_header.uncompressed_size_) {
+        return E_TSFILE_CORRUPTED;
+    }
+
+    // Parse bitmap + value data
+    if (uncompressed_size < sizeof(uint32_t)) return E_TSFILE_CORRUPTED;
+    uint32_t offset = 0;
+    uint32_t data_num = SerializationUtil::read_ui32(uncompressed_buf);
+    offset += sizeof(uint32_t);
+    uint32_t bitmap_bytes = (data_num + 7) / 8;
+    if (uncompressed_size - offset < bitmap_bytes) return E_TSFILE_CORRUPTED;
+    col.notnull_bitmap.resize(bitmap_bytes);
+    for (size_t i = 0; i < col.notnull_bitmap.size(); i++) {
+        col.notnull_bitmap[i] = *(uncompressed_buf + offset);
+        offset++;
+    }
+    col.cur_value_index = -1;
+
+    char* value_buf = uncompressed_buf + offset;
+    uint32_t value_buf_size = uncompressed_size - offset;
+    col.decoder->reset();
+    col.in.wrap_from(value_buf, value_buf_size);
+
+    // Pre-decode all non-null values into pending_decoded_values so the
+    // scatter loop (multi_DECODE_TV_BATCH) just memcpys instead of calling
+    // the decoder.  Moves the expensive int64/double decode into the worker
+    // task so it runs in parallel.  Only handles fixed-length types — strings
+    // stay on the inline-decode path.
+    col.pending_decoded = false;
+    col.pending_decoded_count = 0;
+    col.pending_decoded_cursor = 0;
+    auto dt = col.chunk_header.data_type_;
+    if (predecode && dt != common::STRING && dt != common::TEXT &&
+        dt != common::BLOB) {
+        int nonnull_total = 0;
+        for (uint32_t i = 0; i < data_num; i++) {
+            if ((col.notnull_bitmap[i / 8] & (0x80 >> (i % 8))) != 0) {
+                nonnull_total++;
+            }
+        }
+        if (nonnull_total > 0) {
+            uint32_t elem_size = common::get_data_type_size(dt);
+            col.pending_decoded_values.resize(
+                static_cast<size_t>(nonnull_total) * elem_size);
+            int actual = 0;
+            int rret = common::E_OK;
+            switch (dt) {
+                case common::BOOLEAN: {
+                    bool* out = reinterpret_cast<bool*>(
+                        col.pending_decoded_values.data());
+                    for (int i = 0; i < nonnull_total; i++) {
+                        bool v;
+                        if (col.decoder->read_boolean(v, col.in) !=
+                            common::E_OK) {
+                            rret = common::E_OUT_OF_RANGE;
+                            break;
+                        }
+                        out[i] = v;
+                    }
+                    actual = nonnull_total;
+                    break;
+                }
+                case common::INT32:
+                case common::DATE:
+                    rret = col.decoder->read_batch_int32(
+                        reinterpret_cast<int32_t*>(
+                            col.pending_decoded_values.data()),
+                        nonnull_total, actual, col.in);
+                    break;
+                case common::INT64:
+                case common::TIMESTAMP:
+                    rret = col.decoder->read_batch_int64(
+                        reinterpret_cast<int64_t*>(
+                            col.pending_decoded_values.data()),
+                        nonnull_total, actual, col.in);
+                    break;
+                case common::FLOAT:
+                    rret = col.decoder->read_batch_float(
+                        reinterpret_cast<float*>(
+                            col.pending_decoded_values.data()),
+                        nonnull_total, actual, col.in);
+                    break;
+                case common::DOUBLE:
+                    rret = col.decoder->read_batch_double(
+                        reinterpret_cast<double*>(
+                            col.pending_decoded_values.data()),
+                        nonnull_total, actual, col.in);
+                    break;
+                default:
+                    rret = common::E_OUT_OF_RANGE;
+            }
+            if (rret == common::E_OK && actual == nonnull_total) {
+                col.pending_decoded_count = nonnull_total;
+                col.pending_decoded = true;
+            }
+        } else {
+            col.pending_decoded = true;  // empty page is trivially predecoded
+        }
+    }
+    return ret;
+}
+
+int AlignedChunkReader::decode_time_value_buf_into_tsblock_multi(
+    TsBlock*& ret_tsblock, Filter* filter, PageArena* pa) {
+    int ret = E_OK;
+    RowAppender row_appender(ret_tsblock);
+    ret = multi_DECODE_TV_BATCH(ret_tsblock, row_appender, filter, pa);
+
+    // Release uncompressed buffers if pages are done
+    if (ret != E_OVERFLOW) {
+        if (time_uncompressed_buf_ != nullptr) {
+            time_compressor_->after_uncompress(time_uncompressed_buf_);
+            time_uncompressed_buf_ = nullptr;
+        }
+        for (auto* col : value_columns_) {
+            if (col->uncompressed_buf != nullptr) {
+                col->compressor->after_uncompress(col->uncompressed_buf);
+                col->uncompressed_buf = nullptr;
+            }
+            // The time stream and bitmap define the page's row/value count.
+            // Once the page is fully processed, bytes left in an all-null
+            // value stream are only encoder terminators or padding and must
+            // not make has_more_data_multi() treat the page as unfinished.
+            col->in.reset();
+            col->notnull_bitmap.clear();
+            col->notnull_bitmap.shrink_to_fit();
+        }
+        if (!prev_time_page_not_finish()) {
+            time_in_.reset();
+        }
+    } else {
+        ret = E_OK;
+    }
+    return ret;
+}
+
+int AlignedChunkReader::multi_DECODE_TV_BATCH(TsBlock* ret_tsblock,
+                                              RowAppender& row_appender,
+                                              Filter* filter, PageArena* pa) {
+    int ret = E_OK;
+    const int BATCH = 129;
+    int64_t times[BATCH];
+    const uint32_t null_mask_base = 1 << 7;
+    const uint32_t num_cols = value_columns_.size();
+
+    while (time_decoder_->has_remaining(time_in_)) {
+        // Cap each pass to what the appender can still hold; mirrors the fix
+        // in ChunkReader's per-type batch loops.  A blanket "remaining < BATCH
+        // → E_OVERFLOW" made progress impossible whenever the caller handed
+        // us a TsBlock with capacity below BATCH (e.g. small per-block sizes
+        // in multi-chunk queries).
+        int eff_batch =
+            std::min(BATCH, static_cast<int>(row_appender.remaining()));
+        if (eff_batch <= 0) {
+            ret = E_OVERFLOW;
+            break;
+        }
+
+        // ── Phase 1: Decode a batch of timestamps ──
+        int time_count = 0;
+        if (RET_FAIL(time_decoder_->read_batch_int64(times, eff_batch,
+                                                     time_count, time_in_))) {
+            break;
+        }
+        if (time_count == 0) break;
+
+        // ── Phase 2: Apply time filter ──
+        bool time_mask[BATCH];
+        bool block_all_pass = (filter == nullptr);
+        int pass_count = time_count;
+        if (!block_all_pass) {
+            pass_count =
+                filter->satisfy_batch_time(times, time_count, time_mask);
+        }
+
+        // ── Phase 3: Per-column null check + value decode ──
+        // For each column, compute null flags and decode non-null values.
+        // We store decoded values in column-specific buffers.
+        // Max 8 bytes per value, 129 values per batch.
+        struct ColBatch {
+            bool is_null[BATCH];
+            int nonnull_count;
+            // Value buffer for fixed-width types — up to 129 * 8 bytes
+            char val_buf[BATCH * 8];
+            int val_count;
+            // Variable-length values for STRING/TEXT/BLOB columns.  Only
+            // populated when the column's data_type_ is variable; their
+            // bufs are owned by the caller-provided PageArena.
+            std::vector<common::String> str_vals;
+        };
+        // Allocate on heap if many columns, stack for small counts
+        std::vector<ColBatch> col_batches(num_cols);
+
+        for (uint32_t c = 0; c < num_cols; c++) {
+            auto* col = value_columns_[c];
+            auto& cb = col_batches[c];
+            cb.nonnull_count = 0;
+            cb.val_count = 0;
+            for (int i = 0; i < time_count; i++) {
+                int vi = col->cur_value_index + 1 + i;
+                if (col->notnull_bitmap.empty() ||
+                    ((col->notnull_bitmap[vi / 8] & 0xFF) &
+                     (null_mask_base >> (vi % 8))) == 0) {
+                    cb.is_null[i] = true;
+                } else {
+                    cb.is_null[i] = false;
+                    cb.nonnull_count++;
+                }
+            }
+
+            // Skip values if no rows pass time filter.  Skip/read errors and
+            // short reads (decoder returned fewer values than the bitmap
+            // promised) must abort; otherwise the input stream is left
+            // mid-value and later batches would decode garbage from
+            // misaligned bytes.
+            if (pass_count == 0 && cb.nonnull_count > 0) {
+                int dret = common::E_OK;
+                int sk = 0;
+                switch (col->chunk_header.data_type_) {
+                    case common::BOOLEAN: {
+                        bool dummy;
+                        for (sk = 0; sk < cb.nonnull_count; sk++) {
+                            dret = col->decoder->read_boolean(dummy, col->in);
+                            if (dret != common::E_OK) break;
+                        }
+                        break;
+                    }
+                    case common::INT32:
+                    case common::DATE:
+                        dret = col->decoder->skip_int32(cb.nonnull_count, sk,
+                                                        col->in);
+                        break;
+                    case common::INT64:
+                    case common::TIMESTAMP:
+                        dret = col->decoder->skip_int64(cb.nonnull_count, sk,
+                                                        col->in);
+                        break;
+                    case common::FLOAT:
+                        dret = col->decoder->skip_float(cb.nonnull_count, sk,
+                                                        col->in);
+                        break;
+                    case common::DOUBLE:
+                        dret = col->decoder->skip_double(cb.nonnull_count, sk,
+                                                         col->in);
+                        break;
+                    case common::STRING:
+                    case common::TEXT:
+                    case common::BLOB: {
+                        // The decoder has no fast skip for var-length strings;
+                        // reading + discarding is the only way to advance the
+                        // input stream past the row's payload.
+                        common::String tmp;
+                        for (sk = 0; sk < cb.nonnull_count; sk++) {
+                            dret = col->decoder->read_String(tmp, *pa, col->in);
+                            if (dret != common::E_OK) break;
+                        }
+                        break;
+                    }
+                    default:
+                        ret = E_TSFILE_CORRUPTED;
+                        break;
+                }
+                if (ret != common::E_OK) break;
+                if (dret != common::E_OK) {
+                    ret = dret;
+                    break;
+                }
+                if (sk != cb.nonnull_count) {
+                    ret = E_TSFILE_CORRUPTED;
+                    break;
+                }
+                cb.nonnull_count = 0;  // bytes consumed cleanly
+            }
+
+            // Decode non-null values.  Fast path: values were predecoded
+            // into col->pending_decoded_values by the parallel worker — just
+            // memcpy the slice for this batch.  Fallback: call the decoder
+            // inline (used for STRING/TEXT/BLOB and when predecode was
+            // skipped).
+            if (cb.nonnull_count > 0) {
+                if (col->pending_decoded) {
+                    uint32_t elem_size = common::get_data_type_size(
+                        col->chunk_header.data_type_);
+                    memcpy(
+                        cb.val_buf,
+                        col->pending_decoded_values.data() +
+                            static_cast<size_t>(col->pending_decoded_cursor) *
+                                elem_size,
+                        static_cast<size_t>(cb.nonnull_count) * elem_size);
+                    col->pending_decoded_cursor += cb.nonnull_count;
+                    cb.val_count = cb.nonnull_count;
+                } else {
+                    int dret = common::E_OK;
+                    switch (col->chunk_header.data_type_) {
+                        case common::BOOLEAN: {
+                            bool* out = reinterpret_cast<bool*>(cb.val_buf);
+                            cb.val_count = 0;
+                            for (int s = 0; s < cb.nonnull_count; s++) {
+                                bool v;
+                                dret = col->decoder->read_boolean(v, col->in);
+                                if (dret != common::E_OK) break;
+                                out[cb.val_count++] = v;
+                            }
+                            break;
+                        }
+                        case common::INT32:
+                        case common::DATE:
+                            dret = col->decoder->read_batch_int32(
+                                reinterpret_cast<int32_t*>(cb.val_buf),
+                                cb.nonnull_count, cb.val_count, col->in);
+                            break;
+                        case common::INT64:
+                        case common::TIMESTAMP:
+                            dret = col->decoder->read_batch_int64(
+                                reinterpret_cast<int64_t*>(cb.val_buf),
+                                cb.nonnull_count, cb.val_count, col->in);
+                            break;
+                        case common::FLOAT:
+                            dret = col->decoder->read_batch_float(
+                                reinterpret_cast<float*>(cb.val_buf),
+                                cb.nonnull_count, cb.val_count, col->in);
+                            break;
+                        case common::DOUBLE:
+                            dret = col->decoder->read_batch_double(
+                                reinterpret_cast<double*>(cb.val_buf),
+                                cb.nonnull_count, cb.val_count, col->in);
+                            break;
+                        case common::STRING:
+                        case common::TEXT:
+                        case common::BLOB: {
+                            // Variable-length payload doesn't fit in
+                            // cb.val_buf; pull each value into str_vals and
+                            // let the scatter loop index by val_count.
+                            cb.str_vals.resize(cb.nonnull_count);
+                            cb.val_count = 0;
+                            for (int s = 0; s < cb.nonnull_count; s++) {
+                                dret = col->decoder->read_String(cb.str_vals[s],
+                                                                 *pa, col->in);
+                                if (dret != common::E_OK) break;
+                                cb.val_count++;
+                            }
+                            break;
+                        }
+                        default:
+                            break;
+                    }
+                    // Any decoder error, or a short decode that produced
+                    // fewer values than the bitmap promised, indicates a
+                    // corrupt page; propagate immediately so the scatter
+                    // loop doesn't read uninitialised cb.val_buf bytes.
+                    if (dret != common::E_OK) {
+                        ret = dret;
+                        break;
+                    }
+                    if (col->chunk_header.data_type_ != common::STRING &&
+                        col->chunk_header.data_type_ != common::TEXT &&
+                        col->chunk_header.data_type_ != common::BLOB &&
+                        cb.val_count != cb.nonnull_count) {
+                        ret = E_TSFILE_CORRUPTED;
+                        break;
+                    }
+                }
+            }
+        }
+        if (ret != E_OK) break;
+
+        // ── Phase 4: Skip if no rows pass ──
+        if (pass_count == 0) {
+            for (uint32_t c = 0; c < num_cols; c++) {
+                value_columns_[c]->cur_value_index += time_count;
+            }
+            continue;
+        }
+
+        // ── Phase 5: Scatter into TsBlock ──
+
+        // Fast path: all rows pass filter AND all columns have no nulls
+        // → batch memcpy directly into Vector buffers.  STRING/TEXT/BLOB
+        // columns have variable-width payload and live in cb.str_vals, not
+        // cb.val_buf, so they must take the slow scatter path.
+        if (pass_count == time_count) {
+            bool all_nonnull = true;
+            for (uint32_t c = 0; c < num_cols; c++) {
+                auto dt = value_columns_[c]->chunk_header.data_type_;
+                if (col_batches[c].nonnull_count != time_count ||
+                    dt == common::STRING || dt == common::TEXT ||
+                    dt == common::BLOB) {
+                    all_nonnull = false;
+                    break;
+                }
+            }
+            if (all_nonnull) {
+                // Batch append time column (bytes + row count); see the
+                // chunk-level bulk path above for why add_row_nums() is
+                // required alongside append_fixed_value().
+                common::Vector* time_vec = ret_tsblock->get_vector(0);
+                time_vec->get_value_data().append_fixed_value(
+                    (const char*)times,
+                    static_cast<uint32_t>(time_count) * sizeof(int64_t));
+                time_vec->add_row_nums(static_cast<uint32_t>(time_count));
+                // Batch append each value column
+                for (uint32_t c = 0; c < num_cols; c++) {
+                    auto& cb = col_batches[c];
+                    auto* col = value_columns_[c];
+                    uint32_t elem_size = common::get_data_type_size(
+                        col->chunk_header.data_type_);
+                    common::Vector* vec = ret_tsblock->get_vector(c + 1);
+                    vec->get_value_data().append_fixed_value(
+                        cb.val_buf,
+                        static_cast<uint32_t>(cb.val_count) * elem_size);
+                    vec->add_row_nums(static_cast<uint32_t>(cb.val_count));
+                    col->cur_value_index += time_count;
+                }
+                row_appender.add_rows(static_cast<uint32_t>(time_count));
+                continue;
+            }
+        }
+
+        // Slow path: per-row scatter (has filter or has nulls or strings)
+        std::vector<int> val_idx(num_cols, 0);
+
+        for (int i = 0; i < time_count; i++) {
+            bool passes = block_all_pass || time_mask[i];
+
+            if (!passes) {
+                for (uint32_t c = 0; c < num_cols; c++) {
+                    value_columns_[c]->cur_value_index++;
+                    if (!col_batches[c].is_null[i]) val_idx[c]++;
+                }
+                continue;
+            }
+
+            if (UNLIKELY(!row_appender.add_row())) {
+                ret = E_OVERFLOW;
+                break;
+            }
+
+            row_appender.append(0, (char*)&times[i], sizeof(int64_t));
+
+            for (uint32_t c = 0; c < num_cols; c++) {
+                value_columns_[c]->cur_value_index++;
+                auto& cb = col_batches[c];
+                auto* col = value_columns_[c];
+
+                if (cb.is_null[i]) {
+                    row_appender.append_null(c + 1);
+                } else {
+                    auto dt = col->chunk_header.data_type_;
+                    if (dt == common::STRING || dt == common::TEXT ||
+                        dt == common::BLOB) {
+                        const common::String& sv = cb.str_vals[val_idx[c]];
+                        row_appender.append(c + 1, sv.buf_, sv.len_);
+                    } else {
+                        uint32_t elem_size = common::get_data_type_size(dt);
+                        row_appender.append(c + 1,
+                                            cb.val_buf + val_idx[c] * elem_size,
+                                            elem_size);
+                    }
+                    val_idx[c]++;
+                }
+            }
+        }
+        if (ret != E_OK) break;
     }
     return ret;
 }
 
-}  // end namespace storage
\ No newline at end of file
+}  // end namespace storage
diff --git a/cpp/src/reader/aligned_chunk_reader.h b/cpp/src/reader/aligned_chunk_reader.h
index 91281215e..69ce48f4a 100644
--- a/cpp/src/reader/aligned_chunk_reader.h
+++ b/cpp/src/reader/aligned_chunk_reader.h
@@ -28,8 +28,70 @@
 #include "reader/filter/filter.h"
 #include "reader/ichunk_reader.h"
 
+#ifdef ENABLE_THREADS
+namespace common {
+class ThreadPool;
+}
+#endif
+
 namespace storage {
 
+// Page classification for chunk-level parallel decode.
+enum class PagePassType { SKIP, FULL_PASS, BOUNDARY };
+
+// Metadata collected per page during the chunk scan phase.
+struct ChunkPageInfo {
+    PagePassType pass_type = PagePassType::SKIP;
+    // File offsets of compressed data for time and each value column.
+    int64_t time_file_offset = 0;
+    uint32_t time_compressed_size = 0;
+    uint32_t time_uncompressed_size = 0;
+    int32_t row_begin = 0;  // inclusive
+    int32_t row_end = 0;    // exclusive
+    std::vector<int64_t> value_file_offsets;
+    std::vector<uint32_t> value_compressed_sizes;
+    std::vector<uint32_t> value_uncompressed_sizes;
+};
+
+// Decoded state for one (column, page) slot.  Populated by chunk-level
+// parallel decode; consumed by the scatter loop.
+struct PageDecodedState {
+    std::vector<uint8_t> notnull_bitmap;
+    std::vector<char> predecoded_values;
+    std::vector<common::String> predecoded_strings;
+    common::PageArena predecode_pa;
+    int32_t predecoded_count = 0;
+    int32_t predecoded_read_pos = 0;
+};
+
+// Per-value-column state for multi-value AlignedChunkReader.
+struct ValueColumnState {
+    ChunkMeta* chunk_meta = nullptr;
+    ChunkHeader chunk_header;
+    Decoder* decoder = nullptr;
+    Compressor* compressor = nullptr;
+    common::ByteStream in_stream;  // raw data from file
+    common::ByteStream in;         // decompressed data
+    char* uncompressed_buf = nullptr;
+    int32_t file_data_buf_size = 0;
+    uint32_t chunk_visit_offset = 0;
+    PageHeader cur_page_header;
+    std::vector<uint8_t> notnull_bitmap;
+    int32_t cur_value_index = -1;
+
+    // Per-page decoded state for chunk-level parallel decode.
+    std::vector<PageDecodedState> per_page_state;
+
+    // Pre-decoded value buffer for the CURRENT page, filled by
+    // decompress_and_parse_value_page when the dense-multi path predecodes
+    // values in worker threads.  Consumed by multi_DECODE_TV_BATCH instead of
+    // calling the decoder inline.  Holds nonnull values only.
+    std::vector<char> pending_decoded_values;
+    int32_t pending_decoded_count = 0;
+    int32_t pending_decoded_cursor = 0;
+    bool pending_decoded = false;
+};
+
 class AlignedChunkReader : public IChunkReader {
    public:
     AlignedChunkReader()
@@ -64,11 +126,13 @@ class AlignedChunkReader : public IChunkReader {
     ~AlignedChunkReader() override = default;
 
     bool has_more_data() const override {
-        return prev_value_page_not_finish() ||
+        if (multi_value_mode_) {
+            return has_more_data_multi();
+        }
+        return prev_value_page_not_finish() || prev_time_page_not_finish() ||
                (value_chunk_visit_offset_ -
                     value_chunk_header_.serialized_size_ <
                 value_chunk_header_.data_size_) ||
-               prev_time_page_not_finish() ||
                (time_chunk_visit_offset_ - time_chunk_header_.serialized_size_ <
                 time_chunk_header_.data_size_);
     }
@@ -76,13 +140,36 @@ class AlignedChunkReader : public IChunkReader {
     int load_by_aligned_meta(ChunkMeta* time_meta,
                              ChunkMeta* value_meta) override;
 
+    // Multi-value: load one time chunk + N value chunks.
+    int load_by_aligned_meta_multi(ChunkMeta* time_meta,
+                                   const std::vector<ChunkMeta*>& value_metas);
+
     int get_next_page(common::TsBlock* tsblock, Filter* oneshoot_filter,
                       common::PageArena& pa) override;
-
     int get_next_page(common::TsBlock* tsblock, Filter* oneshoot_filter,
                       common::PageArena& pa, int64_t min_time_hint,
                       int& row_offset, int& row_limit) override;
 
+    // Multi-value: get the number of value columns.
+    uint32_t get_value_column_count() const {
+        return multi_value_mode_ ? value_columns_.size() : 1;
+    }
+
+    // Multi-value: get chunk header for a specific value column.
+    ChunkHeader& get_value_chunk_header(uint32_t col) {
+        if (multi_value_mode_ && col < value_columns_.size()) {
+            return value_columns_[col]->chunk_header;
+        }
+        return value_chunk_header_;
+    }
+
+    bool is_multi_value_mode() const { return multi_value_mode_; }
+
+#ifdef ENABLE_THREADS
+    // Set external thread pool for parallel decode (not owned).
+    void set_decode_pool(common::ThreadPool* pool) { decode_pool_ = pool; }
+#endif
+
    private:
     bool should_skip_page_by_time(int64_t min_time_hint);
     bool should_skip_page_by_offset(int& row_offset);
@@ -100,7 +187,8 @@ class AlignedChunkReader : public IChunkReader {
                             common::ByteStream& in_stream_,
                             PageHeader& cur_page_header_,
                             uint32_t& chunk_visit_offset,
-                            ChunkHeader& chunk_header);
+                            ChunkHeader& chunk_header,
+                            int32_t* override_buf_size = nullptr);
     int read_from_file_and_rewrap(common::ByteStream& in_stream_,
                                   ChunkMeta*& chunk_meta,
                                   uint32_t& chunk_visit_offset,
@@ -114,6 +202,7 @@ class AlignedChunkReader : public IChunkReader {
                                            Filter* filter,
                                            common::PageArena* pa);
     bool prev_time_page_not_finish() const {
+        if (time_predecoded_) return page_time_cursor_ < page_time_count_;
         return (time_decoder_ && time_decoder_->has_remaining(time_in_)) ||
                time_in_.has_remaining();
     }
@@ -132,58 +221,119 @@ class AlignedChunkReader : public IChunkReader {
                                          common::ByteStream& value_in,
                                          common::RowAppender& row_appender,
                                          Filter* filter);
+    int i32_DECODE_TV_BATCH(common::ByteStream& time_in,
+                            common::ByteStream& value_in,
+                            common::RowAppender& row_appender, Filter* filter);
+    int i64_DECODE_TV_BATCH(common::ByteStream& time_in,
+                            common::ByteStream& value_in,
+                            common::RowAppender& row_appender, Filter* filter);
+    int float_DECODE_TV_BATCH(common::ByteStream& time_in,
+                              common::ByteStream& value_in,
+                              common::RowAppender& row_appender,
+                              Filter* filter);
+    int double_DECODE_TV_BATCH(common::ByteStream& time_in,
+                               common::ByteStream& value_in,
+                               common::RowAppender& row_appender,
+                               Filter* filter);
     int STRING_DECODE_TYPED_TV_INTO_TSBLOCK(common::ByteStream& time_in,
                                             common::ByteStream& value_in,
                                             common::RowAppender& row_appender,
                                             common::PageArena& pa,
                                             Filter* filter);
 
+    // ── Multi-value private methods (page-level, serial fallback) ────────
+    bool has_more_data_multi() const;
+    bool prev_any_value_page_not_finish_multi() const;
+    int get_next_page_multi(common::TsBlock* ret_tsblock,
+                            Filter* oneshoot_filter, common::PageArena& pa);
+    int get_next_page_multi_serial(common::TsBlock* ret_tsblock, Filter* filter,
+                                   common::PageArena& pa);
+    int skip_cur_page_multi();
+    bool cur_page_statisify_filter_multi(Filter* filter);
+    int decode_cur_value_pages_multi();
+    int decode_cur_value_page_data_for(ValueColumnState& col);
+    int ensure_value_page_loaded(ValueColumnState& col);
+    static int decompress_and_parse_value_page(ValueColumnState& col,
+                                               bool predecode);
+    void predecode_all_timestamps();
+    int decode_time_value_buf_into_tsblock_multi(common::TsBlock*& ret_tsblock,
+                                                 Filter* filter,
+                                                 common::PageArena* pa);
+    int multi_DECODE_TV_BATCH(common::TsBlock* ret_tsblock,
+                              common::RowAppender& row_appender, Filter* filter,
+                              common::PageArena* pa);
+    int build_page_plan(Filter* filter);
+    int decode_time_page_direct(const ChunkPageInfo& page_info,
+                                std::vector<int64_t>& out_times);
+    int decode_time_page_with(const ChunkPageInfo& page_info,
+                              std::vector<int64_t>& out_times, Decoder* decoder,
+                              Compressor* compressor);
+    int decode_all_planned_pages();
+    int decode_value_page_for_slot(uint32_t col_idx, size_t page_idx);
+    int decode_page_lazy(size_t page_idx);
+    void release_page_slot(size_t page_idx);
+    void release_current_page_state();
+    bool has_variable_length_value_column() const;
+    int count_non_null_prefix(const std::vector<uint8_t>& bitmap,
+                              int32_t row_limit) const;
+
    private:
     ReadFile* read_file_;
+    // ── Single-value mode fields (kept for backward compat) ──────────────
     ChunkMeta* time_chunk_meta_;
     ChunkMeta* value_chunk_meta_;
     common::String measurement_name_;
     ChunkHeader time_chunk_header_;
-    // TODO: support reading more than one measurement in AlignedChunkReader.
     ChunkHeader value_chunk_header_;
     PageHeader cur_time_page_header_;
     PageHeader cur_value_page_header_;
 
-    /*
-     * Data reader from file is stored in @in_stream_, and the size
-     * is stored in @file_data_buf_size_. Note, in_stream_.total_size_
-     * is used to limit deserialization, that is why we still have
-     * @file_data_buf_size_.
-     *
-     * Since we may want keep data of current page (and page header
-     * of next page) in memory, we need a byte-size cursor to tell
-     * us which byte we are processing, so we have @chunk_visit_offset_
-     * it refer to position from the start of chunk_header_,
-     * also refer to offset within the chunk (including chunk header).
-     * It advanced by step of a page header or a page tv data.
-     */
-    common::ByteStream time_in_stream_{common::MOD_CHUNK_READER};
-    common::ByteStream value_in_stream_{common::MOD_CHUNK_READER};
+    common::ByteStream time_in_stream_;
+    common::ByteStream value_in_stream_;
     int32_t file_data_time_buf_size_;
     int32_t file_data_value_buf_size_;
     uint32_t time_chunk_visit_offset_;
     uint32_t value_chunk_visit_offset_;
 
-    // Statistic *page_statistic_;
     Compressor* time_compressor_;
     Compressor* value_compressor_;
     Filter* time_filter_;
 
     Decoder* time_decoder_;
     Decoder* value_decoder_;
-    common::ByteStream time_in_{common::MOD_CHUNK_READER};
-    common::ByteStream value_in_{common::MOD_CHUNK_READER};
+    common::ByteStream time_in_;
+    common::ByteStream value_in_;
     char* time_uncompressed_buf_;
     char* value_uncompressed_buf_;
     std::vector<uint8_t> value_page_col_notnull_bitmap_;
     uint32_t value_page_data_num_;
     int32_t cur_value_index;
+
+    // ── Multi-value mode fields ──────────────────────────────────────────
+    bool multi_value_mode_ = false;
+    std::vector<ValueColumnState*> value_columns_;
+
+    // Pre-decoded timestamps for page-level parallel decode.
+    std::vector<int64_t> page_all_times_;
+    int page_time_count_ = 0;
+    int page_time_cursor_ = 0;
+    bool time_predecoded_ = false;
+
+    // ── Page-plan state ────────────────────────────────────────────────
+    std::vector<ChunkPageInfo> chunk_pages_;
+    std::vector<std::vector<int64_t>> per_page_times_;
+    bool page_plan_built_ = false;
+    bool current_page_loaded_ = false;
+    size_t current_page_plan_index_ = 0;
+
+#ifdef ENABLE_THREADS
+    common::ThreadPool* decode_pool_ = nullptr;  // borrowed, not owned
+    // Per-worker time decoder + compressor pool for parallel time-page decode.
+    // Sized to decode_pool_->num_threads() on first use, owned by this reader.
+    std::vector<Decoder*> time_decoder_pool_;
+    std::vector<Compressor*> time_compressor_pool_;
+#endif
 };
 
 }  // end namespace storage
-#endif  // READER_CHUNK_READER_H
+#endif  // READER_CHUNK_ALIGNED_READER_H
diff --git a/cpp/src/reader/block/single_device_tsblock_reader.cc b/cpp/src/reader/block/single_device_tsblock_reader.cc
index 93f42efd3..5fb9d80d2 100644
--- a/cpp/src/reader/block/single_device_tsblock_reader.cc
+++ b/cpp/src/reader/block/single_device_tsblock_reader.cc
@@ -19,8 +19,18 @@
 
 #include "single_device_tsblock_reader.h"
 
+#include <algorithm>
+#include <iostream>
+#include <set>
+
+#include "common/db_common.h"
+
 namespace storage {
 
+namespace {
+const char* kTimeOnlyContextName = "__time_only_aligned_context__";
+}
+
 SingleDeviceTsBlockReader::SingleDeviceTsBlockReader(
     DeviceQueryTask* device_query_task, uint32_t block_size,
     IMetadataQuerier* metadata_querier, TsFileIOReader* tsfile_io_reader,
@@ -55,6 +65,25 @@ int SingleDeviceTsBlockReader::init(DeviceQueryTask* device_query_task,
 int32_t SingleDeviceTsBlockReader::compute_dense_row_count(
     const std::vector<ITimeseriesIndex*>& ts_indexes) {
     int64_t reference_time_count = -1;
+    // Single-chunk timeseries skip per-chunk statistic serialization
+    // (see TsFileIOWriter / TimeseriesIndex::deserialize_from); when the
+    // chunk-level statistic is null, fall back to the TimeseriesIndex's
+    // top-level statistic, which summarizes that lone chunk.
+    auto chunk_count = [](const common::SimpleList<ChunkMeta*>& list,
+                          Statistic* fallback) -> int64_t {
+        int64_t total = 0;
+        int nchunks = 0;
+        for (auto it = list.begin(); it != list.end(); it++) {
+            nchunks++;
+            if (it.get()->statistic_) {
+                total += it.get()->statistic_->count_;
+            }
+        }
+        if (total == 0 && nchunks == 1 && fallback != nullptr) {
+            total = fallback->count_;
+        }
+        return total;
+    };
     for (const auto* ts_index : ts_indexes) {
         if (ts_index == nullptr) {
             continue;
@@ -69,27 +98,30 @@ int32_t SingleDeviceTsBlockReader::compute_dense_row_count(
             if (time_list == nullptr || value_list == nullptr) {
                 return -1;
             }
-
-            for (auto it = time_list->begin(); it != time_list->end(); it++) {
-                if (it.get()->statistic_) {
-                    time_count += it.get()->statistic_->count_;
-                }
-            }
-            for (auto it = value_list->begin(); it != value_list->end(); it++) {
-                if (it.get()->statistic_) {
-                    value_count += it.get()->statistic_->count_;
-                }
+            // Use the time-side and value-side top stats independently:
+            // the value-side count_ excludes nulls, so reusing it for the
+            // time chunk would misclassify sparse data as dense.
+            const auto* aligned_ti =
+                dynamic_cast<const AlignedTimeseriesIndex*>(ts_index);
+            if (aligned_ti == nullptr) {
+                return -1;
             }
+            Statistic* time_top_stat =
+                aligned_ti->time_ts_idx_ != nullptr
+                    ? aligned_ti->time_ts_idx_->get_statistic()
+                    : nullptr;
+            Statistic* value_top_stat =
+                aligned_ti->value_ts_idx_ != nullptr
+                    ? aligned_ti->value_ts_idx_->get_statistic()
+                    : nullptr;
+            time_count = chunk_count(*time_list, time_top_stat);
+            value_count = chunk_count(*value_list, value_top_stat);
         } else {
             auto* list = ts_index->get_chunk_meta_list();
             if (list == nullptr) {
                 return -1;
             }
-            for (auto it = list->begin(); it != list->end(); it++) {
-                if (it.get()->statistic_) {
-                    time_count += it.get()->statistic_->count_;
-                }
-            }
+            time_count = chunk_count(*list, ts_index->get_statistic());
             value_count = time_count;
         }
 
@@ -149,32 +181,198 @@ int SingleDeviceTsBlockReader::init_internal(DeviceQueryTask* device_query_task,
             time_series_indexs, pa_))) {
         return ret;
     }
-
     dense_row_count_ = compute_dense_row_count(time_series_indexs);
-
-    if (dense_row_count_ >= 0 && remaining_offset_ >= dense_row_count_) {
-        remaining_offset_ -= dense_row_count_;
-        delete current_block_;
-        current_block_ = nullptr;
-        return common::E_OK;
+    // Fast path: when every aligned column is provably dense (same total row
+    // count across time + value chunks), bulk-copy from SSI tsblock to caller
+    // tsblock instead of per-row merging.  compute_dense_row_count() returns
+    // -1 if the device is not provably dense, which gates safety.
+    const bool enable_dense_aligned_fast_path = true;
+    // Early device-level time skip: if time_filter is set and ALL chunks of
+    // this device have statistics that fall outside the filter range, skip the
+    // entire device.  Chunks without statistics are assumed to satisfy.
+    //
+    // Skip the entire shortcut when time_series_indexs is empty (e.g. a
+    // time-only query that selects no value column): there's nothing to
+    // prove outside the filter, and dropping out here would lose the
+    // time-only fallback path that runs below.
+    if (time_filter != nullptr && !time_series_indexs.empty()) {
+        bool examined_any = false;
+        bool all_outside = true;
+        for (const auto* ts_idx : time_series_indexs) {
+            if (ts_idx == nullptr) continue;
+            auto* chunk_list = ts_idx->is_aligned()
+                                   ? ts_idx->get_time_chunk_meta_list()
+                                   : ts_idx->get_chunk_meta_list();
+            if (chunk_list == nullptr) {
+                all_outside = false;
+                break;
+            }
+            examined_any = true;
+            for (auto it = chunk_list->begin(); it != chunk_list->end(); it++) {
+                if (it.get()->statistic_ == nullptr ||
+                    time_filter->satisfy(it.get()->statistic_)) {
+                    all_outside = false;
+                    break;
+                }
+            }
+            if (!all_outside) break;
+        }
+        if (examined_any && all_outside) {
+            // No data in this device matches the time filter.
+            delete current_block_;
+            current_block_ = nullptr;
+            return common::E_OK;
+        }
     }
+    // Try multi-value aligned path: one VectorMeasurementColumnContext (and
+    // the SSI it owns) reads all aligned value columns at once.  This is the
+    // entry point for AlignedChunkReader's per-column parallel decode pool
+    // (created in TsFileSeriesScanIterator::init_chunk_reader_multi when
+    // num_cols > 1 && parallel_read_enabled_); per-column
+    // SingleMeasurementColumnContext siblings would each open their own
+    // single-column SSI and never reach it. Falls back to the per-column path
+    // if ctx->init() fails (e.g. the device mixes aligned and non-aligned
+    // chunks).
+    bool used_multi = false;
+    std::set<std::string> multi_names;
+    {
+        bool can_multi = !time_series_indexs.empty();
+        auto& meas_cols =
+            device_query_task->get_column_mapping()->get_measurement_columns();
+        for (const auto& ts_idx : time_series_indexs) {
+            if (ts_idx == nullptr || !ts_idx->is_aligned()) {
+                can_multi = false;
+                break;
+            }
+        }
+        if (can_multi) {
+            std::vector<std::string> meas_names(meas_cols.begin(),
+                                                meas_cols.end());
+            // Stable order by first appearance in the result schema so the
+            // shared SSI's column buffers line up with the result columns.
+            std::sort(
+                meas_names.begin(), meas_names.end(),
+                [device_query_task](const std::string& lhs,
+                                    const std::string& rhs) {
+                    const auto& lhs_pos =
+                        device_query_task->get_column_mapping()->get_column_pos(
+                            lhs);
+                    const auto& rhs_pos =
+                        device_query_task->get_column_mapping()->get_column_pos(
+                            rhs);
+                    const int lhs_first =
+                        lhs_pos.empty() ? INT32_MAX : lhs_pos.front();
+                    const int rhs_first =
+                        rhs_pos.empty() ? INT32_MAX : rhs_pos.front();
+                    if (lhs_first != rhs_first) {
+                        return lhs_first < rhs_first;
+                    }
+                    return lhs < rhs;
+                });
+            std::vector<std::vector<int32_t>> pos_list;
+            pos_list.reserve(meas_names.size());
+            for (const auto& name : meas_names) {
+                const auto& pos =
+                    device_query_task->get_column_mapping()->get_column_pos(
+                        name);
+                pos_list.push_back(
+                    std::vector<int32_t>(pos.begin(), pos.end()));
+            }
 
-    int ssi_offset = 0;
-    int ssi_limit = -1;
-    if (dense_row_count_ >= 0) {
-        ssi_offset = remaining_offset_;
-        ssi_limit = remaining_limit_;
+            auto* ctx = new VectorMeasurementColumnContext(tsfile_io_reader_);
+            if (common::E_OK == ctx->init(device_query_task_, meas_names,
+                                          time_filter, pos_list, pa_)) {
+                // The shared ctx is referenced from N map entries; close()
+                // and the merge loop dedupe by pointer (already in place).
+                for (const auto& name : meas_names) {
+                    field_column_contexts_.insert(std::make_pair(name, ctx));
+                    multi_names.insert(name);
+                }
+                aligned_col_count_ = meas_names.size();
+                used_multi = true;
+            } else {
+                delete ctx;
+            }
+        }
     }
 
+    // Per-column path for anything not absorbed by the multi-value ctx
+    // (e.g. fallback when init() failed, or a non-aligned column would have
+    // been added before we generalize this for mixed schemas).
     for (const auto& time_series_index : time_series_indexs) {
-        construct_column_context(time_series_index, time_filter, ssi_offset,
-                                 ssi_limit);
+        if (time_series_index == nullptr) {
+            continue;
+        }
+        const std::string measurement_name =
+            time_series_index->get_measurement_name().to_std_string();
+        if (used_multi && multi_names.count(measurement_name) > 0) {
+            continue;
+        }
+        construct_column_context(time_series_index, time_filter, 0, -1);
+    }
+
+    if (field_column_contexts_.empty()) {
+        // If value columns were actually requested but none produced a
+        // context, every one of them read empty under the current filter
+        // (e.g. an empty/inverted time range, or a filter that matches no
+        // rows).  The result is simply empty -- return it directly.  The
+        // time-only fallback below is only for genuine time-only queries (no
+        // value columns); routing an all-empty value query through it would
+        // call alloc_multi_ssi(), which is aligned-only and returns
+        // E_NOT_SUPPORT on non-aligned devices.
+        bool any_value_column_requested = false;
+        for (const auto* ts_idx : time_series_indexs) {
+            if (ts_idx != nullptr) {
+                any_value_column_requested = true;
+                break;
+            }
+        }
+        if (any_value_column_requested) {
+            delete current_block_;
+            current_block_ = nullptr;
+            return common::E_OK;
+        }
+
+        std::vector<std::string> empty_measurements;
+        std::vector<std::vector<int32_t>> empty_positions;
+        auto* time_only_ctx =
+            new VectorMeasurementColumnContext(tsfile_io_reader_);
+        int time_only_ret =
+            time_only_ctx->init(device_query_task_, empty_measurements,
+                                time_filter, empty_positions, pa_);
+        if (common::E_OK == time_only_ret) {
+            field_column_contexts_.insert(
+                std::make_pair(kTimeOnlyContextName, time_only_ctx));
+        } else {
+            delete time_only_ctx;
+            // Only treat "no data" as an acceptable empty result; I/O
+            // errors, OOM, and corruption from the time-only init must
+            // propagate so the caller sees the actual failure instead of
+            // an empty resultset wearing E_OK.
+            if (time_only_ret != common::E_NO_MORE_DATA) {
+                delete current_block_;
+                current_block_ = nullptr;
+                return time_only_ret;
+            }
+        }
     }
 
-    if (dense_row_count_ >= 0 && !field_column_contexts_.empty()) {
-        auto* first_ctx = field_column_contexts_.begin()->second;
-        remaining_offset_ = first_ctx->get_ssi_row_offset();
-        remaining_limit_ = first_ctx->get_ssi_row_limit();
+    // Detect aligned fast path: every field column comes from an aligned chunk.
+    if (!field_column_contexts_.empty() && enable_dense_aligned_fast_path &&
+        dense_row_count_ >= 0 &&
+        aligned_col_count_ == field_column_contexts_.size()) {
+        all_aligned_ = true;
+        aligned_vec_.reserve(field_column_contexts_.size());
+        if (used_multi) {
+            // Single shared VectorMeasurementColumnContext handles all
+            // columns — push it once, otherwise we'd schedule the same
+            // bulk_copy_into N times.
+            aligned_vec_.push_back(field_column_contexts_.begin()->second);
+        } else {
+            for (auto& kv : field_column_contexts_) {
+                aligned_vec_.push_back(kv.second);
+            }
+        }
     }
 
     if (field_column_contexts_.empty()) {
@@ -218,18 +416,25 @@ int SingleDeviceTsBlockReader::has_next(bool& has_next) {
 
     current_block_->reset();
 
-    uint32_t effective_block_size = block_size_;
-    if (remaining_limit_ > 0) {
-        effective_block_size =
-            std::min(block_size_, static_cast<uint32_t>(remaining_limit_));
+    if (all_aligned_) {
+        return has_next_aligned(has_next);
     }
 
     bool next_time_set = false;
     next_time_ = -1;
 
     std::vector<MeasurementColumnContext*> min_time_columns;
-    while (current_block_->get_row_count() < effective_block_size) {
+    while (current_block_->get_row_count() < block_size_) {
+        if (remaining_limit_ > 0 &&
+            current_block_->get_row_count() >=
+                static_cast<uint32_t>(remaining_limit_)) {
+            break;
+        }
+        std::set<MeasurementColumnContext*> visited_contexts;
         for (auto& column_context : field_column_contexts_) {
+            if (!visited_contexts.insert(column_context.second).second) {
+                continue;
+            }
             int64_t time;
             if (IS_FAIL(column_context.second->get_current_time(time))) {
                 continue;
@@ -293,6 +498,114 @@ int SingleDeviceTsBlockReader::has_next(bool& has_next) {
     return ret;
 }
 
+int SingleDeviceTsBlockReader::has_next_aligned(bool& result_has_next) {
+    int ret = common::E_OK;
+    int time_in_query_index = tuple_desc_.get_time_column_index();
+
+    while (current_block_->get_row_count() < block_size_) {
+        if (aligned_vec_.empty()) break;
+
+        if (remaining_limit_ == 0) break;
+
+        // Check if first column has data.
+        uint32_t avail = aligned_vec_[0]->available_rows();
+        if (avail == 0) {
+            for (auto* ctx : aligned_vec_) {
+                ctx->remove_from(field_column_contexts_);
+            }
+            aligned_vec_.clear();
+            break;
+        }
+
+        // Find the batch size: min of output capacity and all SSI
+        // availabilities.
+        uint32_t batch = block_size_ - current_block_->get_row_count();
+        for (auto* ctx : aligned_vec_) {
+            uint32_t ctx_avail = ctx->available_rows();
+            if (ctx_avail == 0) {
+                batch = 0;
+                break;
+            }
+            if (ctx_avail < batch) batch = ctx_avail;
+        }
+        if (batch == 0) {
+            for (auto* ctx : aligned_vec_) {
+                ctx->remove_from(field_column_contexts_);
+            }
+            aligned_vec_.clear();
+            break;
+        }
+
+        // Handle offset: skip rows before copying.
+        if (remaining_offset_ > 0) {
+            uint32_t skip = std::min(batch, (uint32_t)remaining_offset_);
+            for (auto* ctx : aligned_vec_) {
+                int sr = ctx->skip_rows(skip);
+                if (sr != common::E_OK) return sr;
+            }
+            remaining_offset_ -= skip;
+            continue;
+        }
+
+        // Handle limit: cap the batch size.
+        if (remaining_limit_ > 0) {
+            batch = std::min(batch, (uint32_t)remaining_limit_);
+        }
+
+        // First SSI: bulk copy time + values + row_count.
+        int copy_ret = aligned_vec_[0]->bulk_copy_into(
+            col_appenders_, col_appenders_[time_column_index_], row_appender_,
+            batch);
+        // E_NO_MORE_DATA is the normal end-of-stream signal; any other
+        // error (I/O, decode, corruption) must propagate to the caller
+        // instead of silently truncating the result with E_OK.
+        if (copy_ret != common::E_OK && copy_ret != common::E_NO_MORE_DATA) {
+            return copy_ret;
+        }
+
+        // Also copy time to explicit time column if requested.
+        if (time_in_query_index != -1) {
+            common::Vector* time_vec =
+                current_block_->get_vector(time_column_index_);
+            char* time_src =
+                time_vec->get_value_data().get_data() +
+                (current_block_->get_row_count() - batch) * sizeof(int64_t);
+            col_appenders_[time_in_query_index]->bulk_append_fixed(
+                time_src, batch, sizeof(int64_t));
+        }
+
+        // Other SSIs: bulk copy values only (no time, no row_count). Any
+        // hard error from these columns also has to propagate; otherwise a
+        // truncated/corrupt value column would silently emit nulls.
+        for (size_t i = 1; i < aligned_vec_.size(); i++) {
+            int other_ret = aligned_vec_[i]->bulk_copy_into(
+                col_appenders_, nullptr, nullptr, batch);
+            if (other_ret != common::E_OK &&
+                other_ret != common::E_NO_MORE_DATA) {
+                return other_ret;
+            }
+        }
+
+        // Decrement limit for data already copied.
+        if (remaining_limit_ > 0) {
+            remaining_limit_ -= batch;
+        }
+
+        // If first SSI signaled no-more-data, stop after accounting.
+        if (copy_ret == common::E_NO_MORE_DATA) break;
+    }
+
+    if (current_block_->get_row_count() > 0) {
+        if (RET_FAIL(fill_ids())) return ret;
+        current_block_->fill_trailling_nulls();
+        last_block_returned_ = false;
+        result_has_next = true;
+    } else {
+        result_has_next = false;
+    }
+    return ret;
+}
+
 int SingleDeviceTsBlockReader::fill_measurements(
     std::vector<MeasurementColumnContext*>& column_contexts) {
     int ret = common::E_OK;
@@ -400,8 +713,15 @@ int SingleDeviceTsBlockReader::next(common::TsBlock*& ret_block) {
 }
 
 void SingleDeviceTsBlockReader::close() {
+    aligned_vec_.clear();  // non-owning; owned by field_column_contexts_
+    // De-duplicate pointers before deleting: VectorMeasurementColumnContext
+    // has multiple map entries pointing to the same object.
+    std::set<MeasurementColumnContext*> unique_contexts;
     for (auto& column_context : field_column_contexts_) {
-        delete column_context.second;
+        unique_contexts.insert(column_context.second);
+    }
+    for (auto* ctx : unique_contexts) {
+        delete ctx;
     }
     for (auto& col_appender : col_appenders_) {
         if (col_appender) {
@@ -413,9 +733,7 @@ void SingleDeviceTsBlockReader::close() {
         delete row_appender_;
         row_appender_ = nullptr;
     }
-    if (device_query_task_) {
-        device_query_task_->~DeviceQueryTask();
-    }
+    device_query_task_ = nullptr;  // owned by the task iterator arena
     if (current_block_) {
         delete current_block_;
         current_block_ = nullptr;
@@ -430,24 +748,34 @@ int SingleDeviceTsBlockReader::construct_column_context(
         (!time_series_index->is_aligned() &&
          time_series_index->get_chunk_meta_list()->empty())) {
     } else if (time_series_index->is_aligned()) {
+        const int effective_ssi_offset = dense_row_count_ >= 0 ? ssi_offset : 0;
+        const int effective_ssi_limit = dense_row_count_ >= 0 ? ssi_limit : -1;
         const AlignedTimeseriesIndex* aligned_time_series_index =
             dynamic_cast<const AlignedTimeseriesIndex*>(time_series_index);
         if (aligned_time_series_index == nullptr) {
             assert(false);
         }
+        if (aligned_time_series_index->value_ts_idx_ != nullptr &&
+            aligned_time_series_index->value_ts_idx_->get_statistic() !=
+                nullptr &&
+            aligned_time_series_index->value_ts_idx_->get_statistic()->count_ ==
+                0) {
+            return ret;
+        }
         SingleMeasurementColumnContext* column_context =
             new SingleMeasurementColumnContext(tsfile_io_reader_);
         if (RET_FAIL(column_context->init(
                 device_query_task_, time_series_index, time_filter,
                 device_query_task_->get_column_mapping()->get_column_pos(
                     time_series_index->get_measurement_name().to_std_string()),
-                pa_, ssi_offset, ssi_limit))) {
+                pa_, effective_ssi_offset, effective_ssi_limit))) {
             delete column_context;
             return ret;
         }
         field_column_contexts_.insert(std::make_pair(
             time_series_index->get_measurement_name().to_std_string(),
             column_context));
+        aligned_col_count_++;
     } else {
         SingleMeasurementColumnContext* column_context =
             new SingleMeasurementColumnContext(tsfile_io_reader_);
@@ -568,4 +896,342 @@ void SingleMeasurementColumnContext::fill_into(
     }
 }
 
+uint32_t SingleMeasurementColumnContext::available_rows() const {
+    if (!time_iter_ || time_iter_->end()) return 0;
+    return time_iter_->remaining();
+}
+
+int SingleMeasurementColumnContext::bulk_copy_into(
+    std::vector<common::ColAppender*>& col_appenders,
+    common::ColAppender* time_appender, common::RowAppender* row_appender,
+    uint32_t count) {
+    int ret = common::E_OK;
+    const uint32_t time_elem_size = sizeof(int64_t);
+    auto dt = value_iter_->get_data_type();
+    bool is_varlen =
+        (dt == common::STRING || dt == common::TEXT || dt == common::BLOB);
+
+    // Bulk copy time column (only first SSI does this).
+    if (time_appender) {
+        time_appender->bulk_append_fixed(time_iter_->data_ptr(), count,
+                                         time_elem_size);
+    }
+
+    // Advance output row count (only first SSI does this).
+    if (row_appender) {
+        row_appender->add_rows(count);
+    }
+
+    if (is_varlen || value_iter_->has_null()) {
+        for (uint32_t r = 0; r < count; r++) {
+            uint32_t len = 0;
+            bool is_null = false;
+            char* val = value_iter_->read(&len, &is_null);
+            for (int32_t pos : pos_in_result_) {
+                auto* appender = col_appenders[pos + 1];
+                appender->add_row();
+                if (is_null) {
+                    appender->append_null();
+                } else {
+                    appender->append(val, len);
+                }
+            }
+            value_iter_->next();
+        }
+    } else {
+        const uint32_t val_elem_size = common::get_data_type_size(dt);
+        char* val_ptr = value_iter_->data_ptr();
+        for (int32_t pos : pos_in_result_) {
+            col_appenders[pos + 1]->bulk_append_fixed(val_ptr, count,
+                                                      val_elem_size);
+        }
+        value_iter_->advance(count, val_elem_size);
+    }
+
+    // Advance source iterators.
+    time_iter_->advance(count, time_elem_size);
+
+    // If source TsBlock exhausted, load next.
+    if (time_iter_->end()) {
+        if (RET_FAIL(get_next_tsblock(false))) {
+            return ret;
+        }
+    }
+    return ret;
+}
+
+int SingleMeasurementColumnContext::skip_rows(uint32_t count) {
+    if (!time_iter_ || time_iter_->end()) return common::E_OK;
+    const uint32_t time_elem_size = sizeof(int64_t);
+    auto dt = value_iter_->get_data_type();
+    bool is_varlen =
+        (dt == common::STRING || dt == common::TEXT || dt == common::BLOB);
+    uint32_t to_skip = std::min(count, time_iter_->remaining());
+    time_iter_->advance(to_skip, time_elem_size);
+    if (is_varlen || value_iter_->has_null()) {
+        for (uint32_t r = 0; r < to_skip; r++) {
+            value_iter_->next();
+        }
+    } else {
+        const uint32_t val_elem_size = common::get_data_type_size(dt);
+        value_iter_->advance(to_skip, val_elem_size);
+    }
+    if (time_iter_->end()) {
+        // Propagate hard errors from the next-tsblock load; E_NO_MORE_DATA
+        // is the legitimate end-of-stream signal and gets squashed back to
+        // E_OK so the caller's outer loop notices via available_rows()==0.
+        int r = get_next_tsblock(false);
+        if (r != common::E_OK && r != common::E_NO_MORE_DATA) return r;
+    }
+    return common::E_OK;
+}
+
+// ── VectorMeasurementColumnContext implementation ───────────────────────
+
+VectorMeasurementColumnContext::~VectorMeasurementColumnContext() {
+    if (time_iter_) {
+        delete time_iter_;
+        time_iter_ = nullptr;
+    }
+    for (auto* vi : value_iters_) {
+        if (vi) delete vi;
+    }
+    value_iters_.clear();
+    if (ssi_) {
+        ssi_->revert_tsblock();
+    }
+    tsfile_io_reader_->revert_ssi(ssi_);
+    ssi_ = nullptr;
+}
+
+int VectorMeasurementColumnContext::init(
+    DeviceQueryTask* device_query_task,
+    const std::vector<std::string>& measurement_names, Filter* time_filter,
+    std::vector<std::vector<int32_t>>& pos_in_result, common::PageArena& pa) {
+    int ret = common::E_OK;
+    pos_in_result_ = pos_in_result;
+    column_names_ = measurement_names;
+    if (RET_FAIL(tsfile_io_reader_->alloc_multi_ssi(
+            device_query_task->get_device_id(), measurement_names, ssi_, pa,
+            time_filter))) {
+        return ret;
+    }
+    if (RET_FAIL(get_next_tsblock(true))) {
+        return ret;
+    }
+    return ret;
+}
+
+int VectorMeasurementColumnContext::get_next_tsblock(bool alloc_mem) {
+    int ret = common::E_OK;
+    if (tsblock_ != nullptr) {
+        if (time_iter_) {
+            delete time_iter_;
+            time_iter_ = nullptr;
+        }
+        for (auto* vi : value_iters_) {
+            if (vi) delete vi;
+        }
+        value_iters_.clear();
+        tsblock_->reset();
+    }
+    if (RET_FAIL(ssi_->get_next(tsblock_, alloc_mem))) {
+        if (time_iter_) {
+            delete time_iter_;
+            time_iter_ = nullptr;
+        }
+        for (auto* vi : value_iters_) {
+            if (vi) delete vi;
+        }
+        value_iters_.clear();
+        if (tsblock_) {
+            ssi_->destroy();
+            tsblock_ = nullptr;
+        }
+    } else {
+        time_iter_ = new common::ColIterator(0, tsblock_);
+        uint32_t num_value_cols = tsblock_->get_column_count() - 1;
+        value_iters_.reserve(num_value_cols);
+        for (uint32_t c = 0; c < num_value_cols; c++) {
+            value_iters_.push_back(new common::ColIterator(c + 1, tsblock_));
+        }
+    }
+    return ret;
+}
+
+int VectorMeasurementColumnContext::get_current_time(int64_t& time) {
+    if (!time_iter_ || time_iter_->end()) return common::E_NO_MORE_DATA;
+    uint32_t len = 0;
+    time = *(int64_t*)(time_iter_->read(&len));
+    return common::E_OK;
+}
+
+int VectorMeasurementColumnContext::get_current_value(char*& value,
+                                                      uint32_t& len) {
+    if (value_iters_.empty() || value_iters_[0]->end())
+        return common::E_NO_MORE_DATA;
+    bool is_null = false;
+    value = value_iters_[0]->read(&len, &is_null);
+    return common::E_OK;
+}
+
+int VectorMeasurementColumnContext::move_iter() {
+    int ret = common::E_OK;
+    time_iter_->next();
+    for (auto* vi : value_iters_) vi->next();
+    if (time_iter_->end()) {
+        if (RET_FAIL(get_next_tsblock(false))) return ret;
+    }
+    return ret;
+}
+
+void VectorMeasurementColumnContext::fill_into(
+    std::vector<common::ColAppender*>& col_appenders) {
+    for (uint32_t c = 0; c < value_iters_.size() && c < pos_in_result_.size();
+         c++) {
+        uint32_t len = 0;
+        bool is_null = false;
+        char* val = value_iters_[c]->read(&len, &is_null);
+        for (int32_t pos : pos_in_result_[c]) {
+            col_appenders[pos + 1]->add_row();
+            if (is_null) {
+                col_appenders[pos + 1]->append_null();
+            } else {
+                col_appenders[pos + 1]->append(val, len);
+            }
+        }
+    }
+}
+
+void VectorMeasurementColumnContext::remove_from(
+    std::map<std::string, MeasurementColumnContext*>& column_context_map) {
+    if (column_names_.empty()) {
+        for (auto it = column_context_map.begin();
+             it != column_context_map.end();) {
+            if (it->second == this) {
+                it = column_context_map.erase(it);
+            } else {
+                ++it;
+            }
+        }
+        delete this;
+        return;
+    }
+    for (const auto& name : column_names_) {
+        column_context_map.erase(name);
+    }
+    delete this;
+}
+
+uint32_t VectorMeasurementColumnContext::available_rows() const {
+    if (!time_iter_ || time_iter_->end()) return 0;
+    return time_iter_->remaining();
+}
+
+int VectorMeasurementColumnContext::bulk_copy_into(
+    std::vector<common::ColAppender*>& col_appenders,
+    common::ColAppender* time_appender, common::RowAppender* row_appender,
+    uint32_t count) {
+    int ret = common::E_OK;
+    const uint32_t time_elem_size = sizeof(int64_t);
+
+    // Bulk copy time column (only when time_appender is provided).
+    if (time_appender) {
+        time_appender->bulk_append_fixed(time_iter_->data_ptr(), count,
+                                         time_elem_size);
+    }
+
+    // Advance output row count.
+    if (row_appender) {
+        row_appender->add_rows(count);
+    }
+
+    // Bulk copy each value column to its output positions, propagating nulls.
+    for (uint32_t c = 0; c < value_iters_.size() && c < pos_in_result_.size();
+         c++) {
+        auto dt = value_iters_[c]->get_data_type();
+        bool is_varlen =
+            (dt == common::STRING || dt == common::TEXT || dt == common::BLOB);
+        bool src_has_null = value_iters_[c]->has_null();
+
+        if (is_varlen || src_has_null) {
+            // Row-by-row copy for variable-length columns using the
+            // ColIterator next()/read() which properly tracks offsets. Fixed
+            // length columns with nulls also need this path because their
+            // payload buffer only stores non-null values.
+            auto* iter = value_iters_[c];
+            for (uint32_t r = 0; r < count; r++) {
+                uint32_t len = 0;
+                bool is_null = false;
+                char* val = iter->read(&len, &is_null);
+                for (int32_t pos : pos_in_result_[c]) {
+                    auto* appender = col_appenders[pos + 1];
+                    appender->add_row();
+                    if (is_null) {
+                        appender->append_null();
+                    } else {
+                        appender->append(val, len);
+                    }
+                }
+                iter->next();
+            }
+        } else {
+            // Bulk copy for fixed-length columns
+            uint32_t val_elem_size = common::get_data_type_size(dt);
+            char* val_ptr = value_iters_[c]->data_ptr();
+            for (int32_t pos : pos_in_result_[c]) {
+                col_appenders[pos + 1]->bulk_append_fixed(val_ptr, count,
+                                                          val_elem_size);
+            }
+        }
+    }
+
+    // Advance all source iterators.
+    time_iter_->advance(count, time_elem_size);
+    for (uint32_t c = 0; c < value_iters_.size(); c++) {
+        auto dt = value_iters_[c]->get_data_type();
+        bool is_varlen =
+            (dt == common::STRING || dt == common::TEXT || dt == common::BLOB);
+        if (!is_varlen && !value_iters_[c]->has_null()) {
+            uint32_t val_elem_size = common::get_data_type_size(dt);
+            value_iters_[c]->advance(count, val_elem_size);
+        }
+        // Variable-length iterators and fixed-length iterators with nulls were
+        // already advanced in the copy loop above.
+    }
+
+    // If source TsBlock exhausted, load next.
+    if (time_iter_->end()) {
+        if (RET_FAIL(get_next_tsblock(false))) return ret;
+    }
+    return ret;
+}
+
+int VectorMeasurementColumnContext::skip_rows(uint32_t count) {
+    if (!time_iter_ || time_iter_->end()) return common::E_OK;
+    const uint32_t time_elem_size = sizeof(int64_t);
+    uint32_t to_skip = std::min(count, time_iter_->remaining());
+    time_iter_->advance(to_skip, time_elem_size);
+    for (uint32_t c = 0; c < value_iters_.size(); c++) {
+        auto dt = value_iters_[c]->get_data_type();
+        bool is_varlen =
+            (dt == common::STRING || dt == common::TEXT || dt == common::BLOB);
+        if (!is_varlen && !value_iters_[c]->has_null()) {
+            uint32_t val_elem_size = common::get_data_type_size(dt);
+            value_iters_[c]->advance(to_skip, val_elem_size);
+        } else {
+            // Variable-length and fixed-length-with-null vectors need next()
+            // to keep the payload offset aligned with non-null rows.
+            for (uint32_t r = 0; r < to_skip; r++) {
+                value_iters_[c]->next();
+            }
+        }
+    }
+    if (time_iter_->end()) {
+        int r = get_next_tsblock(false);
+        if (r != common::E_OK && r != common::E_NO_MORE_DATA) return r;
+    }
+    return common::E_OK;
+}
+
 }  // namespace storage
diff --git a/cpp/src/reader/block/single_device_tsblock_reader.h b/cpp/src/reader/block/single_device_tsblock_reader.h
index 07d16860c..e74304baf 100644
--- a/cpp/src/reader/block/single_device_tsblock_reader.h
+++ b/cpp/src/reader/block/single_device_tsblock_reader.h
@@ -65,6 +65,9 @@ class SingleDeviceTsBlockReader : public TsBlockReader {
     int advance_column(MeasurementColumnContext* column_context);
     int32_t compute_dense_row_count(
         const std::vector<ITimeseriesIndex*>& ts_indexes);
+    // Fast path for aligned data: all columns share the same timestamps,
+    // so no per-row merge-sort is needed.
+    int has_next_aligned(bool& has_next);
 
     DeviceQueryTask* device_query_task_;
     Filter* field_filter_;
@@ -83,6 +86,11 @@ class SingleDeviceTsBlockReader : public TsBlockReader {
     int remaining_offset_ = 0;
     int remaining_limit_ = -1;
     int32_t dense_row_count_ = -1;
+    // Populated in init() when every field column comes from an aligned chunk.
+    // Provides cache-friendly vector iteration for has_next_aligned().
+    bool all_aligned_ = false;
+    uint32_t aligned_col_count_ = 0;
+    std::vector<MeasurementColumnContext*> aligned_vec_;
 };
 
 class MeasurementColumnContext {
@@ -116,6 +124,13 @@ class MeasurementColumnContext {
         return ssi_ ? ssi_->get_row_limit() : -1;
     }
 
+    virtual uint32_t available_rows() const = 0;
+    virtual int bulk_copy_into(std::vector<common::ColAppender*>& col_appenders,
+                               common::ColAppender* time_appender,
+                               common::RowAppender* row_appender,
+                               uint32_t count) = 0;
+    virtual int skip_rows(uint32_t count) = 0;
+
    protected:
     TsFileIOReader* tsfile_io_reader_;
     TsFileSeriesScanIterator* ssi_ = nullptr;
@@ -124,7 +139,7 @@ class MeasurementColumnContext {
     common::ColIterator* value_iter_ = nullptr;
 };
 
-class SingleMeasurementColumnContext final : public MeasurementColumnContext {
+class SingleMeasurementColumnContext : public MeasurementColumnContext {
    public:
     explicit SingleMeasurementColumnContext(TsFileIOReader* tsfile_io_reader)
         : MeasurementColumnContext(tsfile_io_reader) {}
@@ -155,6 +170,12 @@ class SingleMeasurementColumnContext final : public MeasurementColumnContext {
     int get_current_time(int64_t& time) override;
     int get_current_value(char*& value, uint32_t& len) override;
     int move_iter() override;
+    uint32_t available_rows() const override;
+    int bulk_copy_into(std::vector<common::ColAppender*>& col_appenders,
+                       common::ColAppender* time_appender,
+                       common::RowAppender* row_appender,
+                       uint32_t count) override;
+    int skip_rows(uint32_t count) override;
 
    private:
     std::string column_name_;
@@ -165,21 +186,31 @@ class VectorMeasurementColumnContext final : public MeasurementColumnContext {
    public:
     explicit VectorMeasurementColumnContext(TsFileIOReader* tsfile_io_reader)
         : MeasurementColumnContext(tsfile_io_reader) {}
+    ~VectorMeasurementColumnContext() override;
 
     void fill_into(std::vector<common::ColAppender*>& col_appenders) override;
     void remove_from(std::map<std::string, MeasurementColumnContext*>&
                          column_context_map) override;
     int init(DeviceQueryTask* device_query_task,
-             const ITimeseriesIndex* time_series_index, Filter* time_filter,
+             const std::vector<std::string>& measurement_names,
+             Filter* time_filter,
              std::vector<std::vector<int32_t>>& pos_in_result,
              common::PageArena& pa);
     int get_next_tsblock(bool alloc_mem) override;
     int get_current_time(int64_t& time) override;
     int get_current_value(char*& value, uint32_t& len) override;
     int move_iter() override;
+    uint32_t available_rows() const override;
+    int bulk_copy_into(std::vector<common::ColAppender*>& col_appenders,
+                       common::ColAppender* time_appender,
+                       common::RowAppender* row_appender,
+                       uint32_t count) override;
+    int skip_rows(uint32_t count) override;
 
    private:
+    std::vector<std::string> column_names_;
     std::vector<std::vector<int32_t>> pos_in_result_;
+    std::vector<common::ColIterator*> value_iters_;
 };
 
 class IdColumnContext {
diff --git a/cpp/src/reader/bloom_filter.cc b/cpp/src/reader/bloom_filter.cc
index 068c96e27..4aff4ecd3 100644
--- a/cpp/src/reader/bloom_filter.cc
+++ b/cpp/src/reader/bloom_filter.cc
@@ -208,6 +208,26 @@ int BloomFilter::add_path_entry(const String& device_name,
     return E_OK;
 }
 
+bool BloomFilter::contains(const String& device_name,
+                           const String& measurement_name) {
+    if (size_ == 0) {
+        return true;  // empty filter — assume present
+    }
+    String entry = get_entry_string(device_name, measurement_name);
+    if (IS_NULL(entry.buf_)) {
+        return true;  // OOM — conservatively assume present
+    }
+    for (uint32_t i = 0; i < hash_func_count_; i++) {
+        int32_t hv = hash_func_arr_[i].hash(entry);
+        if (!bitset_.get(hv)) {
+            free_entry_buf(entry.buf_);
+            return false;  // definitely not present
+        }
+    }
+    free_entry_buf(entry.buf_);
+    return true;  // probably present
+}
+
 int BloomFilter::serialize_to(ByteStream& out) {
     int ret = E_OK;
     uint8_t* filter_data_bytes = nullptr;
diff --git a/cpp/src/reader/bloom_filter.h b/cpp/src/reader/bloom_filter.h
index b00de4a84..323cfa8a4 100644
--- a/cpp/src/reader/bloom_filter.h
+++ b/cpp/src/reader/bloom_filter.h
@@ -74,6 +74,11 @@ class BitSet {
         int32_t word_offset = pos % 64;
         words_[word_idx] |= (1ull << word_offset);
     }
+    bool get(int32_t pos) const {
+        int32_t word_idx = pos / 64;
+        int32_t word_offset = pos % 64;
+        return (words_[word_idx] & (1ull << word_offset)) != 0;
+    }
     int32_t get_words_in_use() const {
         for (int32_t i = word_count_ - 1; i >= 0; i--) {
             if (words_[i] != 0) {
@@ -107,8 +112,11 @@ class BloomFilter {
     void destroy() { bitset_.destroy(); }
     int add_path_entry(const common::String& device_name,
                        const common::String& measurement_name);
+    bool contains(const common::String& device_name,
+                  const common::String& measurement_name);
     int serialize_to(common::ByteStream& out);
     int deserialize_from(common::ByteStream& in);
+    bool is_empty() const { return size_ == 0; }
     BitSet* get_bit_set() { return &bitset_; }
 
    private:
diff --git a/cpp/src/reader/chunk_reader.cc b/cpp/src/reader/chunk_reader.cc
index b150f7851..7c36ea07f 100644
--- a/cpp/src/reader/chunk_reader.cc
+++ b/cpp/src/reader/chunk_reader.cc
@@ -422,8 +422,6 @@ int ChunkReader::i32_DECODE_TYPED_TV_INTO_TSBLOCK(ByteStream& time_in,
                 row_appender.backoff_add_row();
                 continue;
             } else {
-                /*std::cout << "decoder: time=" << time << ", value=" << value
-                 * << std::endl;*/
                 row_appender.append(0, (char*)&time, sizeof(time));
                 row_appender.append(1, (char*)&value, sizeof(value));
             }
@@ -432,6 +430,350 @@ int ChunkReader::i32_DECODE_TYPED_TV_INTO_TSBLOCK(ByteStream& time_in,
     return ret;
 }
 
+int ChunkReader::i32_DECODE_TV_BATCH(ByteStream& time_in, ByteStream& value_in,
+                                     RowAppender& row_appender,
+                                     Filter* filter) {
+    int ret = E_OK;
+    const int BATCH = 129;
+    int64_t times[BATCH];
+    int32_t values[BATCH];
+
+    while (time_decoder_->has_remaining(time_in)) {
+        // Cap each pass to what the appender can still hold; the old
+        // "remaining < BATCH → OVERFLOW" check made progress impossible on
+        // TsBlocks with capacity below BATCH.
+        int eff_batch =
+            std::min(BATCH, static_cast<int>(row_appender.remaining()));
+        if (eff_batch <= 0) {
+            ret = E_OVERFLOW;
+            break;
+        }
+
+        // Block-level time filter check
+        bool block_all_pass = false;
+        if (filter != nullptr) {
+            int64_t block_min, block_max;
+            int block_count;
+            if (time_decoder_->peek_next_block_range_int64(
+                    time_in, block_min, block_max, block_count)) {
+                if (!filter->satisfy_start_end_time(block_min, block_max)) {
+                    int skipped = 0;
+                    time_decoder_->skip_peeked_block_int64(time_in, skipped);
+                    value_decoder_->skip_int32(block_count, skipped, value_in);
+                    continue;
+                }
+                if (filter->contain_start_end_time(block_min, block_max)) {
+                    block_all_pass = true;
+                }
+            }
+        }
+
+        int time_count = 0;
+        int value_count = 0;
+
+        if (RET_FAIL(time_decoder_->read_batch_int64(times, eff_batch,
+                                                     time_count, time_in))) {
+            break;
+        }
+        if (time_count == 0) break;
+
+        bool time_mask[BATCH];
+        int pass_count = time_count;
+        if (filter != nullptr && !block_all_pass) {
+            pass_count =
+                filter->satisfy_batch_time(times, time_count, time_mask);
+        }
+
+        if (pass_count == 0) {
+            int skipped = 0;
+            value_decoder_->skip_int32(time_count, skipped, value_in);
+            continue;
+        }
+
+        if (RET_FAIL(value_decoder_->read_batch_int32(values, time_count,
+                                                      value_count, value_in))) {
+            break;
+        }
+        // Time and value chunks are written in lock-step; any discrepancy
+        // means the file is truncated or corrupted.  Reading uninitialised
+        // values[i] would silently surface garbage as decoded rows.
+        if (value_count != time_count) {
+            ret = E_TSFILE_CORRUPTED;
+            break;
+        }
+
+        for (int i = 0; i < time_count; ++i) {
+            if (filter != nullptr && !block_all_pass && !time_mask[i]) {
+                continue;
+            }
+            if (filter != nullptr && !block_all_pass &&
+                !filter->satisfy(times[i], (int64_t)values[i])) {
+                continue;
+            }
+            if (UNLIKELY(!row_appender.add_row())) {
+                ret = E_OVERFLOW;
+                break;
+            }
+            row_appender.append(0, (char*)&times[i], sizeof(int64_t));
+            row_appender.append(1, (char*)&values[i], sizeof(int32_t));
+        }
+        if (ret != E_OK) break;
+    }
+    return ret;
+}
+
+int ChunkReader::i64_DECODE_TV_BATCH(ByteStream& time_in, ByteStream& value_in,
+                                     RowAppender& row_appender,
+                                     Filter* filter) {
+    int ret = E_OK;
+    const int BATCH = 129;
+    int64_t times[BATCH];
+    int64_t values[BATCH];
+
+    while (time_decoder_->has_remaining(time_in)) {
+        int eff_batch =
+            std::min(BATCH, static_cast<int>(row_appender.remaining()));
+        if (eff_batch <= 0) {
+            ret = E_OVERFLOW;
+            break;
+        }
+
+        // Block-level time filter check
+        bool block_all_pass = false;
+        if (filter != nullptr) {
+            int64_t block_min, block_max;
+            int block_count;
+            if (time_decoder_->peek_next_block_range_int64(
+                    time_in, block_min, block_max, block_count)) {
+                if (!filter->satisfy_start_end_time(block_min, block_max)) {
+                    int skipped = 0;
+                    time_decoder_->skip_peeked_block_int64(time_in, skipped);
+                    value_decoder_->skip_int64(block_count, skipped, value_in);
+                    continue;
+                }
+                if (filter->contain_start_end_time(block_min, block_max)) {
+                    block_all_pass = true;
+                }
+            }
+        }
+
+        int time_count = 0;
+        int value_count = 0;
+
+        if (RET_FAIL(time_decoder_->read_batch_int64(times, eff_batch,
+                                                     time_count, time_in))) {
+            break;
+        }
+        if (time_count == 0) break;
+
+        bool time_mask[BATCH];
+        int pass_count = time_count;
+        if (filter != nullptr && !block_all_pass) {
+            pass_count =
+                filter->satisfy_batch_time(times, time_count, time_mask);
+        }
+
+        if (pass_count == 0) {
+            int skipped = 0;
+            value_decoder_->skip_int64(time_count, skipped, value_in);
+            continue;
+        }
+
+        if (RET_FAIL(value_decoder_->read_batch_int64(values, time_count,
+                                                      value_count, value_in))) {
+            break;
+        }
+        if (value_count != time_count) {
+            ret = E_TSFILE_CORRUPTED;
+            break;
+        }
+
+        for (int i = 0; i < time_count; ++i) {
+            if (filter != nullptr && !block_all_pass && !time_mask[i]) {
+                continue;
+            }
+            if (filter != nullptr && !block_all_pass &&
+                !filter->satisfy(times[i], values[i])) {
+                continue;
+            }
+            if (UNLIKELY(!row_appender.add_row())) {
+                ret = E_OVERFLOW;
+                break;
+            }
+            row_appender.append(0, (char*)&times[i], sizeof(int64_t));
+            row_appender.append(1, (char*)&values[i], sizeof(int64_t));
+        }
+        if (ret != E_OK) break;
+    }
+    return ret;
+}
+
+int ChunkReader::float_DECODE_TV_BATCH(ByteStream& time_in,
+                                       ByteStream& value_in,
+                                       RowAppender& row_appender,
+                                       Filter* filter) {
+    int ret = E_OK;
+    const int BATCH = 129;
+    int64_t times[BATCH];
+    float values[BATCH];
+
+    while (time_decoder_->has_remaining(time_in)) {
+        int eff_batch =
+            std::min(BATCH, static_cast<int>(row_appender.remaining()));
+        if (eff_batch <= 0) {
+            ret = E_OVERFLOW;
+            break;
+        }
+
+        // Block-level time filter check
+        bool block_all_pass = false;
+        if (filter != nullptr) {
+            int64_t block_min, block_max;
+            int block_count;
+            if (time_decoder_->peek_next_block_range_int64(
+                    time_in, block_min, block_max, block_count)) {
+                if (!filter->satisfy_start_end_time(block_min, block_max)) {
+                    int skipped = 0;
+                    time_decoder_->skip_peeked_block_int64(time_in, skipped);
+                    value_decoder_->skip_float(block_count, skipped, value_in);
+                    continue;
+                }
+                if (filter->contain_start_end_time(block_min, block_max)) {
+                    block_all_pass = true;
+                }
+            }
+        }
+
+        int time_count = 0;
+        int value_count = 0;
+
+        if (RET_FAIL(time_decoder_->read_batch_int64(times, eff_batch,
+                                                     time_count, time_in))) {
+            break;
+        }
+        if (time_count == 0) break;
+
+        bool time_mask[BATCH];
+        int pass_count = time_count;
+        if (filter != nullptr && !block_all_pass) {
+            pass_count =
+                filter->satisfy_batch_time(times, time_count, time_mask);
+        }
+
+        if (pass_count == 0) {
+            int skipped = 0;
+            value_decoder_->skip_float(time_count, skipped, value_in);
+            continue;
+        }
+
+        if (RET_FAIL(value_decoder_->read_batch_float(values, time_count,
+                                                      value_count, value_in))) {
+            break;
+        }
+        if (value_count != time_count) {
+            ret = E_TSFILE_CORRUPTED;
+            break;
+        }
+
+        for (int i = 0; i < time_count; ++i) {
+            if (filter != nullptr && !block_all_pass && !time_mask[i]) {
+                continue;
+            }
+            if (UNLIKELY(!row_appender.add_row())) {
+                ret = E_OVERFLOW;
+                break;
+            }
+            row_appender.append(0, (char*)&times[i], sizeof(int64_t));
+            row_appender.append(1, (char*)&values[i], sizeof(float));
+        }
+        if (ret != E_OK) break;
+    }
+    return ret;
+}
+
+int ChunkReader::double_DECODE_TV_BATCH(ByteStream& time_in,
+                                        ByteStream& value_in,
+                                        RowAppender& row_appender,
+                                        Filter* filter) {
+    int ret = E_OK;
+    const int BATCH = 129;
+    int64_t times[BATCH];
+    double values[BATCH];
+
+    while (time_decoder_->has_remaining(time_in)) {
+        int eff_batch =
+            std::min(BATCH, static_cast<int>(row_appender.remaining()));
+        if (eff_batch <= 0) {
+            ret = E_OVERFLOW;
+            break;
+        }
+
+        // Block-level time filter check
+        bool block_all_pass = false;
+        if (filter != nullptr) {
+            int64_t block_min, block_max;
+            int block_count;
+            if (time_decoder_->peek_next_block_range_int64(
+                    time_in, block_min, block_max, block_count)) {
+                if (!filter->satisfy_start_end_time(block_min, block_max)) {
+                    int skipped = 0;
+                    time_decoder_->skip_peeked_block_int64(time_in, skipped);
+                    value_decoder_->skip_double(block_count, skipped, value_in);
+                    continue;
+                }
+                if (filter->contain_start_end_time(block_min, block_max)) {
+                    block_all_pass = true;
+                }
+            }
+        }
+
+        int time_count = 0;
+        int value_count = 0;
+
+        if (RET_FAIL(time_decoder_->read_batch_int64(times, eff_batch,
+                                                     time_count, time_in))) {
+            break;
+        }
+        if (time_count == 0) break;
+
+        bool time_mask[BATCH];
+        int pass_count = time_count;
+        if (filter != nullptr && !block_all_pass) {
+            pass_count =
+                filter->satisfy_batch_time(times, time_count, time_mask);
+        }
+
+        if (pass_count == 0) {
+            int skipped = 0;
+            value_decoder_->skip_double(time_count, skipped, value_in);
+            continue;
+        }
+
+        if (RET_FAIL(value_decoder_->read_batch_double(
+                values, time_count, value_count, value_in))) {
+            break;
+        }
+        if (value_count != time_count) {
+            ret = E_TSFILE_CORRUPTED;
+            break;
+        }
+
+        for (int i = 0; i < time_count; ++i) {
+            if (filter != nullptr && !block_all_pass && !time_mask[i]) {
+                continue;
+            }
+            if (UNLIKELY(!row_appender.add_row())) {
+                ret = E_OVERFLOW;
+                break;
+            }
+            row_appender.append(0, (char*)&times[i], sizeof(int64_t));
+            row_appender.append(1, (char*)&values[i], sizeof(double));
+        }
+        if (ret != E_OK) break;
+    }
+    return ret;
+}
+
 int ChunkReader::STRING_DECODE_TYPED_TV_INTO_TSBLOCK(ByteStream& time_in,
                                                      ByteStream& value_in,
                                                      RowAppender& row_appender,
@@ -472,23 +814,21 @@ int ChunkReader::decode_tv_buf_into_tsblock_by_datatype(ByteStream& time_in,
             break;
         case common::DATE:
         case common::INT32:
-            // DECODE_TYPED_TV_INTO_TSBLOCK(int32_t, int32, time_in_, value_in_,
-            // row_appender);
-            ret = i32_DECODE_TYPED_TV_INTO_TSBLOCK(time_in_, value_in_,
-                                                   row_appender, filter);
+            ret =
+                i32_DECODE_TV_BATCH(time_in_, value_in_, row_appender, filter);
             break;
         case TIMESTAMP:
         case common::INT64:
-            DECODE_TYPED_TV_INTO_TSBLOCK(int64_t, int64, time_in_, value_in_,
-                                         row_appender);
+            ret =
+                i64_DECODE_TV_BATCH(time_in_, value_in_, row_appender, filter);
             break;
         case common::FLOAT:
-            DECODE_TYPED_TV_INTO_TSBLOCK(float, float, time_in_, value_in_,
-                                         row_appender);
+            ret = float_DECODE_TV_BATCH(time_in_, value_in_, row_appender,
+                                        filter);
             break;
         case common::DOUBLE:
-            DECODE_TYPED_TV_INTO_TSBLOCK(double, double, time_in_, value_in_,
-                                         row_appender);
+            ret = double_DECODE_TV_BATCH(time_in_, value_in_, row_appender,
+                                         filter);
             break;
         case common::TEXT:
         case common::BLOB:
diff --git a/cpp/src/reader/chunk_reader.h b/cpp/src/reader/chunk_reader.h
index 3acd9c3cf..a1196c330 100644
--- a/cpp/src/reader/chunk_reader.h
+++ b/cpp/src/reader/chunk_reader.h
@@ -105,6 +105,20 @@ class ChunkReader : public IChunkReader {
                                          common::ByteStream& value_in,
                                          common::RowAppender& row_appender,
                                          Filter* filter);
+    int i32_DECODE_TV_BATCH(common::ByteStream& time_in,
+                            common::ByteStream& value_in,
+                            common::RowAppender& row_appender, Filter* filter);
+    int i64_DECODE_TV_BATCH(common::ByteStream& time_in,
+                            common::ByteStream& value_in,
+                            common::RowAppender& row_appender, Filter* filter);
+    int float_DECODE_TV_BATCH(common::ByteStream& time_in,
+                              common::ByteStream& value_in,
+                              common::RowAppender& row_appender,
+                              Filter* filter);
+    int double_DECODE_TV_BATCH(common::ByteStream& time_in,
+                               common::ByteStream& value_in,
+                               common::RowAppender& row_appender,
+                               Filter* filter);
     int STRING_DECODE_TYPED_TV_INTO_TSBLOCK(common::ByteStream& time_in,
                                             common::ByteStream& value_in,
                                             common::RowAppender& row_appender,
@@ -131,7 +145,7 @@ class ChunkReader : public IChunkReader {
      * also refer to offset within the chunk (including chunk header).
      * It advanced by step of a page header or a page tv data.
      */
-    common::ByteStream in_stream_{common::MOD_CHUNK_READER};
+    common::ByteStream in_stream_;
     int32_t file_data_buf_size_;
     uint32_t chunk_visit_offset_;
 
@@ -141,8 +155,8 @@ class ChunkReader : public IChunkReader {
 
     Decoder* time_decoder_;
     Decoder* value_decoder_;
-    common::ByteStream time_in_{common::MOD_CHUNK_READER};
-    common::ByteStream value_in_{common::MOD_CHUNK_READER};
+    common::ByteStream time_in_;
+    common::ByteStream value_in_;
     char* uncompressed_buf_;
 };
 
diff --git a/cpp/src/reader/device_meta_iterator.cc b/cpp/src/reader/device_meta_iterator.cc
index bf01b23a5..955965624 100644
--- a/cpp/src/reader/device_meta_iterator.cc
+++ b/cpp/src/reader/device_meta_iterator.cc
@@ -186,7 +186,17 @@ int DeviceMetaIterator::load_results_direct() {
     ret = io_reader_->load_device_index_entry(device_comparable,
                                               device_index_entry, end_offset);
 
-    if (ret != common::E_OK || device_index_entry == nullptr) {
+    // "Device not present in this file" is the only ret value we should
+    // suppress.  Read failures and corrupt index entries used to be folded
+    // into "no matches"; the caller then couldn't distinguish a clean miss
+    // from a partial read that silently dropped real data.  Surface them.
+    if (ret == common::E_DEVICE_NOT_EXIST || ret == common::E_NOT_EXIST) {
+        return common::E_OK;
+    }
+    if (ret != common::E_OK) {
+        return ret;
+    }
+    if (device_index_entry == nullptr) {
         return common::E_OK;
     }
 
diff --git a/cpp/src/reader/filter/and_filter.h b/cpp/src/reader/filter/and_filter.h
index b324a3f81..289115baf 100644
--- a/cpp/src/reader/filter/and_filter.h
+++ b/cpp/src/reader/filter/and_filter.h
@@ -19,6 +19,8 @@
 #ifndef READER_FILTER_OPERATOR_AND_FILTER_H
 #define READER_FILTER_OPERATOR_AND_FILTER_H
 
+#include <memory>
+
 #include "binary_filter.h"
 // #include "storage/storage_utils.h"
 
@@ -48,6 +50,27 @@ class AndFilter : public BinaryFilter {
                right_->contain_start_end_time(start_time, end_time);
     }
 
+    int satisfy_batch_time(const int64_t* times, int count, bool* mask) {
+        // Inline buffer covers the common per-page BATCH=129 callers; only
+        // out-of-spec larger counts fall back to a heap allocation.
+        constexpr int kInlineCap = 256;
+        bool inline_buf[kInlineCap];
+        std::unique_ptr<bool[]> heap_buf;
+        bool* mask_right = inline_buf;
+        if (count > kInlineCap) {
+            heap_buf.reset(new bool[count]);
+            mask_right = heap_buf.get();
+        }
+        left_->satisfy_batch_time(times, count, mask);
+        right_->satisfy_batch_time(times, count, mask_right);
+        int pass = 0;
+        for (int i = 0; i < count; ++i) {
+            mask[i] = mask[i] && mask_right[i];
+            if (mask[i]) ++pass;
+        }
+        return pass;
+    }
+
     std::vector<TimeRange*>* get_time_ranges() {
         std::vector<TimeRange*>* result = new std::vector<TimeRange*>();
         std::vector<TimeRange*>* left_time_ranges = left_->get_time_ranges();
diff --git a/cpp/src/reader/filter/filter.h b/cpp/src/reader/filter/filter.h
index f39dddbae..e53992308 100644
--- a/cpp/src/reader/filter/filter.h
+++ b/cpp/src/reader/filter/filter.h
@@ -63,6 +63,20 @@ class Filter {
         ASSERT(false);
         return nullptr;
     }
+
+    // Batch time filter: evaluate time filter on an array of timestamps.
+    // Writes true/false into @mask for each element.
+    // Returns the number of elements that passed (mask[i] == true).
+    // Default: scalar fallback using satisfy_start_end_time.
+    virtual int satisfy_batch_time(const int64_t* times, int count,
+                                   bool* mask) {
+        int pass = 0;
+        for (int i = 0; i < count; ++i) {
+            mask[i] = satisfy_start_end_time(times[i], times[i]);
+            if (mask[i]) ++pass;
+        }
+        return pass;
+    }
 };
 
 }  // namespace storage
diff --git a/cpp/src/reader/filter/or_filter.h b/cpp/src/reader/filter/or_filter.h
index fc8d4a2cf..518308982 100644
--- a/cpp/src/reader/filter/or_filter.h
+++ b/cpp/src/reader/filter/or_filter.h
@@ -19,6 +19,8 @@
 #ifndef READER_FILTER_OPERATOR_OR_FILTER_H
 #define READER_FILTER_OPERATOR_OR_FILTER_H
 
+#include <memory>
+
 #include "binary_filter.h"
 // #include "storage/storage_utils.h"
 
@@ -48,6 +50,27 @@ class OrFilter : public BinaryFilter {
                right_->contain_start_end_time(start_time, end_time);
     }
 
+    int satisfy_batch_time(const int64_t* times, int count, bool* mask) {
+        // Inline buffer covers the common per-page BATCH=129 callers; only
+        // out-of-spec larger counts fall back to a heap allocation.
+        constexpr int kInlineCap = 256;
+        bool inline_buf[kInlineCap];
+        std::unique_ptr<bool[]> heap_buf;
+        bool* mask_right = inline_buf;
+        if (count > kInlineCap) {
+            heap_buf.reset(new bool[count]);
+            mask_right = heap_buf.get();
+        }
+        left_->satisfy_batch_time(times, count, mask);
+        right_->satisfy_batch_time(times, count, mask_right);
+        int pass = 0;
+        for (int i = 0; i < count; ++i) {
+            mask[i] = mask[i] || mask_right[i];
+            if (mask[i]) ++pass;
+        }
+        return pass;
+    }
+
     std::vector<TimeRange*>* get_time_ranges() {
         std::vector<TimeRange*>* result = new std::vector<TimeRange*>();
         std::vector<TimeRange*>* left_time_ranges = left_->get_time_ranges();
diff --git a/cpp/src/reader/filter/time_operator.cc b/cpp/src/reader/filter/time_operator.cc
index 19f33b599..0bb12e4ec 100644
--- a/cpp/src/reader/filter/time_operator.cc
+++ b/cpp/src/reader/filter/time_operator.cc
@@ -18,9 +18,17 @@
  */
 #include "time_operator.h"
 
+#include <cstring>
+
 #include "common/statistic.h"
 #include "utils/storage_utils.h"
 
+#if defined(__ARM_NEON)
+#include <arm_neon.h>
+#elif defined(ENABLE_SIMD)
+#include "simde/x86/avx2.h"
+#endif
+
 namespace storage {
 
 TimeBetween::TimeBetween(int64_t value1, int64_t value2, bool not_between)
@@ -29,6 +37,15 @@ TimeBetween::TimeBetween(int64_t value1, int64_t value2, bool not_between)
 TimeBetween::~TimeBetween() {}
 
 bool TimeBetween::satisfy(Statistic* statistic) {
+    // An empty inner interval (value1_ > value2_) is unsatisfiable for BETWEEN
+    // (matches nothing) and trivially true for NOT BETWEEN (matches
+    // everything) -- i.e. the answer is exactly not_.  Without this guard the
+    // overlap test below wrongly reports "maybe" for an empty range,
+    // disagreeing with the row-level satisfy() and letting empty/inverted
+    // ranges slip past statistic-level pruning.
+    if (value1_ > value2_) {
+        return not_;
+    }
     if (not_) {
         return statistic->end_time_ < value1_ ||
                statistic->start_time_ > value2_;
@@ -47,6 +64,10 @@ bool TimeBetween::satisfy(int64_t time, common::String value) {
 }
 
 bool TimeBetween::satisfy_start_end_time(int64_t start_time, int64_t end_time) {
+    // Empty inner interval: see satisfy(Statistic*).
+    if (value1_ > value2_) {
+        return not_;
+    }
     if (not_) {
         return start_time < value1_ || end_time > value2_;
     } else {
@@ -55,6 +76,10 @@ bool TimeBetween::satisfy_start_end_time(int64_t start_time, int64_t end_time) {
 }
 
 bool TimeBetween::contain_start_end_time(int64_t start_time, int64_t end_time) {
+    // Empty inner interval: see satisfy(Statistic*).
+    if (value1_ > value2_) {
+        return not_;
+    }
     if (not_) {
         return end_time < value1_ || start_time > value2_;
     } else {
@@ -64,6 +89,16 @@ bool TimeBetween::contain_start_end_time(int64_t start_time, int64_t end_time) {
 
 std::vector<TimeRange*>* TimeBetween::get_time_ranges() {
     std::vector<TimeRange*>* result = new std::vector<TimeRange*>();
+    // Empty inner interval (value1_ > value2_): BETWEEN yields no ranges;
+    // NOT BETWEEN covers the whole timeline.
+    if (value1_ > value2_) {
+        if (not_) {
+            result->push_back(
+                new TimeRange(std::numeric_limits<int64_t>::min(),
+                              std::numeric_limits<int64_t>::max()));
+        }
+        return result;
+    }
     if (not_) {
         if (value1_ != std::numeric_limits<int64_t>::min()) {
             result->push_back(new TimeRange(std::numeric_limits<int64_t>::min(),
@@ -102,11 +137,42 @@ bool TimeIn::satisfy(int64_t time, common::String value) {
 }
 
 bool TimeIn::satisfy_start_end_time(int64_t start_time, int64_t end_time) {
-    return true;
+    // "Could any time in [s, e] satisfy the filter?"
+    // IN({v_i}): true iff some v_i lies in [s, e].
+    // NOT IN: true unless the entire range [s, e] is one point and that
+    // point is in values_; for ranges wider than a single integer there is
+    // always at least one time not in values_, so we're conservative.
+    bool any_in_range = false;
+    for (int64_t v : values_) {
+        if (v >= start_time && v <= end_time) {
+            any_in_range = true;
+            break;
+        }
+    }
+    if (not_) {
+        if (start_time == end_time) return !any_in_range;
+        return true;
+    }
+    return any_in_range;
 }
 
 bool TimeIn::contain_start_end_time(int64_t start_time, int64_t end_time) {
-    return true;
+    // "Do ALL times in [s, e] satisfy the filter?"
+    // IN({v_i}): only when [s,e] collapses to a single point that is in
+    // values_; a sparse IN list can't cover a range otherwise.  Returning
+    // true unconditionally would let the batch fast path skip per-row
+    // filtering and emit every row.
+    // NOT IN: true iff no v_i lies in [s, e].
+    bool any_in_range = false;
+    for (int64_t v : values_) {
+        if (v >= start_time && v <= end_time) {
+            any_in_range = true;
+            break;
+        }
+    }
+    if (not_) return !any_in_range;
+    if (start_time == end_time) return any_in_range;
+    return false;
 }
 
 std::vector<TimeRange*>* TimeIn::get_time_ranges() {
@@ -308,4 +374,269 @@ std::vector<TimeRange*>* TimeLtEq::get_time_ranges() {
     return result;
 }
 
+// ============================================================================
+// SIMD batch time filter implementations
+// ============================================================================
+
+// Helper: extract 4-bit movemask from 256-bit comparison result (4 x i64)
+#if !defined(__ARM_NEON) && defined(ENABLE_SIMD)
+static inline int simd_movemask_epi64(simde__m256i v) {
+    // movemask_pd reinterprets as double and checks sign bit = high bit of each
+    // 64-bit lane
+    return simde_mm256_movemask_pd(simde_mm256_castsi256_pd(v));
+}
+#endif
+
+int TimeGt::satisfy_batch_time(const int64_t* times, int count, bool* mask) {
+    int pass = 0;
+    int i = 0;
+#if defined(__ARM_NEON)
+    int64x2_t vval = vdupq_n_s64(value_);
+    for (; i + 1 < count; i += 2) {
+        int64x2_t vt = vld1q_s64(times + i);
+        uint64x2_t cmp = vcgtq_s64(vt, vval);
+        mask[i] = vgetq_lane_u64(cmp, 0) != 0;
+        mask[i + 1] = vgetq_lane_u64(cmp, 1) != 0;
+        pass += mask[i] + mask[i + 1];
+    }
+#elif defined(ENABLE_SIMD)
+    simde__m256i vval = simde_mm256_set1_epi64x(value_);
+    for (; i + 3 < count; i += 4) {
+        simde__m256i vt =
+            simde_mm256_loadu_si256((const simde__m256i*)(times + i));
+        // time > value_ => cmpgt(time, value_)
+        simde__m256i cmp = simde_mm256_cmpgt_epi64(vt, vval);
+        int bits = simd_movemask_epi64(cmp);
+        for (int j = 0; j < 4; ++j) {
+            mask[i + j] = (bits >> j) & 1;
+            pass += mask[i + j];
+        }
+    }
+#endif
+    for (; i < count; ++i) {
+        mask[i] = value_ < times[i];
+        if (mask[i]) ++pass;
+    }
+    return pass;
+}
+
+int TimeGtEq::satisfy_batch_time(const int64_t* times, int count, bool* mask) {
+    int pass = 0;
+    int i = 0;
+#if defined(__ARM_NEON)
+    int64x2_t vval = vdupq_n_s64(value_);
+    for (; i + 1 < count; i += 2) {
+        int64x2_t vt = vld1q_s64(times + i);
+        uint64x2_t cmp = vcgeq_s64(vt, vval);
+        mask[i] = vgetq_lane_u64(cmp, 0) != 0;
+        mask[i + 1] = vgetq_lane_u64(cmp, 1) != 0;
+        pass += mask[i] + mask[i + 1];
+    }
+#elif defined(ENABLE_SIMD)
+    simde__m256i vval = simde_mm256_set1_epi64x(value_);
+    for (; i + 3 < count; i += 4) {
+        simde__m256i vt =
+            simde_mm256_loadu_si256((const simde__m256i*)(times + i));
+        // time >= value_ => NOT(cmpgt(value_, time))
+        simde__m256i cmp = simde_mm256_cmpgt_epi64(vval, vt);
+        simde__m256i ncmp =
+            simde_mm256_xor_si256(cmp, simde_mm256_set1_epi64x((int64_t)-1));
+        int bits = simd_movemask_epi64(ncmp);
+        for (int j = 0; j < 4; ++j) {
+            mask[i + j] = (bits >> j) & 1;
+            pass += mask[i + j];
+        }
+    }
+#endif
+    for (; i < count; ++i) {
+        mask[i] = value_ <= times[i];
+        if (mask[i]) ++pass;
+    }
+    return pass;
+}
+
+int TimeLt::satisfy_batch_time(const int64_t* times, int count, bool* mask) {
+    int pass = 0;
+    int i = 0;
+#if defined(__ARM_NEON)
+    int64x2_t vval = vdupq_n_s64(value_);
+    for (; i + 1 < count; i += 2) {
+        int64x2_t vt = vld1q_s64(times + i);
+        uint64x2_t cmp = vcltq_s64(vt, vval);
+        mask[i] = vgetq_lane_u64(cmp, 0) != 0;
+        mask[i + 1] = vgetq_lane_u64(cmp, 1) != 0;
+        pass += mask[i] + mask[i + 1];
+    }
+#elif defined(ENABLE_SIMD)
+    simde__m256i vval = simde_mm256_set1_epi64x(value_);
+    for (; i + 3 < count; i += 4) {
+        simde__m256i vt =
+            simde_mm256_loadu_si256((const simde__m256i*)(times + i));
+        // time < value_ => cmpgt(value_, time)
+        simde__m256i cmp = simde_mm256_cmpgt_epi64(vval, vt);
+        int bits = simd_movemask_epi64(cmp);
+        for (int j = 0; j < 4; ++j) {
+            mask[i + j] = (bits >> j) & 1;
+            pass += mask[i + j];
+        }
+    }
+#endif
+    for (; i < count; ++i) {
+        mask[i] = value_ > times[i];
+        if (mask[i]) ++pass;
+    }
+    return pass;
+}
+
+int TimeLtEq::satisfy_batch_time(const int64_t* times, int count, bool* mask) {
+    int pass = 0;
+    int i = 0;
+#if defined(__ARM_NEON)
+    int64x2_t vval = vdupq_n_s64(value_);
+    for (; i + 1 < count; i += 2) {
+        int64x2_t vt = vld1q_s64(times + i);
+        uint64x2_t cmp = vcleq_s64(vt, vval);
+        mask[i] = vgetq_lane_u64(cmp, 0) != 0;
+        mask[i + 1] = vgetq_lane_u64(cmp, 1) != 0;
+        pass += mask[i] + mask[i + 1];
+    }
+#elif defined(ENABLE_SIMD)
+    simde__m256i vval = simde_mm256_set1_epi64x(value_);
+    for (; i + 3 < count; i += 4) {
+        simde__m256i vt =
+            simde_mm256_loadu_si256((const simde__m256i*)(times + i));
+        // time <= value_ => NOT(cmpgt(time, value_))
+        simde__m256i cmp = simde_mm256_cmpgt_epi64(vt, vval);
+        simde__m256i ncmp =
+            simde_mm256_xor_si256(cmp, simde_mm256_set1_epi64x((int64_t)-1));
+        int bits = simd_movemask_epi64(ncmp);
+        for (int j = 0; j < 4; ++j) {
+            mask[i + j] = (bits >> j) & 1;
+            pass += mask[i + j];
+        }
+    }
+#endif
+    for (; i < count; ++i) {
+        mask[i] = value_ >= times[i];
+        if (mask[i]) ++pass;
+    }
+    return pass;
+}
+
+int TimeEq::satisfy_batch_time(const int64_t* times, int count, bool* mask) {
+    int pass = 0;
+    int i = 0;
+#if defined(__ARM_NEON)
+    int64x2_t vval = vdupq_n_s64(value_);
+    for (; i + 1 < count; i += 2) {
+        int64x2_t vt = vld1q_s64(times + i);
+        uint64x2_t cmp = vceqq_s64(vt, vval);
+        mask[i] = vgetq_lane_u64(cmp, 0) != 0;
+        mask[i + 1] = vgetq_lane_u64(cmp, 1) != 0;
+        pass += mask[i] + mask[i + 1];
+    }
+#elif defined(ENABLE_SIMD)
+    simde__m256i vval = simde_mm256_set1_epi64x(value_);
+    for (; i + 3 < count; i += 4) {
+        simde__m256i vt =
+            simde_mm256_loadu_si256((const simde__m256i*)(times + i));
+        simde__m256i cmp = simde_mm256_cmpeq_epi64(vt, vval);
+        int bits = simd_movemask_epi64(cmp);
+        for (int j = 0; j < 4; ++j) {
+            mask[i + j] = (bits >> j) & 1;
+            pass += mask[i + j];
+        }
+    }
+#endif
+    for (; i < count; ++i) {
+        mask[i] = value_ == times[i];
+        if (mask[i]) ++pass;
+    }
+    return pass;
+}
+
+int TimeNotEq::satisfy_batch_time(const int64_t* times, int count, bool* mask) {
+    int pass = 0;
+    int i = 0;
+#if defined(__ARM_NEON)
+    int64x2_t vval = vdupq_n_s64(value_);
+    uint64x2_t ones = vdupq_n_u64(UINT64_MAX);
+    for (; i + 1 < count; i += 2) {
+        int64x2_t vt = vld1q_s64(times + i);
+        uint64x2_t cmp = veorq_u64(vceqq_s64(vt, vval), ones);
+        mask[i] = vgetq_lane_u64(cmp, 0) != 0;
+        mask[i + 1] = vgetq_lane_u64(cmp, 1) != 0;
+        pass += mask[i] + mask[i + 1];
+    }
+#elif defined(ENABLE_SIMD)
+    simde__m256i vval = simde_mm256_set1_epi64x(value_);
+    for (; i + 3 < count; i += 4) {
+        simde__m256i vt =
+            simde_mm256_loadu_si256((const simde__m256i*)(times + i));
+        simde__m256i eq = simde_mm256_cmpeq_epi64(vt, vval);
+        simde__m256i neq =
+            simde_mm256_xor_si256(eq, simde_mm256_set1_epi64x((int64_t)-1));
+        int bits = simd_movemask_epi64(neq);
+        for (int j = 0; j < 4; ++j) {
+            mask[i + j] = (bits >> j) & 1;
+            pass += mask[i + j];
+        }
+    }
+#endif
+    for (; i < count; ++i) {
+        mask[i] = value_ != times[i];
+        if (mask[i]) ++pass;
+    }
+    return pass;
+}
+
+int TimeBetween::satisfy_batch_time(const int64_t* times, int count,
+                                    bool* mask) {
+    int pass = 0;
+    int i = 0;
+#if defined(__ARM_NEON)
+    int64x2_t vlo = vdupq_n_s64(value1_);
+    int64x2_t vhi = vdupq_n_s64(value2_);
+    uint64x2_t ones = vdupq_n_u64(UINT64_MAX);
+    for (; i + 1 < count; i += 2) {
+        int64x2_t vt = vld1q_s64(times + i);
+        uint64x2_t ge_lo = vcgeq_s64(vt, vlo);
+        uint64x2_t le_hi = vcleq_s64(vt, vhi);
+        uint64x2_t between = vandq_u64(ge_lo, le_hi);
+        uint64x2_t result = not_ ? veorq_u64(between, ones) : between;
+        mask[i] = vgetq_lane_u64(result, 0) != 0;
+        mask[i + 1] = vgetq_lane_u64(result, 1) != 0;
+        pass += mask[i] + mask[i + 1];
+    }
+#elif defined(ENABLE_SIMD)
+    simde__m256i vlo = simde_mm256_set1_epi64x(value1_);
+    simde__m256i vhi = simde_mm256_set1_epi64x(value2_);
+    simde__m256i ones = simde_mm256_set1_epi64x((int64_t)-1);
+    for (; i + 3 < count; i += 4) {
+        simde__m256i vt =
+            simde_mm256_loadu_si256((const simde__m256i*)(times + i));
+        // time >= lo => NOT(cmpgt(lo, time))
+        simde__m256i ge_lo =
+            simde_mm256_xor_si256(simde_mm256_cmpgt_epi64(vlo, vt), ones);
+        // time <= hi => NOT(cmpgt(time, hi))
+        simde__m256i le_hi =
+            simde_mm256_xor_si256(simde_mm256_cmpgt_epi64(vt, vhi), ones);
+        simde__m256i between = simde_mm256_and_si256(ge_lo, le_hi);
+        simde__m256i result =
+            not_ ? simde_mm256_xor_si256(between, ones) : between;
+        int bits = simd_movemask_epi64(result);
+        for (int j = 0; j < 4; ++j) {
+            mask[i + j] = (bits >> j) & 1;
+            pass += mask[i + j];
+        }
+    }
+#endif
+    for (; i < count; ++i) {
+        bool in_range = (value1_ <= times[i]) && (times[i] <= value2_);
+        mask[i] = not_ ? !in_range : in_range;
+        if (mask[i]) ++pass;
+    }
+    return pass;
+}
+
 }  // namespace storage
diff --git a/cpp/src/reader/filter/time_operator.h b/cpp/src/reader/filter/time_operator.h
index 29930b88a..f972a4259 100644
--- a/cpp/src/reader/filter/time_operator.h
+++ b/cpp/src/reader/filter/time_operator.h
@@ -47,6 +47,9 @@ class TimeBetween : public Filter {
     bool contain_start_end_time(int64_t start_time, int64_t end_time);
 
     std::vector<TimeRange*>* get_time_ranges();
+
+    int satisfy_batch_time(const int64_t* times, int count, bool* mask);
+
     FilterType get_filter_type() { return type_; }
 
    private:
@@ -99,6 +102,8 @@ class TimeEq : public Filter {
 
     std::vector<TimeRange*>* get_time_ranges();
 
+    int satisfy_batch_time(const int64_t* times, int count, bool* mask);
+
     FilterType get_filter_type() { return type_; }
 
    private:
@@ -122,6 +127,9 @@ class TimeNotEq : public Filter {
     bool contain_start_end_time(int64_t start_time, int64_t end_time);
 
     std::vector<TimeRange*>* get_time_ranges();
+
+    int satisfy_batch_time(const int64_t* times, int count, bool* mask);
+
     FilterType get_filter_type() { return type_; }
 
    private:
@@ -146,6 +154,8 @@ class TimeGt : public Filter {
 
     std::vector<TimeRange*>* get_time_ranges();
 
+    int satisfy_batch_time(const int64_t* times, int count, bool* mask);
+
     FilterType get_filter_type() { return type_; }
 
    private:
@@ -169,6 +179,9 @@ class TimeGtEq : public Filter {
     bool contain_start_end_time(int64_t start_time, int64_t end_time);
 
     std::vector<TimeRange*>* get_time_ranges();
+
+    int satisfy_batch_time(const int64_t* times, int count, bool* mask);
+
     void reset_value(int64_t val) { value_ = val; }
     FilterType get_filter_type() { return type_; }
 
@@ -194,6 +207,8 @@ class TimeLt : public Filter {
 
     std::vector<TimeRange*>* get_time_ranges();
 
+    int satisfy_batch_time(const int64_t* times, int count, bool* mask);
+
     FilterType get_filter_type() { return type_; }
 
    private:
@@ -217,6 +232,9 @@ class TimeLtEq : public Filter {
     bool contain_start_end_time(int64_t start_time, int64_t end_time);
 
     std::vector<TimeRange*>* get_time_ranges();
+
+    int satisfy_batch_time(const int64_t* times, int count, bool* mask);
+
     FilterType get_filter_type() { return type_; }
 
    private:
diff --git a/cpp/src/reader/qds_without_timegenerator.cc b/cpp/src/reader/qds_without_timegenerator.cc
index 474e13b77..b612e5dc2 100644
--- a/cpp/src/reader/qds_without_timegenerator.cc
+++ b/cpp/src/reader/qds_without_timegenerator.cc
@@ -68,7 +68,12 @@ int QDSWithoutTimeGenerator::init_internal(TsFileIOReader* io_reader,
         ret = io_reader_->alloc_ssi(paths[i].device_id_, paths[i].measurement_,
                                     ssi, pa_, global_time_filter);
         if (ret == E_MEASUREMENT_NOT_EXIST || ret == E_DEVICE_NOT_EXIST ||
-            ret == E_NOT_EXIST) {
+            ret == E_NOT_EXIST || ret == E_NO_MORE_DATA) {
+            // Java-aligned: silently skip paths whose device or measurement
+            // doesn't exist in this file. The bloom-filter optimization in
+            // alloc_ssi reports a missing series as E_NO_MORE_DATA, so treat
+            // that the same as the not-found codes.
+            ret = E_OK;
             continue;
         }
         if (ret != E_OK) {
diff --git a/cpp/src/reader/result_set.h b/cpp/src/reader/result_set.h
index 1f1653603..0b73595d4 100644
--- a/cpp/src/reader/result_set.h
+++ b/cpp/src/reader/result_set.h
@@ -162,6 +162,35 @@ class ResultSet : std::enable_shared_from_this<ResultSet> {
         return common::E_INVALID_ARG;
     }
 
+    // Typed direct accessors.  Default implementation routes through the
+    // generic RowRecord / Field path so existing subclasses keep working.
+    // Fast subclasses (TableResultSet) override these to read straight from
+    // the underlying columnar buffer, skipping the per-cell Field round-trip
+    // (and the eager materialization in next()).
+    virtual bool get_bool_at(uint32_t column_index) {
+        return get_row_record()->get_field(column_index - 1)->get_value<bool>();
+    }
+    virtual int32_t get_int32_at(uint32_t column_index) {
+        return get_row_record()
+            ->get_field(column_index - 1)
+            ->get_value<int32_t>();
+    }
+    virtual int64_t get_int64_at(uint32_t column_index) {
+        return get_row_record()
+            ->get_field(column_index - 1)
+            ->get_value<int64_t>();
+    }
+    virtual float get_float_at(uint32_t column_index) {
+        return get_row_record()
+            ->get_field(column_index - 1)
+            ->get_value<float>();
+    }
+    virtual double get_double_at(uint32_t column_index) {
+        return get_row_record()
+            ->get_field(column_index - 1)
+            ->get_value<double>();
+    }
+
     /**
      * @brief Get the row record of the result set
      *
@@ -245,6 +274,29 @@ inline std::tm ResultSet::get_value(uint32_t column_index) {
     return row_record->get_field(column_index)->get_date_value();
 }
 
+// Index-based primitive specializations route to the typed virtual
+// accessors so TableResultSet can serve them without materializing a Field.
+template <>
+inline bool ResultSet::get_value(uint32_t column_index) {
+    return get_bool_at(column_index);
+}
+template <>
+inline int32_t ResultSet::get_value(uint32_t column_index) {
+    return get_int32_at(column_index);
+}
+template <>
+inline int64_t ResultSet::get_value(uint32_t column_index) {
+    return get_int64_at(column_index);
+}
+template <>
+inline float ResultSet::get_value(uint32_t column_index) {
+    return get_float_at(column_index);
+}
+template <>
+inline double ResultSet::get_value(uint32_t column_index) {
+    return get_double_at(column_index);
+}
+
 /**
  * @brief Simple iterator for ResultSet with smart pointers
  */
@@ -306,7 +358,7 @@ inline ResultSetIterator ResultSet::iterator() {
     return ResultSetIterator(this);
 }
 
-static MAYBE_UNUSED void print_table_result_set(
+MAYBE_UNUSED static void print_table_result_set(
     storage::ResultSet* table_result_set) {
     if (table_result_set == nullptr) {
         std::cout << "TableResultSet is nullptr" << std::endl;
diff --git a/cpp/src/reader/table_result_set.cc b/cpp/src/reader/table_result_set.cc
index 81b58ce68..6de093d24 100644
--- a/cpp/src/reader/table_result_set.cc
+++ b/cpp/src/reader/table_result_set.cc
@@ -43,6 +43,16 @@ int TableResultSet::next(bool& has_next) {
 
     int ret = common::E_OK;
 
+    // Advance past the row yielded by the previous next() call, if any.
+    // Row iterator's next() advances all per-column offsets, so on the next
+    // read the vectors point to the new row's data.
+    if (row_ready_) {
+        row_iterator_->next();
+        row_ready_ = false;
+        row_materialized_ = false;
+    }
+
+    // Find the next non-empty TsBlock.
     while (row_iterator_ == nullptr || !row_iterator_->has_next()) {
         if (RET_FAIL(tsblock_reader_->has_next(has_next))) {
             return ret;
@@ -68,23 +78,29 @@ int TableResultSet::next(bool& has_next) {
     }
     if (row_iterator_ == nullptr || !row_iterator_->has_next()) {
         has_next = false;
+        return ret;
     }
 
-    if (has_next && IS_SUCC(ret)) {
-        uint32_t len = 0;
-        bool null = false;
-        row_record_->reset();
-        for (uint32_t i = 0; i < row_iterator_->get_column_count(); ++i) {
-            const auto value = row_iterator_->read(i, &len, &null);
-            if (!null) {
-                row_record_->get_field(i)->set_value(
-                    row_iterator_->get_data_type(i), value, len, pa_);
-                row_iterator_->next(i);
-            }
+    // A row is now available at row_iterator_'s current row_id_; the per-
+    // column vector offsets are pointing at that row's data.  We do NOT
+    // populate row_record_ here — typed accessors read straight from the
+    // vectors, and get_row_record() lazily materializes on demand.
+    has_next = true;
+    row_ready_ = true;
+    return ret;
+}
+
+void TableResultSet::materialize_current_row() {
+    uint32_t len = 0;
+    bool null = false;
+    row_record_->reset();
+    for (uint32_t i = 0; i < row_iterator_->get_column_count(); ++i) {
+        const auto value = row_iterator_->read(i, &len, &null);
+        if (!null) {
+            row_record_->get_field(i)->set_value(
+                row_iterator_->get_data_type(i), value, len, pa_);
         }
-        row_iterator_->update_row_id();
     }
-    return ret;
 }
 
 bool TableResultSet::is_null(const std::string& column_name) {
@@ -98,11 +114,57 @@ bool TableResultSet::is_null(const std::string& column_name) {
 
 bool TableResultSet::is_null(uint32_t column_index) {
     ASSERT(1 <= column_index && column_index <= row_record_->get_col_num());
-    return row_record_->get_field(column_index - 1) == nullptr ||
-           row_record_->get_field(column_index - 1)->is_type(common::NULL_TYPE);
+    if (!row_ready_) return true;
+    return row_iterator_->is_null_at(column_index - 1);
+}
+
+// Direct buffer access — skips Vector::read's virtual dispatch.  Caller is
+// expected to have checked is_null() (we still null-guard for safety).
+// For fixed-width primitives the vector keeps its value buffer in
+// values_ and tracks the current row's byte offset in offset_; the
+// element at the active row is simply *(T*)(values_.get_data() + offset_).
+// The ASSERT enforces strict typed access: the requested C++ type must match
+// the column's physical storage width (DATE is int32, not int64).  On a
+// mismatch it fires in debug instead of silently splicing the adjacent cell's
+// bytes into the result.
+#define TSFILE_FAST_PRIMITIVE_READ(TYPE, DFLT)                         \
+    if (!row_ready_) return DFLT;                                      \
+    common::Vector* vec = row_iterator_->get_vector(column_index - 1); \
+    ASSERT(common::TypeMatch<TYPE>(vec->get_vector_type()));           \
+    if (vec->has_null() && vec->is_null(row_iterator_->get_row_id()))  \
+        return DFLT;                                                   \
+    return *reinterpret_cast<TYPE*>(vec->get_value_data().get_data() + \
+                                    vec->get_offset())
+
+bool TableResultSet::get_bool_at(uint32_t column_index) {
+    TSFILE_FAST_PRIMITIVE_READ(bool, false);
 }
 
-RowRecord* TableResultSet::get_row_record() { return row_record_; }
+int32_t TableResultSet::get_int32_at(uint32_t column_index) {
+    TSFILE_FAST_PRIMITIVE_READ(int32_t, 0);
+}
+
+int64_t TableResultSet::get_int64_at(uint32_t column_index) {
+    TSFILE_FAST_PRIMITIVE_READ(int64_t, 0);
+}
+
+float TableResultSet::get_float_at(uint32_t column_index) {
+    TSFILE_FAST_PRIMITIVE_READ(float, 0.0f);
+}
+
+double TableResultSet::get_double_at(uint32_t column_index) {
+    TSFILE_FAST_PRIMITIVE_READ(double, 0.0);
+}
+
+#undef TSFILE_FAST_PRIMITIVE_READ
+
+RowRecord* TableResultSet::get_row_record() {
+    if (row_ready_ && !row_materialized_) {
+        materialize_current_row();
+        row_materialized_ = true;
+    }
+    return row_record_;
+}
 
 std::shared_ptr<ResultSetMetadata> TableResultSet::get_metadata() {
     return result_set_metadata_;
@@ -138,7 +200,13 @@ int TableResultSet::get_next_tsblock(common::TsBlock*& block) {
 }
 
 void TableResultSet::close() {
-    tsblock_reader_->close();
+    if (closed_) {
+        return;
+    }
+    closed_ = true;
+    if (tsblock_reader_) {
+        tsblock_reader_->close();
+    }
     pa_.destroy();
     if (row_record_) {
         delete row_record_;
@@ -150,4 +218,4 @@ void TableResultSet::close() {
     }
 }
 
-}  // namespace storage
\ No newline at end of file
+}  // namespace storage
diff --git a/cpp/src/reader/table_result_set.h b/cpp/src/reader/table_result_set.h
index 072a63f6f..d92072934 100644
--- a/cpp/src/reader/table_result_set.h
+++ b/cpp/src/reader/table_result_set.h
@@ -48,8 +48,23 @@ class TableResultSet : public ResultSet {
     void close() override;
     int get_next_tsblock(common::TsBlock*& block) override;
 
+    // Fast typed accessors: read straight from the current TsBlock vector
+    // without going through RowRecord/Field.  Caller is expected to have
+    // checked is_null() — when the cell is null the underlying buffer pointer
+    // is nullptr and these return a default (0 / 0.0 / false) without
+    // dereferencing it.
+    bool get_bool_at(uint32_t column_index) override;
+    int32_t get_int32_at(uint32_t column_index) override;
+    int64_t get_int64_at(uint32_t column_index) override;
+    float get_float_at(uint32_t column_index) override;
+    double get_double_at(uint32_t column_index) override;
+
    private:
     void init();
+    // Lazy materialization: fill row_record_ from the current row when a
+    // caller actually requests the RowRecord (or a non-fast accessor).
+    void materialize_current_row();
+
     std::unique_ptr<TsBlockReader> tsblock_reader_;
     common::RowIterator* row_iterator_ = nullptr;
     common::TsBlock* tsblock_ = nullptr;
@@ -58,6 +73,11 @@ class TableResultSet : public ResultSet {
     std::vector<std::string> column_names_;
     std::vector<common::TSDataType> data_types_;
     const int return_mode_;
+    bool closed_ = false;
+    // True when row_iterator_ points at a row that hasn't been consumed yet.
+    bool row_ready_ = false;
+    // True when row_record_ has been populated for the current row.
+    bool row_materialized_ = false;
 };
 }  // namespace storage
-#endif  // TABLE_RESULT_SET_H
\ No newline at end of file
+#endif  // TABLE_RESULT_SET_H
diff --git a/cpp/src/reader/task/device_query_task.cc b/cpp/src/reader/task/device_query_task.cc
index c7e7091ff..6345c93fa 100644
--- a/cpp/src/reader/task/device_query_task.cc
+++ b/cpp/src/reader/task/device_query_task.cc
@@ -19,6 +19,8 @@
 
 #include "reader/task/device_query_task.h"
 
+#include "common/tsfile_common.h"
+
 namespace storage {
 DeviceQueryTask* DeviceQueryTask::create_device_query_task(
     std::shared_ptr<IDeviceID> device_id, std::vector<std::string> column_names,
@@ -34,8 +36,14 @@ DeviceQueryTask* DeviceQueryTask::create_device_query_task(
 }
 
 DeviceQueryTask::~DeviceQueryTask() {
-    if (index_root_) {
+    // index_root_ was placement-new'd into DeviceMetaIterator's PageArena and
+    // ownership transferred here via DeviceMetaIterator::next; the arena only
+    // frees raw bytes, so we must invoke the destructor explicitly to release
+    // the heap-allocated children_ vector and its nested shared_ptr graph
+    // (DeviceMetaIndexEntry -> StringArrayDeviceID).
+    if (index_root_ != nullptr) {
         index_root_->~MetaIndexNode();
+        index_root_ = nullptr;
     }
 }
 
diff --git a/cpp/src/reader/task/device_task_iterator.cc b/cpp/src/reader/task/device_task_iterator.cc
index dbe763303..e22fefb06 100644
--- a/cpp/src/reader/task/device_task_iterator.cc
+++ b/cpp/src/reader/task/device_task_iterator.cc
@@ -37,6 +37,9 @@ int DeviceTaskIterator::next(DeviceQueryTask*& task) {
         task = DeviceQueryTask::create_device_query_task(
             device_meta_pair.first, column_names_, column_mapping_,
             device_meta_pair.second, table_schema_, pa_);
+        if (task != nullptr) {
+            created_tasks_.push_back(task);
+        }
     }
     return ret;
 }
diff --git a/cpp/src/reader/task/device_task_iterator.h b/cpp/src/reader/task/device_task_iterator.h
index 061711c17..cc5a75562 100644
--- a/cpp/src/reader/task/device_task_iterator.h
+++ b/cpp/src/reader/task/device_task_iterator.h
@@ -58,7 +58,17 @@ class DeviceTaskIterator {
         pa_.init(512, common::MOD_DEVICE_TASK_ITER);
     }
 
-    ~DeviceTaskIterator() { pa_.destroy(); }
+    ~DeviceTaskIterator() {
+        // The tasks are placement-new'd into pa_ memory; pa_.destroy() only
+        // releases the raw bytes, so we must call their destructors here to
+        // release the heap-allocated members (std::vector<std::string>,
+        // shared_ptr's, etc.) they own.
+        for (DeviceQueryTask* t : created_tasks_) {
+            t->~DeviceQueryTask();
+        }
+        created_tasks_.clear();
+        pa_.destroy();
+    }
 
     void flush_remaining_device_meta_cache();
 
@@ -72,6 +82,7 @@ class DeviceTaskIterator {
     std::unique_ptr<DeviceMetaIterator> device_meta_iterator_;
     std::shared_ptr<TableSchema> table_schema_;
     common::PageArena pa_;
+    std::vector<DeviceQueryTask*> created_tasks_;
 };
 
 }  // namespace storage
diff --git a/cpp/src/reader/tsfile_reader.cc b/cpp/src/reader/tsfile_reader.cc
index 8d9d9b5dc..540674f33 100644
--- a/cpp/src/reader/tsfile_reader.cc
+++ b/cpp/src/reader/tsfile_reader.cc
@@ -94,8 +94,7 @@ namespace storage {
 TsFileReader::TsFileReader()
     : read_file_(nullptr),
       tsfile_executor_(nullptr),
-      table_query_executor_(nullptr),
-      table_query_executor_batch_size_(0) {
+      table_query_executor_(nullptr) {
     tsfile_reader_meta_pa_.init(512, MOD_TSFILE_READER);
 }
 
@@ -113,6 +112,22 @@ int TsFileReader::open(const std::string& file_path) {
     return ret;
 }
 
+int TsFileReader::ensure_table_query_executor(int batch_size) {
+    if (table_query_executor_ != nullptr &&
+        table_query_executor_batch_size_ == batch_size) {
+        return E_OK;
+    }
+
+    if (table_query_executor_ != nullptr) {
+        delete table_query_executor_;
+        table_query_executor_ = nullptr;
+    }
+
+    table_query_executor_ = new TableQueryExecutor(read_file_, batch_size);
+    table_query_executor_batch_size_ = batch_size;
+    return E_OK;
+}
+
 int TsFileReader::close() {
     int ret = E_OK;
     if (tsfile_executor_ != nullptr) {
@@ -123,7 +138,6 @@ int TsFileReader::close() {
         delete table_query_executor_;
         table_query_executor_ = nullptr;
     }
-    table_query_executor_batch_size_ = 0;
     if (read_file_ != nullptr) {
         read_file_->close();
         delete read_file_;
@@ -132,22 +146,6 @@ int TsFileReader::close() {
     return ret;
 }
 
-int TsFileReader::ensure_table_query_executor(int batch_size) {
-    if (table_query_executor_ != nullptr &&
-        table_query_executor_batch_size_ == batch_size) {
-        return E_OK;
-    }
-
-    if (table_query_executor_ != nullptr) {
-        delete table_query_executor_;
-        table_query_executor_ = nullptr;
-    }
-
-    table_query_executor_ = new TableQueryExecutor(read_file_, batch_size);
-    table_query_executor_batch_size_ = batch_size;
-    return E_OK;
-}
-
 int TsFileReader::query(QueryExpression* qe, ResultSet*& ret_qds) {
     return tsfile_executor_->execute(qe, ret_qds);
 }
@@ -411,16 +409,21 @@ int TsFileReader::get_timeseries_schema(
                          device_id, timeseries_indexs, pa))) {
     } else {
         for (auto timeseries_index : timeseries_indexs) {
-            auto* aligned_timeseries_index =
-                dynamic_cast<AlignedTimeseriesIndex*>(timeseries_index);
-            auto data_type =
-                aligned_timeseries_index != nullptr &&
-                        aligned_timeseries_index->value_ts_idx_ != nullptr
-                    ? aligned_timeseries_index->value_ts_idx_->get_data_type()
-                    : timeseries_index->get_data_type();
+            // AlignedTimeseriesIndex::get_data_type() returns the time
+            // column type (VECTOR) so the aligned/non-aligned dispatch in
+            // SSI can keep using the existing accessor.  For schema
+            // exposure we need the actual value column type — without this
+            // unwrap, INT32/FLOAT/... would all surface as VECTOR.
+            common::TSDataType dt = timeseries_index->get_data_type();
+            if (dt == common::VECTOR) {
+                auto* aligned =
+                    dynamic_cast<AlignedTimeseriesIndex*>(timeseries_index);
+                if (aligned != nullptr && aligned->value_ts_idx_ != nullptr) {
+                    dt = aligned->value_ts_idx_->get_data_type();
+                }
+            }
             MeasurementSchema ms(
-                timeseries_index->get_measurement_name().to_std_string(),
-                data_type);
+                timeseries_index->get_measurement_name().to_std_string(), dt);
             result.push_back(ms);
         }
     }
@@ -448,6 +451,15 @@ int TsFileReader::get_timeseries_metadata_impl(
 
 DeviceTimeseriesMetadataMap TsFileReader::get_timeseries_metadata(
     const std::vector<std::shared_ptr<IDeviceID>>& device_ids) {
+    // Reset the shared meta arena up front: every call writes fresh
+    // timeseries-index metadata into it via _impl(), and the previous
+    // implementation only ever appended.  A long-lived reader that repeats
+    // this query would grow tsfile_reader_meta_pa_ without bound (each call
+    // duplicates the per-device payload).  Callers that need to retain prior
+    // results past this call must copy them out before invoking again — the
+    // shared_ptrs handed back use a noop deleter pointing into this arena.
+    tsfile_reader_meta_pa_.destroy();
+    tsfile_reader_meta_pa_.init(512, MOD_TSFILE_READER);
     DeviceTimeseriesMetadataMap result;
     for (const auto& device_id : device_ids) {
         std::vector<std::shared_ptr<ITimeseriesIndex>> list;
@@ -466,6 +478,10 @@ DeviceTimeseriesMetadataMap TsFileReader::get_timeseries_metadata() {
         return result;
     }
 
+    // Same arena-reset rationale as the device_ids overload above.
+    tsfile_reader_meta_pa_.destroy();
+    tsfile_reader_meta_pa_.init(512, MOD_TSFILE_READER);
+
     PageArena pa;
     pa.init(512, MOD_TSFILE_READER);
     std::vector<DeviceMetaEntry> entries;
diff --git a/cpp/src/reader/tsfile_reader.h b/cpp/src/reader/tsfile_reader.h
index 19d83ec61..e2f9f3496 100644
--- a/cpp/src/reader/tsfile_reader.h
+++ b/cpp/src/reader/tsfile_reader.h
@@ -143,7 +143,6 @@ class TsFileReader {
      * @param offset         Number of leading rows to skip (>= 0).
      * @param limit          Maximum rows to return. < 0 means unlimited.
      * @param[out] result_set  The result set containing query results.
-     * @param tag_filter     Optional tag filter for filtering by tag columns.
      * @return Returns 0 on success, or a non-zero error code on failure.
      */
     int queryByRow(const std::string& table_name,
@@ -243,8 +242,10 @@ class TsFileReader {
     storage::ReadFile* read_file_;
     storage::TsFileExecutor* tsfile_executor_;
     storage::TableQueryExecutor* table_query_executor_;
-    int table_query_executor_batch_size_;
+    int table_query_executor_batch_size_ = -1;
     common::PageArena tsfile_reader_meta_pa_;
+    // Test-only hook for the unbounded-arena-growth regression check.
+    friend class TsFileReaderMetaArenaTest;
 };
 
 }  // namespace storage
diff --git a/cpp/src/reader/tsfile_series_scan_iterator.cc b/cpp/src/reader/tsfile_series_scan_iterator.cc
index 1d666bfc0..eb41c1f40 100644
--- a/cpp/src/reader/tsfile_series_scan_iterator.cc
+++ b/cpp/src/reader/tsfile_series_scan_iterator.cc
@@ -19,6 +19,13 @@
 
 #include "reader/tsfile_series_scan_iterator.h"
 
+#include <iostream>
+
+#include "common/global.h"
+#ifdef ENABLE_THREADS
+#include "common/thread_pool.h"
+#endif
+
 using namespace common;
 
 namespace storage {
@@ -26,6 +33,11 @@ namespace storage {
 void TsFileSeriesScanIterator::destroy() {
     timeseries_index_pa_.destroy();
     if (chunk_reader_ != nullptr) {
+        // destroy() already runs manual destructors on internal members
+        // (chunk_header_, decoders, compressor, ...), so calling
+        // chunk_reader_->~IChunkReader() here would double-destruct them.
+        // The vector-buffer leaks (e.g. chunk_pages_) are released inside
+        // AlignedChunkReader::destroy() via vector<>{}.swap().
         chunk_reader_->destroy();
         common::mem_free(chunk_reader_);
         chunk_reader_ = nullptr;
@@ -65,20 +77,24 @@ bool TsFileSeriesScanIterator::should_skip_aligned_chunk_by_offset(
     if (row_offset_ <= 0) {
         return false;
     }
-    if (time_cm->statistic_ == nullptr || value_cm->statistic_ == nullptr) {
+    // Aligned value chunks' statistic_->count_ only counts non-null rows,
+    // not total rows.  Using value_cm alone could skip an entire 100-row
+    // chunk for an offset of 10 just because it has 10 non-null values.
+    // Only apply the whole-chunk shortcut when time and value statistics
+    // agree on the row count (i.e. no sparse nulls in this chunk); fall
+    // through to per-page/per-row handling otherwise so the offset is
+    // applied against the real row stream.
+    if (time_cm == nullptr || value_cm == nullptr ||
+        time_cm->statistic_ == nullptr || value_cm->statistic_ == nullptr) {
         return false;
     }
     int32_t tc = time_cm->statistic_->count_;
     int32_t vc = value_cm->statistic_->count_;
-    if (tc <= 0 || vc <= 0) {
-        return false;
-    }
-    if (tc != vc) {
+    if (tc <= 0 || vc <= 0 || tc != vc) {
         return false;
     }
-    int32_t count = tc;
-    if (row_offset_ >= count) {
-        row_offset_ -= count;
+    if (row_offset_ >= tc) {
+        row_offset_ -= tc;
         return true;
     }
     return false;
@@ -91,74 +107,104 @@ int TsFileSeriesScanIterator::get_next(TsBlock*& ret_tsblock, bool alloc,
     Filter* filter =
         (oneshoot_filter != nullptr) ? oneshoot_filter : time_filter_;
 
+    // When get_next_page() reports E_NO_MORE_DATA but the chunk reader
+    // still claims has_more_data() (an aligned-chunk artifact where time
+    // and value pages report state differently), a bare `continue` would
+    // retry the exhausted chunk forever.  Force the next iteration to
+    // advance to the next chunk-meta cursor instead.
     bool force_load_next_chunk = false;
     while (true) {
-        // When get_next_page() reports no more data for the current chunk but
-        // metadata still lists more chunks, we must load the next chunk. A
-        // bare continue would retry the exhausted reader forever if
-        // has_more_data() still returns true (e.g. aligned chunk state).
         if (!chunk_reader_->has_more_data() || force_load_next_chunk) {
             force_load_next_chunk = false;
             while (true) {
                 if (!has_next_chunk()) {
                     return E_NO_MORE_DATA;
+                } else if (is_multi_value_) {
+                    // Multi-value aligned path
+                    ChunkMeta* time_cm = time_chunk_meta_cursor_.get();
+                    std::vector<ChunkMeta*> value_cms;
+                    value_cms.reserve(value_chunk_meta_cursors_.size());
+                    for (auto& cur : value_chunk_meta_cursors_) {
+                        value_cms.push_back(cur.get());
+                    }
+                    advance_to_next_chunk();
+                    // Skip chunk by time filter using time chunk statistics.
+                    if (filter != nullptr && time_cm->statistic_ != nullptr &&
+                        !filter->satisfy(time_cm->statistic_)) {
+                        continue;
+                    }
+                    if (should_skip_chunk_by_time(time_cm, min_time_hint)) {
+                        continue;
+                    }
+                    chunk_reader_->reset();
+                    auto* acr = static_cast<AlignedChunkReader*>(chunk_reader_);
+                    if (RET_FAIL(acr->load_by_aligned_meta_multi(time_cm,
+                                                                 value_cms))) {
+                    }
+                    break;
+                } else if (!is_aligned_) {
+                    ChunkMeta* cm = get_current_chunk_meta();
+                    advance_to_next_chunk();
+                    if (filter != nullptr && cm->statistic_ != nullptr &&
+                        !filter->satisfy(cm->statistic_)) {
+                        continue;
+                    }
+                    // Skip by min_time_hint (merge cursor).
+                    if (should_skip_chunk_by_time(cm, min_time_hint)) {
+                        continue;
+                    }
+                    // Single-path: skip entire chunk by offset using count.
+                    if (should_skip_chunk_by_offset(cm)) {
+                        continue;
+                    }
+                    chunk_reader_->reset();
+                    if (RET_FAIL(chunk_reader_->load_by_meta(cm))) {
+                    }
+                    break;
                 } else {
-                    if (!is_aligned_) {
-                        ChunkMeta* cm = get_current_chunk_meta();
-                        advance_to_next_chunk();
-                        // Skip by time filter.
-                        if (filter != nullptr && cm->statistic_ != nullptr &&
-                            !filter->satisfy(cm->statistic_)) {
-                            continue;
-                        }
-                        // Skip by min_time_hint (merge cursor).
-                        if (should_skip_chunk_by_time(cm, min_time_hint)) {
-                            continue;
-                        }
-                        // Single-path: skip entire chunk by offset using count.
-                        if (should_skip_chunk_by_offset(cm)) {
-                            continue;
-                        }
-                        chunk_reader_->reset();
-                        if (RET_FAIL(chunk_reader_->load_by_meta(cm))) {
-                        }
-                        break;
-                    } else {
-                        ChunkMeta* value_cm = value_chunk_meta_cursor_.get();
-                        ChunkMeta* time_cm = time_chunk_meta_cursor_.get();
-                        advance_to_next_chunk();
-                        if (filter != nullptr &&
-                            value_cm->statistic_ != nullptr &&
-                            !filter->satisfy(value_cm->statistic_)) {
-                            continue;
-                        }
-                        if (should_skip_chunk_by_time(value_cm,
-                                                      min_time_hint)) {
-                            continue;
-                        }
-                        if (should_skip_aligned_chunk_by_offset(time_cm,
-                                                                value_cm)) {
-                            continue;
-                        }
-                        chunk_reader_->reset();
-                        if (RET_FAIL(chunk_reader_->load_by_aligned_meta(
-                                time_cm, value_cm))) {
-                        }
-                        break;
+                    ChunkMeta* value_cm = value_chunk_meta_cursor_.get();
+                    ChunkMeta* time_cm = time_chunk_meta_cursor_.get();
+                    advance_to_next_chunk();
+                    // Use time chunk statistics for time-based filtering.
+                    ChunkMeta* filter_cm =
+                        (time_cm->statistic_ != nullptr) ? time_cm : value_cm;
+                    if (filter != nullptr && filter_cm->statistic_ != nullptr &&
+                        !filter->satisfy(filter_cm->statistic_)) {
+                        continue;
+                    }
+                    if (should_skip_chunk_by_time(filter_cm, min_time_hint)) {
+                        continue;
+                    }
+                    if (should_skip_aligned_chunk_by_offset(time_cm,
+                                                            value_cm)) {
+                        continue;
                     }
+                    chunk_reader_->reset();
+                    if (RET_FAIL(chunk_reader_->load_by_aligned_meta(
+                            time_cm, value_cm))) {
+                    }
+                    break;
                 }
             }
         }
         if (IS_SUCC(ret)) {
             if (alloc && ret_tsblock == nullptr) {
-                ret_tsblock = alloc_tsblock();
+                ret_tsblock =
+                    is_multi_value_ ? alloc_tsblock_multi() : alloc_tsblock();
             }
             ret = chunk_reader_->get_next_page(ret_tsblock, filter, *data_pa_,
                                                min_time_hint, row_offset_,
                                                row_limit_);
         }
+        if (ret == common::E_NO_MORE_DATA && ret_tsblock != nullptr &&
+            ret_tsblock->get_row_count() > 0) {
+            return E_OK;
+        }
         // When current chunk is exhausted (e.g. all pages skipped by offset)
-        // but there are more chunks, load next chunk and retry.
+        // but there are more chunks, load next chunk and retry.  Set the
+        // force flag so the next iteration bypasses has_more_data() (which
+        // can still report true on an aligned chunk that has actually
+        // yielded all its rows).
         if (ret == common::E_NO_MORE_DATA && has_next_chunk()) {
             ret = E_OK;
             force_load_next_chunk = true;
@@ -179,9 +225,19 @@ void TsFileSeriesScanIterator::revert_tsblock() {
 int TsFileSeriesScanIterator::init_chunk_reader() {
     int ret = E_OK;
     is_aligned_ = itimeseries_index_->is_aligned();
+
+    // Check if this is a multi-value aligned index. alloc_multi_ssi() creates
+    // MultiAlignedTimeseriesIndex even when the query selects one value column,
+    // so keep that path consistent with wider aligned reads.
+    if (is_aligned_ && dynamic_cast<MultiAlignedTimeseriesIndex*>(
+                           itimeseries_index_) != nullptr) {
+        return init_chunk_reader_multi();
+    }
+
     if (!is_aligned_) {
         void* buf =
             common::mem_alloc(sizeof(ChunkReader), common::MOD_CHUNK_READER);
+        if (IS_NULL(buf)) return E_OOM;
         chunk_reader_ = new (buf) ChunkReader;
         chunk_meta_cursor_ = itimeseries_index_->get_chunk_meta_list()->begin();
         if (RET_FAIL(chunk_reader_->init(
@@ -191,6 +247,7 @@ int TsFileSeriesScanIterator::init_chunk_reader() {
     } else {
         void* buf = common::mem_alloc(sizeof(AlignedChunkReader),
                                       common::MOD_CHUNK_READER);
+        if (IS_NULL(buf)) return E_OOM;
         chunk_reader_ = new (buf) AlignedChunkReader;
         time_chunk_meta_cursor_ =
             itimeseries_index_->get_time_chunk_meta_list()->begin();
@@ -205,6 +262,96 @@ int TsFileSeriesScanIterator::init_chunk_reader() {
     return ret;
 }
 
+int TsFileSeriesScanIterator::init_chunk_reader_multi() {
+    int ret = E_OK;
+    is_multi_value_ = true;
+
+    void* buf =
+        common::mem_alloc(sizeof(AlignedChunkReader), common::MOD_CHUNK_READER);
+    if (IS_NULL(buf)) {
+        // The single-value path (init_chunk_reader) silently dereferenced
+        // the null pointer on OOM; this path is new in the multi-value
+        // reader work and would do the same via placement-new(nullptr) →
+        // undefined behavior the moment any AlignedChunkReader field is
+        // touched.  Surface E_OOM instead.
+        is_multi_value_ = false;
+        return E_OOM;
+    }
+    auto* acr = new (buf) AlignedChunkReader;
+    chunk_reader_ = acr;
+
+    uint32_t num_cols = itimeseries_index_->get_value_column_count();
+#ifdef ENABLE_THREADS
+    // Borrow the single process-wide worker pool (created in init_common()) for
+    // multi-column decode.  Null when libtsfile_init() hasn't run; combined
+    // with parallel_read_enabled_ this gates the parallel decode path — the
+    // reader falls back to serial decode otherwise.
+    if (num_cols > 1 && common::g_config_value_.parallel_read_enabled_ &&
+        common::g_thread_pool_ != nullptr) {
+        acr->set_decode_pool(common::g_thread_pool_);
+    }
+#endif
+
+    // Per-column chunk lists must align 1:1 with the time chunk list:
+    // load_by_aligned_meta_multi pairs them by index and the downstream
+    // reader has no notion of a "missing" value chunk for a CGM.  If a
+    // file evolved its schema and some column has fewer (or more) chunks
+    // than the time column, naive index pairing would mate chunks from
+    // different chunk groups, returning garbage and dereferencing past
+    // end() once the shorter list ran out.  Refuse upfront with a clear
+    // error rather than producing wrong data.
+    uint32_t time_chunk_count =
+        itimeseries_index_->get_time_chunk_meta_list()->size();
+    for (uint32_t c = 0; c < num_cols; c++) {
+        if (itimeseries_index_->get_value_chunk_meta_list(c)->size() !=
+            time_chunk_count) {
+            return E_NOT_SUPPORT;
+        }
+    }
+
+    // Init time cursor
+    time_chunk_meta_cursor_ =
+        itimeseries_index_->get_time_chunk_meta_list()->begin();
+
+    // Init all value cursors
+    value_chunk_meta_cursors_.resize(num_cols);
+    for (uint32_t c = 0; c < num_cols; c++) {
+        value_chunk_meta_cursors_[c] =
+            itimeseries_index_->get_value_chunk_meta_list(c)->begin();
+    }
+
+    // Init chunk reader
+    if (RET_FAIL(
+            acr->init(read_file_, itimeseries_index_->get_measurement_name(),
+                      itimeseries_index_->get_data_type(), time_filter_))) {
+        return ret;
+    }
+
+    // No chunks → nothing to load; iteration short-circuits via
+    // has_next_chunk() returning false.
+    if (time_chunk_count == 0) {
+        return ret;
+    }
+
+    // Load first chunk set
+    ChunkMeta* time_cm = time_chunk_meta_cursor_.get();
+    std::vector<ChunkMeta*> value_cms;
+    value_cms.reserve(num_cols);
+    for (uint32_t c = 0; c < num_cols; c++) {
+        value_cms.push_back(value_chunk_meta_cursors_[c].get());
+    }
+
+    if (RET_FAIL(acr->load_by_aligned_meta_multi(time_cm, value_cms))) {
+        return ret;
+    }
+
+    // Advance cursors
+    time_chunk_meta_cursor_++;
+    for (auto& cur : value_chunk_meta_cursors_) cur++;
+
+    return ret;
+}
+
 TsBlock* TsFileSeriesScanIterator::alloc_tsblock() {
     ChunkHeader& ch = chunk_reader_->get_chunk_header();
 
@@ -225,4 +372,29 @@ TsBlock* TsFileSeriesScanIterator::alloc_tsblock() {
     return tsblock_;
 }
 
-}  // end namespace storage
\ No newline at end of file
+TsBlock* TsFileSeriesScanIterator::alloc_tsblock_multi() {
+    auto* acr = static_cast<AlignedChunkReader*>(chunk_reader_);
+
+    // Time column
+    ColumnSchema time_cd("time", common::INT64, common::SNAPPY,
+                         common::TS_2DIFF);
+    tuple_desc_.push_back(time_cd);
+
+    // Value columns
+    uint32_t num_cols = acr->get_value_column_count();
+    for (uint32_t c = 0; c < num_cols; c++) {
+        ChunkHeader& ch = acr->get_value_chunk_header(c);
+        ColumnSchema value_cd(ch.measurement_name_, ch.data_type_,
+                              ch.compression_type_, ch.encoding_type_);
+        tuple_desc_.push_back(value_cd);
+    }
+
+    tsblock_ = new TsBlock(&tuple_desc_);
+    if (E_OK != tsblock_->init()) {
+        delete tsblock_;
+        tsblock_ = nullptr;
+    }
+    return tsblock_;
+}
+
+}  // end namespace storage
diff --git a/cpp/src/reader/tsfile_series_scan_iterator.h b/cpp/src/reader/tsfile_series_scan_iterator.h
index 9e790a3d1..77037d8e1 100644
--- a/cpp/src/reader/tsfile_series_scan_iterator.h
+++ b/cpp/src/reader/tsfile_series_scan_iterator.h
@@ -50,6 +50,7 @@ class TsFileSeriesScanIterator {
           tsblock_(nullptr),
           time_filter_(nullptr),
           is_aligned_(false),
+          is_multi_value_(false),
           row_offset_(0),
           row_limit_(-1) {}
     ~TsFileSeriesScanIterator() { destroy(); }
@@ -93,11 +94,42 @@ class TsFileSeriesScanIterator {
                  int64_t min_time_hint = std::numeric_limits<int64_t>::min());
     void revert_tsblock();
 
+    // Multi-value: number of value columns in the TsBlock
+    uint32_t get_value_column_count() const {
+        if (is_multi_value_ && chunk_reader_) {
+            auto* acr = static_cast<AlignedChunkReader*>(chunk_reader_);
+            return acr->get_value_column_count();
+        }
+        return 1;
+    }
+
+    bool is_multi_value() const { return is_multi_value_; }
+
     friend class TsFileIOReader;
 
    private:
     int init_chunk_reader();
+    int init_chunk_reader_multi();
     FORCE_INLINE bool has_next_chunk() const {
+        if (is_multi_value_) {
+            // Anchor on the time chunk list and require every value column
+            // to still have a chunk available.  Checking only value[0] used
+            // to read past end() for columns with fewer chunks (e.g. a
+            // column added after some chunk groups had already been
+            // flushed), which dereferenced freed memory and paired the
+            // wrong time/value chunks.
+            if (time_chunk_meta_cursor_ ==
+                itimeseries_index_->get_time_chunk_meta_list()->end()) {
+                return false;
+            }
+            for (uint32_t c = 0; c < value_chunk_meta_cursors_.size(); c++) {
+                if (value_chunk_meta_cursors_[c] ==
+                    itimeseries_index_->get_value_chunk_meta_list(c)->end()) {
+                    return false;
+                }
+            }
+            return true;
+        }
         if (is_aligned_) {
             return value_chunk_meta_cursor_ !=
                    itimeseries_index_->get_value_chunk_meta_list()->end();
@@ -107,7 +139,21 @@ class TsFileSeriesScanIterator {
         }
     }
     FORCE_INLINE void advance_to_next_chunk() {
-        if (is_aligned_) {
+        if (is_multi_value_) {
+            // Guard each cursor against advancing past end().  Same defense
+            // as has_next_chunk(): per-column chunk counts can diverge in
+            // files with schema evolution.
+            auto time_end =
+                itimeseries_index_->get_time_chunk_meta_list()->end();
+            if (time_chunk_meta_cursor_ != time_end) time_chunk_meta_cursor_++;
+            for (uint32_t c = 0; c < value_chunk_meta_cursors_.size(); c++) {
+                auto end =
+                    itimeseries_index_->get_value_chunk_meta_list(c)->end();
+                if (value_chunk_meta_cursors_[c] != end) {
+                    value_chunk_meta_cursors_[c]++;
+                }
+            }
+        } else if (is_aligned_) {
             time_chunk_meta_cursor_++;
             value_chunk_meta_cursor_++;
         } else {
@@ -119,15 +165,10 @@ class TsFileSeriesScanIterator {
     }
     bool should_skip_chunk_by_time(ChunkMeta* cm, int64_t min_time_hint);
     bool should_skip_chunk_by_offset(ChunkMeta* cm);
-    /**
-     * Aligned (VECTOR): whole-chunk skip by row count is only safe when the
-     * time ChunkMeta and value ChunkMeta agree on statistic count (>0). If
-     * either side lacks count or counts differ, skip is disabled for this
-     * chunk; pages are loaded and page/row-level offset handling applies.
-     */
     bool should_skip_aligned_chunk_by_offset(ChunkMeta* time_cm,
                                              ChunkMeta* value_cm);
     common::TsBlock* alloc_tsblock();
+    common::TsBlock* alloc_tsblock_multi();
 
    private:
     ReadFile* read_file_;
@@ -140,12 +181,16 @@ class TsFileSeriesScanIterator {
     common::SimpleList<ChunkMeta*>::Iterator chunk_meta_cursor_;
     common::SimpleList<ChunkMeta*>::Iterator time_chunk_meta_cursor_;
     common::SimpleList<ChunkMeta*>::Iterator value_chunk_meta_cursor_;
+    // Multi-value: one cursor per value column
+    std::vector<common::SimpleList<ChunkMeta*>::Iterator>
+        value_chunk_meta_cursors_;
     IChunkReader* chunk_reader_;
 
     common::TupleDesc tuple_desc_;
     common::TsBlock* tsblock_;
     Filter* time_filter_;
     bool is_aligned_ = false;
+    bool is_multi_value_ = false;
     int row_offset_;
     int row_limit_;
 };
diff --git a/cpp/src/utils/db_utils.h b/cpp/src/utils/db_utils.h
index 4ffc4d138..b3cb1943e 100644
--- a/cpp/src/utils/db_utils.h
+++ b/cpp/src/utils/db_utils.h
@@ -195,8 +195,6 @@ struct ColumnSchema {
 };
 
 FORCE_INLINE int64_t get_cur_timestamp() {
-    // Milliseconds since the Unix epoch. Uses the C++11 standard library so it
-    // is portable across platforms (gettimeofday is not available on MSVC).
     return std::chrono::duration_cast<std::chrono::milliseconds>(
                std::chrono::system_clock::now().time_since_epoch())
         .count();
diff --git a/cpp/src/writer/chunk_writer.cc b/cpp/src/writer/chunk_writer.cc
index da1811336..acdb4951d 100644
--- a/cpp/src/writer/chunk_writer.cc
+++ b/cpp/src/writer/chunk_writer.cc
@@ -138,6 +138,9 @@ int ChunkWriter::seal_cur_page(bool end_chunk) {
 void ChunkWriter::save_first_page_data(PageWriter& first_page_writer) {
     first_page_data_ = first_page_writer.get_cur_page_data();
     first_page_statistic_->deep_copy_from(first_page_writer.get_statistic());
+    // See ValueChunkWriter::save_first_page_data: avoid double-free on the
+    // shallow-copied buffer pointers.
+    first_page_writer.release_cur_page_data();
 }
 
 int ChunkWriter::write_first_page_data(ByteStream& pages_data,
diff --git a/cpp/src/writer/chunk_writer.h b/cpp/src/writer/chunk_writer.h
index 6eb3f5418..a65f0537f 100644
--- a/cpp/src/writer/chunk_writer.h
+++ b/cpp/src/writer/chunk_writer.h
@@ -103,6 +103,68 @@ class ChunkWriter {
         CW_DO_WRITE_FOR_TYPE();
     }
 
+    template <typename T>
+    int write_batch(const int64_t* timestamps, const T* values,
+                    uint32_t count) {
+        int ret = common::E_OK;
+        uint32_t offset = 0;
+        const uint32_t page_cap =
+            common::g_config_value_.page_writer_max_point_num_;
+        while (offset < count) {
+            uint32_t cur_points = page_writer_.get_point_numer();
+            // Seal whenever cur_points is at or past the cap; the counter is
+            // size_ (rows including the just-written batch) and may exceed
+            // page_cap, so a plain subtraction would underflow uint32_t.
+            if (cur_points >= page_cap) {
+                if (RET_FAIL(seal_cur_page(false))) {
+                    return ret;
+                }
+                cur_points = 0;
+            }
+            uint32_t page_remaining = page_cap - cur_points;
+            uint32_t batch_size = std::min(count - offset, page_remaining);
+            if (RET_FAIL(page_writer_.write_batch(
+                    timestamps + offset, values + offset, batch_size))) {
+                return ret;
+            }
+            offset += batch_size;
+            if (RET_FAIL(seal_cur_page_if_full())) {
+                return ret;
+            }
+        }
+        return ret;
+    }
+
+    int write_string_batch(const int64_t* timestamps, const char* buffer,
+                           const uint32_t* offsets, uint32_t start_idx,
+                           uint32_t count) {
+        int ret = common::E_OK;
+        uint32_t offset = 0;
+        const uint32_t page_cap =
+            common::g_config_value_.page_writer_max_point_num_;
+        while (offset < count) {
+            uint32_t cur_points = page_writer_.get_point_numer();
+            if (cur_points >= page_cap) {
+                if (RET_FAIL(seal_cur_page(false))) {
+                    return ret;
+                }
+                cur_points = 0;
+            }
+            uint32_t page_remaining = page_cap - cur_points;
+            uint32_t batch_size = std::min(count - offset, page_remaining);
+            if (RET_FAIL(page_writer_.write_string_batch(
+                    timestamps + offset, buffer, offsets, start_idx + offset,
+                    batch_size))) {
+                return ret;
+            }
+            offset += batch_size;
+            if (RET_FAIL(seal_cur_page_if_full())) {
+                return ret;
+            }
+        }
+        return ret;
+    }
+
     int end_encode_chunk();
     common::ByteStream& get_chunk_data() { return chunk_data_; }
     Statistic* get_chunk_statistic() { return chunk_statistic_; }
diff --git a/cpp/src/writer/page_writer.cc b/cpp/src/writer/page_writer.cc
index 7766e14c4..eebe5b400 100644
--- a/cpp/src/writer/page_writer.cc
+++ b/cpp/src/writer/page_writer.cc
@@ -126,6 +126,11 @@ void PageWriter::reset() {
     }
     time_out_stream_.reset();
     value_out_stream_.reset();
+    // Without this, a page that was poisoned by a mid-batch encode failure
+    // would stay refused forever even after ChunkWriter calls reset() to
+    // start a fresh page — `partial_failure_` would still be true and
+    // write_to_chunk() would return E_DATA_INCONSISTENCY indefinitely.
+    partial_failure_ = false;
 }
 
 void PageWriter::destroy() {
@@ -156,6 +161,14 @@ int PageWriter::write_to_chunk(ByteStream& pages_data, bool write_header,
               << pages_data.total_size() << " of chunk_data." << std::endl;
 #endif
     int ret = E_OK;
+    // Refuse to seal a page whose time and value streams diverged because of
+    // a mid-batch encode failure (see PageWriter::write_batch).  The higher
+    // layer (TsFileWriter::unrecoverable_) is the authoritative place to
+    // surface this to the caller; this guard prevents a misaligned page from
+    // ever entering the chunk stream.
+    if (UNLIKELY(partial_failure_)) {
+        return common::E_DATA_INCONSISTENCY;
+    }
     if (RET_FAIL(prepare_end_page())) {
         return ret;
     }
diff --git a/cpp/src/writer/page_writer.h b/cpp/src/writer/page_writer.h
index d3966d865..47c958913 100644
--- a/cpp/src/writer/page_writer.h
+++ b/cpp/src/writer/page_writer.h
@@ -150,10 +150,63 @@ class PageWriter {
         PW_DO_WRITE_FOR_TYPE();
     }
 
+    template <typename T>
+    FORCE_INLINE int write_batch(const int64_t* timestamps, const T* values,
+                                 uint32_t count) {
+        int ret = common::E_OK;
+        if (count == 0) return ret;
+        if (UNLIKELY(partial_failure_)) return common::E_DATA_INCONSISTENCY;
+        if (RET_FAIL(time_encoder_->encode_batch(timestamps, count,
+                                                 time_out_stream_))) {
+            // Time stream wasn't advanced (encode_batch is atomic w.r.t. the
+            // stream cursor on failure for these encoders) — leave the page
+            // intact so the caller can retry.
+        } else if (RET_FAIL(value_encoder_->encode_batch(values, count,
+                                                         value_out_stream_))) {
+            // Time stream already advanced; we can't roll it back here.
+            // Mark the page poisoned so write_to_chunk() refuses to seal a
+            // page where time and value rows are out of sync.
+            partial_failure_ = true;
+        } else {
+            statistic_->update_batch(timestamps, values, count);
+        }
+        return ret;
+    }
+
+    // Batch write strings from Arrow-style offset+buffer layout.
+    FORCE_INLINE int write_string_batch(const int64_t* timestamps,
+                                        const char* buffer,
+                                        const uint32_t* offsets,
+                                        uint32_t start_idx, uint32_t count) {
+        int ret = common::E_OK;
+        if (count == 0) return ret;
+        if (UNLIKELY(partial_failure_)) return common::E_DATA_INCONSISTENCY;
+        if (RET_FAIL(time_encoder_->encode_batch(timestamps, count,
+                                                 time_out_stream_))) {
+        } else if (RET_FAIL(value_encoder_->encode_string_batch(
+                       buffer, offsets, start_idx, count, value_out_stream_))) {
+            partial_failure_ = true;
+        } else {
+            for (uint32_t i = 0; i < count; i++) {
+                uint32_t idx = start_idx + i;
+                uint32_t len = offsets[idx + 1] - offsets[idx];
+                common::String val(buffer + offsets[idx], len);
+                statistic_->update(timestamps[i], val);
+            }
+        }
+        return ret;
+    }
+
+    FORCE_INLINE bool has_partial_failure() const { return partial_failure_; }
+
     FORCE_INLINE uint32_t get_point_numer() const { return statistic_->count_; }
     FORCE_INLINE uint32_t get_time_out_stream_size() const {
         return time_out_stream_.total_size();
     }
+    // Logical bytes written — used by the page-seal-when-full heuristic.
+    // Memory-pressure accounting should use estimate_max_mem_size() below,
+    // which reflects the real 64 KiB-page footprint of the underlying
+    // ByteStreams.
     FORCE_INLINE uint32_t get_page_memory_size() const {
         return time_out_stream_.total_size() + value_out_stream_.total_size();
     }
@@ -162,10 +215,17 @@ class PageWriter {
      * outputStream and value outputStream, because size outputStream is never
      * used until flushing.
      *
+     * Reports the *allocated* stream footprint (sum of backing 64 KiB pages)
+     * rather than the logical bytes written.  Sparse workloads with many
+     * measurements would otherwise look like they hold ~0 memory while
+     * actually pinning a full 64 KiB page per stream, so chunk-group memory
+     * thresholds couldn't keep peak memory under the configured cap.
+     *
      * @return allocated size in time, value and outputStream
      */
     FORCE_INLINE uint32_t estimate_max_mem_size() const {
-        return time_out_stream_.total_size() + value_out_stream_.total_size() +
+        return static_cast<uint32_t>(time_out_stream_.allocated_bytes() +
+                                     value_out_stream_.allocated_bytes()) +
                time_encoder_->get_max_byte_size() +
                value_encoder_->get_max_byte_size();
     }
@@ -179,6 +239,11 @@ class PageWriter {
     }
     FORCE_INLINE Statistic* get_statistic() { return statistic_; }
     PageData get_cur_page_data() { return cur_page_data_; }
+    // See ValuePageWriter::release_cur_page_data for rationale.
+    void release_cur_page_data() {
+        cur_page_data_.uncompressed_buf_ = nullptr;
+        cur_page_data_.compressed_buf_ = nullptr;
+    }
     void destroy_page_data() { cur_page_data_.destroy(); }
 
    private:
@@ -193,7 +258,6 @@ class PageWriter {
                           common::ByteStream& pages_data);
 
    private:
-    // static const uint32_t OUT_STREAM_PAGE_SIZE = 48;
     static const uint32_t OUT_STREAM_PAGE_SIZE = 1024;
 
    private:
@@ -206,6 +270,11 @@ class PageWriter {
     PageData cur_page_data_;
     Compressor* compressor_;
     bool is_inited_;
+    // Set when write_batch advanced the time stream but value encoding
+    // failed.  We can't unwind the partial time write, so refuse further
+    // writes and surface the poisoning to the higher layer via
+    // write_to_chunk().
+    bool partial_failure_ = false;
 };
 
 }  // end namespace storage
diff --git a/cpp/src/writer/time_chunk_writer.cc b/cpp/src/writer/time_chunk_writer.cc
index 0c7e3b212..0a0623686 100644
--- a/cpp/src/writer/time_chunk_writer.cc
+++ b/cpp/src/writer/time_chunk_writer.cc
@@ -144,6 +144,9 @@ int TimeChunkWriter::seal_cur_page(bool end_chunk) {
 void TimeChunkWriter::save_first_page_data(TimePageWriter& first_page_writer) {
     first_page_data_ = first_page_writer.get_cur_page_data();
     first_page_statistic_->deep_copy_from(first_page_writer.get_statistic());
+    // See ValueChunkWriter::save_first_page_data: avoid double-free on the
+    // shallow-copied buffer pointers.
+    first_page_writer.release_cur_page_data();
 }
 
 int TimeChunkWriter::write_first_page_data(ByteStream& pages_data,
@@ -173,9 +176,6 @@ int TimeChunkWriter::end_encode_chunk() {
             chunk_header_.data_size_ = chunk_data_.total_size();
             chunk_header_.num_of_pages_ = num_of_pages_;
         }
-    } else if (num_of_pages_ > 0) {
-        chunk_header_.data_size_ = chunk_data_.total_size();
-        chunk_header_.num_of_pages_ = num_of_pages_;
     }
 #if DEBUG_SE
     std::cout << "end_encode_time_chunk: num_of_pages_=" << num_of_pages_
diff --git a/cpp/src/writer/time_chunk_writer.h b/cpp/src/writer/time_chunk_writer.h
index c67516ba5..e6b2894e2 100644
--- a/cpp/src/writer/time_chunk_writer.h
+++ b/cpp/src/writer/time_chunk_writer.h
@@ -42,8 +42,7 @@ class TimeChunkWriter {
           first_page_data_(),
           first_page_statistic_(nullptr),
           chunk_header_(),
-          num_of_pages_(0),
-          enable_page_seal_if_full_(true) {}
+          num_of_pages_(0) {}
     ~TimeChunkWriter() { destroy(); }
     int init(const common::ColumnSchema& col_schema);
     int init(const std::string& measurement_name, common::TSEncoding encoding,
@@ -58,9 +57,35 @@ class TimeChunkWriter {
         if (RET_FAIL(time_page_writer_.write(timestamp))) {
             return ret;
         }
-        if (UNLIKELY(!enable_page_seal_if_full_)) {
+        if (RET_FAIL(seal_cur_page_if_full())) {
             return ret;
-        } else {
+        }
+        return ret;
+    }
+
+    int write_batch(const int64_t* timestamps, uint32_t count) {
+        int ret = common::E_OK;
+        uint32_t offset = 0;
+        const uint32_t page_cap =
+            common::g_config_value_.page_writer_max_point_num_;
+        while (offset < count) {
+            uint32_t cur_points = time_page_writer_.get_point_numer();
+            // Seal whenever cur_points is at or past the cap; the counter is
+            // size_ (rows including the just-written batch) and may exceed
+            // page_cap, so a plain subtraction would underflow uint32_t.
+            if (cur_points >= page_cap) {
+                if (RET_FAIL(seal_cur_page(false))) {
+                    return ret;
+                }
+                cur_points = 0;
+            }
+            uint32_t page_remaining = page_cap - cur_points;
+            uint32_t batch_size = std::min(count - offset, page_remaining);
+            if (RET_FAIL(time_page_writer_.write_batch(timestamps + offset,
+                                                       batch_size))) {
+                return ret;
+            }
+            offset += batch_size;
             if (RET_FAIL(seal_cur_page_if_full())) {
                 return ret;
             }
@@ -73,29 +98,25 @@ class TimeChunkWriter {
     Statistic* get_chunk_statistic() { return chunk_statistic_; }
     FORCE_INLINE int32_t num_of_pages() const { return num_of_pages_; }
 
+    int64_t estimate_max_series_mem_size();
+
+    bool hasData();
+
     // Current (unsealed) page point count.
     FORCE_INLINE uint32_t get_point_numer() const {
         return time_page_writer_.get_point_numer();
     }
 
-    int64_t estimate_max_series_mem_size();
-
-    bool hasData();
-
     /** True if the current (unsealed) page has at least one point. */
     bool has_current_page_data() const {
         return time_page_writer_.get_point_numer() > 0;
     }
 
-    /**
-     * Force seal the current page (for aligned model: when any aligned page
-     * seals due to memory/point threshold, all pages must seal together).
-     * @return E_OK on success.
-     */
+    /** Force seal the current page. */
     int seal_current_page() { return seal_cur_page(false); }
 
-    // For aligned writer: allow disabling the automatic page-size/point-number
-    // check so the caller can seal pages at chosen boundaries.
+    // Allow disabling the automatic page-size/point-number check so the
+    // caller can seal pages at chosen boundaries.
     FORCE_INLINE void set_enable_page_seal_if_full(bool enable) {
         enable_page_seal_if_full_ = enable;
     }
@@ -109,6 +130,9 @@ class TimeChunkWriter {
                 common::g_config_value_.page_writer_max_memory_bytes_);
     }
     FORCE_INLINE int seal_cur_page_if_full() {
+        if (UNLIKELY(!enable_page_seal_if_full_)) {
+            return common::E_OK;
+        }
         if (UNLIKELY(is_cur_page_full())) {
             return seal_cur_page(false);
         }
@@ -138,8 +162,7 @@ class TimeChunkWriter {
 
     ChunkHeader chunk_header_;
     int32_t num_of_pages_;
-    // If false, write() won't auto-seal when the current page becomes full.
-    bool enable_page_seal_if_full_;
+    bool enable_page_seal_if_full_ = true;
 };
 
 }  // end namespace storage
diff --git a/cpp/src/writer/time_page_writer.h b/cpp/src/writer/time_page_writer.h
index d9dcecff1..bda9a5023 100644
--- a/cpp/src/writer/time_page_writer.h
+++ b/cpp/src/writer/time_page_writer.h
@@ -84,15 +84,40 @@ class TimePageWriter {
         return ret;
     }
 
+    int write_batch(const int64_t* timestamps, uint32_t count) {
+        int ret = common::E_OK;
+        if (count == 0) return ret;
+        // Check order: first timestamp vs existing end_time
+        if (statistic_->count_ != 0 && is_inited_ &&
+            timestamps[0] <= statistic_->end_time_) {
+            return common::E_OUT_OF_ORDER;
+        }
+        // Check monotonicity within batch
+        for (uint32_t i = 1; i < count; i++) {
+            if (timestamps[i] <= timestamps[i - 1]) {
+                return common::E_OUT_OF_ORDER;
+            }
+        }
+        if (RET_FAIL(time_encoder_->encode_batch(timestamps, count,
+                                                 time_out_stream_))) {
+        } else {
+            statistic_->update_time_batch(timestamps, count);
+        }
+        return ret;
+    }
+
     FORCE_INLINE uint32_t get_point_numer() const { return statistic_->count_; }
     FORCE_INLINE uint32_t get_time_out_stream_size() const {
         return time_out_stream_.total_size();
     }
+    // Logical bytes written — used by the page-seal-when-full heuristic.
     FORCE_INLINE uint32_t get_page_memory_size() const {
         return time_out_stream_.total_size();
     }
+    // Allocated 64 KiB-page footprint — used by chunk-group memory pressure
+    // accounting.  See PageWriter::estimate_max_mem_size.
     FORCE_INLINE uint32_t estimate_max_mem_size() const {
-        return time_out_stream_.total_size() +
+        return static_cast<uint32_t>(time_out_stream_.allocated_bytes()) +
                time_encoder_->get_max_byte_size();
     }
     int write_to_chunk(common::ByteStream& pages_data, bool write_header,
@@ -102,6 +127,11 @@ class TimePageWriter {
     }
     FORCE_INLINE Statistic* get_statistic() { return statistic_; }
     TimePageData get_cur_page_data() { return cur_page_data_; }
+    // See ValuePageWriter::release_cur_page_data for rationale.
+    void release_cur_page_data() {
+        cur_page_data_.uncompressed_buf_ = nullptr;
+        cur_page_data_.compressed_buf_ = nullptr;
+    }
     void destroy_page_data() { cur_page_data_.destroy(); }
 
    private:
diff --git a/cpp/src/writer/tsfile_table_writer.cc b/cpp/src/writer/tsfile_table_writer.cc
index eb0319af8..b1b7911bd 100644
--- a/cpp/src/writer/tsfile_table_writer.cc
+++ b/cpp/src/writer/tsfile_table_writer.cc
@@ -45,7 +45,7 @@ TsFileTableWriter::TsFileTableWriter(
 
 }  // namespace storage
 
-storage::TsFileTableWriter::~TsFileTableWriter() = default;
+storage::TsFileTableWriter::~TsFileTableWriter() { close(); }
 
 int storage::TsFileTableWriter::register_table(
     const std::shared_ptr<TableSchema>& table_schema) {
@@ -66,21 +66,48 @@ int storage::TsFileTableWriter::write_table(storage::Tablet& tablet) const {
                tablet.get_table_name() != exclusive_table_name_) {
         return common::E_TABLE_NOT_EXIST;
     }
+    // Always lowercase the incoming tablet's table / column / schema-map
+    // names: each call may carry a fresh tablet with mixed-case identifiers,
+    // and the underlying engine expects lowercase. Lowering is idempotent so
+    // reusing the same tablet across calls remains cheap.
     tablet.set_table_name(to_lower(tablet.get_table_name()));
     for (size_t i = 0; i < tablet.get_column_count(); i++) {
         tablet.set_column_name(i, to_lower(tablet.get_column_name(i)));
     }
 
     auto schema_map = tablet.get_schema_map();
-    std::map<std::string, int> schema_map_;
+    std::map<std::string, int> new_schema_map;
     for (auto iter = schema_map.begin(); iter != schema_map.end(); iter++) {
-        schema_map_[to_lower(iter->first)] = iter->second;
+        new_schema_map[to_lower(iter->first)] = iter->second;
     }
-    tablet.set_schema_map(schema_map_);
+    tablet.set_schema_map(new_schema_map);
 
     return tsfile_writer_->write_table(tablet);
 }
 
-int storage::TsFileTableWriter::flush() { return tsfile_writer_->flush(); }
+int storage::TsFileTableWriter::flush() {
+    if (closed_) {
+        return common::E_OK;
+    }
+    return tsfile_writer_->flush();
+}
 
-int storage::TsFileTableWriter::close() { return tsfile_writer_->close(); }
+int storage::TsFileTableWriter::close() {
+    if (closed_) {
+        return common::E_OK;
+    }
+    if (!tsfile_writer_) {
+        closed_ = true;
+        return common::E_OK;
+    }
+    // Don't latch closed_ until the underlying writer reports success: a
+    // failed footer write / sync / file close should be retryable, and the
+    // destructor must still be able to drive a final close attempt.  The
+    // previous order returned E_OK on every retry after the first failure,
+    // potentially leaving the file unfinished and leaking the fd.
+    int ret = tsfile_writer_->close();
+    if (ret == common::E_OK) {
+        closed_ = true;
+    }
+    return ret;
+}
diff --git a/cpp/src/writer/tsfile_table_writer.h b/cpp/src/writer/tsfile_table_writer.h
index ce18bc007..a2d2a5fd9 100644
--- a/cpp/src/writer/tsfile_table_writer.h
+++ b/cpp/src/writer/tsfile_table_writer.h
@@ -124,6 +124,8 @@ class TsFileTableWriter {
     // Some errors may not be conveyed during the construction phase, so it's
     // necessary to maintain an internal error code.
     int error_number = common::E_OK;
+
+    bool closed_ = false;
 };
 
 }  // namespace storage
diff --git a/cpp/src/writer/tsfile_writer.cc b/cpp/src/writer/tsfile_writer.cc
index bc3398d98..c469faaec 100644
--- a/cpp/src/writer/tsfile_writer.cc
+++ b/cpp/src/writer/tsfile_writer.cc
@@ -25,8 +25,12 @@
 #include <unistd.h>
 #endif
 
+#include <chrono>
+#include <iomanip>
+
 #include "chunk_writer.h"
 #include "common/config/config.h"
+#include "common/global.h"
 #ifdef ENABLE_THREADS
 #include "common/thread_pool.h"
 #endif
@@ -56,23 +60,19 @@ int libtsfile_init() {
 }
 
 void libtsfile_destroy() {
+    ModStat::get_instance().destroy();
 #ifdef ENABLE_THREADS
-    delete common::g_write_thread_pool_;
-    common::g_write_thread_pool_ = nullptr;
+    delete common::g_thread_pool_;
+    common::g_thread_pool_ = nullptr;
 #endif
-    ModStat::get_instance().destroy();
     libtsfile::g_s_is_inited = false;
 }
 
-void set_page_max_point_count(uint32_t page_max_ponint_count) {
-    config_set_page_max_point_count(page_max_ponint_count);
+int set_page_max_point_count(uint32_t page_max_ponint_count) {
+    return config_set_page_max_point_count(page_max_ponint_count);
 }
-void set_max_degree_of_index_node(uint32_t max_degree_of_index_node) {
-    config_set_max_degree_of_index_node(max_degree_of_index_node);
-}
-
-void set_strict_page_size(bool strict_page_size) {
-    config_set_strict_page_size(strict_page_size);
+int set_max_degree_of_index_node(uint32_t max_degree_of_index_node) {
+    return config_set_max_degree_of_index_node(max_degree_of_index_node);
 }
 
 TsFileWriter::TsFileWriter()
@@ -84,8 +84,7 @@ TsFileWriter::TsFileWriter()
       record_count_for_next_mem_check_(
           g_config_value_.record_count_for_next_mem_check_),
       write_file_created_(false),
-      io_writer_owned_(true),
-      enforce_recovered_last_time_order_(false) {}
+      io_writer_owned_(true) {}
 
 TsFileWriter::~TsFileWriter() { destroy(); }
 
@@ -131,7 +130,19 @@ int TsFileWriter::init(WriteFile* write_file) {
     write_file_ = write_file;
     write_file_created_ = false;
     io_writer_owned_ = true;
+    // Re-arm per-lifecycle state when the writer is reused after a
+    // destroy().  enforce_recovered_last_time_order_ may have been set
+    // true by a previous recovery init; without resetting it we'd refuse
+    // valid writes whose timestamps don't satisfy a long-stale anchor.
+    // unrecoverable_ from a previous partial-write failure would otherwise
+    // make every operation on the new file fail immediately.
+    // start_file_done_ is true after the previous lifecycle's first flush,
+    // so without resetting it flush() would skip the magic/version write on
+    // the new file and produce headerless output.
     enforce_recovered_last_time_order_ = false;
+    unrecoverable_ = false;
+    start_file_done_ = false;
+    record_count_since_last_flush_ = 0;
     io_writer_ = new TsFileIOWriter();
     io_writer_->init(write_file_);
     return E_OK;
@@ -151,6 +162,10 @@ int TsFileWriter::init(RestorableTsFileIOWriter* rw) {
     write_file_ = rw->get_write_file();
     write_file_created_ = false;
     io_writer_owned_ = false;
+    // Clear any unrecoverable_ latched from a previous lifecycle so the
+    // re-init isn't immediately poisoned.
+    unrecoverable_ = false;
+    // Reject new writes whose timestamps fall back into the recovered range.
     enforce_recovered_last_time_order_ = true;
     io_writer_ = rw;
 
@@ -188,6 +203,8 @@ int TsFileWriter::init(RestorableTsFileIOWriter* rw) {
             if (cm == nullptr) {
                 continue;
             }
+            // Track the highest end_time across recovered chunks so that
+            // appending writes can refuse out-of-order timestamps.
             if (cm->statistic_ != nullptr && cm->statistic_->count_ > 0) {
                 group->last_time_ =
                     std::max(group->last_time_, cm->statistic_->end_time_);
@@ -682,6 +699,10 @@ int64_t TsFileWriter::calculate_mem_size_for_all_group() {
     return mem_total_size;
 }
 
+int64_t TsFileWriter::calculate_meta_mem_size() const {
+    return io_writer_->get_meta_size();
+}
+
 /**
  * check occupied memory size, if it exceeds the chunkGroupSize threshold, flush
  * them to given OutputStream.
@@ -689,7 +710,15 @@ int64_t TsFileWriter::calculate_mem_size_for_all_group() {
 int TsFileWriter::check_memory_size_and_may_flush_chunks() {
     int ret = E_OK;
     if (record_count_since_last_flush_ >= record_count_for_next_mem_check_) {
-        int64_t mem_size = calculate_mem_size_for_all_group();
+        // chunk-writer memory drops to ~0 after flush, but chunk metadata
+        // (ChunkMeta / ChunkGroupMeta / per-statistic PageArenas) keeps
+        // accumulating until end_file().  Wide-schema or many-flush
+        // workloads can pile up tens of MB of metadata that the old
+        // threshold check ignored entirely — flush would never fire even
+        // though total writer memory was well past chunk_group_size_threshold_.
+        int64_t chunk_size = calculate_mem_size_for_all_group();
+        int64_t meta_size = calculate_meta_mem_size();
+        int64_t mem_size = chunk_size + meta_size;
         record_count_for_next_mem_check_ =
             record_count_since_last_flush_ *
             common::g_config_value_.chunk_group_size_threshold_ / mem_size;
@@ -701,16 +730,17 @@ int TsFileWriter::check_memory_size_and_may_flush_chunks() {
 }
 
 int TsFileWriter::write_record(const TsRecord& record) {
+    if (UNLIKELY(unrecoverable_)) return E_DATA_INCONSISTENCY;
     int ret = E_OK;
     auto device_id = std::make_shared<StringArrayDeviceID>(record.device_id_);
-    auto schema_it = schemas_.find(device_id);
-    if (schema_it == schemas_.end() || schema_it->second == nullptr) {
-        return E_DEVICE_NOT_EXIST;
-    }
-    MeasurementSchemaGroup* device_schema = schema_it->second;
-    if (enforce_recovered_last_time_order_ &&
-        record.timestamp_ <= device_schema->last_time_) {
-        return E_OUT_OF_ORDER;
+    // After recovery, refuse writes whose timestamp would land at or before
+    // any already-flushed chunk's end_time for this device.
+    if (enforce_recovered_last_time_order_) {
+        auto schema_it = schemas_.find(device_id);
+        if (schema_it != schemas_.end() && schema_it->second != nullptr &&
+            record.timestamp_ <= schema_it->second->last_time_) {
+            return E_OUT_OF_ORDER;
+        }
     }
     // std::vector<ChunkWriter*> chunk_writers;
     SimpleVector<ChunkWriter*> chunk_writers;
@@ -732,24 +762,28 @@ int TsFileWriter::write_record(const TsRecord& record) {
                     record.points_[c]);
     }
 
-    device_schema->last_time_ =
-        std::max(device_schema->last_time_, record.timestamp_);
+    if (enforce_recovered_last_time_order_) {
+        auto schema_it = schemas_.find(device_id);
+        if (schema_it != schemas_.end() && schema_it->second != nullptr) {
+            schema_it->second->last_time_ =
+                std::max(schema_it->second->last_time_, record.timestamp_);
+        }
+    }
     record_count_since_last_flush_++;
     ret = check_memory_size_and_may_flush_chunks();
     return ret;
 }
 
 int TsFileWriter::write_record_aligned(const TsRecord& record) {
+    if (UNLIKELY(unrecoverable_)) return E_DATA_INCONSISTENCY;
     int ret = E_OK;
     auto device_id = std::make_shared<StringArrayDeviceID>(record.device_id_);
-    auto schema_it = schemas_.find(device_id);
-    if (schema_it == schemas_.end() || schema_it->second == nullptr) {
-        return E_DEVICE_NOT_EXIST;
-    }
-    MeasurementSchemaGroup* device_schema = schema_it->second;
-    if (enforce_recovered_last_time_order_ &&
-        record.timestamp_ <= device_schema->last_time_) {
-        return E_OUT_OF_ORDER;
+    if (enforce_recovered_last_time_order_) {
+        auto schema_it = schemas_.find(device_id);
+        if (schema_it != schemas_.end() && schema_it->second != nullptr &&
+            record.timestamp_ <= schema_it->second->last_time_) {
+            return E_OUT_OF_ORDER;
+        }
     }
     SimpleVector<ValueChunkWriter*> value_chunk_writers;
     SimpleVector<common::TSDataType> data_types;
@@ -763,6 +797,8 @@ int TsFileWriter::write_record_aligned(const TsRecord& record) {
     if (value_chunk_writers.size() != record.points_.size()) {
         return E_INVALID_ARG;
     }
+    // Snapshot page counters before the write so we can detect any column
+    // that crossed a page boundary and seal the rest in lockstep.
     int32_t time_pages_before = time_chunk_writer->num_of_pages();
     std::vector<int32_t> value_pages_before(value_chunk_writers.size(), 0);
     for (uint32_t c = 0; c < value_chunk_writers.size(); c++) {
@@ -771,22 +807,40 @@ int TsFileWriter::write_record_aligned(const TsRecord& record) {
             value_pages_before[c] = value_chunk_writer->num_of_pages();
         }
     }
-    time_chunk_writer->write(record.timestamp_);
+    // Time first: a rejected timestamp (E_OUT_OF_ORDER, OOM, etc.) must
+    // not silently advance the value writers — that would leave the time
+    // chunk one row behind every value chunk for the rest of the file.
+    if (RET_FAIL(time_chunk_writer->write(record.timestamp_))) {
+        return ret;
+    }
     for (uint32_t c = 0; c < value_chunk_writers.size(); c++) {
         ValueChunkWriter* value_chunk_writer = value_chunk_writers[c];
         if (IS_NULL(value_chunk_writer)) {
             continue;
         }
-        write_point_aligned(value_chunk_writer, record.timestamp_,
-                            data_types[c], record.points_[c]);
+        if (RET_FAIL(write_point_aligned(value_chunk_writer, record.timestamp_,
+                                         data_types[c], record.points_[c]))) {
+            // Time wrote the row but at least one value column failed
+            // mid-record; the per-column row counts no longer agree.
+            // Mark the writer unrecoverable so flush/close refuses to
+            // seal a misaligned chunk group.
+            unrecoverable_ = true;
+            return ret;
+        }
     }
     if (RET_FAIL(maybe_seal_aligned_pages_together(
             time_chunk_writer, value_chunk_writers, time_pages_before,
             value_pages_before))) {
+        unrecoverable_ = true;
         return ret;
     }
-    device_schema->last_time_ =
-        std::max(device_schema->last_time_, record.timestamp_);
+    if (enforce_recovered_last_time_order_) {
+        auto schema_it = schemas_.find(device_id);
+        if (schema_it != schemas_.end() && schema_it->second != nullptr) {
+            schema_it->second->last_time_ =
+                std::max(schema_it->second->last_time_, record.timestamp_);
+        }
+    }
     return ret;
 }
 
@@ -815,39 +869,10 @@ int TsFileWriter::write_point(ChunkWriter* chunk_writer, int64_t timestamp,
     }
 }
 
-int TsFileWriter::write_point_aligned(ValueChunkWriter* value_chunk_writer,
-                                      int64_t timestamp,
-                                      common::TSDataType data_type,
-                                      const DataPoint& point) {
-    bool isnull = point.isnull;
-    switch (data_type) {
-        case common::BOOLEAN:
-            return value_chunk_writer->write(timestamp, point.u_.bool_val_,
-                                             isnull);
-        case common::INT32:
-        case common::DATE:
-            return value_chunk_writer->write(timestamp, point.u_.i32_val_,
-                                             isnull);
-        case common::TIMESTAMP:
-        case common::INT64:
-            return value_chunk_writer->write(timestamp, point.u_.i64_val_,
-                                             isnull);
-        case common::FLOAT:
-            return value_chunk_writer->write(timestamp, point.u_.float_val_,
-                                             isnull);
-        case common::DOUBLE:
-            return value_chunk_writer->write(timestamp, point.u_.double_val_,
-                                             isnull);
-        case common::BLOB:
-        case common::TEXT:
-        case common::STRING:
-            return value_chunk_writer->write(timestamp, point.text_val_,
-                                             isnull);
-        default:
-            return E_INVALID_DATA_POINT;
-    }
-}
-
+// After writing one record / batch to the time chunk and every value chunk,
+// keep their page boundaries aligned: if any of them autosealed a page on
+// memory pressure, seal the rest of the open pages too so an aligned reader
+// can still pair position N across time + every value column.
 int TsFileWriter::maybe_seal_aligned_pages_together(
     TimeChunkWriter* time_chunk_writer,
     common::SimpleVector<ValueChunkWriter*>& value_chunk_writers,
@@ -883,19 +908,52 @@ int TsFileWriter::maybe_seal_aligned_pages_together(
     return ret;
 }
 
+int TsFileWriter::write_point_aligned(ValueChunkWriter* value_chunk_writer,
+                                      int64_t timestamp,
+                                      common::TSDataType data_type,
+                                      const DataPoint& point) {
+    bool isnull = point.isnull;
+    switch (data_type) {
+        case common::BOOLEAN:
+            return value_chunk_writer->write(timestamp, point.u_.bool_val_,
+                                             isnull);
+        case common::INT32:
+        case common::DATE:
+            return value_chunk_writer->write(timestamp, point.u_.i32_val_,
+                                             isnull);
+        case common::TIMESTAMP:
+        case common::INT64:
+            return value_chunk_writer->write(timestamp, point.u_.i64_val_,
+                                             isnull);
+        case common::FLOAT:
+            return value_chunk_writer->write(timestamp, point.u_.float_val_,
+                                             isnull);
+        case common::DOUBLE:
+            return value_chunk_writer->write(timestamp, point.u_.double_val_,
+                                             isnull);
+        case common::BLOB:
+        case common::TEXT:
+        case common::STRING:
+            return value_chunk_writer->write(timestamp, point.text_val_,
+                                             isnull);
+        default:
+            return E_INVALID_DATA_POINT;
+    }
+}
+
 int TsFileWriter::write_tablet_aligned(const Tablet& tablet) {
+    if (UNLIKELY(unrecoverable_)) return E_DATA_INCONSISTENCY;
     int ret = E_OK;
     auto device_id =
         std::make_shared<StringArrayDeviceID>(tablet.insert_target_name_);
-    auto schema_it = schemas_.find(device_id);
-    if (schema_it == schemas_.end() || schema_it->second == nullptr) {
-        return E_DEVICE_NOT_EXIST;
-    }
-    MeasurementSchemaGroup* device_schema = schema_it->second;
     const uint32_t total_rows = tablet.get_cur_row_size();
     if (enforce_recovered_last_time_order_ && total_rows > 0 &&
-        tablet.timestamps_[0] <= device_schema->last_time_) {
-        return E_OUT_OF_ORDER;
+        tablet.timestamps_ != nullptr) {
+        auto schema_it = schemas_.find(device_id);
+        if (schema_it != schemas_.end() && schema_it->second != nullptr &&
+            tablet.timestamps_[0] <= schema_it->second->last_time_) {
+            return E_OUT_OF_ORDER;
+        }
     }
     SimpleVector<ValueChunkWriter*> value_chunk_writers;
     TimeChunkWriter* time_chunk_writer = nullptr;
@@ -906,247 +964,109 @@ int TsFileWriter::write_tablet_aligned(const Tablet& tablet) {
                                          data_types))) {
         return ret;
     }
-    const bool strict_page_size = common::g_config_value_.strict_page_size_;
-
-    // Decide whether we have string/blob/text columns.
-    bool has_varlen_column = false;
-    for (uint32_t i = 0; i < data_types.size(); i++) {
-        if (data_types[i] == common::STRING || data_types[i] == common::TEXT ||
-            data_types[i] == common::BLOB) {
-            has_varlen_column = true;
-            break;
-        }
-    }
-
-    // Keep writers' seal-check behavior consistent across calls.
-    time_chunk_writer->set_enable_page_seal_if_full(strict_page_size);
-    for (uint32_t c = 0; c < value_chunk_writers.size(); c++) {
-        if (!IS_NULL(value_chunk_writers[c])) {
-            value_chunk_writers[c]->set_enable_page_seal_if_full(
-                strict_page_size);
-        }
-    }
-
-    if (strict_page_size) {
-        // Strict mode: keep the original row-based insertion to ensure aligned
-        // pages seal together when either side becomes full.
-        for (uint32_t row = 0; row < total_rows; row++) {
-            int32_t time_pages_before = time_chunk_writer->num_of_pages();
-            std::vector<int32_t> value_pages_before(value_chunk_writers.size(),
-                                                    0);
-            for (uint32_t c = 0; c < value_chunk_writers.size(); c++) {
-                ValueChunkWriter* value_chunk_writer = value_chunk_writers[c];
-                if (!IS_NULL(value_chunk_writer)) {
-                    value_pages_before[c] = value_chunk_writer->num_of_pages();
-                }
-            }
-
-            if (RET_FAIL(time_chunk_writer->write(tablet.timestamps_[row]))) {
-                return ret;
-            }
-            ASSERT(value_chunk_writers.size() == tablet.get_column_count());
-            for (uint32_t c = 0; c < value_chunk_writers.size(); c++) {
-                ValueChunkWriter* value_chunk_writer = value_chunk_writers[c];
-                if (IS_NULL(value_chunk_writer)) {
-                    continue;
-                }
-                if (RET_FAIL(value_write_column(value_chunk_writer, tablet, c,
-                                                row, row + 1))) {
-                    return ret;
-                }
-            }
-            if (RET_FAIL(maybe_seal_aligned_pages_together(
-                    time_chunk_writer, value_chunk_writers, time_pages_before,
-                    value_pages_before))) {
-                return ret;
-            }
+    ASSERT(data_types.size() == tablet.get_column_count());
+    for (uint32_t c = 0; c < data_types.size(); c++) {
+        if (data_types[c] == common::NULL_TYPE) {
+            continue;
         }
-        if (total_rows > 0) {
-            device_schema->last_time_ = std::max(
-                device_schema->last_time_, tablet.timestamps_[total_rows - 1]);
+        if (data_types[c] != tablet.schema_vec_->at(c).data_type_) {
+            return E_TYPE_NOT_MATCH;
         }
-        return ret;
     }
-
-    // Non-strict mode: switch to column-based insertion.
-    if (!has_varlen_column) {
-        // Optimization: when there is no string/blob/text column, we only need
-        // to split by point-number so that each split will trigger a page
-        // seal (and avoid the per-row page-size check).
-        const uint32_t points_per_page =
-            common::g_config_value_.page_writer_max_point_num_;
-
-        // Disable auto page sealing. We will seal pages at split boundaries.
-        time_chunk_writer->set_enable_page_seal_if_full(false);
-        for (uint32_t c = 0; c < value_chunk_writers.size(); c++) {
-            if (!IS_NULL(value_chunk_writers[c])) {
-                value_chunk_writers[c]->set_enable_page_seal_if_full(false);
-            }
-        }
-
-        // Determine how many points we need to fill the current unsealed time
-        // page (it may already contain data from previous tablets).
-        uint32_t time_cur_points = time_chunk_writer->get_point_numer();
-        if (time_cur_points >= points_per_page &&
-            time_chunk_writer->has_current_page_data()) {
-            // Close the already-full page together with all aligned value
-            // pages.
-            if (RET_FAIL(time_chunk_writer->seal_current_page())) {
-                return ret;
-            }
-            for (uint32_t c = 0; c < value_chunk_writers.size(); c++) {
-                ValueChunkWriter* value_chunk_writer = value_chunk_writers[c];
-                if (!IS_NULL(value_chunk_writer) &&
-                    value_chunk_writer->has_current_page_data()) {
-                    if (RET_FAIL(value_chunk_writer->seal_current_page())) {
-                        return ret;
-                    }
-                }
-            }
-            time_cur_points = 0;
-        }
-        const uint32_t first_seg_len =
-            (time_cur_points > 0 && time_cur_points < points_per_page)
-                ? (points_per_page - time_cur_points)
-                : points_per_page;
-
-        // 1) Write time in segments and seal all full segments (except the
-        // last remaining segment).
-        uint32_t seg_start = 0;
-        uint32_t seg_len = first_seg_len;
-        while (seg_start < total_rows) {
-            const uint32_t seg_end = std::min(seg_start + seg_len, total_rows);
-            if (RET_FAIL(time_write_column(time_chunk_writer, tablet, seg_start,
-                                           seg_end))) {
-                return ret;
-            }
-            seg_start = seg_end;
-            if (seg_start < total_rows) {
-                if (RET_FAIL(time_chunk_writer->seal_current_page())) {
-                    return ret;
-                }
-            }
-            seg_len = points_per_page;
-        }
-
-        // 2) Write each value column in the same segments.
-        ASSERT(value_chunk_writers.size() == tablet.get_column_count());
-        for (uint32_t col = 0; col < value_chunk_writers.size(); col++) {
-            ValueChunkWriter* value_chunk_writer = value_chunk_writers[col];
-            if (IS_NULL(value_chunk_writer)) {
-                continue;
-            }
-
-            seg_start = 0;
-            seg_len = first_seg_len;
-            while (seg_start < total_rows) {
-                const uint32_t seg_end =
-                    std::min(seg_start + seg_len, total_rows);
-                if (RET_FAIL(value_write_column(value_chunk_writer, tablet, col,
-                                                seg_start, seg_end))) {
-                    return ret;
-                }
-                seg_start = seg_end;
-                if (seg_start < total_rows) {
-                    if (value_chunk_writer->has_current_page_data() &&
-                        RET_FAIL(value_chunk_writer->seal_current_page())) {
-                        return ret;
-                    }
-                }
-                seg_len = points_per_page;
-            }
-        }
-        if (total_rows > 0) {
-            device_schema->last_time_ = std::max(
-                device_schema->last_time_, tablet.timestamps_[total_rows - 1]);
+    // Snapshot page counters before the batch so we can detect any column
+    // that crossed a page boundary mid-tablet and seal the rest in lockstep.
+    int32_t time_pages_before = time_chunk_writer->num_of_pages();
+    std::vector<int32_t> value_pages_before(value_chunk_writers.size(), 0);
+    for (uint32_t c = 0; c < value_chunk_writers.size(); c++) {
+        ValueChunkWriter* value_chunk_writer = value_chunk_writers[c];
+        if (!IS_NULL(value_chunk_writer)) {
+            value_pages_before[c] = value_chunk_writer->num_of_pages();
         }
-        return ret;
     }
-
-    // General non-strict (may have varlen STRING/TEXT/BLOB columns):
-    // time auto-seals to provide aligned page boundaries; value writers
-    // skip auto page sealing and are sealed manually at time boundaries.
-    // Attention: since value-side auto-seal is disabled, if a varlen value
-    // page hits the memory threshold earlier, it may not seal immediately
-    // and instead will be sealed later at the recorded time-page boundaries
-    // (this may sacrifice the strict page size limit for performance).
-    time_chunk_writer->set_enable_page_seal_if_full(true);
+    // Suppress memory-driven page sealing on every column for the duration of
+    // the batch. The count-driven seals inside write_batch still fire at the
+    // same `page_writer_max_point_num_` boundary on every writer (time +
+    // values), which keeps aligned page boundaries in lock-step. Re-enable
+    // both before returning so subsequent record-by-record writes restore the
+    // normal memory-pressure behavior, and let the final
+    // maybe_seal_aligned_pages_together pick up any count-driven divergence
+    // (e.g. when a sealed value column ended a page that the time column did
+    // not).
+    time_chunk_writer->set_enable_page_seal_if_full(false);
     for (uint32_t c = 0; c < value_chunk_writers.size(); c++) {
-        if (!IS_NULL(value_chunk_writers[c])) {
-            value_chunk_writers[c]->set_enable_page_seal_if_full(false);
+        ValueChunkWriter* value_chunk_writer = value_chunk_writers[c];
+        if (!IS_NULL(value_chunk_writer)) {
+            value_chunk_writer->set_enable_page_seal_if_full(false);
         }
     }
-
-    std::vector<uint32_t> time_page_row_ends;
-    const uint32_t page_max_points = std::max<uint32_t>(
-        1, common::g_config_value_.page_writer_max_point_num_);
-    time_page_row_ends.reserve(total_rows / page_max_points + 1);
-
-    // Write time and record where a time page is sealed.
-    for (uint32_t row = 0; row < total_rows; row++) {
-        const int32_t pages_before = time_chunk_writer->num_of_pages();
-        if (RET_FAIL(time_chunk_writer->write(tablet.timestamps_[row]))) {
-            return ret;
-        }
-        const int32_t pages_after = time_chunk_writer->num_of_pages();
-        if (pages_after > pages_before) {
-            const uint32_t boundary_end = row + 1;
-            if (time_page_row_ends.empty() ||
-                time_page_row_ends.back() != boundary_end) {
-                time_page_row_ends.push_back(boundary_end);
+    auto restore_seal = [&]() {
+        time_chunk_writer->set_enable_page_seal_if_full(true);
+        for (uint32_t k = 0; k < value_chunk_writers.size(); k++) {
+            if (!IS_NULL(value_chunk_writers[k])) {
+                value_chunk_writers[k]->set_enable_page_seal_if_full(true);
             }
         }
+    };
+    // Any failure (out-of-order timestamps, OOM, etc.) must abort before we
+    // write a single value column — otherwise the time chunk would record
+    // fewer rows than each value chunk and the chunk-group would deserialize
+    // as misaligned data.
+    if (RET_FAIL(time_write_column_batch(time_chunk_writer, tablet, 0,
+                                         total_rows))) {
+        restore_seal();
+        return ret;
     }
-
-    // Write values column-by-column and seal at recorded boundaries.
     ASSERT(value_chunk_writers.size() == tablet.get_column_count());
-    for (uint32_t col = 0; col < value_chunk_writers.size(); col++) {
-        ValueChunkWriter* value_chunk_writer = value_chunk_writers[col];
+    for (uint32_t c = 0; c < value_chunk_writers.size(); c++) {
+        ValueChunkWriter* value_chunk_writer = value_chunk_writers[c];
         if (IS_NULL(value_chunk_writer)) {
             continue;
         }
-        uint32_t seg_start = 0;
-        for (uint32_t boundary_end : time_page_row_ends) {
-            if (boundary_end <= seg_start) {
-                continue;
-            }
-            if (RET_FAIL(value_write_column(value_chunk_writer, tablet, col,
-                                            seg_start, boundary_end))) {
-                return ret;
-            }
-            if (value_chunk_writer->has_current_page_data() &&
-                RET_FAIL(value_chunk_writer->seal_current_page())) {
-                return ret;
-            }
-            seg_start = boundary_end;
-        }
-        if (seg_start < total_rows) {
-            if (RET_FAIL(value_write_column(value_chunk_writer, tablet, col,
-                                            seg_start, total_rows))) {
-                return ret;
-            }
+        if (RET_FAIL(value_write_column_batch(value_chunk_writer, tablet, c, 0,
+                                              total_rows))) {
+            restore_seal();
+            // Time chunk has the full row count but at least one value
+            // column stopped early.  Mark the writer unrecoverable so no
+            // later flush/close seals the divergent state.
+            unrecoverable_ = true;
+            return ret;
         }
     }
-    if (total_rows > 0) {
-        device_schema->last_time_ = std::max(
-            device_schema->last_time_, tablet.timestamps_[total_rows - 1]);
+    restore_seal();
+    if (RET_FAIL(maybe_seal_aligned_pages_together(
+            time_chunk_writer, value_chunk_writers, time_pages_before,
+            value_pages_before))) {
+        unrecoverable_ = true;
+        return ret;
+    }
+    if (enforce_recovered_last_time_order_ && total_rows > 0 &&
+        tablet.timestamps_ != nullptr) {
+        auto schema_it = schemas_.find(device_id);
+        if (schema_it != schemas_.end() && schema_it->second != nullptr) {
+            schema_it->second->last_time_ =
+                std::max(schema_it->second->last_time_,
+                         tablet.timestamps_[total_rows - 1]);
+        }
     }
     return ret;
 }
 
 int TsFileWriter::write_tablet(const Tablet& tablet) {
+    if (UNLIKELY(unrecoverable_)) return E_DATA_INCONSISTENCY;
     int ret = E_OK;
     auto device_id =
         std::make_shared<StringArrayDeviceID>(tablet.insert_target_name_);
-    auto schema_it = schemas_.find(device_id);
-    if (schema_it == schemas_.end() || schema_it->second == nullptr) {
-        return E_DEVICE_NOT_EXIST;
-    }
-    MeasurementSchemaGroup* device_schema = schema_it->second;
+    // Use the actual filled row count — max_row_num_ is the buffer capacity
+    // and would let uninitialized timestamps/values past the live range leak
+    // into the chunk.
     const uint32_t total_rows = tablet.get_cur_row_size();
     if (enforce_recovered_last_time_order_ && total_rows > 0 &&
-        tablet.timestamps_[0] <= device_schema->last_time_) {
-        return E_OUT_OF_ORDER;
+        tablet.timestamps_ != nullptr) {
+        auto schema_it = schemas_.find(device_id);
+        if (schema_it != schemas_.end() && schema_it->second != nullptr &&
+            tablet.timestamps_[0] <= schema_it->second->last_time_) {
+            return E_OUT_OF_ORDER;
+        }
     }
     SimpleVector<ChunkWriter*> chunk_writers;
     SimpleVector<common::TSDataType> data_types;
@@ -1155,22 +1075,44 @@ int TsFileWriter::write_tablet(const Tablet& tablet) {
                                  data_types))) {
         return ret;
     }
+    ASSERT(data_types.size() == tablet.get_column_count());
+    for (uint32_t c = 0; c < data_types.size(); c++) {
+        if (data_types[c] == common::NULL_TYPE) {
+            continue;
+        }
+        if (data_types[c] != tablet.schema_vec_->at(c).data_type_) {
+            return E_TYPE_NOT_MATCH;
+        }
+    }
     ASSERT(chunk_writers.size() == tablet.get_column_count());
+    uint32_t columns_written = 0;
     for (uint32_t c = 0; c < chunk_writers.size(); c++) {
         ChunkWriter* chunk_writer = chunk_writers[c];
         if (IS_NULL(chunk_writer)) {
             continue;
         }
-        if (RET_FAIL(write_column(chunk_writer, tablet, c))) {
+        if (RET_FAIL(
+                write_column_batch(chunk_writer, tablet, c, 0, total_rows))) {
+            // Earlier columns already advanced their chunk writers; this
+            // column failed mid-write, so per-column row counts diverge.
+            // Mark unrecoverable so flush/close refuse to seal the
+            // misaligned tree chunk group.
+            if (columns_written > 0) unrecoverable_ = true;
             return ret;
         }
+        columns_written++;
     }
 
-    if (total_rows > 0) {
-        device_schema->last_time_ = std::max(
-            device_schema->last_time_, tablet.timestamps_[total_rows - 1]);
+    if (enforce_recovered_last_time_order_ && total_rows > 0 &&
+        tablet.timestamps_ != nullptr) {
+        auto schema_it = schemas_.find(device_id);
+        if (schema_it != schemas_.end() && schema_it->second != nullptr) {
+            schema_it->second->last_time_ =
+                std::max(schema_it->second->last_time_,
+                         tablet.timestamps_[total_rows - 1]);
+        }
     }
-    record_count_since_last_flush_ += tablet.max_row_num_;
+    record_count_since_last_flush_ += total_rows;
     ret = check_memory_size_and_may_flush_chunks();
     return ret;
 }
@@ -1201,6 +1143,7 @@ int TsFileWriter::write_tree(const TsRecord& record) {
 }
 
 int TsFileWriter::write_table(Tablet& tablet) {
+    if (UNLIKELY(unrecoverable_)) return E_DATA_INCONSISTENCY;
     int ret = E_OK;
     if (io_writer_->get_schema()->table_schema_map_.find(
             tablet.insert_target_name_) ==
@@ -1213,175 +1156,332 @@ int TsFileWriter::write_table(Tablet& tablet) {
     }
 
     auto device_id_end_index_pairs = split_tablet_by_device(tablet);
-    int start_idx = 0;
-    for (auto& device_id_end_index_pair : device_id_end_index_pairs) {
-        auto device_id = device_id_end_index_pair.first;
-        int end_idx = device_id_end_index_pair.second;
-        if (end_idx == 0) continue;
-
-        SimpleVector<ValueChunkWriter*> value_chunk_writers;
-        TimeChunkWriter* time_chunk_writer = nullptr;
-        if (RET_FAIL(do_check_schema_table(device_id, tablet, time_chunk_writer,
-                                           value_chunk_writers))) {
-            return ret;
-        }
-        auto schema_it = schemas_.find(device_id);
-        MeasurementSchemaGroup* device_schema =
-            (schema_it == schemas_.end()) ? nullptr : schema_it->second;
 
-        std::vector<uint32_t> field_columns;
-        field_columns.reserve(tablet.get_column_count());
-        for (uint32_t col = 0; col < tablet.get_column_count(); ++col) {
-            if (tablet.column_categories_[col] ==
-                common::ColumnCategory::FIELD) {
-                field_columns.push_back(col);
-            }
-        }
-        ASSERT(field_columns.size() == value_chunk_writers.size());
-
-        // Precompute page boundaries from point counts — no serial write
-        // needed.  The first segment may be shorter if the time page already
-        // holds data from a previous write_table call.
-        const uint32_t page_max_points = std::max<uint32_t>(
-            1, common::g_config_value_.page_writer_max_point_num_);
-        const uint32_t si = static_cast<uint32_t>(start_idx);
-        const uint32_t ei = static_cast<uint32_t>(end_idx);
-        if (enforce_recovered_last_time_order_ && device_schema != nullptr &&
-            si < ei && tablet.timestamps_[si] <= device_schema->last_time_) {
-            return E_OUT_OF_ORDER;
-        }
+    if (table_aligned_) {
+        struct ValueTask {
+            ValueChunkWriter* vcw;
+            uint32_t col_idx;
+        };
+        struct SegmentRange {
+            uint32_t si;
+            uint32_t ei;
+        };
+        struct DeviceWriteCtx {
+            TimeChunkWriter* tcw;
+            std::vector<ValueTask> value_tasks;
+            std::vector<SegmentRange> segments;
+            uint32_t initial_page_points;
+        };
 
-        // If the current unsealed page is already at or past capacity (from
-        // a previous write_table call), seal it before starting new segments.
-        uint32_t time_cur_points = time_chunk_writer->get_point_numer();
-        if (time_cur_points >= page_max_points) {
-            if (time_chunk_writer->has_current_page_data()) {
-                if (RET_FAIL(time_chunk_writer->seal_current_page())) {
-                    return ret;
+        const uint32_t page_max_points =
+            std::max<uint32_t>(1, g_config_value_.page_writer_max_point_num_);
+
+        std::vector<DeviceWriteCtx> device_ctxs;
+        std::map<std::shared_ptr<IDeviceID>, size_t, IDeviceIDComparator>
+            device_ctx_index;
+        int start_idx = 0;
+        for (auto& pair : device_id_end_index_pairs) {
+            auto device_id = pair.first;
+            int end_idx = pair.second;
+            if (end_idx == 0) continue;
+
+            const uint32_t si = static_cast<uint32_t>(start_idx);
+            const uint32_t ei = static_cast<uint32_t>(end_idx);
+            // Recovery: refuse any segment whose first timestamp would land
+            // at or before a flushed chunk's end_time for this device. This
+            // mirrors the per-record / per-tablet check on the tree path.
+            if (enforce_recovered_last_time_order_ && tablet.timestamps_ &&
+                ei > si) {
+                auto schema_it = schemas_.find(device_id);
+                if (schema_it != schemas_.end() &&
+                    schema_it->second != nullptr &&
+                    tablet.timestamps_[si] <= schema_it->second->last_time_) {
+                    return E_OUT_OF_ORDER;
                 }
             }
-            for (uint32_t k = 0; k < value_chunk_writers.size(); k++) {
-                if (!IS_NULL(value_chunk_writers[k]) &&
-                    value_chunk_writers[k]->has_current_page_data()) {
-                    if (RET_FAIL(value_chunk_writers[k]->seal_current_page())) {
-                        return ret;
+            auto idx_it = device_ctx_index.find(device_id);
+            if (idx_it == device_ctx_index.end()) {
+                SimpleVector<ValueChunkWriter*> value_chunk_writers;
+                TimeChunkWriter* time_chunk_writer = nullptr;
+                if (RET_FAIL(do_check_schema_table(device_id, tablet,
+                                                   time_chunk_writer,
+                                                   value_chunk_writers))) {
+                    return ret;
+                }
+
+                uint32_t time_cur_points = time_chunk_writer->get_point_numer();
+                if (time_cur_points >= page_max_points) {
+                    // Seal the time page first, then every value page in
+                    // lockstep.  Any failure leaves columns at different
+                    // page boundaries and the chunk group can no longer be
+                    // sealed coherently — mark the writer unrecoverable.
+                    if (time_chunk_writer->has_current_page_data()) {
+                        if (RET_FAIL(time_chunk_writer->seal_current_page())) {
+                            unrecoverable_ = true;
+                            return ret;
+                        }
+                    }
+                    for (uint32_t k = 0; k < value_chunk_writers.size(); k++) {
+                        if (!IS_NULL(value_chunk_writers[k]) &&
+                            value_chunk_writers[k]->has_current_page_data()) {
+                            if (RET_FAIL(value_chunk_writers[k]
+                                             ->seal_current_page())) {
+                                unrecoverable_ = true;
+                                return ret;
+                            }
+                        }
                     }
+                    time_cur_points = 0;
                 }
-            }
-            time_cur_points = 0;
-        }
-        const uint32_t first_seg_cap =
-            (time_cur_points > 0 && time_cur_points < page_max_points)
-                ? (page_max_points - time_cur_points)
-                : page_max_points;
 
-        std::vector<uint32_t> page_boundaries;  // row indices where a page
-                                                // should seal
-        {
-            uint32_t pos = si;
-            uint32_t seg_cap = first_seg_cap;
-            while (pos < ei) {
-                uint32_t seg_end = std::min(pos + seg_cap, ei);
-                if (seg_end < ei) {
-                    page_boundaries.push_back(seg_end);
+                DeviceWriteCtx ctx;
+                ctx.tcw = time_chunk_writer;
+                ctx.initial_page_points = time_cur_points;
+                uint32_t field_col_count = 0;
+                for (uint32_t i = 0; i < tablet.get_column_count(); ++i) {
+                    if (tablet.column_categories_[i] ==
+                        common::ColumnCategory::FIELD) {
+                        ValueChunkWriter* vcw =
+                            value_chunk_writers[field_col_count];
+                        if (!IS_NULL(vcw)) {
+                            ctx.value_tasks.push_back({vcw, i});
+                        }
+                        field_col_count++;
+                    }
                 }
-                pos = seg_end;
-                seg_cap = page_max_points;
+                device_ctxs.push_back(std::move(ctx));
+                idx_it = device_ctx_index
+                             .insert(std::make_pair(device_id,
+                                                    device_ctxs.size() - 1))
+                             .first;
             }
+
+            device_ctxs[idx_it->second].segments.push_back({si, ei});
+            start_idx = end_idx;
         }
 
-        // We control page sealing explicitly at precomputed boundaries, so
-        // auto-seal must be disabled during segmented writes — otherwise a
-        // segment of exactly page_max_points would trigger auto-seal AND
-        // our explicit seal, double-sealing (sealing an empty page → crash).
-        // Note: with auto-seal off, the memory-based threshold
-        // (page_writer_max_memory_bytes_) is not enforced within a segment.
-        // For varlen columns (STRING/TEXT/BLOB), individual pages may exceed
-        // the memory limit.  Each segment is still bounded by
-        // page_max_points rows, keeping pages within a reasonable size.
-        auto write_time_in_segments = [this, &tablet, &page_boundaries, si,
-                                       ei](TimeChunkWriter* tcw) -> int {
+        auto write_time_segments =
+            [this, &tablet, page_max_points](
+                TimeChunkWriter* tcw, const std::vector<SegmentRange>& segments,
+                uint32_t initial_page_points) -> int {
             int r = E_OK;
             tcw->set_enable_page_seal_if_full(false);
-            uint32_t seg_start = si;
-            for (uint32_t boundary : page_boundaries) {
-                if ((r = time_write_column(tcw, tablet, seg_start, boundary)) !=
-                    E_OK)
-                    return r;
-                if ((r = tcw->seal_current_page()) != E_OK) return r;
-                seg_start = boundary;
-            }
-            if (seg_start < ei) {
-                r = time_write_column(tcw, tablet, seg_start, ei);
+            uint32_t page_remaining =
+                (initial_page_points > 0 &&
+                 initial_page_points < page_max_points)
+                    ? (page_max_points - initial_page_points)
+                    : page_max_points;
+            for (const auto& segment : segments) {
+                uint32_t seg_pos = segment.si;
+                while (seg_pos < segment.ei) {
+                    uint32_t batch =
+                        std::min(page_remaining, segment.ei - seg_pos);
+                    if ((r = time_write_column_batch(
+                             tcw, tablet, seg_pos, seg_pos + batch)) != E_OK) {
+                        tcw->set_enable_page_seal_if_full(true);
+                        return r;
+                    }
+                    seg_pos += batch;
+                    page_remaining -= batch;
+                    if (page_remaining == 0) {
+                        if ((r = tcw->seal_current_page()) != E_OK) {
+                            tcw->set_enable_page_seal_if_full(true);
+                            return r;
+                        }
+                        page_remaining = page_max_points;
+                    }
+                }
             }
             tcw->set_enable_page_seal_if_full(true);
             return r;
         };
 
-        auto write_value_in_segments = [this, &tablet, &page_boundaries, si,
-                                        ei](ValueChunkWriter* vcw,
-                                            uint32_t col_idx) -> int {
+        auto write_value_segments =
+            [this, &tablet, page_max_points](
+                ValueChunkWriter* vcw, uint32_t col_idx,
+                const std::vector<SegmentRange>& segments,
+                uint32_t initial_page_points) -> int {
             int r = E_OK;
             vcw->set_enable_page_seal_if_full(false);
-            uint32_t seg_start = si;
-            for (uint32_t boundary : page_boundaries) {
-                if ((r = value_write_column(vcw, tablet, col_idx, seg_start,
-                                            boundary)) != E_OK)
-                    return r;
-                if (vcw->has_current_page_data() &&
-                    (r = vcw->seal_current_page()) != E_OK)
-                    return r;
-                seg_start = boundary;
-            }
-            if (seg_start < ei) {
-                r = value_write_column(vcw, tablet, col_idx, seg_start, ei);
+            uint32_t page_remaining =
+                (initial_page_points > 0 &&
+                 initial_page_points < page_max_points)
+                    ? (page_max_points - initial_page_points)
+                    : page_max_points;
+            for (const auto& segment : segments) {
+                uint32_t seg_pos = segment.si;
+                while (seg_pos < segment.ei) {
+                    uint32_t batch =
+                        std::min(page_remaining, segment.ei - seg_pos);
+                    if ((r = value_write_column_batch(
+                             vcw, tablet, col_idx, seg_pos, seg_pos + batch)) !=
+                        E_OK) {
+                        vcw->set_enable_page_seal_if_full(true);
+                        return r;
+                    }
+                    seg_pos += batch;
+                    page_remaining -= batch;
+                    if (page_remaining == 0) {
+                        if (vcw->has_current_page_data() &&
+                            (r = vcw->seal_current_page()) != E_OK) {
+                            vcw->set_enable_page_seal_if_full(true);
+                            return r;
+                        }
+                        page_remaining = page_max_points;
+                    }
+                }
             }
             vcw->set_enable_page_seal_if_full(true);
             return r;
         };
 
-        // All columns (time + values) write the same row segments and seal
-        // at the same boundaries — fully parallel.
 #ifdef ENABLE_THREADS
-        if (g_config_value_.parallel_write_enabled_) {
+        if (g_config_value_.parallel_write_enabled_ &&
+            common::g_thread_pool_ != nullptr) {
             std::vector<std::future<int>> futures;
-            futures.push_back(g_write_thread_pool_->submit(
-                [&write_time_in_segments, time_chunk_writer]() {
-                    return write_time_in_segments(time_chunk_writer);
-                }));
-            for (uint32_t k = 0; k < value_chunk_writers.size(); k++) {
-                ValueChunkWriter* vcw = value_chunk_writers[k];
-                if (IS_NULL(vcw)) continue;
-                uint32_t col_idx = field_columns[k];
-                futures.push_back(g_write_thread_pool_->submit(
-                    [&write_value_in_segments, vcw, col_idx]() {
-                        return write_value_in_segments(vcw, col_idx);
+            for (auto& ctx : device_ctxs) {
+                futures.push_back(common::g_thread_pool_->submit(
+                    [&write_time_segments, &ctx]() {
+                        return write_time_segments(ctx.tcw, ctx.segments,
+                                                   ctx.initial_page_points);
                     }));
+                for (auto& vt : ctx.value_tasks) {
+                    futures.push_back(common::g_thread_pool_->submit(
+                        [&write_value_segments, &vt, &ctx]() {
+                            return write_value_segments(
+                                vt.vcw, vt.col_idx, ctx.segments,
+                                ctx.initial_page_points);
+                        }));
+                }
             }
             for (auto& f : futures) {
                 int r = f.get();
                 if (r != E_OK && ret == E_OK) ret = r;
             }
-            if (ret != E_OK) return ret;
+            if (ret != E_OK) {
+                // One task aborted mid-batch while others may have written
+                // all of their rows; the per-column row counts no longer
+                // line up.  Mark the writer unrecoverable so flush/close
+                // can't seal a corrupt aligned chunk group.
+                unrecoverable_ = true;
+                return ret;
+            }
         } else
 #endif
         {
-            if (RET_FAIL(write_time_in_segments(time_chunk_writer))) {
+            for (auto& ctx : device_ctxs) {
+                if (RET_FAIL(write_time_segments(ctx.tcw, ctx.segments,
+                                                 ctx.initial_page_points))) {
+                    // Time wrote partial rows before failing; value columns
+                    // still hold the prior count.  Same column-alignment
+                    // hazard as the parallel path.
+                    unrecoverable_ = true;
+                    return ret;
+                }
+                for (auto& vt : ctx.value_tasks) {
+                    if (RET_FAIL(write_value_segments(
+                            vt.vcw, vt.col_idx, ctx.segments,
+                            ctx.initial_page_points))) {
+                        unrecoverable_ = true;
+                        return ret;
+                    }
+                }
+            }
+        }
+    } else {
+        int start_idx = 0;
+        for (auto& device_id_end_index_pair : device_id_end_index_pairs) {
+            auto device_id = device_id_end_index_pair.first;
+            int end_idx = device_id_end_index_pair.second;
+            if (end_idx == 0) continue;
+
+            const uint32_t si = static_cast<uint32_t>(start_idx);
+            if (enforce_recovered_last_time_order_ && tablet.timestamps_ &&
+                end_idx > start_idx) {
+                auto schema_it = schemas_.find(device_id);
+                if (schema_it != schemas_.end() &&
+                    schema_it->second != nullptr &&
+                    tablet.timestamps_[si] <= schema_it->second->last_time_) {
+                    return E_OUT_OF_ORDER;
+                }
+            }
+            MeasurementNamesFromTablet mnames_getter(tablet);
+            SimpleVector<ChunkWriter*> chunk_writers;
+            SimpleVector<common::TSDataType> data_types;
+            if (RET_FAIL(do_check_schema(device_id, mnames_getter,
+                                         chunk_writers, data_types))) {
                 return ret;
             }
-            for (uint32_t k = 0; k < value_chunk_writers.size(); k++) {
-                ValueChunkWriter* vcw = value_chunk_writers[k];
-                if (IS_NULL(vcw)) continue;
-                if (RET_FAIL(write_value_in_segments(vcw, field_columns[k]))) {
+            ASSERT(chunk_writers.size() == tablet.get_column_count());
+
+#ifdef ENABLE_THREADS
+            if (chunk_writers.size() >= 2 &&
+                g_config_value_.parallel_write_enabled_ &&
+                common::g_thread_pool_ != nullptr) {
+                const uint32_t si = start_idx;
+                const uint32_t ei = device_id_end_index_pair.second;
+                std::vector<std::future<int>> futures;
+                for (uint32_t c = 0; c < chunk_writers.size(); c++) {
+                    ChunkWriter* cw = chunk_writers[c];
+                    if (IS_NULL(cw)) continue;
+                    futures.push_back(common::g_thread_pool_->submit(
+                        [this, cw, &tablet, c, si, ei]() {
+                            return write_column_batch(cw, tablet, c, si, ei);
+                        }));
+                }
+                for (auto& f : futures) {
+                    int r = f.get();
+                    if (r != E_OK && ret == E_OK) ret = r;
+                }
+                if (ret != E_OK) {
+                    // One column aborted partway while sibling columns
+                    // may have written all of their rows.  The per-column
+                    // chunk writers now disagree on row count, so subsequent
+                    // flush/close would seal a corrupt non-aligned chunk
+                    // group.  Same hazard as the aligned parallel path —
+                    // mark the writer unrecoverable so future ops refuse.
+                    unrecoverable_ = true;
                     return ret;
                 }
+            } else
+#endif
+            {
+                for (uint32_t c = 0; c < chunk_writers.size(); c++) {
+                    ChunkWriter* chunk_writer = chunk_writers[c];
+                    if (IS_NULL(chunk_writer)) continue;
+                    if (RET_FAIL(write_column_batch(
+                            chunk_writer, tablet, c, start_idx,
+                            device_id_end_index_pair.second))) {
+                        // Sequential path: earlier columns already wrote
+                        // their batch, this column failed → divergent row
+                        // counts.  Same unrecoverable contract.
+                        if (c > 0) unrecoverable_ = true;
+                        return ret;
+                    }
+                }
             }
+            start_idx = device_id_end_index_pair.second;
         }
-        if (device_schema != nullptr && si < ei) {
-            device_schema->last_time_ =
-                std::max(device_schema->last_time_, tablet.timestamps_[ei - 1]);
+    }
+    // After all device segments wrote successfully, advance recovery's
+    // per-device last_time_ floor to the highest timestamp this tablet
+    // contributed for each device.
+    if (enforce_recovered_last_time_order_ && tablet.timestamps_) {
+        int update_start = 0;
+        for (auto& pair : device_id_end_index_pairs) {
+            int end_idx = pair.second;
+            if (end_idx == 0) continue;
+            if (end_idx > update_start) {
+                auto schema_it = schemas_.find(pair.first);
+                if (schema_it != schemas_.end() &&
+                    schema_it->second != nullptr) {
+                    schema_it->second->last_time_ =
+                        std::max(schema_it->second->last_time_,
+                                 tablet.timestamps_[end_idx - 1]);
+                }
+            }
+            update_start = end_idx;
         }
-        start_idx = end_idx;
     }
     record_count_since_last_flush_ += tablet.cur_row_size_;
     // Reset string column buffers so the tablet can be reused for the next
@@ -1395,14 +1495,13 @@ std::vector<std::pair<std::shared_ptr<IDeviceID>, int>>
 TsFileWriter::split_tablet_by_device(const Tablet& tablet) {
     std::vector<std::pair<std::shared_ptr<IDeviceID>, int>> result;
 
-    if (tablet.id_column_indexes_.empty()) {
+    if (tablet.id_column_indexes_.empty() || tablet.single_device_) {
+        // No tag columns or caller guarantees single device — skip boundary
+        // detection entirely.
         auto sentinel = std::make_shared<StringArrayDeviceID>("last_device_id");
         result.emplace_back(std::move(sentinel), 0);
-        std::vector<std::string*> id_array;
-        id_array.push_back(new std::string(tablet.insert_target_name_));
-        auto res = std::make_shared<StringArrayDeviceID>(id_array);
-        delete id_array[0];
-        result.emplace_back(std::move(res), tablet.get_cur_row_size());
+        std::shared_ptr<IDeviceID> dev_id(tablet.get_device_id(0));
+        result.emplace_back(std::move(dev_id), tablet.get_cur_row_size());
         return result;
     }
 
@@ -1428,41 +1527,49 @@ TsFileWriter::split_tablet_by_device(const Tablet& tablet) {
 int TsFileWriter::write_column(ChunkWriter* chunk_writer, const Tablet& tablet,
                                int col_idx, uint32_t start_idx,
                                uint32_t end_idx) {
-    int ret = E_OK;
-
     common::TSDataType data_type = tablet.schema_vec_->at(col_idx).data_type_;
     int64_t* timestamps = tablet.timestamps_;
     Tablet::ValueMatrixEntry col_values = tablet.value_matrix_[col_idx];
     BitMap& col_notnull_bitmap = tablet.bitmaps_[col_idx];
     end_idx = std::min(end_idx, tablet.max_row_num_);
 
-    if (data_type == common::BOOLEAN) {
-        ret = write_typed_column(chunk_writer, timestamps, col_values.bool_data,
-                                 col_notnull_bitmap, start_idx, end_idx);
-    } else if (data_type == common::INT32) {
-        ret =
-            write_typed_column(chunk_writer, timestamps, col_values.int32_data,
-                               col_notnull_bitmap, start_idx, end_idx);
-    } else if (data_type == common::INT64) {
-        ret =
-            write_typed_column(chunk_writer, timestamps, col_values.int64_data,
-                               col_notnull_bitmap, start_idx, end_idx);
-    } else if (data_type == common::FLOAT) {
-        ret =
-            write_typed_column(chunk_writer, timestamps, col_values.float_data,
-                               col_notnull_bitmap, start_idx, end_idx);
-    } else if (data_type == common::DOUBLE) {
-        ret =
-            write_typed_column(chunk_writer, timestamps, col_values.double_data,
-                               col_notnull_bitmap, start_idx, end_idx);
-    } else if (data_type == common::STRING) {
-        ret =
-            write_typed_column(chunk_writer, timestamps, col_values.string_col,
-                               col_notnull_bitmap, start_idx, end_idx);
-    } else {
-        ASSERT(false);
+    // Cover every storage type (DATE->int32, TIMESTAMP->int64, TEXT/BLOB->
+    // string).  This is the null fallback for the non-aligned batch path, so a
+    // column of any type that contains a null lands here; the old if/else only
+    // handled 6 types and ASSERT(false)'d (silently no-op in NDEBUG) on
+    // DATE/TIMESTAMP/TEXT/BLOB, dropping those rows.
+    switch (data_type) {
+        case common::BOOLEAN:
+            return write_typed_column(chunk_writer, timestamps,
+                                      col_values.bool_data, col_notnull_bitmap,
+                                      start_idx, end_idx);
+        case common::INT32:
+        case common::DATE:
+            return write_typed_column(chunk_writer, timestamps,
+                                      col_values.int32_data, col_notnull_bitmap,
+                                      start_idx, end_idx);
+        case common::INT64:
+        case common::TIMESTAMP:
+            return write_typed_column(chunk_writer, timestamps,
+                                      col_values.int64_data, col_notnull_bitmap,
+                                      start_idx, end_idx);
+        case common::FLOAT:
+            return write_typed_column(chunk_writer, timestamps,
+                                      col_values.float_data, col_notnull_bitmap,
+                                      start_idx, end_idx);
+        case common::DOUBLE:
+            return write_typed_column(chunk_writer, timestamps,
+                                      col_values.double_data,
+                                      col_notnull_bitmap, start_idx, end_idx);
+        case common::STRING:
+        case common::TEXT:
+        case common::BLOB:
+            return write_typed_column(chunk_writer, timestamps,
+                                      col_values.string_col, col_notnull_bitmap,
+                                      start_idx, end_idx);
+        default:
+            return E_NOT_SUPPORT;
     }
-    return ret;
 }
 
 int TsFileWriter::time_write_column(TimeChunkWriter* time_chunk_writer,
@@ -1481,124 +1588,25 @@ int TsFileWriter::time_write_column(TimeChunkWriter* time_chunk_writer,
     return ret;
 }
 
-int TsFileWriter::value_write_column(ValueChunkWriter* value_chunk_writer,
-                                     const Tablet& tablet, int col_idx,
+// Non-aligned numeric column: a null row contributes no point, so null rows
+// are skipped.  Covers bool/int32/int64/float/double; instantiated only from
+// write_column in this translation unit.
+template <typename T>
+int TsFileWriter::write_typed_column(ChunkWriter* chunk_writer,
+                                     int64_t* timestamps, T* col_values,
+                                     BitMap& col_notnull_bitmap,
                                      uint32_t start_idx, uint32_t end_idx) {
     int ret = E_OK;
-
-    TSDataType data_type = tablet.schema_vec_->at(col_idx).data_type_;
-    int64_t* timestamps = tablet.timestamps_;
-    Tablet::ValueMatrixEntry col_values = tablet.value_matrix_[col_idx];
-    BitMap& col_notnull_bitmap = tablet.bitmaps_[col_idx];
-    switch (data_type) {
-        case common::BOOLEAN:
-            ret = write_typed_column(value_chunk_writer, timestamps,
-                                     (bool*)col_values.bool_data,
-                                     col_notnull_bitmap, start_idx, end_idx);
-            break;
-        case common::DATE:
-        case common::INT32:
-            ret = write_typed_column(value_chunk_writer, timestamps,
-                                     (int32_t*)col_values.int32_data,
-                                     col_notnull_bitmap, start_idx, end_idx);
-            break;
-        case common::TIMESTAMP:
-        case common::INT64:
-            ret = write_typed_column(value_chunk_writer, timestamps,
-                                     (int64_t*)col_values.int64_data,
-                                     col_notnull_bitmap, start_idx, end_idx);
-            break;
-        case common::FLOAT:
-            ret = write_typed_column(value_chunk_writer, timestamps,
-                                     (float*)col_values.float_data,
-                                     col_notnull_bitmap, start_idx, end_idx);
-            break;
-        case common::DOUBLE:
-            ret = write_typed_column(value_chunk_writer, timestamps,
-                                     (double*)col_values.double_data,
-                                     col_notnull_bitmap, start_idx, end_idx);
-            break;
-        case common::STRING:
-        case common::TEXT:
-        case common::BLOB:
-            ret = write_typed_column(value_chunk_writer, timestamps,
-                                     col_values.string_col, col_notnull_bitmap,
-                                     start_idx, end_idx);
-            break;
-        default:
-            ret = E_NOT_SUPPORT;
+    for (uint32_t r = start_idx; r < end_idx; r++) {
+        if (LIKELY(!col_notnull_bitmap.test(r))) {
+            if (RET_FAIL(chunk_writer->write(timestamps[r], col_values[r]))) {
+                return ret;
+            }
+        }
     }
     return ret;
 }
 
-#define DO_WRITE_TYPED_COLUMN()                                               \
-    do {                                                                      \
-        int ret = E_OK;                                                       \
-        for (uint32_t r = start_idx; r < end_idx; r++) {                      \
-            if (LIKELY(!col_notnull_bitmap.test(r))) {                        \
-                if (RET_FAIL(                                                 \
-                        chunk_writer->write(timestamps[r], col_values[r]))) { \
-                    return ret;                                               \
-                }                                                             \
-            }                                                                 \
-        }                                                                     \
-        return ret;                                                           \
-    } while (false)
-
-#define DO_VALUE_WRITE_TYPED_COLUMN()                            \
-    do {                                                         \
-        int ret = E_OK;                                          \
-        for (uint32_t r = start_idx; r < end_idx; r++) {         \
-            if (LIKELY(col_notnull_bitmap.test(r))) {            \
-                if (RET_FAIL(value_chunk_writer->write(          \
-                        timestamps[r], col_values[r], true))) {  \
-                    return ret;                                  \
-                }                                                \
-            } else {                                             \
-                if (RET_FAIL(value_chunk_writer->write(          \
-                        timestamps[r], col_values[r], false))) { \
-                    return ret;                                  \
-                }                                                \
-            }                                                    \
-        }                                                        \
-        return ret;                                              \
-    } while (false)
-
-int TsFileWriter::write_typed_column(ChunkWriter* chunk_writer,
-                                     int64_t* timestamps, bool* col_values,
-                                     BitMap& col_notnull_bitmap,
-                                     uint32_t start_idx, uint32_t end_idx) {
-    DO_WRITE_TYPED_COLUMN();
-}
-
-int TsFileWriter::write_typed_column(ChunkWriter* chunk_writer,
-                                     int64_t* timestamps, int32_t* col_values,
-                                     BitMap& col_notnull_bitmap,
-                                     uint32_t start_idx, uint32_t end_idx) {
-    DO_WRITE_TYPED_COLUMN();
-}
-
-int TsFileWriter::write_typed_column(ChunkWriter* chunk_writer,
-                                     int64_t* timestamps, int64_t* col_values,
-                                     BitMap& col_notnull_bitmap,
-                                     uint32_t start_idx, uint32_t end_idx) {
-    DO_WRITE_TYPED_COLUMN();
-}
-
-int TsFileWriter::write_typed_column(ChunkWriter* chunk_writer,
-                                     int64_t* timestamps, float* col_values,
-                                     BitMap& col_notnull_bitmap,
-                                     uint32_t start_idx, uint32_t end_idx) {
-    DO_WRITE_TYPED_COLUMN();
-}
-
-int TsFileWriter::write_typed_column(ChunkWriter* chunk_writer,
-                                     int64_t* timestamps, double* col_values,
-                                     BitMap& col_notnull_bitmap,
-                                     uint32_t start_idx, uint32_t end_idx) {
-    DO_WRITE_TYPED_COLUMN();
-}
-
 int TsFileWriter::write_typed_column(ChunkWriter* chunk_writer,
                                      int64_t* timestamps,
                                      Tablet::StringColumn* string_col,
@@ -1609,8 +1617,7 @@ int TsFileWriter::write_typed_column(ChunkWriter* chunk_writer,
         if (LIKELY(!col_notnull_bitmap.test(r))) {
             common::String val(
                 string_col->buffer + string_col->offsets[r],
-                static_cast<uint32_t>(string_col->offsets[r + 1] -
-                                      string_col->offsets[r]));
+                string_col->offsets[r + 1] - string_col->offsets[r]);
             if (RET_FAIL(chunk_writer->write(timestamps[r], val))) {
                 return ret;
             }
@@ -1619,67 +1626,161 @@ int TsFileWriter::write_typed_column(ChunkWriter* chunk_writer,
     return ret;
 }
 
-int TsFileWriter::write_typed_column(ValueChunkWriter* value_chunk_writer,
-                                     int64_t* timestamps, bool* col_values,
-                                     BitMap& col_notnull_bitmap,
-                                     uint32_t start_idx, uint32_t end_idx) {
-    DO_VALUE_WRITE_TYPED_COLUMN();
-}
-
-int TsFileWriter::write_typed_column(ValueChunkWriter* value_chunk_writer,
-                                     int64_t* timestamps, int32_t* col_values,
-                                     BitMap& col_notnull_bitmap,
-                                     uint32_t start_idx, uint32_t end_idx) {
-    DO_VALUE_WRITE_TYPED_COLUMN();
-}
-
-int TsFileWriter::write_typed_column(ValueChunkWriter* value_chunk_writer,
-                                     int64_t* timestamps, int64_t* col_values,
-                                     BitMap& col_notnull_bitmap,
-                                     uint32_t start_idx, uint32_t end_idx) {
-    DO_VALUE_WRITE_TYPED_COLUMN();
+int TsFileWriter::time_write_column_batch(TimeChunkWriter* time_chunk_writer,
+                                          const Tablet& tablet,
+                                          uint32_t start_idx,
+                                          uint32_t end_idx) {
+    int64_t* timestamps = tablet.timestamps_;
+    int ret = E_OK;
+    if (IS_NULL(time_chunk_writer) || IS_NULL(timestamps)) {
+        return E_INVALID_ARG;
+    }
+    end_idx = std::min(end_idx, tablet.max_row_num_);
+    uint32_t count = end_idx - start_idx;
+    if (count == 0) return ret;
+    return time_chunk_writer->write_batch(timestamps + start_idx, count);
 }
 
-int TsFileWriter::write_typed_column(ValueChunkWriter* value_chunk_writer,
-                                     int64_t* timestamps, float* col_values,
-                                     BitMap& col_notnull_bitmap,
+int TsFileWriter::write_column_batch(ChunkWriter* chunk_writer,
+                                     const Tablet& tablet, int col_idx,
                                      uint32_t start_idx, uint32_t end_idx) {
-    DO_VALUE_WRITE_TYPED_COLUMN();
-}
+    int ret = E_OK;
+    common::TSDataType data_type = tablet.schema_vec_->at(col_idx).data_type_;
+    int64_t* timestamps = tablet.timestamps_;
+    Tablet::ValueMatrixEntry col_values = tablet.value_matrix_[col_idx];
+    BitMap& col_notnull_bitmap = tablet.bitmaps_[col_idx];
+    end_idx = std::min(end_idx, tablet.max_row_num_);
+    uint32_t count = end_idx - start_idx;
+    if (count == 0) return ret;
+
+    bool has_null = false;
+    if (col_notnull_bitmap.may_have_set_bits()) {
+        for (uint32_t r = start_idx; r < end_idx; r++) {
+            if (col_notnull_bitmap.test(r)) {
+                has_null = true;
+                break;
+            }
+        }
+    }
 
-int TsFileWriter::write_typed_column(ValueChunkWriter* value_chunk_writer,
-                                     int64_t* timestamps, double* col_values,
-                                     BitMap& col_notnull_bitmap,
-                                     uint32_t start_idx, uint32_t end_idx) {
-    DO_VALUE_WRITE_TYPED_COLUMN();
+    if (!has_null) {
+        switch (data_type) {
+            case common::BOOLEAN:
+                ret = chunk_writer->write_batch(
+                    timestamps + start_idx, col_values.bool_data + start_idx,
+                    count);
+                break;
+            case common::INT32:
+            case common::DATE:
+                ret = chunk_writer->write_batch(
+                    timestamps + start_idx, col_values.int32_data + start_idx,
+                    count);
+                break;
+            case common::INT64:
+            case common::TIMESTAMP:
+                ret = chunk_writer->write_batch(
+                    timestamps + start_idx, col_values.int64_data + start_idx,
+                    count);
+                break;
+            case common::FLOAT:
+                ret = chunk_writer->write_batch(
+                    timestamps + start_idx, col_values.float_data + start_idx,
+                    count);
+                break;
+            case common::DOUBLE:
+                ret = chunk_writer->write_batch(
+                    timestamps + start_idx, col_values.double_data + start_idx,
+                    count);
+                break;
+            case common::STRING:
+            case common::TEXT:
+            case common::BLOB: {
+                auto* sc = col_values.string_col;
+                // sc->offsets is int32_t* (Arrow Utf8/Binary spec);
+                // write_string_batch still takes const uint32_t* through the
+                // page/encoder stack.  Offsets are non-negative by
+                // construction so the bit pattern is identical — cast at the
+                // boundary until the downstream chain is converted in a
+                // follow-up.
+                ret = chunk_writer->write_string_batch(
+                    timestamps + start_idx, sc->buffer,
+                    reinterpret_cast<const uint32_t*>(sc->offsets), start_idx,
+                    count);
+                break;
+            }
+            default:
+                ret = write_column(chunk_writer, tablet, col_idx, start_idx,
+                                   end_idx);
+                break;
+        }
+    } else {
+        ret = write_column(chunk_writer, tablet, col_idx, start_idx, end_idx);
+    }
+    return ret;
 }
 
-int TsFileWriter::write_typed_column(ValueChunkWriter* value_chunk_writer,
-                                     int64_t* timestamps,
-                                     Tablet::StringColumn* string_col,
-                                     common::BitMap& col_notnull_bitmap,
-                                     uint32_t start_idx, uint32_t end_idx) {
+int TsFileWriter::value_write_column_batch(ValueChunkWriter* value_chunk_writer,
+                                           const Tablet& tablet, int col_idx,
+                                           uint32_t start_idx,
+                                           uint32_t end_idx) {
     int ret = E_OK;
-    for (uint32_t r = start_idx; r < end_idx; r++) {
-        common::String val(string_col->buffer + string_col->offsets[r],
-                           static_cast<uint32_t>(string_col->offsets[r + 1] -
-                                                 string_col->offsets[r]));
-        if (LIKELY(col_notnull_bitmap.test(r))) {
-            if (RET_FAIL(value_chunk_writer->write(timestamps[r], val, true))) {
-                return ret;
-            }
-        } else {
-            if (RET_FAIL(
-                    value_chunk_writer->write(timestamps[r], val, false))) {
-                return ret;
-            }
+    common::TSDataType data_type = tablet.schema_vec_->at(col_idx).data_type_;
+    int64_t* timestamps = tablet.timestamps_;
+    Tablet::ValueMatrixEntry col_values = tablet.value_matrix_[col_idx];
+    BitMap& col_notnull_bitmap = tablet.bitmaps_[col_idx];
+    end_idx = std::min(end_idx, tablet.max_row_num_);
+    uint32_t count = end_idx - start_idx;
+    if (count == 0) return ret;
+
+    switch (data_type) {
+        case common::BOOLEAN:
+            ret = value_chunk_writer->write_batch(
+                timestamps, col_values.bool_data, col_notnull_bitmap, start_idx,
+                count);
+            break;
+        case common::DATE:
+        case common::INT32:
+            ret = value_chunk_writer->write_batch(
+                timestamps, col_values.int32_data, col_notnull_bitmap,
+                start_idx, count);
+            break;
+        case common::TIMESTAMP:
+        case common::INT64:
+            ret = value_chunk_writer->write_batch(
+                timestamps, col_values.int64_data, col_notnull_bitmap,
+                start_idx, count);
+            break;
+        case common::FLOAT:
+            ret = value_chunk_writer->write_batch(
+                timestamps, col_values.float_data, col_notnull_bitmap,
+                start_idx, count);
+            break;
+        case common::DOUBLE:
+            ret = value_chunk_writer->write_batch(
+                timestamps, col_values.double_data, col_notnull_bitmap,
+                start_idx, count);
+            break;
+        case common::STRING:
+        case common::TEXT:
+        case common::BLOB: {
+            auto* sc = col_values.string_col;
+            // See above: sc->offsets is int32_t*, downstream still uint32_t*.
+            ret = value_chunk_writer->write_string_batch(
+                timestamps, sc->buffer,
+                reinterpret_cast<const uint32_t*>(sc->offsets),
+                col_notnull_bitmap, start_idx, count);
+            break;
         }
+        default:
+            ret = E_NOT_SUPPORT;
+            break;
     }
     return ret;
 }
 
 // TODO make sure ret is meaningful to SDK user
 int TsFileWriter::flush() {
+    if (UNLIKELY(unrecoverable_)) return E_DATA_INCONSISTENCY;
     int ret = E_OK;
     if (!start_file_done_) {
         if (RET_FAIL(io_writer_->start_file())) {
@@ -1690,9 +1791,10 @@ int TsFileWriter::flush() {
 
     /* since @schemas_ used std::map which is rbtree underlying,
              so map itself is ordered by device name. */
+
     DeviceSchemasMapIter device_iter;
     for (device_iter = schemas_.begin(); device_iter != schemas_.end();
-         device_iter++) {  // cppcheck-suppress postfixOperator
+         device_iter++) {
         if (check_chunk_group_empty(device_iter->second,
                                     device_iter->second->is_aligned_)) {
             continue;
@@ -1706,6 +1808,7 @@ int TsFileWriter::flush() {
         } else if (RET_FAIL(io_writer_->end_flush_chunk_group(is_aligned))) {
         }
     }
+
     record_count_since_last_flush_ = 0;
     return ret;
 }
@@ -1751,6 +1854,56 @@ bool TsFileWriter::check_chunk_group_empty(MeasurementSchemaGroup* chunk_group,
         writer->reset();                                                       \
     }
 
+// Write already-encoded chunk data to stream (no compression — done earlier).
+#define FLUSH_CHUNK_ENCODED(writer, io_writer, name, data_type, encoding,     \
+                            compression, num_pages)                           \
+    if (RET_FAIL(io_writer->start_flush_chunk(writer->get_chunk_data(), name, \
+                                              data_type, encoding,            \
+                                              compression, num_pages))) {     \
+    } else if (RET_FAIL(io_writer->flush_chunk(writer->get_chunk_data()))) {  \
+    } else if (RET_FAIL(io_writer->end_flush_chunk(                           \
+                   writer->get_chunk_statistic()))) {                         \
+    } else {                                                                  \
+        writer->reset();                                                      \
+    }
+
+int TsFileWriter::flush_chunk_group_encoded(MeasurementSchemaGroup* chunk_group,
+                                            bool is_aligned) {
+    int ret = E_OK;
+    MeasurementSchemaMap& map = chunk_group->measurement_schema_map_;
+
+    if (chunk_group->is_aligned_) {
+        TimeChunkWriter*& time_chunk_writer = chunk_group->time_chunk_writer_;
+        ChunkHeader chunk_header = time_chunk_writer->get_chunk_header();
+        FLUSH_CHUNK_ENCODED(
+            time_chunk_writer, io_writer_, chunk_header.measurement_name_,
+            chunk_header.data_type_, chunk_header.encoding_type_,
+            chunk_header.compression_type_, time_chunk_writer->num_of_pages())
+    }
+
+    for (MeasurementSchemaMapIter ms_iter = map.begin(); ms_iter != map.end();
+         ms_iter++) {
+        MeasurementSchema* m_schema = ms_iter->second;
+        if (!chunk_group->is_aligned_ && m_schema->chunk_writer_ != nullptr) {
+            ChunkWriter*& chunk_writer = m_schema->chunk_writer_;
+            FLUSH_CHUNK_ENCODED(
+                chunk_writer, io_writer_, m_schema->measurement_name_,
+                m_schema->data_type_, m_schema->encoding_,
+                m_schema->compression_type_, chunk_writer->num_of_pages())
+        } else if (m_schema->value_chunk_writer_ != nullptr &&
+                   m_schema->value_chunk_writer_->hasData()) {
+            ValueChunkWriter*& value_chunk_writer =
+                m_schema->value_chunk_writer_;
+            FLUSH_CHUNK_ENCODED(
+                value_chunk_writer, io_writer_, m_schema->measurement_name_,
+                m_schema->data_type_, m_schema->encoding_,
+                m_schema->compression_type_, value_chunk_writer->num_of_pages())
+        }
+    }
+
+    return ret;
+}
+
 int TsFileWriter::flush_chunk_group(MeasurementSchemaGroup* chunk_group,
                                     bool is_aligned) {
     int ret = E_OK;
@@ -1774,7 +1927,8 @@ int TsFileWriter::flush_chunk_group(MeasurementSchemaGroup* chunk_group,
                         m_schema->data_type_, m_schema->encoding_,
                         m_schema->compression_type_,
                         chunk_writer->num_of_pages())
-        } else if (m_schema->value_chunk_writer_ != nullptr) {
+        } else if (m_schema->value_chunk_writer_ != nullptr &&
+                   m_schema->value_chunk_writer_->hasData()) {
             ValueChunkWriter*& value_chunk_writer =
                 m_schema->value_chunk_writer_;
             FLUSH_CHUNK(value_chunk_writer, io_writer_,
@@ -1787,6 +1941,9 @@ int TsFileWriter::flush_chunk_group(MeasurementSchemaGroup* chunk_group,
     return ret;
 }
 
-int TsFileWriter::close() { return io_writer_->end_file(); }
+int TsFileWriter::close() {
+    if (UNLIKELY(unrecoverable_)) return E_DATA_INCONSISTENCY;
+    return io_writer_->end_file();
+}
 
 }  // end namespace storage
diff --git a/cpp/src/writer/tsfile_writer.h b/cpp/src/writer/tsfile_writer.h
index a2c8f2842..e0b102c97 100644
--- a/cpp/src/writer/tsfile_writer.h
+++ b/cpp/src/writer/tsfile_writer.h
@@ -33,7 +33,6 @@
 #include "common/record.h"
 #include "common/schema.h"
 #include "common/tablet.h"
-#include "utils/util_define.h"  // mode_t and other platform-compat shims
 
 namespace storage {
 class WriteFile;
@@ -46,9 +45,12 @@ namespace storage {
 
 extern int libtsfile_init();
 extern void libtsfile_destroy();
-extern void set_page_max_point_count(uint32_t page_max_ponint_count);
-extern void set_max_degree_of_index_node(uint32_t max_degree_of_index_node);
-extern void set_strict_page_size(bool strict_page_size);
+// Returns common::E_INVALID_ARG when count would freeze the chunk writers
+// (i.e. less than 1); leaves the field untouched on rejection.
+extern int set_page_max_point_count(uint32_t page_max_ponint_count);
+// Returns common::E_INVALID_ARG when degree < 2 (which collapses the index
+// tree); leaves the field untouched on rejection.
+extern int set_max_degree_of_index_node(uint32_t max_degree_of_index_node);
 
 class TsFileWriter {
    public:
@@ -98,6 +100,7 @@ class TsFileWriter {
     std::shared_ptr<TableSchema> get_table_schema(
         const std::string& table_name) const;
     int64_t calculate_mem_size_for_all_group();
+    int64_t calculate_meta_mem_size() const;
     int check_memory_size_and_may_flush_chunks();
     /*
      * Flush buffer to disk file, but do not writer file index part.
@@ -125,25 +128,15 @@ class TsFileWriter {
         int32_t time_pages_before,
         const std::vector<int32_t>& value_pages_before);
     int flush_chunk_group(MeasurementSchemaGroup* chunk_group, bool is_aligned);
+    int flush_chunk_group_encoded(MeasurementSchemaGroup* chunk_group,
+                                  bool is_aligned);
 
+    // Numeric columns (bool/int32/int64/float/double) share one body:
+    // non-aligned ChunkWriter skips null rows entirely.  Defined in the .cc;
+    // every instantiation lives in that translation unit.
+    template <typename T>
     int write_typed_column(storage::ChunkWriter* chunk_writer,
-                           int64_t* timestamps, bool* col_values,
-                           common::BitMap& col_notnull_bitmap,
-                           uint32_t start_idx, uint32_t end_idx);
-    int write_typed_column(storage::ChunkWriter* chunk_writer,
-                           int64_t* timestamps, int32_t* col_values,
-                           common::BitMap& col_notnull_bitmap,
-                           uint32_t start_idx, uint32_t end_idx);
-    int write_typed_column(storage::ChunkWriter* chunk_writer,
-                           int64_t* timestamps, int64_t* col_values,
-                           common::BitMap& col_notnull_bitmap,
-                           uint32_t start_idx, uint32_t end_idx);
-    int write_typed_column(storage::ChunkWriter* chunk_writer,
-                           int64_t* timestamps, float* col_values,
-                           common::BitMap& col_notnull_bitmap,
-                           uint32_t start_idx, uint32_t end_idx);
-    int write_typed_column(storage::ChunkWriter* chunk_writer,
-                           int64_t* timestamps, double* col_values,
+                           int64_t* timestamps, T* col_values,
                            common::BitMap& col_notnull_bitmap,
                            uint32_t start_idx, uint32_t end_idx);
     int write_typed_column(ChunkWriter* chunk_writer, int64_t* timestamps,
@@ -196,41 +189,33 @@ class TsFileWriter {
     int64_t record_count_for_next_mem_check_;
     bool write_file_created_;
     bool io_writer_owned_;  // false when init(RestorableTsFileIOWriter*)
-    bool enforce_recovered_last_time_order_;
-
-    int write_typed_column(ValueChunkWriter* value_chunk_writer,
-                           int64_t* timestamps, bool* col_values,
-                           common::BitMap& col_notnull_bitmap,
-                           uint32_t start_idx, uint32_t end_idx);
-
-    int write_typed_column(ValueChunkWriter* value_chunk_writer,
-                           int64_t* timestamps, double* col_values,
-                           common::BitMap& col_notnull_bitmap,
-                           uint32_t start_idx, uint32_t end_idx);
-    int write_typed_column(ValueChunkWriter* value_chunk_writer,
-                           int64_t* timestamps,
-                           Tablet::StringColumn* string_col,
-                           common::BitMap& col_notnull_bitmap,
-                           uint32_t start_idx, uint32_t end_idx);
-
-    int write_typed_column(ValueChunkWriter* value_chunk_writer,
-                           int64_t* timestamps, float* col_values,
-                           common::BitMap& col_notnull_bitmap,
-                           uint32_t start_idx, uint32_t end_idx);
-
-    int write_typed_column(ValueChunkWriter* value_chunk_writer,
-                           int64_t* timestamps, int32_t* col_values,
-                           common::BitMap& col_notnull_bitmap,
-                           uint32_t start_idx, uint32_t end_idx);
-
-    int write_typed_column(ValueChunkWriter* value_chunk_writer,
-                           int64_t* timestamps, int64_t* col_values,
-                           common::BitMap& col_notnull_bitmap,
-                           uint32_t start_idx, uint32_t end_idx);
-
-    int value_write_column(ValueChunkWriter* value_chunk_writer,
+    // Only the recovery init path sets this true: subsequent writes must
+    // refuse timestamps <= the recovered per-device last_time_ so the chunk
+    // ordering invariants preserved by RestorableTsFileIOWriter are not
+    // broken by appending older data.
+    bool enforce_recovered_last_time_order_ = false;
+    bool table_aligned_ = true;
+    // Set once a partial-write failure leaves the per-column chunk writers
+    // out of sync (e.g. parallel aligned tablet write where one task fails
+    // mid-way while others succeed).  Subsequent write/flush/close calls
+    // refuse to operate so that the on-disk file isn't sealed with row
+    // counts that disagree between time and value columns.
+    bool unrecoverable_ = false;
+    // Test-only accessor for the unrecoverable contract: real triggers
+    // (parallel task failure, out-of-order timestamps across multiple chunk
+    // writers) are hard to drive deterministically, but the contract —
+    // flush/close refuse — can be unit-tested directly.
+    friend class TsFileWriterUnrecoverableTest;
+
+    int write_column_batch(storage::ChunkWriter* chunk_writer,
                            const Tablet& tablet, int col_idx,
                            uint32_t start_idx, uint32_t end_idx);
+    int time_write_column_batch(TimeChunkWriter* time_chunk_writer,
+                                const Tablet& tablet, uint32_t start_idx,
+                                uint32_t end_idx);
+    int value_write_column_batch(ValueChunkWriter* value_chunk_writer,
+                                 const Tablet& tablet, int col_idx,
+                                 uint32_t start_idx, uint32_t end_idx);
 };
 
 }  // end namespace storage
diff --git a/cpp/src/writer/value_chunk_writer.cc b/cpp/src/writer/value_chunk_writer.cc
index a59cf8d3f..182b0762b 100644
--- a/cpp/src/writer/value_chunk_writer.cc
+++ b/cpp/src/writer/value_chunk_writer.cc
@@ -110,7 +110,7 @@ int ValueChunkWriter::seal_cur_page(bool end_chunk) {
                 /*stat*/ false, /*data*/ false);
             if (IS_SUCC(ret)) {
                 save_first_page_data(value_page_writer_);
-                value_page_writer_.clear_page_data();
+                // value_page_writer_.destroy_page_data();
                 value_page_writer_.reset();
             }
         }
@@ -145,6 +145,11 @@ void ValueChunkWriter::save_first_page_data(
     ValuePageWriter& first_page_writer) {
     first_page_data_ = first_page_writer.get_cur_page_data();
     first_page_statistic_->deep_copy_from(first_page_writer.get_statistic());
+    // Take ownership of the heap buffers: get_cur_page_data() returned a
+    // shallow copy, so without this we'd alias compressed_buf_ /
+    // uncompressed_buf_ between cur_page_data_ and first_page_data_ and
+    // double-free at destroy() time.
+    first_page_writer.release_cur_page_data();
 }
 
 int ValueChunkWriter::write_first_page_data(ByteStream& pages_data,
@@ -161,8 +166,7 @@ int ValueChunkWriter::write_first_page_data(ByteStream& pages_data,
 
 int ValueChunkWriter::end_encode_chunk() {
     int ret = E_OK;
-    if (value_page_writer_.get_point_numer() > 0 ||
-        (has_current_page_data() && num_of_pages_ == 0)) {
+    if (has_current_page_data()) {
         ret = seal_cur_page(/*end_chunk*/ true);
         if (E_OK == ret) {
             chunk_header_.data_size_ = chunk_data_.total_size();
@@ -175,9 +179,6 @@ int ValueChunkWriter::end_encode_chunk() {
             chunk_header_.data_size_ = chunk_data_.total_size();
             chunk_header_.num_of_pages_ = num_of_pages_;
         }
-    } else if (num_of_pages_ > 0) {
-        chunk_header_.data_size_ = chunk_data_.total_size();
-        chunk_header_.num_of_pages_ = num_of_pages_;
     }
 #if DEBUG_SE
     std::cout << "end_encode_chunk: num_of_pages_=" << num_of_pages_
diff --git a/cpp/src/writer/value_chunk_writer.h b/cpp/src/writer/value_chunk_writer.h
index 64eb4cc50..cd7c75a54 100644
--- a/cpp/src/writer/value_chunk_writer.h
+++ b/cpp/src/writer/value_chunk_writer.h
@@ -53,8 +53,7 @@ class ValueChunkWriter {
           first_page_data_(),
           first_page_statistic_(nullptr),
           chunk_header_(),
-          num_of_pages_(0),
-          enable_page_seal_if_full_(true) {}
+          num_of_pages_(0) {}
     ~ValueChunkWriter() { destroy(); }
     int init(const common::ColumnSchema& col_schema);
     int init(const std::string& measurement_name, common::TSDataType data_type,
@@ -110,6 +109,71 @@ class ValueChunkWriter {
         VCW_DO_WRITE_FOR_TYPE(isnull);
     }
 
+    template <typename T>
+    int write_batch(const int64_t* timestamps, const T* values,
+                    const common::BitMap& col_notnull_bitmap,
+                    uint32_t start_idx, uint32_t count) {
+        int ret = common::E_OK;
+        uint32_t offset = 0;
+        const uint32_t page_cap =
+            common::g_config_value_.page_writer_max_point_num_;
+        while (offset < count) {
+            uint32_t cur_points = value_page_writer_.get_point_numer();
+            // get_point_numer() now returns size_ (rows including nulls and
+            // the just-written batch), so it can momentarily exceed page_cap;
+            // seal whenever we are at or past the cap to avoid uint32 wrap.
+            if (cur_points >= page_cap) {
+                if (RET_FAIL(seal_cur_page(false))) {
+                    return ret;
+                }
+                cur_points = 0;
+            }
+            uint32_t page_remaining = page_cap - cur_points;
+            uint32_t batch_size = std::min(count - offset, page_remaining);
+            if (RET_FAIL(value_page_writer_.write_batch(
+                    timestamps, values, col_notnull_bitmap, start_idx + offset,
+                    batch_size))) {
+                return ret;
+            }
+            offset += batch_size;
+            if (RET_FAIL(seal_cur_page_if_full())) {
+                return ret;
+            }
+        }
+        return ret;
+    }
+
+    int write_string_batch(const int64_t* timestamps, const char* buffer,
+                           const uint32_t* offsets,
+                           const common::BitMap& col_notnull_bitmap,
+                           uint32_t start_idx, uint32_t count) {
+        int ret = common::E_OK;
+        uint32_t offset = 0;
+        const uint32_t page_cap =
+            common::g_config_value_.page_writer_max_point_num_;
+        while (offset < count) {
+            uint32_t cur_points = value_page_writer_.get_point_numer();
+            if (cur_points >= page_cap) {
+                if (RET_FAIL(seal_cur_page(false))) {
+                    return ret;
+                }
+                cur_points = 0;
+            }
+            uint32_t page_remaining = page_cap - cur_points;
+            uint32_t batch_size = std::min(count - offset, page_remaining);
+            if (RET_FAIL(value_page_writer_.write_string_batch(
+                    timestamps, buffer, offsets, col_notnull_bitmap,
+                    start_idx + offset, batch_size))) {
+                return ret;
+            }
+            offset += batch_size;
+            if (RET_FAIL(seal_cur_page_if_full())) {
+                return ret;
+            }
+        }
+        return ret;
+    }
+
     int end_encode_chunk();
     common::ByteStream& get_chunk_data() { return chunk_data_; }
     Statistic* get_chunk_statistic() { return chunk_statistic_; }
@@ -119,25 +183,21 @@ class ValueChunkWriter {
 
     bool hasData();
 
-    /** True if the current (unsealed) page has at least one write (including
-     * nulls). */
+    /** True if the current (unsealed) page has at least one write
+     *  (including NULLs). */
     bool has_current_page_data() const {
-        return value_page_writer_.get_total_write_count() > 0;
+        return value_page_writer_.get_point_numer() > 0;
     }
 
     FORCE_INLINE uint32_t get_point_numer() const {
         return value_page_writer_.get_point_numer();
     }
 
-    /**
-     * Force seal the current page (for aligned table model: when time page
-     * seals due to memory/point threshold, all value pages must seal together).
-     * @return E_OK on success.
-     */
+    /** Force seal the current page. */
     int seal_current_page() { return seal_cur_page(false); }
 
-    // For aligned writer: allow disabling the automatic page-size/point-number
-    // check so the caller can seal pages at chosen boundaries.
+    // Allow disabling the automatic page-size/point-number check so the
+    // caller can seal pages at chosen boundaries.
     FORCE_INLINE void set_enable_page_seal_if_full(bool enable) {
         enable_page_seal_if_full_ = enable;
     }
@@ -183,8 +243,7 @@ class ValueChunkWriter {
 
     ChunkHeader chunk_header_;
     int32_t num_of_pages_;
-    // If false, write() won't auto-seal when the current page becomes full.
-    bool enable_page_seal_if_full_;
+    bool enable_page_seal_if_full_ = true;
 };
 
 }  // end namespace storage
diff --git a/cpp/src/writer/value_page_writer.cc b/cpp/src/writer/value_page_writer.cc
index a7bcd89c4..c538ea2fa 100644
--- a/cpp/src/writer/value_page_writer.cc
+++ b/cpp/src/writer/value_page_writer.cc
@@ -59,6 +59,10 @@ int ValuePageData::init(ByteStream& col_notnull_bitmap_bs, ByteStream& value_bs,
                                           uncompressed_buf_ + sizeof(size) +
                                               col_notnull_bitmap_buf_size_,
                                           value_buf_size_))) {
+        // value_buf_size_ == 0 is a fully-null value page: only the bitmap is
+        // written, value_out_stream_ is empty. Skip the copy — feeding an
+        // empty stream to copy_bs_to_buf trips ASSERT(b.len_ > 0) in the
+        // buffer iterator. (Restores the #734 aligned-page-seal fix.)
     } else {
         // TODO
         // NOTE: different compressor may have different compress API
@@ -119,6 +123,8 @@ void ValuePageWriter::reset() {
     }
     col_notnull_bitmap_out_stream_.reset();
     value_out_stream_.reset();
+    col_notnull_bitmap_.clear();
+    size_ = 0;
 }
 
 void ValuePageWriter::destroy() {
diff --git a/cpp/src/writer/value_page_writer.h b/cpp/src/writer/value_page_writer.h
index 97f8a5f0d..92c39b9b2 100644
--- a/cpp/src/writer/value_page_writer.h
+++ b/cpp/src/writer/value_page_writer.h
@@ -59,19 +59,6 @@ struct ValuePageData {
             compressor_->after_compress(compressed_buf_);
             compressed_buf_ = nullptr;
         }
-        compressor_ = nullptr;
-    }
-
-    /** Clear pointers without freeing (transfer ownership to another holder).
-     */
-    void clear() {
-        col_notnull_bitmap_buf_size_ = 0;
-        value_buf_size_ = 0;
-        uncompressed_size_ = 0;
-        compressed_size_ = 0;
-        uncompressed_buf_ = nullptr;
-        compressed_buf_ = nullptr;
-        compressor_ = nullptr;
     }
 };
 
@@ -163,11 +150,170 @@ class ValuePageWriter {
         VPW_DO_WRITE_FOR_TYPE(isnull);
     }
 
-    FORCE_INLINE uint32_t get_point_numer() const { return statistic_->count_; }
-    FORCE_INLINE uint32_t get_total_write_count() const { return size_; }
+    // Batch write for aligned/table model.
+    // In the tablet bitmap: bit=1 means null, bit=0 means not null.
+    // In VPW_DO_WRITE_FOR_TYPE: ISNULL=true skips encoding.
+    // So: tablet bitmap.test(r)=true -> isnull=true (null value)
+    //     tablet bitmap.test(r)=false -> isnull=false (valid value)
+    template <typename T>
+    int write_batch(const int64_t* timestamps, const T* values,
+                    const common::BitMap& col_notnull_bitmap,
+                    uint32_t start_idx, uint32_t count) {
+        int ret = common::E_OK;
+        if (count == 0) return ret;
+
+        // Count the not-null rows but defer mutating size_ /
+        // col_notnull_bitmap_ until the value encode finishes successfully.
+        // Previously the bitmap and size_ were bumped first, so a half-failed
+        // encode_batch left the page claiming `count` rows had been written
+        // when only a prefix made it into value_out_stream_ — a subsequent
+        // re-encode would interleave with the stale stream and produce a
+        // misaligned page on disk.
+        uint32_t valid_count = 0;
+        for (uint32_t i = 0; i < count; i++) {
+            uint32_t row = start_idx + i;
+            // bit=1 in tablet bitmap means null; bit=0 means not null
+            if (!const_cast<common::BitMap&>(col_notnull_bitmap).test(row)) {
+                valid_count++;
+            }
+        }
+
+        if (valid_count == 0) {
+            // Still need to advance size_ so trailing null rows are tracked.
+            for (uint32_t i = 0; i < count; i++) {
+                if ((size_ / 8) + 1 > col_notnull_bitmap_.size()) {
+                    col_notnull_bitmap_.push_back(0);
+                }
+                size_++;
+            }
+            return ret;
+        }
+
+        // If all values are valid, we can encode the batch directly
+        if (valid_count == count) {
+            if (RET_FAIL(value_encoder_->encode_batch(values + start_idx, count,
+                                                      value_out_stream_))) {
+                // Don't bump size_/bitmap on encode failure.
+                return ret;
+            }
+            statistic_->update_batch(timestamps + start_idx, values + start_idx,
+                                     count);
+        } else {
+            // Encode only non-null values one by one
+            for (uint32_t i = 0; i < count; i++) {
+                uint32_t row = start_idx + i;
+                if (!const_cast<common::BitMap&>(col_notnull_bitmap)
+                         .test(row)) {
+                    if (RET_FAIL(value_encoder_->encode(values[row],
+                                                        value_out_stream_))) {
+                        return ret;
+                    }
+                    statistic_->update(timestamps[row], values[row]);
+                }
+            }
+        }
+
+        // Commit size_ + page bitmap now that all encoding succeeded.
+        for (uint32_t i = 0; i < count; i++) {
+            uint32_t row = start_idx + i;
+            if ((size_ / 8) + 1 > col_notnull_bitmap_.size()) {
+                col_notnull_bitmap_.push_back(0);
+            }
+            if (!const_cast<common::BitMap&>(col_notnull_bitmap).test(row)) {
+                col_notnull_bitmap_[size_ / 8] |= (MASK >> (size_ % 8));
+            }
+            size_++;
+        }
+        return ret;
+    }
+
+    // Batch write strings from Arrow-style offset+buffer layout with null
+    // bitmap.  See write_batch above for the encode-before-commit rationale.
+    int write_string_batch(const int64_t* timestamps, const char* buffer,
+                           const uint32_t* offsets,
+                           const common::BitMap& col_notnull_bitmap,
+                           uint32_t start_idx, uint32_t count) {
+        int ret = common::E_OK;
+        if (count == 0) return ret;
+
+        // Count valid rows up-front without mutating size_ / page bitmap.
+        uint32_t valid_count = 0;
+        for (uint32_t i = 0; i < count; i++) {
+            uint32_t row = start_idx + i;
+            if (!const_cast<common::BitMap&>(col_notnull_bitmap).test(row)) {
+                valid_count++;
+            }
+        }
+
+        if (valid_count == 0) {
+            // Advance size_ so the trailing null rows still count.
+            for (uint32_t i = 0; i < count; i++) {
+                if ((size_ / 8) + 1 > col_notnull_bitmap_.size()) {
+                    col_notnull_bitmap_.push_back(0);
+                }
+                size_++;
+            }
+            return ret;
+        }
+
+        // Phase 2: encode non-null strings (no page-state mutation yet).
+        if (valid_count == count) {
+            // All valid — batch encode directly
+            if (RET_FAIL(value_encoder_->encode_string_batch(
+                    buffer, offsets, start_idx, count, value_out_stream_))) {
+                return ret;
+            }
+        } else {
+            // Mixed — encode only non-null strings one by one
+            for (uint32_t i = 0; i < count; i++) {
+                uint32_t row = start_idx + i;
+                if (!const_cast<common::BitMap&>(col_notnull_bitmap)
+                         .test(row)) {
+                    uint32_t len = offsets[row + 1] - offsets[row];
+                    common::String val(buffer + offsets[row], len);
+                    if (RET_FAIL(
+                            value_encoder_->encode(val, value_out_stream_))) {
+                        return ret;
+                    }
+                }
+            }
+        }
+
+        // Phase 3: update statistics for non-null rows.
+        for (uint32_t i = 0; i < count; i++) {
+            uint32_t row = start_idx + i;
+            if (!const_cast<common::BitMap&>(col_notnull_bitmap).test(row)) {
+                uint32_t len = offsets[row + 1] - offsets[row];
+                common::String val(buffer + offsets[row], len);
+                statistic_->update(timestamps[row], val);
+            }
+        }
+
+        // Phase 4: commit page-level state (bitmap + size_) only after the
+        // encoder calls all succeeded.
+        for (uint32_t i = 0; i < count; i++) {
+            uint32_t row = start_idx + i;
+            if ((size_ / 8) + 1 > col_notnull_bitmap_.size()) {
+                col_notnull_bitmap_.push_back(0);
+            }
+            if (!const_cast<common::BitMap&>(col_notnull_bitmap).test(row)) {
+                col_notnull_bitmap_[size_ / 8] |= (MASK >> (size_ % 8));
+            }
+            size_++;
+        }
+        return ret;
+    }
+
+    // Rows in the current page including NULLs (NULLs advance size_ but not
+    // statistic_->count_).  This is the count the page-seal logic uses so
+    // value-column page boundaries stay aligned with the time column.
+    FORCE_INLINE uint32_t get_point_numer() const { return size_; }
     FORCE_INLINE uint32_t get_col_notnull_bitmap_out_stream_size() const {
         return col_notnull_bitmap_out_stream_.total_size();
     }
+    // Logical bytes written — used by the page-seal-when-full heuristic.
+    // Memory-pressure accounting uses estimate_max_mem_size() below, which
+    // counts the real 64 KiB-page footprint.
     FORCE_INLINE uint32_t get_page_memory_size() const {
         return col_notnull_bitmap_out_stream_.total_size() +
                value_out_stream_.total_size();
@@ -177,12 +323,16 @@ class ValuePageWriter {
      * outputStream and value outputStream, because size outputStream is never
      * used until flushing.
      *
+     * Reports the *allocated* stream footprint — see PageWriter::
+     * estimate_max_mem_size for rationale.
+     *
      * @return allocated size in time, value and outputStream
      */
     FORCE_INLINE uint32_t estimate_max_mem_size() const {
         return sizeof(int32_t) + 1 +
-               col_notnull_bitmap_out_stream_.total_size() +
-               value_out_stream_.total_size() +
+               static_cast<uint32_t>(
+                   col_notnull_bitmap_out_stream_.allocated_bytes() +
+                   value_out_stream_.allocated_bytes()) +
                value_encoder_->get_max_byte_size();
     }
     int write_to_chunk(common::ByteStream& pages_data, bool write_header,
@@ -195,9 +345,16 @@ class ValuePageWriter {
     }
     FORCE_INLINE Statistic* get_statistic() { return statistic_; }
     ValuePageData get_cur_page_data() { return cur_page_data_; }
+    // Transfer ownership of cur_page_data_'s heap buffers (uncompressed_buf_
+    // and compressed_buf_) out of this writer. Callers use this together with
+    // get_cur_page_data() to keep a long-lived copy of the data (e.g. as the
+    // first-page snapshot) without leaving an alias here that would cause a
+    // double free on destroy.
+    void release_cur_page_data() {
+        cur_page_data_.uncompressed_buf_ = nullptr;
+        cur_page_data_.compressed_buf_ = nullptr;
+    }
     void destroy_page_data() { cur_page_data_.destroy(); }
-    /** Clear cur_page_data_ without freeing (after ownership transferred). */
-    void clear_page_data() { cur_page_data_.clear(); }
 
    private:
     FORCE_INLINE int prepare_end_page() {
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index 513cbd5ca..066e5accb 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -169,7 +169,7 @@ if (ENABLE_LZOKAY)
 endif()
 
 if (ENABLE_ZLIB)
-    include_directories(${CMAKE_SOURCE_DIR}/third_party/zlib-1.2.13)
+    include_directories(${THIRD_PARTY_INCLUDE}/zlib-1.3.1)
 endif()
 
 if (ENABLE_ANTLR4)
@@ -186,6 +186,7 @@ file(GLOB_RECURSE TEST_SRCS
         "reader/*_test.cc"
         "writer/*_test.cc"
         "cwrapper/*_test.cc"
+        "compress/*_test.cc"
 )
 
 # Parser tests depend on the ANTLR4 runtime; only build them when it is enabled.
diff --git a/cpp/test/common/allocator/byte_stream_test.cc b/cpp/test/common/allocator/byte_stream_test.cc
index b211803c3..3f57cbf84 100644
--- a/cpp/test/common/allocator/byte_stream_test.cc
+++ b/cpp/test/common/allocator/byte_stream_test.cc
@@ -87,7 +87,6 @@ TEST_F(ByteStreamTest, WriteReadLargeQuantities) {
         write_to_stream(&data, 1);
     }
 
-    // 1 MiB buffer: keep it off the stack (MSVC's default stack is only 1 MiB).
     static uint8_t read_buffer[1024 * 1024];
     for (int i = 0; i < 1024 * 1024; i++) {
         uint32_t read_len = 0;
@@ -186,6 +185,42 @@ TEST_F(ByteStreamTest, ReadMoreThanAvailableTest) {
     ASSERT_EQ(read_len, data_size);
 }
 
+// Regression: the ctor used to take page_size verbatim, but hot read/write
+// paths use `& (page_size-1)` as a bitmask.  A non-power-of-2 page_size
+// would cause page-crossing logic to misfire, corrupting written data.
+// Constructing with 1000 should still round-trip cleanly across many pages.
+// Regression: round_up_pow2 used `while (ps < n) ps <<= 1`, which overflows
+// to 0 once ps passes 2^31 and never matches, looping forever.  Verify the
+// clamped helper returns the largest representable power of two instead.
+TEST(ByteStreamCtorTest, RoundUpPow2ClampsHugeInput) {
+    EXPECT_EQ(round_up_pow2(0u), 1u);
+    EXPECT_EQ(round_up_pow2(1u), 1u);
+    EXPECT_EQ(round_up_pow2(1000u), 1024u);
+    EXPECT_EQ(round_up_pow2(1024u), 1024u);
+    EXPECT_EQ(round_up_pow2(0x80000000u), 0x80000000u);
+    EXPECT_EQ(round_up_pow2(0x80000001u), 0x80000000u);
+    EXPECT_EQ(round_up_pow2(0xFFFFFFFFu), 0x80000000u);
+}
+
+TEST(ByteStreamCtorTest, NonPowerOfTwoPageSizeRoundTrip) {
+    ByteStream bs(1000, MOD_DEFAULT, false);
+    // Span ~5 pages: 1024 * 5 = 5120 bytes.
+    const uint32_t N = 5120;
+    std::vector<uint8_t> data(N);
+    for (uint32_t i = 0; i < N; i++) {
+        data[i] = static_cast<uint8_t>((i * 31 + 7) & 0xff);
+    }
+    ASSERT_EQ(bs.write_buf(data.data(), N), common::E_OK);
+
+    std::vector<uint8_t> out(N, 0);
+    uint32_t read_len = 0;
+    ASSERT_EQ(bs.read_buf(out.data(), N, read_len), common::E_OK);
+    ASSERT_EQ(read_len, N);
+    for (uint32_t i = 0; i < N; i++) {
+        ASSERT_EQ(out[i], data[i]) << "mismatch at idx " << i;
+    }
+}
+
 TEST_F(ByteStreamTest, WrapAndClearTest) {
     const char externalBuffer[] = "Hello, World!";
     const int32_t bufferSize = sizeof(externalBuffer);
@@ -316,4 +351,70 @@ TEST_F(SerializationUtilTest, WriteReadIntLEPaddedBitWidthBoundaryValue) {
     }
 }
 
-}  // namespace common
\ No newline at end of file
+// Regression: total_size_ was widened to uint64_t but the read-cursor APIs
+// stayed uint32_t.  A stream that legitimately reaches >4 GiB would have
+// remaining_size() / read_pos() / set_read_pos() truncating to the low 32
+// bits and silently mis-positioning later reads.  Lock the widened type at
+// compile time so a partial revert can't reintroduce truncation, and
+// round-trip a moderate value via the API to catch arithmetic mistakes.
+TEST(ByteStreamWidthTest, ReadCursorApisAre64Bit) {
+    ByteStream s(64, common::MOD_DEFAULT);
+    static_assert(sizeof(decltype(s.read_pos())) >= sizeof(uint64_t),
+                  "ByteStream::read_pos() must return a 64-bit type");
+    static_assert(sizeof(decltype(s.remaining_size())) >= sizeof(uint64_t),
+                  "ByteStream::remaining_size() must return a 64-bit type");
+    static_assert(sizeof(decltype(s.get_mark_len())) >= sizeof(uint64_t),
+                  "ByteStream::get_mark_len() must return a 64-bit type");
+
+    // Round-trip a position via set_read_pos / read_pos on a small wrapped
+    // buffer.  Combined with the static_asserts above this guards the path
+    // arithmetic: a partial revert that kept the signature 64-bit but
+    // truncated read_pos_ to uint32_t internally would fail set_read_pos →
+    // read_pos on values near a 32-bit boundary.
+    constexpr int32_t kLen = 256;
+    std::vector<char> backing(kLen, 0);
+    ByteStream wrapped(common::MOD_DEFAULT);
+    wrapped.wrap_from(backing.data(), kLen);
+    wrapped.set_read_pos(static_cast<uint64_t>(kLen - 7));
+    EXPECT_EQ(wrapped.read_pos(), static_cast<uint64_t>(kLen - 7));
+    EXPECT_EQ(wrapped.remaining_size(), 7u);
+}
+
+// Regression for the 64 KiB page memory-pressure account: ByteStream pages
+// are allocated up to OUT_STREAM_PAGE_SIZE bytes even when only a handful of
+// bytes have been written, so a chunk-group with many sparse measurements
+// can pin tens of megabytes that total_size() can't see.  allocated_bytes()
+// must reflect the real allocated footprint.
+TEST(ByteStreamAllocatedBytesTest, ReportsPageAllocationsNotLogicalSize) {
+    constexpr uint32_t kPageSize = 4096;
+    ByteStream s(kPageSize, common::MOD_DEFAULT);
+    EXPECT_EQ(s.allocated_bytes(), 0u);
+
+    // First write triggers one page allocation; logical size is 4 bytes but
+    // the real footprint should be the rounded page size.
+    uint8_t payload[4] = {1, 2, 3, 4};
+    ASSERT_EQ(s.write_buf(payload, 4), common::E_OK);
+    EXPECT_EQ(s.total_size(), 4u);
+    EXPECT_GE(s.allocated_bytes(), kPageSize);
+    EXPECT_EQ(s.allocated_bytes() % kPageSize, 0u);
+}
+
+// Regression for finding 21 (MSVC reinterpret_cast<atomic<T>*> UB): the
+// OptionalAtomic storage is now a real std::atomic<T>, so atomic ops never
+// observe a non-atomic backing object.  Lock the storage type at compile
+// time so a future refactor can't reintroduce the bare T fallback.
+TEST(OptionalAtomicStorageTest, BackingStorageIsRealAtomic) {
+    OptionalAtomic<uint64_t> oa(0, /*enable_atomic=*/true);
+    static_assert(!std::is_copy_constructible<OptionalAtomic<uint64_t>>::value,
+                  "OptionalAtomic must not be copyable — the std::atomic<T> "
+                  "storage forces explicit load/store");
+    EXPECT_EQ(oa.load(), 0u);
+    oa.store(42);
+    EXPECT_EQ(oa.load(), 42u);
+    EXPECT_EQ(oa.atomic_aaf(8), 50u);
+    EXPECT_EQ(oa.load(), 50u);
+    EXPECT_EQ(oa.atomic_faa(1), 50u);
+    EXPECT_EQ(oa.load(), 51u);
+}
+
+}  // namespace common
diff --git a/cpp/test/common/tablet_test.cc b/cpp/test/common/tablet_test.cc
index 71863f0c7..11dfa485f 100644
--- a/cpp/test/common/tablet_test.cc
+++ b/cpp/test/common/tablet_test.cc
@@ -46,6 +46,144 @@ TEST(TabletTest, BasicFunctionality) {
     EXPECT_EQ(tablet.add_value(1, 1, true), common::E_OK);
 }
 
+// Regression: reset() must restore each column's bitmap to all-null. If the
+// previous batch left some cells with non-null bits cleared and the next batch
+// does not re-fill those cells, get_value() must report them as null so the
+// writer does not emit stale leftover values.
+TEST(TabletTest, ResetClearsBitmap) {
+    std::vector<MeasurementSchema> schema_vec;
+    schema_vec.push_back(MeasurementSchema(
+        "m_int", common::TSDataType::INT32, common::TSEncoding::PLAIN,
+        common::CompressionType::UNCOMPRESSED));
+    schema_vec.push_back(MeasurementSchema(
+        "m_double", common::TSDataType::DOUBLE, common::TSEncoding::PLAIN,
+        common::CompressionType::UNCOMPRESSED));
+    Tablet tablet("dev",
+                  std::make_shared<std::vector<MeasurementSchema>>(schema_vec));
+
+    // First batch fills row 5 in both columns.
+    ASSERT_EQ(tablet.add_value(5u, 0u, static_cast<int32_t>(42)), common::E_OK);
+    ASSERT_EQ(tablet.add_value(5u, 1u, 3.14), common::E_OK);
+
+    common::TSDataType ty;
+    EXPECT_NE(tablet.get_value(5, 0u, ty), nullptr);
+    EXPECT_NE(tablet.get_value(5, 1u, ty), nullptr);
+
+    // Reuse the tablet: reset and write a fresh, smaller batch that does not
+    // touch row 5 at all. Row 5 must come back as null, not as the stale 42.
+    tablet.reset();
+    ASSERT_EQ(tablet.add_value(0u, 0u, static_cast<int32_t>(7)), common::E_OK);
+    EXPECT_NE(tablet.get_value(0, 0u, ty), nullptr);
+    EXPECT_EQ(tablet.get_value(5, 0u, ty), nullptr);
+    EXPECT_EQ(tablet.get_value(5, 1u, ty), nullptr);
+}
+
+// Regression: set_column_values() with a non-null bitmap must update
+// has_set_bits_, otherwise downstream may_have_set_bits() shortcuts treat the
+// column as having no nulls and the writer emits stale/garbage values for the
+// rows the bitmap was meant to mark null.
+TEST(TabletTest, SetColumnValuesBitmapPreservesNullFlag) {
+    std::vector<MeasurementSchema> schema_vec;
+    schema_vec.push_back(MeasurementSchema(
+        "m_int", common::TSDataType::INT32, common::TSEncoding::PLAIN,
+        common::CompressionType::UNCOMPRESSED));
+    Tablet tablet("dev",
+                  std::make_shared<std::vector<MeasurementSchema>>(schema_vec));
+
+    int32_t buf[8] = {1, 2, 3, 4, 5, 6, 7, 8};
+
+    // Step 1: write all 8 rows with no nulls -> clear_all() inside the tablet
+    // sets has_set_bits_=false, matching the state a real workload leaves
+    // behind for a fully-populated column.
+    ASSERT_EQ(tablet.set_column_values(0u, buf, /*bitmap=*/nullptr, 8u),
+              common::E_OK);
+
+    // Step 2: rewrite with a bitmap that marks rows 0 and 7 as NULL.  Tablet's
+    // BitMap layout is LSB-first within each byte (row i -> bit 1<<(i%8)).
+    uint8_t external_bitmap[] = {0x81};  // bit 0 (row 0) + bit 7 (row 7) set
+    ASSERT_EQ(tablet.set_column_values(0u, buf, external_bitmap, 8u),
+              common::E_OK);
+
+    common::TSDataType ty;
+    EXPECT_EQ(tablet.get_value(0, 0u, ty), nullptr);
+    EXPECT_NE(tablet.get_value(1, 0u, ty), nullptr);
+    EXPECT_EQ(tablet.get_value(7, 0u, ty), nullptr);
+}
+
+// Regression: set_column_string_values / set_column_string_repeated used to
+// reinterpret value_matrix_[c].string_col without checking the schema type.
+// Calling them on a numeric column would corrupt that column's numeric
+// buffer.  Verify both reject non-string columns with E_TYPE_NOT_MATCH.
+TEST(TabletTest, StringApisRejectNonStringColumn) {
+    std::vector<MeasurementSchema> schema_vec;
+    schema_vec.push_back(MeasurementSchema(
+        "m_int", common::TSDataType::INT32, common::TSEncoding::PLAIN,
+        common::CompressionType::UNCOMPRESSED));
+    Tablet tablet("dev",
+                  std::make_shared<std::vector<MeasurementSchema>>(schema_vec));
+
+    const char data[] = "hello";
+    int32_t offsets[2] = {0, 5};
+    EXPECT_EQ(tablet.set_column_string_values(0u, offsets, data, nullptr, 1u),
+              common::E_TYPE_NOT_MATCH);
+    EXPECT_EQ(tablet.set_column_string_repeated(0u, "x", 1u, 4u),
+              common::E_TYPE_NOT_MATCH);
+}
+
+// Regression: str_len * count used to be computed in uint32_t and would wrap
+// silently, leaving the loop to write past the truncated allocation.
+// 65536 * 65537 = 4295032832 → wraps to 65536 in uint32_t.
+TEST(TabletTest, StringRepeatedTotalBytesOverflowRejected) {
+    std::vector<MeasurementSchema> schema_vec;
+    schema_vec.push_back(MeasurementSchema(
+        "m_str", common::TSDataType::STRING, common::TSEncoding::PLAIN,
+        common::CompressionType::UNCOMPRESSED));
+    Tablet tablet("dev",
+                  std::make_shared<std::vector<MeasurementSchema>>(schema_vec),
+                  100000u);
+    std::string big_str(65536, 'a');
+    EXPECT_EQ(tablet.set_column_string_repeated(0u, big_str.c_str(),
+                                                /*str_len=*/65536u,
+                                                /*count=*/65537u),
+              common::E_OVERFLOW);
+}
+
+// Regression: set_column_string_values only checked offsets[count] before;
+// non-monotonic / negative / non-zero-start offsets would underflow the
+// downstream `offsets[i+1] - offsets[i]` length calc and trigger wild
+// memcpy.  Verify each malformed input is rejected with E_INVALID_ARG.
+TEST(TabletTest, StringValuesRejectsMalformedOffsets) {
+    std::vector<MeasurementSchema> schema_vec;
+    schema_vec.push_back(MeasurementSchema(
+        "m_str", common::TSDataType::STRING, common::TSEncoding::PLAIN,
+        common::CompressionType::UNCOMPRESSED));
+    Tablet tablet("dev",
+                  std::make_shared<std::vector<MeasurementSchema>>(schema_vec));
+    const char data[] = "abcdefghij";
+
+    // Non-zero start offset.
+    int32_t off_bad_start[3] = {1, 5, 10};
+    EXPECT_EQ(
+        tablet.set_column_string_values(0u, off_bad_start, data, nullptr, 2u),
+        common::E_INVALID_ARG);
+
+    // Non-monotonic: {0, 10, 5}.
+    int32_t off_non_mono[3] = {0, 10, 5};
+    EXPECT_EQ(
+        tablet.set_column_string_values(0u, off_non_mono, data, nullptr, 2u),
+        common::E_INVALID_ARG);
+
+    // Negative offset somewhere in the middle.
+    int32_t off_neg[3] = {0, -1, 5};
+    EXPECT_EQ(tablet.set_column_string_values(0u, off_neg, data, nullptr, 2u),
+              common::E_INVALID_ARG);
+
+    // Sanity: well-formed offsets succeed.
+    int32_t off_ok[3] = {0, 3, 7};
+    EXPECT_EQ(tablet.set_column_string_values(0u, off_ok, data, nullptr, 2u),
+              common::E_OK);
+}
+
 TEST(TabletTest, LargeQuantities) {
     std::string device_name = "test_device";
     std::vector<MeasurementSchema> schema_vec;
diff --git a/cpp/test/common/thread_pool_test.cc b/cpp/test/common/thread_pool_test.cc
new file mode 100644
index 000000000..1fe7465cf
--- /dev/null
+++ b/cpp/test/common/thread_pool_test.cc
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#ifdef ENABLE_THREADS
+
+#include "common/thread_pool.h"
+
+#include <gtest/gtest.h>
+
+#include <atomic>
+#include <chrono>
+#include <future>
+#include <thread>
+
+// Regression: a zero-sized ThreadPool used to silently accept submit() but
+// block wait_all() forever (no worker thread, so active_ never reaches 0).
+// init_common() clamps thread_count_ to >= 1 before building the global pool,
+// but the ctor normalizes zero to a single worker as a defensive backstop so
+// any direct ThreadPool(0) still makes progress instead of hanging.
+TEST(ThreadPoolTest, ZeroThreadPoolStillExecutesAndDrains) {
+    common::ThreadPool pool(0);
+    EXPECT_GE(pool.num_threads(), static_cast<size_t>(1));
+
+    std::atomic<int> ran{0};
+    pool.submit([&ran]() { ran.fetch_add(1); });
+    auto fut = pool.submit([]() { return 42; });
+
+    auto wait_with_timeout = [&pool]() {
+        // wait_all has no timeout; run it in a helper thread we can join().
+        std::promise<void> done;
+        auto fut = done.get_future();
+        std::thread t([&pool, &done]() {
+            pool.wait_all();
+            done.set_value();
+        });
+        auto status = fut.wait_for(std::chrono::seconds(2));
+        if (status != std::future_status::ready) {
+            // Detach so a hung pool doesn't terminate the test process.
+            t.detach();
+            return false;
+        }
+        t.join();
+        return true;
+    };
+
+    ASSERT_TRUE(wait_with_timeout()) << "wait_all hung — zero-thread pool";
+    EXPECT_EQ(ran.load(), 1);
+    EXPECT_EQ(fut.get(), 42);
+}
+
+#endif  // ENABLE_THREADS
diff --git a/cpp/test/common/tsfile_common_test.cc b/cpp/test/common/tsfile_common_test.cc
index 01e193f79..c451a8136 100644
--- a/cpp/test/common/tsfile_common_test.cc
+++ b/cpp/test/common/tsfile_common_test.cc
@@ -21,6 +21,9 @@
 #include <common/schema.h>
 #include <gtest/gtest.h>
 
+#include "common/global.h"
+#include "compress/compressor_factory.h"
+
 namespace storage {
 TEST(PageHeaderTest, DefaultConstructor) {
     PageHeader header;
@@ -471,4 +474,26 @@ TEST_F(TsFileMetaTest, SerializeDeserialize) {
     ASSERT_EQ(*new_meta.tsfile_properties_["key"], std::string("value"));
     ASSERT_EQ(new_meta.tsfile_properties_["null_key"], nullptr);
 }
+
+// Regression: the default-compression configuration must name a compressor
+// that the build actually provides; otherwise CompressorFactory returns
+// nullptr at write time. init_config_value() previously gated SNAPPY on
+// ENABLE_LZ4, which broke --disable-snappy --enable-lz4 builds.
+TEST(DefaultCompressorTest, DefaultIsAllocatable) {
+    common::init_config_value();
+    Compressor* c = CompressorFactory::alloc_compressor(
+        common::g_config_value_.default_compression_type_);
+    ASSERT_NE(c, nullptr);
+#ifdef ENABLE_SNAPPY
+    EXPECT_EQ(common::g_config_value_.default_compression_type_,
+              common::CompressionType::SNAPPY);
+#elif defined(ENABLE_LZ4)
+    EXPECT_EQ(common::g_config_value_.default_compression_type_,
+              common::CompressionType::LZ4);
+#else
+    EXPECT_EQ(common::g_config_value_.default_compression_type_,
+              common::CompressionType::UNCOMPRESSED);
+#endif
+    CompressorFactory::free(c);
+}
 }  // namespace storage
diff --git a/cpp/test/compress/lz4_compressor_test.cc b/cpp/test/compress/lz4_compressor_test.cc
index c57ec0caf..0b2249f8d 100644
--- a/cpp/test/compress/lz4_compressor_test.cc
+++ b/cpp/test/compress/lz4_compressor_test.cc
@@ -126,4 +126,40 @@ TEST_F(LZ4Test, TestBytes2) {
     compressor.after_compress(compressed_buf);
     compressor.after_uncompress(decompressed_buf);
 }
+
+TEST_F(LZ4Test, AfterUncompressFreesParamNotMember) {
+    storage::LZ4Compressor compressor;
+    std::string input_a(1024, 'A');
+    std::string input_b(2048, 'B');
+    char* compressed_a = nullptr;
+    char* compressed_b = nullptr;
+    uint32_t compressed_a_len = 0;
+    uint32_t compressed_b_len = 0;
+
+    ASSERT_EQ(compressor.compress(&input_a[0], input_a.size(), compressed_a,
+                                  compressed_a_len),
+              common::E_OK);
+    ASSERT_EQ(compressor.compress(&input_b[0], input_b.size(), compressed_b,
+                                  compressed_b_len),
+              common::E_OK);
+
+    char* uncompressed_a = nullptr;
+    char* uncompressed_b = nullptr;
+    uint32_t uncompressed_a_len = 0;
+    uint32_t uncompressed_b_len = 0;
+    ASSERT_EQ(compressor.uncompress(compressed_a, compressed_a_len,
+                                    uncompressed_a, uncompressed_a_len),
+              common::E_OK);
+    ASSERT_EQ(compressor.uncompress(compressed_b, compressed_b_len,
+                                    uncompressed_b, uncompressed_b_len),
+              common::E_OK);
+
+    compressor.after_uncompress(uncompressed_a);
+    EXPECT_EQ(uncompressed_b_len, input_b.size());
+    EXPECT_EQ(memcmp(uncompressed_b, input_b.data(), uncompressed_b_len), 0);
+
+    compressor.after_uncompress(uncompressed_b);
+    compressor.after_compress(compressed_a);
+    compressor.after_compress(compressed_b);
+}
 }  // namespace
diff --git a/cpp/test/compress/snappy_compressor_test.cc b/cpp/test/compress/snappy_compressor_test.cc
index d24915d70..249200cce 100644
--- a/cpp/test/compress/snappy_compressor_test.cc
+++ b/cpp/test/compress/snappy_compressor_test.cc
@@ -126,4 +126,40 @@ TEST_F(SnappyTest, TestBytes2) {
     compressor.after_compress(compressed_buf);
     compressor.after_uncompress(decompressed_buf);
 }
+
+TEST_F(SnappyTest, AfterUncompressFreesParamNotMember) {
+    storage::SnappyCompressor compressor;
+    std::string input_a(1024, 'A');
+    std::string input_b(2048, 'B');
+    char* compressed_a = nullptr;
+    char* compressed_b = nullptr;
+    uint32_t compressed_a_len = 0;
+    uint32_t compressed_b_len = 0;
+
+    ASSERT_EQ(compressor.compress(&input_a[0], input_a.size(), compressed_a,
+                                  compressed_a_len),
+              common::E_OK);
+    ASSERT_EQ(compressor.compress(&input_b[0], input_b.size(), compressed_b,
+                                  compressed_b_len),
+              common::E_OK);
+
+    char* uncompressed_a = nullptr;
+    char* uncompressed_b = nullptr;
+    uint32_t uncompressed_a_len = 0;
+    uint32_t uncompressed_b_len = 0;
+    ASSERT_EQ(compressor.uncompress(compressed_a, compressed_a_len,
+                                    uncompressed_a, uncompressed_a_len),
+              common::E_OK);
+    ASSERT_EQ(compressor.uncompress(compressed_b, compressed_b_len,
+                                    uncompressed_b, uncompressed_b_len),
+              common::E_OK);
+
+    compressor.after_uncompress(uncompressed_a);
+    EXPECT_EQ(uncompressed_b_len, input_b.size());
+    EXPECT_EQ(memcmp(uncompressed_b, input_b.data(), uncompressed_b_len), 0);
+
+    compressor.after_uncompress(uncompressed_b);
+    compressor.after_compress(compressed_a);
+    compressor.after_compress(compressed_b);
+}
 }  // namespace
diff --git a/cpp/test/compress/uncompressed_compressor_test.cc b/cpp/test/compress/uncompressed_compressor_test.cc
new file mode 100644
index 000000000..c4f1e8ced
--- /dev/null
+++ b/cpp/test/compress/uncompressed_compressor_test.cc
@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include "compress/uncompressed_compressor.h"
+
+#include <gtest/gtest.h>
+
+#include <cstring>
+
+namespace storage {
+
+// Regression: after_uncompress() used to free the cached uncompressed_buf_
+// member regardless of which buffer the caller actually passed in.  Two
+// successive uncompress() calls would cache only the second buffer; calling
+// after_uncompress(first) then freed the still-live second buffer (UAF) and
+// leaked the first.  The fix frees the parameter and only clears the
+// member when it matches.  We can't directly observe UAF in a unit test,
+// but we can verify the contract: a buffer the caller is releasing is no
+// longer used after the call, and the second buffer's contents stay
+// readable until its own after_uncompress() runs.
+TEST(UncompressedCompressorTest, AfterUncompressFreesParamNotMember) {
+    UncompressedCompressor c;
+
+    const char src_a[] = "AAAA-payload-A";
+    const char src_b[] = "BBBB-payload-B-longer";
+
+    char* uA = nullptr;
+    uint32_t lenA = 0;
+    ASSERT_EQ(
+        c.uncompress(const_cast<char*>(src_a), sizeof(src_a) - 1, uA, lenA),
+        common::E_OK);
+    ASSERT_NE(uA, nullptr);
+    ASSERT_EQ(lenA, sizeof(src_a) - 1);
+    EXPECT_EQ(memcmp(uA, src_a, lenA), 0);
+
+    char* uB = nullptr;
+    uint32_t lenB = 0;
+    ASSERT_EQ(
+        c.uncompress(const_cast<char*>(src_b), sizeof(src_b) - 1, uB, lenB),
+        common::E_OK);
+    ASSERT_NE(uB, nullptr);
+    EXPECT_NE(uA, uB);
+    EXPECT_EQ(memcmp(uB, src_b, lenB), 0);
+
+    // Release the FIRST buffer.  Under the old bug this would free uB
+    // (the member-cached pointer) and leak uA.  Under the fix it frees uA
+    // and leaves uB intact for the next read.
+    c.after_uncompress(uA);
+    // uB must still be readable — if we had freed it above, the cached
+    // member pointer would now point into freed memory and most
+    // allocators would either return the byte back to the free list or
+    // poison it.  Validate via the original content.
+    EXPECT_EQ(memcmp(uB, src_b, lenB), 0);
+
+    // Releasing uB should be a clean no-op-after on the member.
+    c.after_uncompress(uB);
+}
+
+}  // namespace storage
diff --git a/cpp/test/cwrapper/c_release_test.cc b/cpp/test/cwrapper/c_release_test.cc
index 375c7e115..bb21483f7 100644
--- a/cpp/test/cwrapper/c_release_test.cc
+++ b/cpp/test/cwrapper/c_release_test.cc
@@ -40,6 +40,7 @@ class CReleaseTest : public testing::Test {};
 
 TEST_F(CReleaseTest, TestCreateFile) {
     ERRNO error_no = RET_OK;
+    remove("create_file1.tsfile");
     // Create File and Get RET_OK
     WriteFile file = write_file_new("create_file1.tsfile", &error_no);
     ASSERT_EQ(RET_OK, error_no);
@@ -50,7 +51,8 @@ TEST_F(CReleaseTest, TestCreateFile) {
     ASSERT_EQ(RET_ALREADY_EXIST, error_no);
     ASSERT_EQ(nullptr, file);
 
-    // Folder
+    // Folder: rejected either as an open error (POSIX) or as already-existing
+    // (Windows / filesystems where the directory already exists).
     file = write_file_new("test/", &error_no);
     ASSERT_TRUE(error_no == RET_FILRET_OPEN_ERR ||
                 error_no == RET_ALREADY_EXIST);
@@ -112,6 +114,17 @@ TEST_F(CReleaseTest, TsFileWriterNew) {
     free_write_file(&file);
     remove("test_empty_writer.tsfile");
 
+    // Normal schema with memory threshold
+    file = write_file_new("test_memory_threshold_writer.tsfile", &error_code);
+    ASSERT_EQ(RET_OK, error_code);
+    writer = tsfile_writer_new_with_memory_threshold(file, &table_schema, 100,
+                                                     &error_code);
+    ASSERT_NE(nullptr, writer);
+    ASSERT_EQ(RET_OK, error_code);
+    ASSERT_EQ(RET_OK, tsfile_writer_close(writer));
+    free_write_file(&file);
+    remove("test_memory_threshold_writer.tsfile");
+
     free_table_schema(table_schema);
     free_table_schema(test_schema);
 }
@@ -142,6 +155,10 @@ TEST_F(CReleaseTest, TsFileWriterWriteDataAbnormalColumn) {
     TsFileWriter writer =
         tsfile_writer_new(file, &abnormal_schema, &error_code);
     ASSERT_EQ(RET_INVALID_SCHEMA, error_code);
+    writer = tsfile_writer_new_with_memory_threshold(file, &abnormal_schema,
+                                                     100, &error_code);
+    ASSERT_EQ(nullptr, writer);
+    ASSERT_EQ(RET_INVALID_SCHEMA, error_code);
     free(abnormal_schema.column_schemas[2].column_name);
 
     abnormal_schema.column_schemas[2] =
@@ -150,6 +167,10 @@ TEST_F(CReleaseTest, TsFileWriterWriteDataAbnormalColumn) {
     // datatype conflict
     writer = tsfile_writer_new(file, &abnormal_schema, &error_code);
     ASSERT_EQ(RET_INVALID_SCHEMA, error_code);
+    writer = tsfile_writer_new_with_memory_threshold(file, &abnormal_schema,
+                                                     100, &error_code);
+    ASSERT_EQ(nullptr, writer);
+    ASSERT_EQ(RET_INVALID_SCHEMA, error_code);
 
     free(abnormal_schema.column_schemas[1].column_name);
     abnormal_schema.column_schemas[1] =
@@ -388,4 +409,4 @@ TEST_F(CReleaseTest, TsFileWriterConfTest) {
     remove("plain_file.tsfile");
 }
 
-}  // namespace CReleaseTest
\ No newline at end of file
+}  // namespace CReleaseTest
diff --git a/cpp/test/cwrapper/cwrapper_test.cc b/cpp/test/cwrapper/cwrapper_test.cc
index 9cf06d2f8..2ac6cad21 100644
--- a/cpp/test/cwrapper/cwrapper_test.cc
+++ b/cpp/test/cwrapper/cwrapper_test.cc
@@ -314,4 +314,155 @@ TEST_F(CWrapperTest, WriterFlushTabletAndReadData) {
     free(data_types);
     free_write_file(&file);
 }
-}  // namespace cwrapper
\ No newline at end of file
+
+// Regression: tsfile_writer_new_with_memory_threshold() had its duplicate-
+// column check inverted (`==` instead of `!=`), so the very first column
+// always looked like a duplicate and the constructor returned
+// E_INVALID_SCHEMA before any legitimate schema could be used.  Compare to
+// tsfile_writer_new() in the same file which had the correct check.
+TEST(TsFileWriterCApiTest, NewWithMemoryThresholdAcceptsValidSchema) {
+    const char* path = "cwrapper_writer_with_threshold_smoke.tsfile";
+    remove(path);
+    ERRNO code = 0;
+    WriteFile file = write_file_new(path, &code);
+    ASSERT_EQ(code, RET_OK);
+
+    const int column_num = 3;
+    TableSchema schema;
+    schema.table_name = strdup("t");
+    schema.column_num = column_num;
+    schema.column_schemas =
+        static_cast<ColumnSchema*>(malloc(sizeof(ColumnSchema) * column_num));
+    schema.column_schemas[0] =
+        ColumnSchema{strdup("id1"), TS_DATATYPE_STRING, TAG};
+    schema.column_schemas[1] =
+        ColumnSchema{strdup("s1"), TS_DATATYPE_INT64, FIELD};
+    schema.column_schemas[2] =
+        ColumnSchema{strdup("s2"), TS_DATATYPE_DOUBLE, FIELD};
+
+    TsFileWriter writer = tsfile_writer_new_with_memory_threshold(
+        file, &schema, 1024 * 1024, &code);
+    EXPECT_NE(writer, nullptr) << "constructor refused a valid 3-column schema";
+    EXPECT_EQ(code, RET_OK);
+
+    // Duplicate column triggers the now-correct path.
+    TableSchema dup;
+    dup.table_name = strdup("t");
+    dup.column_num = 2;
+    dup.column_schemas =
+        static_cast<ColumnSchema*>(malloc(sizeof(ColumnSchema) * 2));
+    dup.column_schemas[0] =
+        ColumnSchema{strdup("s1"), TS_DATATYPE_INT64, FIELD};
+    dup.column_schemas[1] =
+        ColumnSchema{strdup("s1"), TS_DATATYPE_INT64, FIELD};
+    ERRNO dup_code = 0;
+    TsFileWriter dup_writer = tsfile_writer_new_with_memory_threshold(
+        file, &dup, 1024 * 1024, &dup_code);
+    EXPECT_EQ(dup_writer, nullptr);
+    EXPECT_EQ(dup_code, common::E_INVALID_SCHEMA);
+
+    if (writer != nullptr) {
+        tsfile_writer_close(writer);
+    }
+    free_table_schema(schema);
+    free_table_schema(dup);
+    free_write_file(&file);
+    remove(path);
+}
+
+// Regression: tsfile_writer_new / tsfile_writer_new_with_memory_threshold /
+// _tsfile_writer_register_table used to dereference null inputs directly,
+// crashing the host process.  Each now reports E_INVALID_ARG (or returns
+// nullptr when err_code itself is null) instead of segfaulting.
+TEST(TsFileWriterCApiTest, RejectsNullInputs) {
+    ERRNO err = 0;
+
+    // tsfile_writer_new: null file
+    EXPECT_EQ(
+        tsfile_writer_new(nullptr, reinterpret_cast<TableSchema*>(1), &err),
+        nullptr);
+    EXPECT_EQ(err, common::E_INVALID_ARG);
+
+    // tsfile_writer_new: null schema
+    err = 0;
+    EXPECT_EQ(tsfile_writer_new(reinterpret_cast<WriteFile>(1), nullptr, &err),
+              nullptr);
+    EXPECT_EQ(err, common::E_INVALID_ARG);
+
+    // tsfile_writer_new: null err_code
+    EXPECT_EQ(tsfile_writer_new(nullptr, nullptr, nullptr), nullptr);
+
+    // tsfile_writer_new_with_memory_threshold: same checks
+    err = 0;
+    EXPECT_EQ(tsfile_writer_new_with_memory_threshold(
+                  nullptr, reinterpret_cast<TableSchema*>(1), 1024, &err),
+              nullptr);
+    EXPECT_EQ(err, common::E_INVALID_ARG);
+
+    // _tsfile_writer_register_table: nulls
+    EXPECT_EQ(_tsfile_writer_register_table(nullptr,
+                                            reinterpret_cast<TableSchema*>(1)),
+              common::E_INVALID_ARG);
+    EXPECT_EQ(_tsfile_writer_register_table(reinterpret_cast<TsFileWriter>(1),
+                                            nullptr),
+              common::E_INVALID_ARG);
+}
+
+// Regression: the tag-filter C API used to dereference a null reader and
+// pass null char pointers straight to std::string(), crashing the host
+// process.  Each entry point must now return nullptr / E_INVALID_ARG on
+// missing inputs instead of segfaulting.  This test only checks the guards
+// are in place — it deliberately never touches a real reader.
+TEST(TagFilterCApiTest, RejectsNullInputs) {
+    const char* table = "t";
+    const char* col = "c";
+    const char* val = "v";
+
+    EXPECT_EQ(tsfile_tag_filter_eq(nullptr, table, col, val), nullptr);
+    EXPECT_EQ(tsfile_tag_filter_eq(reinterpret_cast<TsFileReader>(1), nullptr,
+                                   col, val),
+              nullptr);
+    EXPECT_EQ(tsfile_tag_filter_eq(reinterpret_cast<TsFileReader>(1), table,
+                                   nullptr, val),
+              nullptr);
+    EXPECT_EQ(tsfile_tag_filter_eq(reinterpret_cast<TsFileReader>(1), table,
+                                   col, nullptr),
+              nullptr);
+
+    EXPECT_EQ(tsfile_tag_filter_neq(nullptr, table, col, val), nullptr);
+    EXPECT_EQ(tsfile_tag_filter_lt(nullptr, table, col, val), nullptr);
+    EXPECT_EQ(tsfile_tag_filter_lteq(nullptr, table, col, val), nullptr);
+    EXPECT_EQ(tsfile_tag_filter_gt(nullptr, table, col, val), nullptr);
+    EXPECT_EQ(tsfile_tag_filter_gteq(nullptr, table, col, val), nullptr);
+
+    ERRNO err = common::E_OK;
+    EXPECT_EQ(
+        tsfile_tag_filter_create(nullptr, table, col, val, TAG_FILTER_EQ, &err),
+        nullptr);
+    EXPECT_EQ(err, common::E_INVALID_ARG);
+
+    err = common::E_OK;
+    EXPECT_EQ(tsfile_tag_filter_create(reinterpret_cast<TsFileReader>(1),
+                                       nullptr, col, val, TAG_FILTER_EQ, &err),
+              nullptr);
+    EXPECT_EQ(err, common::E_INVALID_ARG);
+
+    err = common::E_OK;
+    EXPECT_EQ(tsfile_tag_filter_create(reinterpret_cast<TsFileReader>(1), table,
+                                       nullptr, val, TAG_FILTER_EQ, &err),
+              nullptr);
+    EXPECT_EQ(err, common::E_INVALID_ARG);
+
+    err = common::E_OK;
+    EXPECT_EQ(tsfile_tag_filter_create(reinterpret_cast<TsFileReader>(1), table,
+                                       col, nullptr, TAG_FILTER_EQ, &err),
+              nullptr);
+    EXPECT_EQ(err, common::E_INVALID_ARG);
+
+    // err_code itself is null — must not crash, must return null.
+    EXPECT_EQ(tsfile_tag_filter_create(reinterpret_cast<TsFileReader>(1), table,
+                                       col, val, TAG_FILTER_EQ, nullptr),
+              nullptr);
+}
+
+}  // namespace cwrapper
diff --git a/cpp/test/cwrapper/query_by_row_cwrapper_test.cc b/cpp/test/cwrapper/query_by_row_cwrapper_test.cc
index 3de447ffd..4983c57ea 100644
--- a/cpp/test/cwrapper/query_by_row_cwrapper_test.cc
+++ b/cpp/test/cwrapper/query_by_row_cwrapper_test.cc
@@ -217,7 +217,7 @@ TEST_F(CWrapperQueryByRowTest, TableByRowOffsetLimit) {
     const int limit = 5;
     ResultSet rs = tsfile_reader_query_table_by_row(reader, table_name.c_str(),
                                                     column_names_c, 2, offset,
-                                                    limit, NULL, 0, &code);
+                                                    limit, nullptr, 0, &code);
     ASSERT_EQ(code, RET_OK);
     ASSERT_NE(rs, nullptr);
 
diff --git a/cpp/test/encoding/encoding_coverage_test.cc b/cpp/test/encoding/encoding_coverage_test.cc
new file mode 100644
index 000000000..6970b9387
--- /dev/null
+++ b/cpp/test/encoding/encoding_coverage_test.cc
@@ -0,0 +1,406 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+// Targeted coverage tests that exercise paths missed by the per-codec
+// roundtrip tests: type-mismatch error returns, has_remaining variants,
+// SIMD/scalar batch branches, floating-point special values, dictionary
+// decoder/encoder, and reset cycles.
+
+#include <cmath>
+#include <limits>
+#include <vector>
+
+#include "common/allocator/byte_stream.h"
+#include "encoding/dictionary_decoder.h"
+#include "encoding/dictionary_encoder.h"
+#include "encoding/gorilla_decoder.h"
+#include "encoding/gorilla_encoder.h"
+#include "encoding/int32_rle_decoder.h"
+#include "encoding/int32_rle_encoder.h"
+#include "encoding/int64_rle_decoder.h"
+#include "encoding/int64_rle_encoder.h"
+#include "encoding/plain_decoder.h"
+#include "encoding/plain_encoder.h"
+#include "encoding/ts2diff_decoder.h"
+#include "encoding/ts2diff_encoder.h"
+#include "encoding/zigzag_decoder.h"
+#include "encoding/zigzag_encoder.h"
+#include "gtest/gtest.h"
+
+namespace storage {
+
+// ── Type-mismatch returns ────────────────────────────────────────────────
+//
+// Every codec exposes read_boolean / read_int32 / read_int64 / read_float /
+// read_double / read_String. Most of them only implement one or two and
+// return E_TYPE_NOT_MATCH for the rest, but those return paths were never
+// hit by the existing per-codec tests (which only call the one supported
+// method per codec).
+TEST(EncodingCoverage, TypeMismatchReturnsAreReachable) {
+    common::ByteStream s(64, common::MOD_DEFAULT);
+    common::PageArena pa;
+    pa.init(512, common::MOD_DEFAULT);
+    bool b;
+    float f;
+    double d;
+    int64_t i64;
+    common::String str;
+
+    // Each decoder returns an error sentinel (E_TYPE_NOT_MATCH or
+    // E_NOT_SUPPORT depending on codec) for the read_* variants it
+    // doesn't implement.  We only care that the unsupported path returns
+    // an error rather than a corrupted value.  Note that GorillaDecoder
+    // implements its unsupported paths with `ASSERT(false)`; calling
+    // those in Debug builds aborts, so we exercise only the codecs that
+    // return cleanly (Zigzag, RLE).
+    auto NE_OK = [](int r) { EXPECT_NE(r, common::E_OK); };
+    IntZigzagDecoder zz;
+    NE_OK(zz.read_boolean(b, s));
+    NE_OK(zz.read_float(f, s));
+    NE_OK(zz.read_double(d, s));
+    NE_OK(zz.read_String(str, pa, s));
+
+    Int32RleDecoder rle32;
+    NE_OK(rle32.read_int64(i64, s));
+    NE_OK(rle32.read_float(f, s));
+    NE_OK(rle32.read_double(d, s));
+    NE_OK(rle32.read_String(str, pa, s));
+
+    Int64RleDecoder rle64;
+    int32_t i32;
+    NE_OK(rle64.read_boolean(b, s));
+    NE_OK(rle64.read_int32(i32, s));
+    NE_OK(rle64.read_float(f, s));
+    NE_OK(rle64.read_double(d, s));
+    NE_OK(rle64.read_String(str, pa, s));
+    (void)i32;
+    (void)i64;
+}
+
+// ── Reset cycles ────────────────────────────────────────────────────────
+//
+// Each codec defines a reset() that resets internal state; nothing in the
+// roundtrip tests calls it.  Encode → reset → re-encode should still
+// produce a stream that decodes to the second batch's values.
+TEST(EncodingCoverage, ResetClearsState) {
+    {
+        IntZigzagEncoder enc;
+        IntZigzagDecoder dec;
+        common::ByteStream s(64, common::MOD_DEFAULT);
+        EXPECT_EQ(enc.encode(123, s), common::E_OK);
+        enc.flush(s);
+        EXPECT_EQ(dec.decode(s), 123);
+        dec.reset();
+        common::ByteStream s2(64, common::MOD_DEFAULT);
+        EXPECT_EQ(enc.encode(-456, s2), common::E_OK);
+        enc.flush(s2);
+        EXPECT_EQ(dec.decode(s2), -456);
+    }
+    {
+        IntGorillaEncoder enc;
+        IntGorillaDecoder dec;
+        common::ByteStream s(64, common::MOD_DEFAULT);
+        EXPECT_EQ(enc.encode(7, s), common::E_OK);
+        EXPECT_EQ(enc.encode(7, s), common::E_OK);
+        enc.flush(s);
+        int32_t v;
+        EXPECT_EQ(dec.read_int32(v, s), common::E_OK);
+        EXPECT_EQ(v, 7);
+        dec.reset();
+        enc.reset();
+        common::ByteStream s2(64, common::MOD_DEFAULT);
+        EXPECT_EQ(enc.encode(42, s2), common::E_OK);
+        EXPECT_EQ(enc.encode(42, s2), common::E_OK);
+        enc.flush(s2);
+        EXPECT_EQ(dec.read_int32(v, s2), common::E_OK);
+        EXPECT_EQ(v, 42);
+    }
+}
+
+// ── has_remaining variants ──────────────────────────────────────────────
+TEST(EncodingCoverage, HasRemainingOnEmptyAndAfterDrain) {
+    common::ByteStream empty(64, common::MOD_DEFAULT);
+    {
+        IntZigzagDecoder zz;
+        EXPECT_FALSE(zz.has_remaining(empty));
+    }
+    {
+        IntGorillaDecoder g;
+        EXPECT_FALSE(g.has_remaining(empty));
+    }
+    {
+        Int32RleDecoder rle;
+        EXPECT_FALSE(rle.has_remaining(empty));
+    }
+    {
+        TS2DIFFDecoder<int32_t> t;
+        EXPECT_FALSE(t.has_remaining(empty));
+    }
+    {
+        PlainDecoder p;
+        EXPECT_FALSE(p.has_remaining(empty));
+    }
+}
+
+// ── Gorilla floating-point special values ──────────────────────────────
+//
+// FloatGorillaDecoder / DoubleGorillaDecoder run different VALUE_BITS and
+// ending-sentinel paths.  Verify they round-trip NaN, infinity, -0.0 and
+// denormals — none of which the existing happy-path roundtrip exercises.
+TEST(EncodingCoverage, GorillaFloatSpecialValues) {
+    FloatGorillaEncoder enc;
+    common::ByteStream s(256, common::MOD_DEFAULT);
+    std::vector<float> values = {
+        0.0f,
+        -0.0f,
+        std::numeric_limits<float>::infinity(),
+        -std::numeric_limits<float>::infinity(),
+        std::numeric_limits<float>::min(),
+        std::numeric_limits<float>::denorm_min(),
+        std::numeric_limits<float>::epsilon(),
+        1.0f,
+        -1.0f,
+        std::numeric_limits<float>::max(),
+        std::numeric_limits<float>::lowest(),
+    };
+    for (float v : values) ASSERT_EQ(enc.encode(v, s), common::E_OK);
+    enc.flush(s);
+
+    FloatGorillaDecoder dec;
+    float out;
+    for (size_t i = 0; i < values.size(); i++) {
+        ASSERT_EQ(dec.read_float(out, s), common::E_OK) << "i=" << i;
+        if (std::isnan(values[i])) {
+            EXPECT_TRUE(std::isnan(out));
+        } else {
+            // Bitwise compare to catch -0.0 vs 0.0 etc.
+            uint32_t a, b;
+            memcpy(&a, &values[i], sizeof(float));
+            memcpy(&b, &out, sizeof(float));
+            EXPECT_EQ(a, b) << "i=" << i;
+        }
+    }
+}
+
+TEST(EncodingCoverage, GorillaDoubleSpecialValues) {
+    DoubleGorillaEncoder enc;
+    common::ByteStream s(256, common::MOD_DEFAULT);
+    std::vector<double> values = {
+        0.0,
+        -0.0,
+        std::numeric_limits<double>::infinity(),
+        -std::numeric_limits<double>::infinity(),
+        std::numeric_limits<double>::min(),
+        std::numeric_limits<double>::denorm_min(),
+        std::numeric_limits<double>::epsilon(),
+        1.0,
+        -1.0,
+        std::numeric_limits<double>::max(),
+        std::numeric_limits<double>::lowest(),
+    };
+    for (double v : values) ASSERT_EQ(enc.encode(v, s), common::E_OK);
+    enc.flush(s);
+
+    DoubleGorillaDecoder dec;
+    double out;
+    for (size_t i = 0; i < values.size(); i++) {
+        ASSERT_EQ(dec.read_double(out, s), common::E_OK) << "i=" << i;
+        uint64_t a, b;
+        memcpy(&a, &values[i], sizeof(double));
+        memcpy(&b, &out, sizeof(double));
+        EXPECT_EQ(a, b) << "i=" << i;
+    }
+}
+
+// ── Gorilla skip path ───────────────────────────────────────────────────
+TEST(EncodingCoverage, GorillaSkipInt32Roundtrip) {
+    IntGorillaEncoder enc;
+    common::ByteStream stream(1024, common::MOD_DEFAULT);
+    const int N = 200;
+    std::vector<int32_t> values(N);
+    for (int i = 0; i < N; i++) {
+        values[i] = i * 11 - 5;
+        ASSERT_EQ(enc.encode(values[i], stream), common::E_OK);
+    }
+    enc.flush(stream);
+
+    // Wrap into contiguous buffer for batch_skip_raw.
+    uint32_t total = stream.total_size();
+    std::vector<uint8_t> buf(total);
+    uint32_t got = 0;
+    stream.read_buf(buf.data(), total, got);
+    common::ByteStream wrapped(common::MOD_DEFAULT);
+    wrapped.wrap_from((const char*)buf.data(), total);
+
+    IntGorillaDecoder dec;
+    int skipped = 0;
+    ASSERT_EQ(dec.skip_int32(50, skipped, wrapped), common::E_OK);
+    EXPECT_EQ(skipped, 50);
+    int32_t out[N];
+    int actual = 0;
+    ASSERT_EQ(dec.read_batch_int32(out, N - 50, actual, wrapped), common::E_OK);
+    EXPECT_EQ(actual, N - 50);
+    for (int i = 0; i < N - 50; i++) {
+        EXPECT_EQ(out[i], values[50 + i]) << "i=" << i;
+    }
+}
+
+// ── TS2DIFF batch decode hits SIMD block + scalar tail ─────────────────
+TEST(EncodingCoverage, TS2DIFFBatchInt32MultipleBlocks) {
+    TS2DIFFEncoder<int32_t> enc;
+    common::ByteStream s(8192, common::MOD_DEFAULT);
+    // Encode 500 values to span ~4 blocks (default block size 128).
+    const int N = 500;
+    std::vector<int32_t> values(N);
+    for (int i = 0; i < N; i++) {
+        values[i] = i * 7 + 3;
+        ASSERT_EQ(enc.encode(values[i], s), common::E_OK);
+    }
+    ASSERT_EQ(enc.flush(s), common::E_OK);
+
+    // Wrap-from for the SIMD/scalar block fast path.
+    uint32_t total = s.total_size();
+    std::vector<uint8_t> buf(total);
+    uint32_t got = 0;
+    s.read_buf(buf.data(), total, got);
+    common::ByteStream wrapped(common::MOD_DEFAULT);
+    wrapped.wrap_from((const char*)buf.data(), total);
+
+    TS2DIFFDecoder<int32_t> dec;
+    std::vector<int32_t> out(N);
+    int total_decoded = 0;
+    while (dec.has_remaining(wrapped) && total_decoded < N) {
+        int actual = 0;
+        ASSERT_EQ(dec.read_batch_int32(out.data() + total_decoded,
+                                       N - total_decoded, actual, wrapped),
+                  common::E_OK);
+        if (actual == 0) break;
+        total_decoded += actual;
+    }
+    EXPECT_EQ(total_decoded, N);
+    for (int i = 0; i < N; i++) EXPECT_EQ(out[i], values[i]) << "i=" << i;
+}
+
+TEST(EncodingCoverage, TS2DIFFBatchInt64MultipleBlocks) {
+    TS2DIFFEncoder<int64_t> enc;
+    common::ByteStream s(8192, common::MOD_DEFAULT);
+    const int N = 500;
+    std::vector<int64_t> values(N);
+    for (int i = 0; i < N; i++) {
+        values[i] = static_cast<int64_t>(i) * 17 + 41;
+        ASSERT_EQ(enc.encode(values[i], s), common::E_OK);
+    }
+    ASSERT_EQ(enc.flush(s), common::E_OK);
+
+    uint32_t total = s.total_size();
+    std::vector<uint8_t> buf(total);
+    uint32_t got = 0;
+    s.read_buf(buf.data(), total, got);
+    common::ByteStream wrapped(common::MOD_DEFAULT);
+    wrapped.wrap_from((const char*)buf.data(), total);
+
+    TS2DIFFDecoder<int64_t> dec;
+    std::vector<int64_t> out(N);
+    int total_decoded = 0;
+    while (dec.has_remaining(wrapped) && total_decoded < N) {
+        int actual = 0;
+        ASSERT_EQ(dec.read_batch_int64(out.data() + total_decoded,
+                                       N - total_decoded, actual, wrapped),
+                  common::E_OK);
+        if (actual == 0) break;
+        total_decoded += actual;
+    }
+    EXPECT_EQ(total_decoded, N);
+    for (int i = 0; i < N; i++) EXPECT_EQ(out[i], values[i]) << "i=" << i;
+}
+
+// ── Plain encoder: encode_batch fast paths for each type ───────────────
+TEST(EncodingCoverage, PlainEncoderBatchAllTypes) {
+    PlainEncoder enc;
+    PlainDecoder dec;
+
+    // Float batch.
+    {
+        common::ByteStream s(1024, common::MOD_DEFAULT);
+        const uint32_t N = 100;
+        float v[N];
+        for (uint32_t i = 0; i < N; i++) v[i] = i * 0.5f - 1.0f;
+        ASSERT_EQ(enc.encode_batch(v, N, s), common::E_OK);
+        float out[N];
+        int actual = 0;
+        ASSERT_EQ(dec.read_batch_float(out, N, actual, s), common::E_OK);
+        EXPECT_EQ(actual, static_cast<int>(N));
+        for (uint32_t i = 0; i < N; i++) EXPECT_FLOAT_EQ(out[i], v[i]);
+    }
+    // Int64 batch.
+    {
+        common::ByteStream s(1024, common::MOD_DEFAULT);
+        const uint32_t N = 100;
+        int64_t v[N];
+        for (uint32_t i = 0; i < N; i++) v[i] = i * 1000 - 50;
+        ASSERT_EQ(enc.encode_batch(v, N, s), common::E_OK);
+        int64_t out[N];
+        int actual = 0;
+        ASSERT_EQ(dec.read_batch_int64(out, N, actual, s), common::E_OK);
+        EXPECT_EQ(actual, static_cast<int>(N));
+        for (uint32_t i = 0; i < N; i++) EXPECT_EQ(out[i], v[i]);
+    }
+}
+
+// ── PlainDecoder skip paths (wrapped + paged) ──────────────────────────
+TEST(EncodingCoverage, PlainSkipPagedStream) {
+    PlainEncoder enc;
+    PlainDecoder dec;
+    // Paged ByteStream (tiny page) forces the fallback path.
+    common::ByteStream s(16, common::MOD_DEFAULT);
+    for (int i = 0; i < 32; i++)
+        ASSERT_EQ(enc.encode((int64_t)i, s), common::E_OK);
+    int skipped = 0;
+    ASSERT_EQ(dec.skip_int64(10, skipped, s), common::E_OK);
+    EXPECT_EQ(skipped, 10);
+    int64_t out;
+    ASSERT_EQ(dec.read_int64(out, s), common::E_OK);
+    EXPECT_EQ(out, 10);
+}
+
+// ── Dictionary codec roundtrip ─────────────────────────────────────────
+TEST(EncodingCoverage, DictionaryStringRoundTrip) {
+    DictionaryEncoder enc;
+    common::ByteStream s(1024, common::MOD_DEFAULT);
+
+    std::vector<std::string> raw = {"apple",  "banana", "apple",
+                                    "cherry", "banana", "apple"};
+    for (const auto& r : raw) {
+        common::String str(const_cast<char*>(r.c_str()), r.size());
+        ASSERT_EQ(enc.encode(str, s), common::E_OK);
+    }
+    enc.flush(s);
+
+    DictionaryDecoder dec;
+    common::PageArena pa;
+    pa.init(512, common::MOD_DEFAULT);
+    for (const auto& r : raw) {
+        common::String out;
+        ASSERT_EQ(dec.read_String(out, pa, s), common::E_OK);
+        ASSERT_EQ(out.len_, r.size());
+        EXPECT_EQ(std::string(out.buf_, out.len_), r);
+    }
+}
+
+}  // namespace storage
diff --git a/cpp/test/encoding/gorilla_codec_test.cc b/cpp/test/encoding/gorilla_codec_test.cc
index 47056a6db..945451088 100644
--- a/cpp/test/encoding/gorilla_codec_test.cc
+++ b/cpp/test/encoding/gorilla_codec_test.cc
@@ -207,4 +207,319 @@ TEST_F(GorillaCodecTest, DoubleEncodingDecodingBoundaryValues) {
     }
 }
 
+// ── Batch decode tests (exercises the raw-pointer GorillaBitReader path) ──
+
+TEST_F(GorillaCodecTest, Int32BatchDecode) {
+    storage::IntGorillaEncoder encoder;
+    common::ByteStream stream(1024, common::MOD_DEFAULT);
+    const int N = 500;
+    int32_t expected[N];
+    for (int i = 0; i < N; i++) {
+        expected[i] = i * 7 - 100;
+        EXPECT_EQ(encoder.encode(expected[i], stream), common::E_OK);
+    }
+    encoder.flush(stream);
+
+    // Copy to a contiguous buffer and wrap (simulates production path)
+    uint32_t total = stream.total_size();
+    std::vector<uint8_t> buf(total);
+    uint32_t got = 0;
+    stream.read_buf(buf.data(), total, got);
+    ASSERT_EQ(got, total);
+
+    common::ByteStream wrapped(common::MOD_DEFAULT);
+    wrapped.wrap_from((const char*)buf.data(), total);
+
+    storage::IntGorillaDecoder decoder;
+    int32_t out[N];
+    int total_decoded = 0;
+    while (decoder.has_remaining(wrapped) && total_decoded < N) {
+        int batch = std::min(129, N - total_decoded);
+        int actual = 0;
+        EXPECT_EQ(decoder.read_batch_int32(out + total_decoded, batch, actual,
+                                           wrapped),
+                  common::E_OK);
+        if (actual == 0) break;
+        total_decoded += actual;
+    }
+    ASSERT_EQ(total_decoded, N);
+    for (int i = 0; i < N; i++) {
+        EXPECT_EQ(out[i], expected[i]) << "mismatch at index " << i;
+    }
+}
+
+TEST_F(GorillaCodecTest, Int64BatchDecode) {
+    storage::LongGorillaEncoder encoder;
+    common::ByteStream stream(1024, common::MOD_DEFAULT);
+    const int N = 500;
+    int64_t expected[N];
+    for (int i = 0; i < N; i++) {
+        expected[i] = (int64_t)i * 13 - 200;
+        EXPECT_EQ(encoder.encode(expected[i], stream), common::E_OK);
+    }
+    encoder.flush(stream);
+
+    uint32_t total = stream.total_size();
+    std::vector<uint8_t> buf(total);
+    uint32_t got = 0;
+    stream.read_buf(buf.data(), total, got);
+
+    common::ByteStream wrapped(common::MOD_DEFAULT);
+    wrapped.wrap_from((const char*)buf.data(), total);
+
+    storage::LongGorillaDecoder decoder;
+    int64_t out[N];
+    int total_decoded = 0;
+    while (decoder.has_remaining(wrapped) && total_decoded < N) {
+        int batch = std::min(129, N - total_decoded);
+        int actual = 0;
+        EXPECT_EQ(decoder.read_batch_int64(out + total_decoded, batch, actual,
+                                           wrapped),
+                  common::E_OK);
+        if (actual == 0) break;
+        total_decoded += actual;
+    }
+    ASSERT_EQ(total_decoded, N);
+    for (int i = 0; i < N; i++) {
+        EXPECT_EQ(out[i], expected[i]) << "mismatch at index " << i;
+    }
+}
+
+TEST_F(GorillaCodecTest, FloatBatchDecode) {
+    storage::FloatGorillaEncoder encoder;
+    common::ByteStream stream(1024, common::MOD_DEFAULT);
+    const int N = 300;
+    std::vector<float> expected(N);
+    for (int i = 0; i < N; i++) {
+        expected[i] = (float)i * 1.5f - 50.0f;
+        EXPECT_EQ(encoder.encode(expected[i], stream), common::E_OK);
+    }
+    encoder.flush(stream);
+
+    uint32_t total = stream.total_size();
+    std::vector<uint8_t> buf(total);
+    uint32_t got = 0;
+    stream.read_buf(buf.data(), total, got);
+
+    common::ByteStream wrapped(common::MOD_DEFAULT);
+    wrapped.wrap_from((const char*)buf.data(), total);
+
+    storage::FloatGorillaDecoder decoder;
+    std::vector<float> out(N);
+    int total_decoded = 0;
+    while (decoder.has_remaining(wrapped) && total_decoded < N) {
+        int batch = std::min(129, N - total_decoded);
+        int actual = 0;
+        EXPECT_EQ(decoder.read_batch_float(out.data() + total_decoded, batch,
+                                           actual, wrapped),
+                  common::E_OK);
+        if (actual == 0) break;
+        total_decoded += actual;
+    }
+    ASSERT_EQ(total_decoded, N);
+    for (int i = 0; i < N; i++) {
+        EXPECT_FLOAT_EQ(out[i], expected[i]) << "mismatch at index " << i;
+    }
+}
+
+TEST_F(GorillaCodecTest, DoubleBatchDecode) {
+    storage::DoubleGorillaEncoder encoder;
+    common::ByteStream stream(1024, common::MOD_DEFAULT);
+    const int N = 300;
+    std::vector<double> expected(N);
+    for (int i = 0; i < N; i++) {
+        expected[i] = (double)i * 2.7 - 100.0;
+        EXPECT_EQ(encoder.encode(expected[i], stream), common::E_OK);
+    }
+    encoder.flush(stream);
+
+    uint32_t total = stream.total_size();
+    std::vector<uint8_t> buf(total);
+    uint32_t got = 0;
+    stream.read_buf(buf.data(), total, got);
+
+    common::ByteStream wrapped(common::MOD_DEFAULT);
+    wrapped.wrap_from((const char*)buf.data(), total);
+
+    storage::DoubleGorillaDecoder decoder;
+    std::vector<double> out(N);
+    int total_decoded = 0;
+    while (decoder.has_remaining(wrapped) && total_decoded < N) {
+        int batch = std::min(129, N - total_decoded);
+        int actual = 0;
+        EXPECT_EQ(decoder.read_batch_double(out.data() + total_decoded, batch,
+                                            actual, wrapped),
+                  common::E_OK);
+        if (actual == 0) break;
+        total_decoded += actual;
+    }
+    ASSERT_EQ(total_decoded, N);
+    for (int i = 0; i < N; i++) {
+        EXPECT_DOUBLE_EQ(out[i], expected[i]) << "mismatch at index " << i;
+    }
+}
+
+TEST_F(GorillaCodecTest, Int32BatchSkip) {
+    storage::IntGorillaEncoder encoder;
+    common::ByteStream stream(1024, common::MOD_DEFAULT);
+    const int N = 200;
+    int32_t expected[N];
+    for (int i = 0; i < N; i++) {
+        expected[i] = i * 3;
+        EXPECT_EQ(encoder.encode(expected[i], stream), common::E_OK);
+    }
+    encoder.flush(stream);
+
+    uint32_t total = stream.total_size();
+    std::vector<uint8_t> buf(total);
+    uint32_t got = 0;
+    stream.read_buf(buf.data(), total, got);
+
+    common::ByteStream wrapped(common::MOD_DEFAULT);
+    wrapped.wrap_from((const char*)buf.data(), total);
+
+    storage::IntGorillaDecoder decoder;
+    // Skip first 50 values
+    int skipped = 0;
+    EXPECT_EQ(decoder.skip_int32(50, skipped, wrapped), common::E_OK);
+    EXPECT_EQ(skipped, 50);
+    // Read next 50 values
+    int32_t out[50];
+    int actual = 0;
+    EXPECT_EQ(decoder.read_batch_int32(out, 50, actual, wrapped), common::E_OK);
+    EXPECT_EQ(actual, 50);
+    for (int i = 0; i < 50; i++) {
+        EXPECT_EQ(out[i], expected[50 + i]) << "mismatch at index " << i;
+    }
+}
+
+// Regression: batch_decode_raw used to write out[0] unconditionally in the
+// bootstrap branch, even when capacity was 0. Verify the entry path early
+// returns and leaves the stream + state untouched.
+TEST_F(GorillaCodecTest, Int32BatchDecodeZeroCapacity) {
+    storage::IntGorillaEncoder encoder;
+    common::ByteStream stream(1024, common::MOD_DEFAULT);
+    const int N = 8;
+    for (int i = 0; i < N; i++) {
+        ASSERT_EQ(encoder.encode(i, stream), common::E_OK);
+    }
+    encoder.flush(stream);
+
+    uint32_t total = stream.total_size();
+    std::vector<uint8_t> buf(total);
+    uint32_t got = 0;
+    stream.read_buf(buf.data(), total, got);
+    common::ByteStream wrapped(common::MOD_DEFAULT);
+    wrapped.wrap_from((const char*)buf.data(), total);
+
+    storage::IntGorillaDecoder decoder;
+    int32_t sentinel[1] = {0x7fffffff};
+    int actual = 42;
+    EXPECT_EQ(decoder.read_batch_int32(sentinel, 0, actual, wrapped),
+              common::E_OK);
+    EXPECT_EQ(actual, 0);
+    EXPECT_EQ(sentinel[0], 0x7fffffff);  // not written
+
+    // Followup decode should still read the first value 0.
+    int32_t out[N];
+    int got_actual = 0;
+    EXPECT_EQ(decoder.read_batch_int32(out, N, got_actual, wrapped),
+              common::E_OK);
+    EXPECT_EQ(got_actual, N);
+    for (int i = 0; i < N; i++) EXPECT_EQ(out[i], i);
+}
+
+TEST_F(GorillaCodecTest, Int64BatchDecodeZeroCapacity) {
+    storage::LongGorillaEncoder encoder;
+    common::ByteStream stream(1024, common::MOD_DEFAULT);
+    for (int i = 0; i < 8; i++) {
+        ASSERT_EQ(encoder.encode(static_cast<int64_t>(i), stream),
+                  common::E_OK);
+    }
+    encoder.flush(stream);
+
+    uint32_t total = stream.total_size();
+    std::vector<uint8_t> buf(total);
+    uint32_t got = 0;
+    stream.read_buf(buf.data(), total, got);
+    common::ByteStream wrapped(common::MOD_DEFAULT);
+    wrapped.wrap_from((const char*)buf.data(), total);
+
+    storage::LongGorillaDecoder decoder;
+    int64_t sentinel[1] = {0x7fffffffffffffffLL};
+    int actual = 42;
+    EXPECT_EQ(decoder.read_batch_int64(sentinel, 0, actual, wrapped),
+              common::E_OK);
+    EXPECT_EQ(actual, 0);
+    EXPECT_EQ(sentinel[0], 0x7fffffffffffffffLL);  // not written
+}
+
+// Regression: a truncated Gorilla page used to spin GorillaBitReader::read_long
+// forever (bits stays 0, n -= 0 never decreases) and GorillaBitReader::read_bit
+// would compute (cur_byte >> -1).  batch_decode_raw must now surface
+// E_BUF_NOT_ENOUGH instead of looping.
+TEST_F(GorillaCodecTest, Int32BatchDecodeTruncatedInputReturnsError) {
+    // Encode enough values to fill several bits, then chop the buffer down to
+    // a small prefix so the decoder runs out of bits mid-value.
+    storage::IntGorillaEncoder encoder;
+    common::ByteStream stream(1024, common::MOD_DEFAULT);
+    const int N = 32;
+    for (int i = 0; i < N; i++) {
+        ASSERT_EQ(encoder.encode(i * 11 + 3, stream), common::E_OK);
+    }
+    encoder.flush(stream);
+
+    uint32_t total = stream.total_size();
+    ASSERT_GT(total, 4u);
+    std::vector<uint8_t> buf(total);
+    uint32_t got = 0;
+    stream.read_buf(buf.data(), total, got);
+    ASSERT_EQ(got, total);
+
+    // 3 bytes is large enough to bootstrap the first value (depending on
+    // VALUE_BITS_LENGTH_32BIT) but typically too short for the full batch.
+    common::ByteStream truncated(common::MOD_DEFAULT);
+    truncated.wrap_from((const char*)buf.data(), 3);
+
+    storage::IntGorillaDecoder decoder;
+    int32_t out[N];
+    int actual = -1;
+    int ret = decoder.read_batch_int32(out, N, actual, truncated);
+    // Either the decoder reports the truncation, or it stops early without
+    // looping forever; both are acceptable.  What MUST NOT happen is a hang
+    // or a full-batch return — the test will time out on a hang via the
+    // GoogleTest harness.
+    EXPECT_TRUE(ret == common::E_OK || ret == common::E_BUF_NOT_ENOUGH)
+        << "unexpected ret=" << ret;
+    EXPECT_LT(actual, N);
+}
+
+TEST_F(GorillaCodecTest, Int64BatchDecodeTruncatedInputReturnsError) {
+    storage::LongGorillaEncoder encoder;
+    common::ByteStream stream(1024, common::MOD_DEFAULT);
+    const int N = 32;
+    for (int i = 0; i < N; i++) {
+        ASSERT_EQ(encoder.encode(static_cast<int64_t>(i) * 17 + 5, stream),
+                  common::E_OK);
+    }
+    encoder.flush(stream);
+    uint32_t total = stream.total_size();
+    ASSERT_GT(total, 4u);
+    std::vector<uint8_t> buf(total);
+    uint32_t got = 0;
+    stream.read_buf(buf.data(), total, got);
+    ASSERT_EQ(got, total);
+
+    common::ByteStream truncated(common::MOD_DEFAULT);
+    truncated.wrap_from((const char*)buf.data(), 3);
+
+    storage::LongGorillaDecoder decoder;
+    int64_t out[N];
+    int actual = -1;
+    int ret = decoder.read_batch_int64(out, N, actual, truncated);
+    EXPECT_TRUE(ret == common::E_OK || ret == common::E_BUF_NOT_ENOUGH)
+        << "unexpected ret=" << ret;
+    EXPECT_LT(actual, N);
+}
+
 }  // namespace storage
diff --git a/cpp/test/encoding/plain_codec_test.cc b/cpp/test/encoding/plain_codec_test.cc
index a51fa9261..6372469e6 100644
--- a/cpp/test/encoding/plain_codec_test.cc
+++ b/cpp/test/encoding/plain_codec_test.cc
@@ -110,4 +110,90 @@ TEST(PlainEncoderDecoderTest, EncodeDecodeDouble) {
     EXPECT_DOUBLE_EQ(original, decoded);
 }
 
+// Regression: read_batch_int64/float/double used to dereference
+// in.get_wrapped_buf() unconditionally, which is null for a normal paged
+// ByteStream. Verify the fallback path produces correct results.
+TEST(PlainEncoderDecoderTest, ReadBatchInt64PagedStream) {
+    PlainEncoder encoder;
+    PlainDecoder decoder;
+    // Tiny page size forces multi-page write so the stream is paged, not
+    // wrapped.
+    common::ByteStream stream(16, common::MOD_DEFAULT);
+    const int N = 32;
+    int64_t values[N];
+    for (int i = 0; i < N; i++) {
+        values[i] = static_cast<int64_t>(i) * 7 - 3;
+        encoder.encode(values[i], stream);
+    }
+    int64_t out[N];
+    int actual = 0;
+    EXPECT_EQ(decoder.read_batch_int64(out, N, actual, stream), common::E_OK);
+    EXPECT_EQ(actual, N);
+    for (int i = 0; i < N; i++) {
+        EXPECT_EQ(out[i], values[i]) << "mismatch at " << i;
+    }
+}
+
+TEST(PlainEncoderDecoderTest, ReadBatchFloatPagedStream) {
+    PlainEncoder encoder;
+    PlainDecoder decoder;
+    common::ByteStream stream(16, common::MOD_DEFAULT);
+    const int N = 32;
+    float values[N];
+    for (int i = 0; i < N; i++) {
+        values[i] = static_cast<float>(i) * 0.5f - 1.25f;
+        encoder.encode(values[i], stream);
+    }
+    float out[N];
+    int actual = 0;
+    EXPECT_EQ(decoder.read_batch_float(out, N, actual, stream), common::E_OK);
+    EXPECT_EQ(actual, N);
+    for (int i = 0; i < N; i++) {
+        EXPECT_FLOAT_EQ(out[i], values[i]);
+    }
+}
+
+// Regression: encode_batch(const double*) used to reinterpret_cast to
+// int64_t* and dispatch into the int64 path, which read the doubles through
+// an int64_t pointer — a strict-aliasing violation under -O.  The dedicated
+// double path now memcpys per element; verify a full round-trip through it.
+TEST(PlainEncoderDecoderTest, EncodeBatchDoubleRoundTrip) {
+    PlainEncoder encoder;
+    PlainDecoder decoder;
+    common::ByteStream stream(1024, common::MOD_DEFAULT);
+    const uint32_t N = 64;
+    double values[N];
+    for (uint32_t i = 0; i < N; i++) {
+        values[i] = static_cast<double>(i) * 0.125 - 3.14;
+    }
+    ASSERT_EQ(encoder.encode_batch(values, N, stream), common::E_OK);
+
+    double out[N];
+    int actual = 0;
+    EXPECT_EQ(decoder.read_batch_double(out, N, actual, stream), common::E_OK);
+    EXPECT_EQ(actual, static_cast<int>(N));
+    for (uint32_t i = 0; i < N; i++) {
+        EXPECT_DOUBLE_EQ(out[i], values[i]) << "mismatch at " << i;
+    }
+}
+
+TEST(PlainEncoderDecoderTest, ReadBatchDoublePagedStream) {
+    PlainEncoder encoder;
+    PlainDecoder decoder;
+    common::ByteStream stream(16, common::MOD_DEFAULT);
+    const int N = 32;
+    double values[N];
+    for (int i = 0; i < N; i++) {
+        values[i] = static_cast<double>(i) * 1.25 + 3.14;
+        encoder.encode(values[i], stream);
+    }
+    double out[N];
+    int actual = 0;
+    EXPECT_EQ(decoder.read_batch_double(out, N, actual, stream), common::E_OK);
+    EXPECT_EQ(actual, N);
+    for (int i = 0; i < N; i++) {
+        EXPECT_DOUBLE_EQ(out[i], values[i]);
+    }
+}
+
 }  // end namespace storage
\ No newline at end of file
diff --git a/cpp/test/encoding/ts2diff_codec_test.cc b/cpp/test/encoding/ts2diff_codec_test.cc
index 3164edafb..fb997103c 100644
--- a/cpp/test/encoding/ts2diff_codec_test.cc
+++ b/cpp/test/encoding/ts2diff_codec_test.cc
@@ -364,4 +364,120 @@ TEST_F(TS2DIFFCodecTest, TestEncodingLast) {
     EXPECT_FALSE(decoder_int_->has_remaining(out_stream_int32));
 }
 
+// Regression: skip_int32/skip_int64 used to advance the stream by the full
+// block size even when the requested skip count fell short of the block,
+// which silently dropped values from the next read in aligned nullable
+// columns.  Verify that skipping a count smaller than the first block leaves
+// the remainder of that block intact and decodable.
+TEST_F(TS2DIFFCodecTest, SkipPartialBlockInt32PreservesRemainder) {
+    common::ByteStream out_stream(1024, common::MOD_TS2DIFF_OBJ, false);
+    const int row_num = 1024;
+    std::vector<int32_t> data(row_num);
+    for (int i = 0; i < row_num; i++) {
+        data[i] = i * 3 + 7;
+    }
+    for (int i = 0; i < row_num; i++) {
+        ASSERT_EQ(encoder_int_->encode(data[i], out_stream), common::E_OK);
+    }
+    ASSERT_EQ(encoder_int_->flush(out_stream), common::E_OK);
+
+    const int skip_count = 5;
+    int skipped = 0;
+    ASSERT_EQ(decoder_int_->skip_int32(skip_count, skipped, out_stream),
+              common::E_OK);
+    EXPECT_EQ(skipped, skip_count);
+
+    int32_t v;
+    for (int i = skip_count; i < row_num; i++) {
+        ASSERT_EQ(decoder_int_->read_int32(v, out_stream), common::E_OK);
+        EXPECT_EQ(v, data[i]) << "mismatch at idx " << i;
+    }
+}
+
+TEST_F(TS2DIFFCodecTest, SkipPartialBlockInt64PreservesRemainder) {
+    common::ByteStream out_stream(1024, common::MOD_TS2DIFF_OBJ, false);
+    const int row_num = 1024;
+    std::vector<int64_t> data(row_num);
+    for (int i = 0; i < row_num; i++) {
+        data[i] = static_cast<int64_t>(i) * 13 + 11;
+    }
+    for (int i = 0; i < row_num; i++) {
+        ASSERT_EQ(encoder_long_->encode(data[i], out_stream), common::E_OK);
+    }
+    ASSERT_EQ(encoder_long_->flush(out_stream), common::E_OK);
+
+    const int skip_count = 7;
+    int skipped = 0;
+    ASSERT_EQ(decoder_long_->skip_int64(skip_count, skipped, out_stream),
+              common::E_OK);
+    EXPECT_EQ(skipped, skip_count);
+
+    int64_t v;
+    for (int i = skip_count; i < row_num; i++) {
+        ASSERT_EQ(decoder_long_->read_int64(v, out_stream), common::E_OK);
+        EXPECT_EQ(v, data[i]) << "mismatch at idx " << i;
+    }
+}
+
+// Regression: pack_bits_msb used to drop ByteStream::write_buf's return value
+// on the floor and unconditionally return 0 (success).  flush() then reported
+// E_OK and reset() wiped encoder state even when the actual data never made
+// it onto the stream.  The fix surfaces the underlying error code via the
+// helper's return value.
+//
+// We can't easily inject a real write failure without a custom allocator
+// (ByteStream::write_buf only fails on OOM), so this test pins down the
+// contract on the visible boundary: a wide bit_width must return the
+// dedicated "fallback" sentinel (-1) so flush() knows to take the per-bit
+// path, and the helper's return type must be the error code from write_buf
+// otherwise.  Future refactors that swallow the write error would either
+// stop returning -1 for fallback (caught here) or break round-trip in the
+// happy-path test below.
+TEST_F(TS2DIFFCodecTest, PackBitsMsbFallbackSentinelStillReported) {
+    common::ByteStream out(1024, common::MOD_TS2DIFF_OBJ, false);
+    int64_t values[4] = {1, 2, 3, 4};
+    EXPECT_EQ(TS2DIFFEncoder<int64_t>::pack_bits_msb(values, 4, 57, out), -1);
+    // Healthy small bit_width writes succeed.
+    int32_t small_values[4] = {1, 2, 3, 4};
+    EXPECT_EQ(TS2DIFFEncoder<int32_t>::pack_bits_msb(small_values, 4, 3, out),
+              common::E_OK);
+}
+
+// Regression: FloatTS2DIFFEncoder / DoubleTS2DIFFEncoder kept the previous
+// page's overflow markers in underflow_flags_ when reset() was called
+// directly (PageWriter drops a partial page that way).  The next page would
+// then read the stale flags and emit a wrong overflow bitmap.  reset() now
+// clears underflow_flags_; verify a reset between pages doesn't leak the
+// first page's overflow state into the second.
+TEST(FloatTS2DIFFEncoderResetTest, ResetClearsUnderflowFlags) {
+    storage::FloatTS2DIFFEncoder enc;
+    common::ByteStream out1(1024, common::MOD_TS2DIFF_OBJ, false);
+    // Encode a value that overflows the scale factor so the encoder records
+    // an underflow flag.
+    const float overflow_value = 1e30f;  // scaled > INT32_MAX
+    ASSERT_EQ(enc.encode(0.0f, out1), common::E_OK);
+    ASSERT_EQ(enc.encode(overflow_value, out1), common::E_OK);
+
+    // Drop the page without flushing.  PageWriter does exactly this when
+    // discarding a half-built page.
+    enc.reset();
+
+    // Encode a clean page that should not have any overflow markers.
+    common::ByteStream out2(1024, common::MOD_TS2DIFF_OBJ, false);
+    ASSERT_EQ(enc.encode(0.0f, out2), common::E_OK);
+    ASSERT_EQ(enc.encode(1.0f, out2), common::E_OK);
+    ASSERT_EQ(enc.encode(2.0f, out2), common::E_OK);
+    ASSERT_EQ(enc.flush(out2), common::E_OK);
+
+    // Round-trip the clean page; if reset() leaked the stale overflow flags
+    // the decoder would misinterpret the leading bytes as an overflow
+    // bitmap header and fail to recover the original values.
+    storage::FloatTS2DIFFDecoder dec;
+    float v = 0.0f;
+    for (int i = 0; i < 3; i++) {
+        ASSERT_EQ(dec.read_float(v, out2), common::E_OK);
+        EXPECT_NEAR(v, static_cast<float>(i), 1e-5f);
+    }
+}
+
 }  // namespace storage
diff --git a/cpp/test/file/restorable_tsfile_io_writer_test.cc b/cpp/test/file/restorable_tsfile_io_writer_test.cc
index 8f723e056..c60a855c5 100644
--- a/cpp/test/file/restorable_tsfile_io_writer_test.cc
+++ b/cpp/test/file/restorable_tsfile_io_writer_test.cc
@@ -994,4 +994,70 @@ TEST_F(RestorableTsFileIOWriterTest,
         }
         ASSERT_EQ(table_writer2.close(), E_OK);
     }
-}
\ No newline at end of file
+}
+
+// Regression: recovery of an aligned single-page value chunk must consult the
+// page's not-null bitmap to bind each decoded value to its real timestamp.
+// The bug paired non-null values densely with times[0..N-1], so a column whose
+// only non-null entry sat at the tail surfaced start_time/end_time equal to
+// the head of the time chunk, which then leaked through chunk-level time
+// filters.
+TEST_F(RestorableTsFileIOWriterTest, RecoveryAlignedSparseStatRespectsBitmap) {
+    const int64_t kBase = 100;
+    const int kRowCount = 10;
+    const int kNonNullRow = 7;
+    const std::string table_name = "sparse_aligned_t";
+    std::vector<MeasurementSchema*> ms_vec;
+    ms_vec.push_back(new MeasurementSchema("device", STRING));
+    ms_vec.push_back(new MeasurementSchema("s1", INT64));
+    std::vector<ColumnCategory> cats = {ColumnCategory::TAG,
+                                        ColumnCategory::FIELD};
+    TableSchema table_schema(table_name, ms_vec, cats);
+    {
+        WriteFile wf;
+        ASSERT_EQ(wf.create(file_name_, GetWriteCreateFlags(), 0666), E_OK);
+        TsFileTableWriter tw(&wf, &table_schema);
+        Tablet tablet(table_schema.get_measurement_names(),
+                      table_schema.get_data_types(), kRowCount);
+        tablet.set_table_name(table_name);
+        for (int i = 0; i < kRowCount; i++) {
+            tablet.add_timestamp(i, kBase + i);
+            tablet.add_value(i, "device", "d0");
+            // Only row kNonNullRow gets a value; the rest stay null.
+            if (i == kNonNullRow) {
+                tablet.add_value(i, "s1", static_cast<int64_t>(999));
+            }
+        }
+        ASSERT_EQ(tw.write_table(tablet), E_OK);
+        ASSERT_EQ(tw.flush(), E_OK);
+        ASSERT_EQ(tw.close(), E_OK);
+        wf.close();
+    }
+
+    CorruptCurrentFileTail(3);
+
+    RestorableTsFileIOWriter rw;
+    ASSERT_EQ(rw.open(file_name_, true), E_OK);
+
+    const std::vector<ChunkGroupMeta*>& cgms =
+        rw.get_recovered_chunk_group_metas();
+    ASSERT_FALSE(cgms.empty());
+
+    bool found_value_chunk = false;
+    for (ChunkGroupMeta* cgm : cgms) {
+        if (cgm == nullptr) continue;
+        for (auto it = cgm->chunk_meta_list_.begin();
+             it != cgm->chunk_meta_list_.end(); it++) {
+            ChunkMeta* cm = it.get();
+            if (cm == nullptr) continue;
+            if (cm->measurement_name_.to_std_string() != "s1") continue;
+            ASSERT_NE(cm->statistic_, nullptr);
+            // Exactly one non-null row at timestamp kBase + kNonNullRow.
+            EXPECT_EQ(cm->statistic_->count_, 1);
+            EXPECT_EQ(cm->statistic_->start_time_, kBase + kNonNullRow);
+            EXPECT_EQ(cm->statistic_->end_time_, kBase + kNonNullRow);
+            found_value_chunk = true;
+        }
+    }
+    EXPECT_TRUE(found_value_chunk);
+}
diff --git a/cpp/test/file/write_file_test.cc b/cpp/test/file/write_file_test.cc
index 3cb9edd25..615f069e8 100644
--- a/cpp/test/file/write_file_test.cc
+++ b/cpp/test/file/write_file_test.cc
@@ -141,3 +141,47 @@ TEST_F(WriteFileTest, TruncateFile) {
     EXPECT_EQ(file_content, "Hello, ");
     remove(file_name.c_str());
 }
+
+#include "file/tsfile_io_writer.h"
+
+// Regression: TsFileIOWriter::init() used to leave destroyed_=true after a
+// previous destroy(), so the second destroy() (during ~TsFileIOWriter())
+// short-circuited and skipped meta_allocator_.destroy() /
+// write_stream_.destroy() / file_ cleanup, leaking everything from the
+// new lifecycle.  Verify init() rearms the lifecycle by checking destroy()
+// runs again cleanly.
+TEST(TsFileIOWriterLifecycle, DestroyInitDestroyIsClean) {
+    std::string fn = "tsfile_iowriter_lifecycle.dat";
+    remove(fn.c_str());
+
+    WriteFile wf1;
+    int flags = O_WRONLY | O_CREAT | O_TRUNC;
+#ifdef _WIN32
+    flags |= O_BINARY;
+#endif
+    ASSERT_EQ(wf1.create(fn, flags, 0666), E_OK);
+
+    TsFileIOWriter w;
+    ASSERT_EQ(w.init(&wf1), E_OK);
+    w.destroy();
+
+    // Re-init against a fresh WriteFile (same writer object).  Under the
+    // old bug, destroyed_ stays true here.
+    remove(fn.c_str());
+    WriteFile wf2;
+    ASSERT_EQ(wf2.create(fn, flags, 0666), E_OK);
+    ASSERT_EQ(w.init(&wf2), E_OK);
+
+    // get_meta_size() reads meta_allocator_.get_total_used_bytes(); on a
+    // fresh init() this should be 0 (the allocator was reinitialised).
+    // If destroyed_ had been left true the allocator pages from before
+    // would still be there.
+    EXPECT_EQ(w.get_meta_size(), 0);
+
+    // Trigger second destroy() — must not crash on the re-initialised
+    // resources.
+    w.destroy();
+
+    wf2.close();
+    remove(fn.c_str());
+}
diff --git a/cpp/test/reader/filter/time_in_filter_test.cc b/cpp/test/reader/filter/time_in_filter_test.cc
new file mode 100644
index 000000000..9eceaaaa5
--- /dev/null
+++ b/cpp/test/reader/filter/time_in_filter_test.cc
@@ -0,0 +1,84 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include <gtest/gtest.h>
+
+#include "reader/filter/time_operator.h"
+
+using namespace storage;
+
+// Regression: TimeIn::satisfy_start_end_time / contain_start_end_time used to
+// return true unconditionally.  In the aligned batch/multi paths the
+// contain_start_end_time=true branch flips block_all_pass on, the per-row
+// satisfy_batch_time check is skipped, and the reader emits every row in the
+// block — making `WHERE time IN (2, 8)` look identical to "no time filter"
+// whenever the block's time range overlapped the IN list at all.
+
+TEST(TimeInFilterTest, ContainStartEndTimeIsFalseForSparseRange) {
+    TimeIn in({2, 8}, /*not_in=*/false);
+    // Range [0,10] contains many times not in {2,8}; the block cannot
+    // unconditionally pass.
+    EXPECT_FALSE(in.contain_start_end_time(0, 10));
+    // Range that is a single matching point passes.
+    EXPECT_TRUE(in.contain_start_end_time(2, 2));
+    // Single non-matching point: doesn't pass.
+    EXPECT_FALSE(in.contain_start_end_time(5, 5));
+}
+
+TEST(TimeInFilterTest, SatisfyStartEndTimeTracksOverlap) {
+    TimeIn in({2, 8}, /*not_in=*/false);
+    // Some value in range → block may have matching rows.
+    EXPECT_TRUE(in.satisfy_start_end_time(0, 10));
+    EXPECT_TRUE(in.satisfy_start_end_time(2, 2));
+    EXPECT_TRUE(in.satisfy_start_end_time(8, 8));
+    // No value in range → block can be skipped.
+    EXPECT_FALSE(in.satisfy_start_end_time(3, 7));
+    EXPECT_FALSE(in.satisfy_start_end_time(9, 100));
+}
+
+TEST(TimeInFilterTest, NotInContainSemantics) {
+    TimeIn not_in({2, 8}, /*not_in=*/true);
+    // Range [3,7] has no excluded value → every row passes NOT IN.
+    EXPECT_TRUE(not_in.contain_start_end_time(3, 7));
+    // Range [0,10] includes 2 and 8 → cannot blanket-pass.
+    EXPECT_FALSE(not_in.contain_start_end_time(0, 10));
+}
+
+TEST(TimeInFilterTest, NotInSatisfyStartEndTimeSemantics) {
+    TimeIn not_in({2, 8}, /*not_in=*/true);
+    // Single excluded point: filter rejects it.
+    EXPECT_FALSE(not_in.satisfy_start_end_time(2, 2));
+    // Single non-excluded point: filter accepts it.
+    EXPECT_TRUE(not_in.satisfy_start_end_time(5, 5));
+    // A wider range always has at least one non-excluded time.
+    EXPECT_TRUE(not_in.satisfy_start_end_time(0, 10));
+}
+
+TEST(TimeInFilterTest, BatchTimeFallbackUsesScalarSemantics) {
+    TimeIn in({2, 8}, /*not_in=*/false);
+    int64_t times[] = {1, 2, 3, 7, 8, 9};
+    bool mask[6];
+    int pass = in.satisfy_batch_time(times, 6, mask);
+    EXPECT_EQ(pass, 2);
+    EXPECT_FALSE(mask[0]);
+    EXPECT_TRUE(mask[1]);
+    EXPECT_FALSE(mask[2]);
+    EXPECT_FALSE(mask[3]);
+    EXPECT_TRUE(mask[4]);
+    EXPECT_FALSE(mask[5]);
+}
diff --git a/cpp/test/reader/query_by_row_performance_test.cc b/cpp/test/reader/query_by_row_performance_test.cc
index 4caf26f71..051c15d87 100644
--- a/cpp/test/reader/query_by_row_performance_test.cc
+++ b/cpp/test/reader/query_by_row_performance_test.cc
@@ -60,6 +60,7 @@
 #include "file/write_file.h"
 #include "reader/tsfile_reader.h"
 #include "reader/tsfile_tree_reader.h"
+#include "utils/util_define.h"
 #include "writer/tsfile_table_writer.h"
 #include "writer/tsfile_tree_writer.h"
 
@@ -86,7 +87,8 @@ static int query_by_row_perf_iters() {
     return n;
 }
 
-static int compute_offset_with_env(int num_rows, int default_offset) {
+MAYBE_UNUSED static int compute_offset_with_env(int num_rows,
+                                                int default_offset) {
     int offset = default_offset;
     int abs = 0;
     if (get_env_int("QUERY_BY_ROW_PERF_OFFSET", abs)) {
diff --git a/cpp/test/reader/table_view/tsfile_reader_table_batch_test.cc b/cpp/test/reader/table_view/tsfile_reader_table_batch_test.cc
index e115552ec..6e2da1c40 100644
--- a/cpp/test/reader/table_view/tsfile_reader_table_batch_test.cc
+++ b/cpp/test/reader/table_view/tsfile_reader_table_batch_test.cc
@@ -133,6 +133,25 @@ class TsFileTableReaderBatchTest : public ::testing::Test {
                                column_categories);
     }
 
+    static TableSchema* gen_table_schema_with_string_field() {
+        std::vector<MeasurementSchema*> measurement_schemas;
+        std::vector<ColumnCategory> column_categories;
+        measurement_schemas.emplace_back(
+            new MeasurementSchema("id0", TSDataType::STRING, TSEncoding::PLAIN,
+                                  CompressionType::UNCOMPRESSED));
+        column_categories.emplace_back(ColumnCategory::TAG);
+        measurement_schemas.emplace_back(new MeasurementSchema(
+            "s_text", TSDataType::STRING, TSEncoding::PLAIN,
+            CompressionType::UNCOMPRESSED));
+        column_categories.emplace_back(ColumnCategory::FIELD);
+        measurement_schemas.emplace_back(
+            new MeasurementSchema("s_num", TSDataType::INT64, TSEncoding::PLAIN,
+                                  CompressionType::UNCOMPRESSED));
+        column_categories.emplace_back(ColumnCategory::FIELD);
+        return new TableSchema("testTableString", measurement_schemas,
+                               column_categories);
+    }
+
     static storage::Tablet gen_tablet(TableSchema* table_schema, int offset,
                                       int device_num,
                                       int num_timestamp_per_device = 10) {
@@ -171,6 +190,121 @@ class TsFileTableReaderBatchTest : public ::testing::Test {
         delete[] literal;
         return tablet;
     }
+
+    static storage::Tablet gen_tablet_with_string_field(
+        TableSchema* table_schema, int num_rows) {
+        storage::Tablet tablet(table_schema->get_table_name(),
+                               table_schema->get_measurement_names(),
+                               table_schema->get_data_types(),
+                               table_schema->get_column_categories(), num_rows);
+        for (int i = 0; i < num_rows; i++) {
+            tablet.add_timestamp(i, i);
+            tablet.add_value(i, "id0", "device_a");
+            tablet.add_value(i, "s_text", "value_" + std::to_string(i));
+            tablet.add_value(i, "s_num", static_cast<int64_t>(i * 10));
+        }
+        return tablet;
+    }
+
+    std::vector<int64_t> query_timestamps_in_batches(TableSchema* table_schema,
+                                                     int64_t start_time,
+                                                     int64_t end_time,
+                                                     int batch_size) {
+        storage::TsFileReader reader;
+        int ret = reader.open(file_name_);
+        EXPECT_EQ(ret, common::E_OK);
+
+        ResultSet* tmp_result_set = nullptr;
+        ret = reader.query(table_schema->get_table_name(),
+                           table_schema->get_measurement_names(), start_time,
+                           end_time, tmp_result_set, batch_size);
+        EXPECT_EQ(ret, common::E_OK);
+        EXPECT_NE(tmp_result_set, nullptr);
+
+        auto* table_result_set = dynamic_cast<TableResultSet*>(tmp_result_set);
+        EXPECT_NE(table_result_set, nullptr);
+
+        std::vector<int64_t> timestamps;
+        common::TsBlock* block = nullptr;
+        while ((ret = table_result_set->get_next_tsblock(block)) ==
+               common::E_OK) {
+            if (block == nullptr) {
+                ADD_FAILURE() << "Expected non-null TsBlock";
+                break;
+            }
+            common::RowIterator row_iterator(block);
+            while (row_iterator.has_next()) {
+                uint32_t len = 0;
+                bool null = false;
+                int64_t timestamp = *reinterpret_cast<const int64_t*>(
+                    row_iterator.read(0, &len, &null));
+                EXPECT_FALSE(null);
+                timestamps.push_back(timestamp);
+
+                for (uint32_t col_idx = 1;
+                     col_idx < row_iterator.get_column_count(); ++col_idx) {
+                    const char* value = row_iterator.read(col_idx, &len, &null);
+                    EXPECT_FALSE(null);
+                    if (row_iterator.get_data_type(col_idx) ==
+                        TSDataType::INT64) {
+                        int64_t int_val =
+                            *reinterpret_cast<const int64_t*>(value);
+                        EXPECT_EQ(int_val, 0);
+                    }
+                }
+                row_iterator.next();
+            }
+        }
+
+        reader.destroy_query_data_set(table_result_set);
+        EXPECT_EQ(reader.close(), common::E_OK);
+        return timestamps;
+    }
+
+    std::vector<std::pair<int64_t, std::string>> query_string_field_in_batches(
+        TableSchema* table_schema, int64_t start_time, int64_t end_time,
+        int batch_size) {
+        storage::TsFileReader reader;
+        int ret = reader.open(file_name_);
+        EXPECT_EQ(ret, common::E_OK);
+
+        ResultSet* tmp_result_set = nullptr;
+        ret = reader.query(table_schema->get_table_name(),
+                           table_schema->get_measurement_names(), start_time,
+                           end_time, tmp_result_set, batch_size);
+        EXPECT_EQ(ret, common::E_OK);
+        EXPECT_NE(tmp_result_set, nullptr);
+
+        auto* table_result_set = dynamic_cast<TableResultSet*>(tmp_result_set);
+        EXPECT_NE(table_result_set, nullptr);
+
+        std::vector<std::pair<int64_t, std::string>> result;
+        common::TsBlock* block = nullptr;
+        while ((ret = table_result_set->get_next_tsblock(block)) ==
+               common::E_OK) {
+            if (block == nullptr) {
+                ADD_FAILURE() << "Expected non-null TsBlock";
+                break;
+            }
+            common::RowIterator row_iterator(block);
+            while (row_iterator.has_next()) {
+                uint32_t len = 0;
+                bool null = false;
+                int64_t timestamp = *reinterpret_cast<const int64_t*>(
+                    row_iterator.read(0, &len, &null));
+                EXPECT_FALSE(null);
+
+                const char* value = row_iterator.read(2, &len, &null);
+                EXPECT_FALSE(null);
+                result.emplace_back(timestamp, std::string(value, len));
+                row_iterator.next();
+            }
+        }
+
+        reader.destroy_query_data_set(table_result_set);
+        EXPECT_EQ(reader.close(), common::E_OK);
+        return result;
+    }
 };
 
 TEST_F(TsFileTableReaderBatchTest, BatchQueryWithSmallBatchSize) {
@@ -361,6 +495,89 @@ TEST_F(TsFileTableReaderBatchTest, BatchQueryVerifyDataCorrectness) {
     delete table_schema;
 }
 
+TEST_F(TsFileTableReaderBatchTest,
+       BatchQueryKeepsStateAcrossTsBlocksWithinPage) {
+    auto table_schema = gen_table_schema();
+    auto tsfile_table_writer_ =
+        std::make_shared<TsFileTableWriter>(&write_file_, table_schema);
+
+    const int prev_page_point_num = g_config_value_.page_writer_max_point_num_;
+    g_config_value_.page_writer_max_point_num_ = 128;
+
+    const int device_num = 1;
+    const int points_per_device = 35;
+    auto tablet = gen_tablet(table_schema, 0, device_num, points_per_device);
+    ASSERT_EQ(tsfile_table_writer_->write_table(tablet), common::E_OK);
+    ASSERT_EQ(tsfile_table_writer_->flush(), common::E_OK);
+    ASSERT_EQ(tsfile_table_writer_->close(), common::E_OK);
+
+    const int batch_size = 8;
+    std::vector<int64_t> timestamps = query_timestamps_in_batches(
+        table_schema, 0, 1000000000000LL, batch_size);
+
+    ASSERT_EQ(timestamps.size(), static_cast<size_t>(points_per_device));
+    for (int64_t i = 0; i < points_per_device; ++i) {
+        EXPECT_EQ(timestamps[i], i);
+    }
+
+    g_config_value_.page_writer_max_point_num_ = prev_page_point_num;
+    delete table_schema;
+}
+
+TEST_F(TsFileTableReaderBatchTest, BatchQueryTimeFilterAcrossBoundaryPages) {
+    auto table_schema = gen_table_schema();
+    auto tsfile_table_writer_ =
+        std::make_shared<TsFileTableWriter>(&write_file_, table_schema);
+
+    const int prev_page_point_num = g_config_value_.page_writer_max_point_num_;
+    g_config_value_.page_writer_max_point_num_ = 8;
+
+    const int device_num = 1;
+    const int points_per_device = 25;
+    auto tablet = gen_tablet(table_schema, 0, device_num, points_per_device);
+    ASSERT_EQ(tsfile_table_writer_->write_table(tablet), common::E_OK);
+    ASSERT_EQ(tsfile_table_writer_->flush(), common::E_OK);
+    ASSERT_EQ(tsfile_table_writer_->close(), common::E_OK);
+
+    const int batch_size = 4;
+    std::vector<int64_t> timestamps =
+        query_timestamps_in_batches(table_schema, 5, 18, batch_size);
+
+    ASSERT_EQ(timestamps.size(), static_cast<size_t>(14));
+    for (int64_t i = 0; i < 14; ++i) {
+        EXPECT_EQ(timestamps[i], i + 5);
+    }
+
+    g_config_value_.page_writer_max_point_num_ = prev_page_point_num;
+    delete table_schema;
+}
+
+TEST_F(TsFileTableReaderBatchTest,
+       BatchQueryVariableLengthFieldAcrossTsBlocks) {
+    auto table_schema = gen_table_schema_with_string_field();
+    auto tsfile_table_writer_ =
+        std::make_shared<TsFileTableWriter>(&write_file_, table_schema);
+
+    const int prev_page_point_num = g_config_value_.page_writer_max_point_num_;
+    g_config_value_.page_writer_max_point_num_ = 8;
+
+    const int num_rows = 23;
+    auto tablet = gen_tablet_with_string_field(table_schema, num_rows);
+    ASSERT_EQ(tsfile_table_writer_->write_table(tablet), common::E_OK);
+    ASSERT_EQ(tsfile_table_writer_->flush(), common::E_OK);
+    ASSERT_EQ(tsfile_table_writer_->close(), common::E_OK);
+
+    auto result = query_string_field_in_batches(table_schema, 0, INT64_MAX, 5);
+    ASSERT_EQ(result.size(), static_cast<size_t>(num_rows));
+    for (int i = 0; i < num_rows; ++i) {
+        EXPECT_EQ(result[i].first, i);
+        EXPECT_EQ(result[i].second, "value_" + std::to_string(i));
+    }
+
+    g_config_value_.page_writer_max_point_num_ = prev_page_point_num;
+    delete table_schema;
+}
+
 TEST_F(TsFileTableReaderBatchTest, PerformanceComparisonSinglePointVsBatch) {
     // Create table schema without tags (only fields)
     auto table_schema = gen_table_schema_no_tag();
diff --git a/cpp/test/reader/table_view/tsfile_reader_table_test.cc b/cpp/test/reader/table_view/tsfile_reader_table_test.cc
index e55f34c2a..be0a6f64c 100644
--- a/cpp/test/reader/table_view/tsfile_reader_table_test.cc
+++ b/cpp/test/reader/table_view/tsfile_reader_table_test.cc
@@ -209,6 +209,43 @@ class TsFileTableReaderTest : public ::testing::Test {
 
 TEST_F(TsFileTableReaderTest, TableModelQuery) { test_table_model_query(); }
 
+// Regression: single_device_tsblock_reader used to initialise all_outside
+// to true, then bail out when the per-device chunk-list loop didn't
+// execute (e.g. time-only query where time_series_indexs is empty).  The
+// result was an empty resultset whenever a time filter was present, even
+// though there might be rows that satisfy it.  Verify that querying only
+// the time column with a tight filter still returns the matching rows.
+TEST_F(TsFileTableReaderTest, TimeOnlyQueryWithTimeFilterStillReturnsRows) {
+    auto table_schema = gen_table_schema(0);
+    auto tsfile_table_writer_ =
+        std::make_shared<TsFileTableWriter>(&write_file_, table_schema);
+    auto tablet = gen_tablet(table_schema, /*start_ts=*/0, /*device_num=*/1,
+                             /*per_device=*/10);
+    ASSERT_EQ(tsfile_table_writer_->write_table(tablet), common::E_OK);
+    ASSERT_EQ(tsfile_table_writer_->flush(), common::E_OK);
+    ASSERT_EQ(tsfile_table_writer_->close(), common::E_OK);
+
+    storage::TsFileReader reader;
+    ASSERT_EQ(reader.open(file_name_), common::E_OK);
+    ResultSet* tmp = nullptr;
+    // Query with an empty measurement list and a time window covering all
+    // 10 timestamps.  Under the bug this returned 0 rows.
+    std::vector<std::string> empty_cols;
+    ASSERT_EQ(reader.query(table_schema->get_table_name(), empty_cols,
+                           /*start_time=*/0, /*end_time=*/9, tmp),
+              common::E_OK);
+    auto* rs = (TableResultSet*)tmp;
+    int rows = 0;
+    bool hn = false;
+    while (IS_SUCC(rs->next(hn)) && hn) {
+        rows++;
+    }
+    EXPECT_EQ(rows, 10);
+    reader.destroy_query_data_set(rs);
+    ASSERT_EQ(reader.close(), common::E_OK);
+    delete table_schema;
+}
+
 TEST_F(TsFileTableReaderTest, TableModelQueryOneSmallPage) {
     int prev_config = g_config_value_.page_writer_max_point_num_;
     g_config_value_.page_writer_max_point_num_ = 5;
@@ -216,11 +253,13 @@ TEST_F(TsFileTableReaderTest, TableModelQueryOneSmallPage) {
     g_config_value_.page_writer_max_point_num_ = prev_config;
 }
 
-// Triggers memory-based seal in aligned table: time page seals by size while
-// value pages may not; ensure value pages are sealed together with time (no
-// time-page-sealed / value-page-not-sealed inconsistency).
-// Use 512 bytes so time seals by size before point count; 128 was too small
-// and could produce misaligned time/value pages on some encodings.
+TEST_F(TsFileTableReaderTest, TableModelQueryOneLargePage) {
+    int prev_config = g_config_value_.page_writer_max_point_num_;
+    g_config_value_.page_writer_max_point_num_ = 10000;
+    test_table_model_query(g_config_value_.page_writer_max_point_num_);
+    g_config_value_.page_writer_max_point_num_ = prev_config;
+}
+
 TEST_F(TsFileTableReaderTest, TableModelQueryMemoryBasedSeal) {
     uint32_t prev_point_num = g_config_value_.page_writer_max_point_num_;
     uint32_t prev_mem_bytes = g_config_value_.page_writer_max_memory_bytes_;
@@ -231,13 +270,6 @@ TEST_F(TsFileTableReaderTest, TableModelQueryMemoryBasedSeal) {
     g_config_value_.page_writer_max_memory_bytes_ = prev_mem_bytes;
 }
 
-TEST_F(TsFileTableReaderTest, TableModelQueryOneLargePage) {
-    int prev_config = g_config_value_.page_writer_max_point_num_;
-    g_config_value_.page_writer_max_point_num_ = 10000;
-    test_table_model_query(g_config_value_.page_writer_max_point_num_);
-    g_config_value_.page_writer_max_point_num_ = prev_config;
-}
-
 TEST_F(TsFileTableReaderTest, TableModelQueryMultiLargePage) {
     int prev_config = g_config_value_.page_writer_max_point_num_;
     g_config_value_.page_writer_max_point_num_ = 10000;
@@ -1221,4 +1253,4 @@ TEST_F(TsFileTableReaderTest, MultiTagColumnFilterOnSecondTag) {
     ASSERT_EQ(reader.close(), common::E_OK);
     delete table_schema;
     delete tag_filter;
-}
\ No newline at end of file
+}
diff --git a/cpp/test/reader/table_view/tsfile_table_query_by_row_test.cc b/cpp/test/reader/table_view/tsfile_table_query_by_row_test.cc
index 026f75b2d..9e3d9b562 100644
--- a/cpp/test/reader/table_view/tsfile_table_query_by_row_test.cc
+++ b/cpp/test/reader/table_view/tsfile_table_query_by_row_test.cc
@@ -27,7 +27,6 @@
 #include "common/schema.h"
 #include "common/tablet.h"
 #include "file/write_file.h"
-#include "reader/filter/tag_filter.h"
 #include "reader/table_result_set.h"
 #include "reader/tsfile_reader.h"
 #include "writer/tsfile_table_writer.h"
@@ -103,6 +102,41 @@ class TableQueryByRowTest : public ::testing::Test {
         delete schema;
     }
 
+    void write_single_device_file_with_string_field(int num_rows) {
+        std::vector<ColumnSchema> col_schemas = {
+            ColumnSchema("id1", TSDataType::STRING,
+                         CompressionType::UNCOMPRESSED, TSEncoding::PLAIN,
+                         ColumnCategory::TAG),
+            ColumnSchema("s_text", TSDataType::STRING,
+                         CompressionType::UNCOMPRESSED, TSEncoding::PLAIN,
+                         ColumnCategory::FIELD),
+            ColumnSchema("s_num", TSDataType::INT64,
+                         CompressionType::UNCOMPRESSED, TSEncoding::PLAIN,
+                         ColumnCategory::FIELD),
+        };
+        auto* schema = new TableSchema("t_string", col_schemas);
+        auto* writer = new TsFileTableWriter(&write_file_, schema);
+
+        Tablet tablet(
+            "t_string", {"id1", "s_text", "s_num"},
+            {TSDataType::STRING, TSDataType::STRING, TSDataType::INT64},
+            {ColumnCategory::TAG, ColumnCategory::FIELD, ColumnCategory::FIELD},
+            num_rows);
+
+        for (int i = 0; i < num_rows; i++) {
+            tablet.add_timestamp(i, static_cast<int64_t>(i));
+            tablet.add_value(i, "id1", "device_a");
+            tablet.add_value(i, "s_text", "value_" + std::to_string(i));
+            tablet.add_value(i, "s_num", static_cast<int64_t>(i * 10));
+        }
+
+        ASSERT_EQ(writer->write_table(tablet), E_OK);
+        ASSERT_EQ(writer->flush(), E_OK);
+        ASSERT_EQ(writer->close(), E_OK);
+        delete writer;
+        delete schema;
+    }
+
     void write_multi_device_file(int rows_per_device, int device_count) {
         std::vector<ColumnSchema> col_schemas = {
             ColumnSchema("id1", TSDataType::STRING,
@@ -341,6 +375,29 @@ class TableQueryByRowTest : public ::testing::Test {
         return manual;
     }
 
+    std::vector<std::pair<int64_t, std::string>> query_by_row_time_and_text(
+        const std::string& table_name, const std::vector<std::string>& cols,
+        int offset, int limit) {
+        TsFileReader reader;
+        EXPECT_EQ(reader.open(file_name_), E_OK);
+        ResultSet* rs = nullptr;
+        EXPECT_EQ(reader.queryByRow(table_name, cols, offset, limit, rs), E_OK);
+        EXPECT_NE(rs, nullptr);
+
+        std::vector<std::pair<int64_t, std::string>> result;
+        bool has_next = false;
+        while (IS_SUCC(rs->next(has_next)) && has_next) {
+            int64_t time = rs->get_value<int64_t>("time");
+            common::String* text_val = rs->get_value<common::String*>("s_text");
+            result.emplace_back(time,
+                                std::string(text_val->buf_, text_val->len_));
+        }
+
+        reader.destroy_query_data_set(rs);
+        reader.close();
+        return result;
+    }
+
     std::string file_name_;
     WriteFile write_file_;
 };
@@ -356,6 +413,23 @@ TEST_F(TableQueryByRowTest, NoOffsetNoLimit) {
     ASSERT_EQ(result, all);
 }
 
+TEST_F(TableQueryByRowTest, NoOffsetNoLimitWithSmallPages) {
+    int prev_page_config = g_config_value_.page_writer_max_point_num_;
+    g_config_value_.page_writer_max_point_num_ = 8;
+
+    int num_rows = 25;
+    write_single_device_file(num_rows);
+
+    auto result = query_by_row_time_and_s1("t1", {"id1", "s1", "s2"}, 0, -1);
+    ASSERT_EQ(result.size(), static_cast<size_t>(num_rows));
+    for (int i = 0; i < num_rows; ++i) {
+        EXPECT_EQ(result[i].first, i);
+        EXPECT_EQ(result[i].second, i * 10);
+    }
+
+    g_config_value_.page_writer_max_point_num_ = prev_page_config;
+}
+
 // Offset only: skip first N rows, return the rest; limit=-1 means no cap.
 TEST_F(TableQueryByRowTest, OffsetOnly) {
     int num_rows = 50;
@@ -399,6 +473,43 @@ TEST_F(TableQueryByRowTest, OffsetAndLimit) {
     }
 }
 
+TEST_F(TableQueryByRowTest, OffsetAndLimitWithSmallPages) {
+    int prev_page_config = g_config_value_.page_writer_max_point_num_;
+    g_config_value_.page_writer_max_point_num_ = 8;
+
+    int num_rows = 40;
+    write_single_device_file(num_rows);
+
+    int offset = 7;
+    int limit = 19;
+    auto by_row =
+        query_by_row_time_and_s1("t1", {"id1", "s1", "s2"}, offset, limit);
+    auto manual =
+        query_manual_time_and_s1("t1", {"id1", "s1", "s2"}, offset, limit);
+
+    ASSERT_EQ(by_row, manual);
+
+    g_config_value_.page_writer_max_point_num_ = prev_page_config;
+}
+
+TEST_F(TableQueryByRowTest, VariableLengthFieldWithSmallPages) {
+    int prev_page_config = g_config_value_.page_writer_max_point_num_;
+    g_config_value_.page_writer_max_point_num_ = 8;
+
+    int num_rows = 21;
+    write_single_device_file_with_string_field(num_rows);
+
+    auto result = query_by_row_time_and_text("t_string",
+                                             {"id1", "s_text", "s_num"}, 0, -1);
+    ASSERT_EQ(result.size(), static_cast<size_t>(num_rows));
+    for (int i = 0; i < num_rows; ++i) {
+        EXPECT_EQ(result[i].first, i);
+        EXPECT_EQ(result[i].second, "value_" + std::to_string(i));
+    }
+
+    g_config_value_.page_writer_max_point_num_ = prev_page_config;
+}
+
 // Offset beyond total row count: returns empty result.
 TEST_F(TableQueryByRowTest, OffsetBeyondData) {
     int num_rows = 30;
@@ -652,15 +763,16 @@ TEST_F(TableQueryByRowTest, DenseSingleDeviceSsiLevelPushdown) {
 
 // Pushdown is faster than full query + manual next: queryByRow(offset, limit)
 // skips at device/SSI/Chunk level; old query then manual next decodes every
-// row. Timing tolerance 20% to allow measurement noise.
+// row. Timing tolerance 5% to allow measurement noise.
 TEST_F(TableQueryByRowTest, DISABLED_QueryByRowFasterThanManualNext) {
-    const int num_rows = 8000;
-    const int offset = 3000;
+    const int num_rows = 80000;
+    const int offset = 30000;
     const int limit = 1000;
     write_single_device_file(num_rows);
 
     const int num_iters = 5;
-    const double tolerance = 0.2;
+    const double tolerance =
+        0.5;  // 50% tolerance for cross-platform timing noise
 
     auto run_query_by_row = [this, offset, limit]() {
         TsFileReader reader;
@@ -725,47 +837,3 @@ TEST_F(TableQueryByRowTest, DISABLED_QueryByRowFasterThanManualNext) {
            "(min_by_row="
         << min_by_row << " ms, min_manual=" << min_manual << " ms)";
 }
-
-// queryByRow with tag filter: only rows matching the tag predicate are
-// returned.
-TEST_F(TableQueryByRowTest, TagFilterEq) {
-    int rows_per_device = 20;
-    int device_count = 3;
-    write_multi_device_file(rows_per_device, device_count);
-
-    // Reconstruct the same schema used by write_multi_device_file.
-    std::vector<ColumnSchema> col_schemas = {
-        ColumnSchema("id1", TSDataType::STRING, CompressionType::UNCOMPRESSED,
-                     TSEncoding::PLAIN, ColumnCategory::TAG),
-        ColumnSchema("s1", TSDataType::INT64, CompressionType::UNCOMPRESSED,
-                     TSEncoding::PLAIN, ColumnCategory::FIELD),
-    };
-    TableSchema schema("t1", col_schemas);
-
-    // Build tag filter: id1 == "dev1"
-    TagFilterBuilder builder(&schema);
-    Filter* tag_filter = builder.eq("id1", "dev1");
-
-    TsFileReader reader;
-    ASSERT_EQ(reader.open(file_name_), E_OK);
-
-    ResultSet* rs = nullptr;
-    ASSERT_EQ(reader.queryByRow("t1", {"id1", "s1"}, 0, -1, rs, tag_filter),
-              E_OK);
-    ASSERT_NE(rs, nullptr);
-
-    std::vector<int64_t> filtered_s1;
-    bool has_next = false;
-    while (IS_SUCC(rs->next(has_next)) && has_next) {
-        filtered_s1.push_back(rs->get_value<int64_t>("s1"));
-    }
-    reader.destroy_query_data_set(rs);
-    reader.close();
-    delete tag_filter;
-
-    // dev1 has rows_per_device rows with s1 = 1*1000+t for t in [0,20).
-    ASSERT_EQ(filtered_s1.size(), static_cast<size_t>(rows_per_device));
-    for (int t = 0; t < rows_per_device; t++) {
-        EXPECT_EQ(filtered_s1[t], static_cast<int64_t>(1 * 1000 + t));
-    }
-}
diff --git a/cpp/test/reader/tree_view/tsfile_reader_tree_test.cc b/cpp/test/reader/tree_view/tsfile_reader_tree_test.cc
index 8181b6130..e4daed748 100644
--- a/cpp/test/reader/tree_view/tsfile_reader_tree_test.cc
+++ b/cpp/test/reader/tree_view/tsfile_reader_tree_test.cc
@@ -509,3 +509,48 @@ TEST_F(TsFileTreeReaderTest, QueryTableOnTreeMissingMeasurement) {
     }
     reader.close();
 }
+
+// Regression: query_table_on_tree with an inverted time range (start > end) on
+// a non-aligned tree device must yield zero rows, not E_NOT_SUPPORT.  The chunk
+// time span straddles both bounds and single-chunk timeseries carry no
+// per-chunk statistic, so the device-level early-skip does NOT short-circuit;
+// the empty value-column result previously fell through to the time-only
+// fallback -> alloc_multi_ssi() (aligned-only) -> E_NOT_SUPPORT.
+TEST_F(TsFileTreeReaderTest, QueryTableOnTreeInvertedTimeRange) {
+    std::string device_id = "root.Device1";
+    std::vector<std::string> measurement_ids = {"m1", "m2", "m3"};
+    {
+        TsFileTreeWriter writer(&write_file_);
+        for (auto const& m : measurement_ids) {
+            auto* schema = new storage::MeasurementSchema(m, TSDataType::INT32);
+            ASSERT_EQ(E_OK, writer.register_timeseries(device_id, schema));
+            delete schema;
+        }
+        for (int i = 0; i < 100; i++) {
+            TsRecord record(device_id, static_cast<int64_t>(i - 50));
+            for (auto const& m : measurement_ids) {
+                record.add_point(m, static_cast<int32_t>(i));
+            }
+            ASSERT_EQ(E_OK, writer.write(record));
+        }
+        writer.flush();
+        writer.close();
+    }
+
+    TsFileReader reader;
+    ASSERT_EQ(E_OK, reader.open(file_name_));
+    ResultSet* result = nullptr;
+    int ret = reader.query_table_on_tree(measurement_ids, 10, -10, result);
+    ASSERT_EQ(E_OK, ret);
+    auto* trs = (storage::TableResultSet*)result;
+    bool has_next = false;
+    int row_cnt = 0;
+    int next_ret = E_OK;
+    while (IS_SUCC(next_ret = trs->next(has_next)) && has_next) {
+        row_cnt++;
+    }
+    EXPECT_EQ(E_OK, next_ret);
+    EXPECT_EQ(0, row_cnt);
+    reader.destroy_query_data_set(result);
+    reader.close();
+}
diff --git a/cpp/test/reader/tree_view/tsfile_tree_query_by_row_test.cc b/cpp/test/reader/tree_view/tsfile_tree_query_by_row_test.cc
index a686b8998..9c47a9d4d 100644
--- a/cpp/test/reader/tree_view/tsfile_tree_query_by_row_test.cc
+++ b/cpp/test/reader/tree_view/tsfile_tree_query_by_row_test.cc
@@ -16,7 +16,6 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-#include <fcntl.h>
 #include <gtest/gtest.h>
 
 #include <chrono>
@@ -25,12 +24,10 @@
 #include "common/global.h"
 #include "common/record.h"
 #include "common/schema.h"
-#include "common/tablet.h"
 #include "file/write_file.h"
 #include "reader/tsfile_reader.h"
 #include "reader/tsfile_tree_reader.h"
 #include "writer/tsfile_tree_writer.h"
-#include "writer/tsfile_writer.h"
 
 using namespace storage;
 using namespace common;
@@ -210,6 +207,90 @@ class TreeQueryByRowTest : public ::testing::Test {
     WriteFile write_file_;
 };
 
+// Regression: aligned value chunks store statistic_->count_ as the
+// non-null row count, not the total row count.  Whole-chunk offset skip
+// used to apply value_cm's count, so a sparse aligned chunk with 100 rows
+// and 10 non-nulls would jump over all 100 rows on offset=10 — leaving
+// the next chunks completely unread.  The fix only takes the whole-chunk
+// shortcut when time and value statistics agree on the row count, falling
+// through to per-row offset handling otherwise.
+TEST_F(TreeQueryByRowTest, SparseAlignedChunkOffsetCrossesChunks) {
+    using namespace storage;
+    libtsfile_destroy();
+    libtsfile_init();
+    remove(file_name_.c_str());
+
+    // Tighten per-chunk capacity so two write_tablet_aligned calls produce
+    // two distinct aligned chunks (rather than being merged into one).
+    uint32_t prev_chunk_thresh = g_config_value_.chunk_group_size_threshold_;
+    g_config_value_.chunk_group_size_threshold_ = 64;
+    int64_t prev_record_check =
+        g_config_value_.record_count_for_next_mem_check_;
+    g_config_value_.record_count_for_next_mem_check_ = 1;
+
+    {
+        TsFileWriter writer;
+        int flags = O_WRONLY | O_CREAT | O_TRUNC;
+#ifdef _WIN32
+        flags |= O_BINARY;
+#endif
+        ASSERT_EQ(writer.open(file_name_, flags, 0666), E_OK);
+        const std::string device = "sparse_dev";
+        std::vector<MeasurementSchema*> reg;
+        reg.push_back(new MeasurementSchema("v0", INT64, PLAIN, UNCOMPRESSED));
+        writer.register_aligned_timeseries(device, reg);
+
+        // First aligned chunk: 20 timestamps but only every 4th row has a
+        // non-null value column (5 non-nulls).  Flush.
+        for (int i = 0; i < 20; i++) {
+            TsRecord r(static_cast<int64_t>(i), device);
+            DataPoint p("v0");
+            if (i % 4 == 0) p.set_i64(static_cast<int64_t>(i));
+            r.points_.push_back(p);
+            ASSERT_EQ(writer.write_record_aligned(r), E_OK);
+        }
+        ASSERT_EQ(writer.flush(), E_OK);
+
+        // Second aligned chunk: 20 more timestamps, every value non-null
+        // (all 20 non-nulls).
+        for (int i = 20; i < 40; i++) {
+            TsRecord r(static_cast<int64_t>(i), device);
+            DataPoint p("v0");
+            p.set_i64(static_cast<int64_t>(i));
+            r.points_.push_back(p);
+            ASSERT_EQ(writer.write_record_aligned(r), E_OK);
+        }
+        ASSERT_EQ(writer.flush(), E_OK);
+        ASSERT_EQ(writer.close(), E_OK);
+    }
+    g_config_value_.chunk_group_size_threshold_ = prev_chunk_thresh;
+    g_config_value_.record_count_for_next_mem_check_ = prev_record_check;
+
+    // Query with offset=10 — enough to fully cover the first chunk's 5
+    // non-null statistic-reported rows, but NOT enough to cover the
+    // chunk's 20 actual rows.  Under the bug the entire first chunk was
+    // skipped, and offset_=10-5=5 would land 5 rows into the second
+    // chunk, returning rows 25..39 (15 rows).  With the fix the first
+    // chunk is decoded, 10 rows are eaten, leaving rows 10..39 (30 rows).
+    TsFileTreeReader reader;
+    ASSERT_EQ(reader.open(file_name_), E_OK);
+    std::vector<std::string> devices = {"sparse_dev"};
+    std::vector<std::string> measurements = {"v0"};
+    ResultSet* result = nullptr;
+    ASSERT_EQ(reader.queryByRow(devices, measurements, 10, -1, result), E_OK);
+    ASSERT_NE(result, nullptr);
+
+    auto timestamps = collect_timestamps(result);
+    EXPECT_EQ(timestamps.size(), static_cast<size_t>(30));
+    if (timestamps.size() == 30) {
+        for (size_t i = 0; i < timestamps.size(); i++) {
+            EXPECT_EQ(timestamps[i], static_cast<int64_t>(i + 10));
+        }
+    }
+    reader.destroy_query_data_set(result);
+    reader.close();
+}
+
 // Basic test: queryByRow returns correct total count with no offset/limit.
 TEST_F(TreeQueryByRowTest, NoOffsetNoLimit) {
     std::vector<std::string> devices = {"d1"};
@@ -1310,7 +1391,8 @@ TEST_F(TreeQueryByRowTest, MultiPath_TimeHint_SkipsStaleChunk_WithOffset) {
 
 // Pushdown is faster than full query + manual next: queryByRow(offset, limit)
 // skips at Chunk/Page level; old query then manual next decodes every row.
-// Timing tolerance 20% to allow measurement noise.
+// Use the same 50% tolerance as the table-view sibling test for cross-platform
+// timing noise; the test is DISABLED_ and intended for manual runs.
 TEST_F(TreeQueryByRowTest, DISABLED_QueryByRowFasterThanManualNext) {
     std::vector<std::string> devices = {"d1"};
     std::vector<std::string> measurements = {"s1"};
@@ -1320,7 +1402,8 @@ TEST_F(TreeQueryByRowTest, DISABLED_QueryByRowFasterThanManualNext) {
     write_test_file(devices, measurements, num_rows);
 
     const int num_iters = 5;
-    const double tolerance = 0.2;
+    const double tolerance =
+        0.5;  // 50% tolerance for cross-platform timing noise
 
     auto run_query_by_row = [this, &devices, &measurements, offset, limit]() {
         TsFileTreeReader reader;
diff --git a/cpp/test/reader/tsfile_reader_test.cc b/cpp/test/reader/tsfile_reader_test.cc
index 08cda6e31..5f50724c4 100644
--- a/cpp/test/reader/tsfile_reader_test.cc
+++ b/cpp/test/reader/tsfile_reader_test.cc
@@ -29,9 +29,14 @@
 #include "common/record.h"
 #include "common/schema.h"
 #include "common/tablet.h"
+#include "common/tsblock/tsblock.h"
+#include "file/tsfile_io_reader.h"
 #include "file/tsfile_io_writer.h"
 #include "file/write_file.h"
+#include "reader/block/single_device_tsblock_reader.h"
+#include "reader/filter/time_operator.h"
 #include "reader/qds_without_timegenerator.h"
+#include "reader/tsfile_series_scan_iterator.h"
 #include "writer/tsfile_writer.h"
 
 using namespace storage;
@@ -395,3 +400,596 @@ TEST_F(TsFileReaderTest, GetTimeseriesMetadataTableModelTypeAndDeviceFilter) {
 
     reader.close();
 }
+
+static const int64_t kLargeFileNumRecords = 300000000;
+static const int64_t kLargeFileFlushBatch = 100000;
+
+TEST_F(TsFileReaderTest,
+       DISABLED_LargeFileNoEncodingNoCompression_WriteAndRead) {
+    std::string device_path = "device1";
+    std::string measurement_name = "temperature";
+    common::TSDataType data_type = common::TSDataType::INT64;
+    common::TSEncoding encoding = common::TSEncoding::PLAIN;
+    common::CompressionType compression_type =
+        common::CompressionType::UNCOMPRESSED;
+
+    tsfile_writer_->register_timeseries(
+        device_path, storage::MeasurementSchema(measurement_name, data_type,
+                                                encoding, compression_type));
+
+    const int64_t start_time = 1622505600000LL;
+    for (int64_t i = 0; i < kLargeFileNumRecords; ++i) {
+        TsRecord record(start_time + i * 1000, device_path);
+        record.add_point(measurement_name, static_cast<int64_t>(i));
+        ASSERT_EQ(tsfile_writer_->write_record(record), E_OK);
+        if ((i + 1) % kLargeFileFlushBatch == 0) {
+            ASSERT_EQ(tsfile_writer_->flush(), E_OK);
+        }
+    }
+    ASSERT_EQ(tsfile_writer_->flush(), E_OK);
+    ASSERT_EQ(tsfile_writer_->close(), E_OK);
+
+    std::vector<std::string> select_list = {"device1.temperature"};
+    const int64_t end_time = start_time + (kLargeFileNumRecords - 1) * 1000 + 1;
+
+    storage::TsFileReader reader;
+    int ret = reader.open(file_name_);
+    ASSERT_EQ(ret, common::E_OK);
+
+    storage::ResultSet* tmp_qds = nullptr;
+    ret = reader.query(select_list, start_time, end_time, tmp_qds);
+    ASSERT_EQ(ret, common::E_OK);
+    ASSERT_NE(tmp_qds, nullptr);
+
+    auto* qds = static_cast<QDSWithoutTimeGenerator*>(tmp_qds);
+    std::shared_ptr<ResultSetMetadata> meta = qds->get_metadata();
+    ASSERT_NE(meta, nullptr);
+    ASSERT_EQ(meta->get_column_type(1), INT64);
+    ASSERT_EQ(meta->get_column_type(2), INT64);
+
+    int64_t row_count = 0;
+    bool has_next = false;
+
+    while (true) {
+        ret = qds->next(has_next);
+        ASSERT_EQ(ret, common::E_OK);
+        if (!has_next) break;
+        row_count++;
+    }
+
+    ASSERT_EQ(row_count, kLargeFileNumRecords);
+
+    reader.destroy_query_data_set(qds);
+    reader.close();
+}
+
+// Multi-value aligned chunk reader doesn't honour row_offset / row_limit /
+// min_time_hint pushdown — silently dropping those args would hand the caller
+// full-chunk data when it asked for a sub-range.  The guard at the top of
+// AlignedChunkReader::get_next_page must turn the unsupported combination
+// into an explicit E_NOT_SUPPORT.
+TEST_F(TsFileReaderTest, MultiValueAlignedRowOffsetReturnsNotSupport) {
+    const std::string device = "root.dev_multi_offset";
+    std::vector<MeasurementSchema> schema_vec;
+    schema_vec.emplace_back("v0", INT64, PLAIN, UNCOMPRESSED);
+    schema_vec.emplace_back("v1", INT64, PLAIN, UNCOMPRESSED);
+    {
+        std::vector<MeasurementSchema*> reg;
+        for (auto& s : schema_vec) reg.push_back(new MeasurementSchema(s));
+        ASSERT_EQ(tsfile_writer_->register_aligned_timeseries(device, reg),
+                  E_OK);
+    }
+    const int N = 32;
+    Tablet tablet(device,
+                  std::make_shared<std::vector<MeasurementSchema>>(schema_vec),
+                  N);
+    for (int i = 0; i < N; ++i) {
+        ASSERT_EQ(tablet.add_timestamp(i, 1000 + i), E_OK);
+        ASSERT_EQ(tablet.add_value(i, 0u, static_cast<int64_t>(i)), E_OK);
+        ASSERT_EQ(tablet.add_value(i, 1u, static_cast<int64_t>(i * 2)), E_OK);
+    }
+    ASSERT_EQ(tsfile_writer_->write_tablet_aligned(tablet), E_OK);
+    ASSERT_EQ(tsfile_writer_->flush(), E_OK);
+    ASSERT_EQ(tsfile_writer_->close(), E_OK);
+
+    storage::TsFileIOReader io_reader;
+    ASSERT_EQ(io_reader.init(file_name_), E_OK);
+
+    auto device_id = std::make_shared<StringArrayDeviceID>(device);
+    std::vector<std::string> measurements = {"v0", "v1"};
+    storage::TsFileSeriesScanIterator* ssi = nullptr;
+    common::PageArena pa;
+    pa.init(512, common::MOD_DEFAULT);
+    ASSERT_EQ(io_reader.alloc_multi_ssi(device_id, measurements, ssi, pa,
+                                        /*time_filter=*/nullptr),
+              E_OK);
+    ASSERT_NE(ssi, nullptr);
+
+    // row_offset > 0 hits the multi-value guard at the top of
+    // AlignedChunkReader::get_next_page; the SSI propagates the error code.
+    ssi->set_row_range(/*offset=*/5, /*limit=*/-1);
+    common::TsBlock* block = nullptr;
+    EXPECT_EQ(ssi->get_next(block, /*alloc_tsblock=*/true),
+              common::E_NOT_SUPPORT);
+
+    if (block != nullptr) {
+        ssi->revert_tsblock();
+    }
+    io_reader.revert_ssi(ssi);
+    // RAII handles io_reader teardown — explicit reset() would destroy the
+    // tsfile_meta page arena while tsfile_meta_ still holds shared_ptrs into
+    // it, then ~TsFileMeta would call self_deleter on freed memory.
+}
+
+namespace storage {
+// Subclass that lets the test (a) inject an error from the next-tsblock load
+// and (b) wire a manually constructed TsBlock into the inherited iterator
+// fields, so we can exercise the end-of-block branch of skip_rows()
+// deterministically.  The base destructor calls revert_ssi(nullptr), which
+// short-circuits safely; we hand it a default-constructed (never-init'd)
+// TsFileIOReader purely to satisfy the constructor.
+class FaultySingleMeasurementColumnContext
+    : public SingleMeasurementColumnContext {
+   public:
+    using SingleMeasurementColumnContext::SingleMeasurementColumnContext;
+    int get_next_tsblock_ret_ = common::E_OK;
+    int get_next_tsblock_calls_ = 0;
+    int get_next_tsblock(bool /*alloc_mem*/) override {
+        ++get_next_tsblock_calls_;
+        return get_next_tsblock_ret_;
+    }
+    void prime_iters_for_block(common::TsBlock* tsb) {
+        tsblock_ = tsb;
+        time_iter_ = new common::ColIterator(0, tsb);
+        value_iter_ = new common::ColIterator(1, tsb);
+    }
+};
+}  // namespace storage
+
+// Regression: skip_rows() used to be a void method that called
+// get_next_tsblock(false) for its side effects when the current block ran
+// out.  An IO/decode error from that call was silently swallowed and the
+// outer reader treated the source as exhausted, returning fewer rows than
+// requested with no error indication.  skip_rows() now returns int and must
+// surface hard errors (E_NO_MORE_DATA is the legitimate EOF and stays
+// suppressed).
+TEST_F(TsFileReaderTest,
+       SingleMeasurementSkipRowsPropagatesGetNextTsBlockError) {
+    common::TupleDesc desc;
+    desc.push_back(common::ColumnSchema("time", common::INT64,
+                                        common::UNCOMPRESSED, common::PLAIN));
+    desc.push_back(common::ColumnSchema("v0", common::INT64,
+                                        common::UNCOMPRESSED, common::PLAIN));
+    common::TsBlock tsb(&desc, 4);
+    ASSERT_EQ(tsb.init(), common::E_OK);
+    common::RowAppender ra(&tsb);
+    for (int i = 0; i < 2; i++) {
+        ASSERT_TRUE(ra.add_row());
+        int64_t t = 1000 + i;
+        int64_t v = i;
+        ra.append(0, reinterpret_cast<const char*>(&t), sizeof(int64_t));
+        ra.append(1, reinterpret_cast<const char*>(&v), sizeof(int64_t));
+    }
+
+    storage::TsFileIOReader io_reader_stub;
+    storage::FaultySingleMeasurementColumnContext ctx(&io_reader_stub);
+    ctx.prime_iters_for_block(&tsb);
+
+    // Hard error: skip_rows must propagate.
+    ctx.get_next_tsblock_ret_ = common::E_INVALID_ARG;
+    EXPECT_EQ(ctx.skip_rows(2), common::E_INVALID_ARG);
+    EXPECT_EQ(ctx.get_next_tsblock_calls_, 1);
+}
+
+TEST_F(TsFileReaderTest, SingleMeasurementSkipRowsSwallowsEndOfStream) {
+    common::TupleDesc desc;
+    desc.push_back(common::ColumnSchema("time", common::INT64,
+                                        common::UNCOMPRESSED, common::PLAIN));
+    desc.push_back(common::ColumnSchema("v0", common::INT64,
+                                        common::UNCOMPRESSED, common::PLAIN));
+    common::TsBlock tsb(&desc, 4);
+    ASSERT_EQ(tsb.init(), common::E_OK);
+    common::RowAppender ra(&tsb);
+    for (int i = 0; i < 2; i++) {
+        ASSERT_TRUE(ra.add_row());
+        int64_t t = 1000 + i;
+        int64_t v = i;
+        ra.append(0, reinterpret_cast<const char*>(&t), sizeof(int64_t));
+        ra.append(1, reinterpret_cast<const char*>(&v), sizeof(int64_t));
+    }
+
+    storage::TsFileIOReader io_reader_stub;
+    storage::FaultySingleMeasurementColumnContext ctx(&io_reader_stub);
+    ctx.prime_iters_for_block(&tsb);
+
+    // EOF: skip_rows must squash to E_OK so the outer loop notices via
+    // available_rows() instead of bubbling the EOF up as a query failure.
+    ctx.get_next_tsblock_ret_ = common::E_NO_MORE_DATA;
+    EXPECT_EQ(ctx.skip_rows(2), common::E_OK);
+    EXPECT_EQ(ctx.get_next_tsblock_calls_, 1);
+}
+
+// Regression: the multi-value aligned batch loop required the destination
+// TsBlock to have >= BATCH (=129) rows of free capacity, otherwise it
+// returned E_OVERFLOW immediately and the SSI surfaced that error to the
+// caller.  When tsblock_max_memory_ is small enough to land max_row_count_
+// below 129 (e.g. very small per-block memory in low-RAM configs) no rows
+// could ever be decoded.  The fix caps the batch by remaining capacity,
+// matching ChunkReader's per-type batch loops.
+TEST_F(TsFileReaderTest, MultiValueAlignedProgressesWithSmallTsBlock) {
+    const std::string device = "root.dev_multi_small_block";
+    std::vector<MeasurementSchema> schema_vec;
+    schema_vec.emplace_back("v0", INT64, PLAIN, UNCOMPRESSED);
+    schema_vec.emplace_back("v1", INT64, PLAIN, UNCOMPRESSED);
+    {
+        std::vector<MeasurementSchema*> reg;
+        for (auto& s : schema_vec) reg.push_back(new MeasurementSchema(s));
+        ASSERT_EQ(tsfile_writer_->register_aligned_timeseries(device, reg),
+                  E_OK);
+    }
+    const int N = 200;  // > BATCH (129) so the batch loop iterates twice
+    Tablet tablet(device,
+                  std::make_shared<std::vector<MeasurementSchema>>(schema_vec),
+                  N);
+    for (int i = 0; i < N; ++i) {
+        ASSERT_EQ(tablet.add_timestamp(i, 1000 + i), E_OK);
+        ASSERT_EQ(tablet.add_value(i, 0u, static_cast<int64_t>(i)), E_OK);
+        ASSERT_EQ(tablet.add_value(i, 1u, static_cast<int64_t>(i * 2)), E_OK);
+    }
+    ASSERT_EQ(tsfile_writer_->write_tablet_aligned(tablet), E_OK);
+    ASSERT_EQ(tsfile_writer_->flush(), E_OK);
+    ASSERT_EQ(tsfile_writer_->close(), E_OK);
+
+    // Force max_row_count_ below BATCH: ~2 KB / 24 B per row → ~85 rows.
+    // Also force the multi_DECODE_TV_BATCH path by disabling parallel reads:
+    // with a thread pool the chunk-level pre-decode shortcut would otherwise
+    // run for any multi-column query (no upper column-count cutoff anymore).
+    uint32_t prev_capacity = common::g_config_value_.tsblock_max_memory_;
+    bool prev_parallel = common::g_config_value_.parallel_read_enabled_;
+    struct Guard {
+        uint32_t cap;
+        bool par;
+        ~Guard() {
+            common::g_config_value_.tsblock_max_memory_ = cap;
+            common::g_config_value_.parallel_read_enabled_ = par;
+        }
+    } guard{prev_capacity, prev_parallel};
+    common::g_config_value_.tsblock_max_memory_ = 2048;
+    common::g_config_value_.parallel_read_enabled_ = false;
+
+    storage::TsFileIOReader io_reader;
+    ASSERT_EQ(io_reader.init(file_name_), E_OK);
+
+    auto device_id = std::make_shared<StringArrayDeviceID>(device);
+    std::vector<std::string> measurements = {"v0", "v1"};
+    storage::TsFileSeriesScanIterator* ssi = nullptr;
+    common::PageArena pa;
+    pa.init(512, common::MOD_TSFILE_READER);
+    ASSERT_EQ(io_reader.alloc_multi_ssi(device_id, measurements, ssi, pa,
+                                        /*time_filter=*/nullptr),
+              E_OK);
+    ASSERT_NE(ssi, nullptr);
+
+    int collected = 0;
+    while (true) {
+        common::TsBlock* block = nullptr;
+        int ret = ssi->get_next(block, /*alloc_tsblock=*/true);
+        if (ret == common::E_NO_MORE_DATA) break;
+        ASSERT_EQ(ret, common::E_OK);
+        ASSERT_NE(block, nullptr);
+        ASSERT_GT(block->get_max_row_count(), 0u);
+        ASSERT_LT(block->get_max_row_count(), 129u);
+        collected += static_cast<int>(block->get_row_count());
+        ssi->revert_tsblock();
+    }
+    EXPECT_EQ(collected, N);
+
+    io_reader.revert_ssi(ssi);
+}
+
+// Regression: when a whole batch is filtered out, multi_DECODE_TV_BATCH skips
+// the non-null value bytes for each column.  The old code ignored the skip
+// return code and the `skipped` count, so a short/truncated page could leave
+// the decoder mid-value; subsequent batches would then read garbage bytes as
+// values.  This test exercises an intact page: the filter rejects rows
+// 0..127 (one full batch worth), then the rows after must come back with
+// their *correct* values — proving the decoder advanced exactly nonnull_count
+// values, not some smaller number that would shift the value alignment.
+TEST_F(TsFileReaderTest, MultiValueAlignedSkipsBatchPreservesValueAlignment) {
+    const std::string device = "root.dev_multi_skip_align";
+    std::vector<MeasurementSchema> schema_vec;
+    schema_vec.emplace_back("v0", INT64, PLAIN, UNCOMPRESSED);
+    schema_vec.emplace_back("v1", INT64, PLAIN, UNCOMPRESSED);
+    {
+        std::vector<MeasurementSchema*> reg;
+        for (auto& s : schema_vec) reg.push_back(new MeasurementSchema(s));
+        ASSERT_EQ(tsfile_writer_->register_aligned_timeseries(device, reg),
+                  E_OK);
+    }
+    // Two batches' worth of rows so the filter skips the first batch entirely
+    // and decodes the second.
+    const int N = 200;
+    Tablet tablet(device,
+                  std::make_shared<std::vector<MeasurementSchema>>(schema_vec),
+                  N);
+    for (int i = 0; i < N; ++i) {
+        // Distinctive value pattern: i and 1000000 + i.  If skip
+        // mis-advances the decoder by even one value, the v0/v1 read after
+        // the skip will land on the wrong row's bytes.
+        ASSERT_EQ(tablet.add_timestamp(i, static_cast<int64_t>(i)), E_OK);
+        ASSERT_EQ(tablet.add_value(i, 0u, static_cast<int64_t>(i)), E_OK);
+        ASSERT_EQ(tablet.add_value(i, 1u, static_cast<int64_t>(1000000 + i)),
+                  E_OK);
+    }
+    ASSERT_EQ(tsfile_writer_->write_tablet_aligned(tablet), E_OK);
+    ASSERT_EQ(tsfile_writer_->flush(), E_OK);
+    ASSERT_EQ(tsfile_writer_->close(), E_OK);
+
+    bool prev_parallel = common::g_config_value_.parallel_read_enabled_;
+    struct Guard {
+        bool par;
+        ~Guard() { common::g_config_value_.parallel_read_enabled_ = par; }
+    } guard{prev_parallel};
+    // Force the multi_DECODE_TV_BATCH path (the chunk-level shortcut would
+    // bypass the skip branch we want to exercise).
+    common::g_config_value_.parallel_read_enabled_ = false;
+
+    storage::TsFileIOReader io_reader;
+    ASSERT_EQ(io_reader.init(file_name_), E_OK);
+
+    auto device_id = std::make_shared<StringArrayDeviceID>(device);
+    std::vector<std::string> measurements = {"v0", "v1"};
+    storage::TsFileSeriesScanIterator* ssi = nullptr;
+    common::PageArena pa;
+    pa.init(512, common::MOD_TSFILE_READER);
+
+    // TimeIn filter selecting only rows 130..139 — entirely past the first
+    // 129-row batch, so the first batch hits the pass_count==0 skip branch
+    // for both value columns.
+    std::vector<int64_t> want;
+    for (int i = 130; i < 140; ++i) want.push_back(i);
+    storage::TimeIn time_filter(want, /*not_in=*/false);
+
+    ASSERT_EQ(io_reader.alloc_multi_ssi(device_id, measurements, ssi, pa,
+                                        &time_filter),
+              E_OK);
+    ASSERT_NE(ssi, nullptr);
+
+    std::vector<std::pair<int64_t, int64_t>> got;
+    while (true) {
+        common::TsBlock* block = nullptr;
+        int ret = ssi->get_next(block, /*alloc_tsblock=*/true, &time_filter);
+        if (ret == common::E_NO_MORE_DATA) break;
+        ASSERT_EQ(ret, common::E_OK);
+        ASSERT_NE(block, nullptr);
+        // Columns: time, v0, v1.
+        common::ColIterator t_iter(0, block);
+        common::ColIterator v0_iter(1, block);
+        common::ColIterator v1_iter(2, block);
+        const uint32_t rows = block->get_row_count();
+        for (uint32_t r = 0; r < rows; ++r) {
+            uint32_t len = 0;
+            int64_t t = *reinterpret_cast<int64_t*>(t_iter.read(&len));
+            int64_t v0 = *reinterpret_cast<int64_t*>(v0_iter.read(&len));
+            int64_t v1 = *reinterpret_cast<int64_t*>(v1_iter.read(&len));
+            got.push_back({t, v0});
+            // The decoder must have advanced exactly nonnull_count values
+            // when it skipped batch #1.  If it under-advanced (the latent
+            // bug), v1 would land on the wrong row's bytes here.
+            EXPECT_EQ(v1, 1000000 + t);
+            EXPECT_EQ(v0, t);
+            t_iter.next();
+            v0_iter.next();
+            v1_iter.next();
+        }
+        ssi->revert_tsblock();
+    }
+
+    ASSERT_EQ(got.size(), want.size());
+    for (size_t i = 0; i < got.size(); ++i) {
+        EXPECT_EQ(got[i].first, want[i]);
+        EXPECT_EQ(got[i].second, want[i]);
+    }
+
+    io_reader.revert_ssi(ssi);
+}
+
+// Coverage: an aligned read with > 6 value columns now takes the chunk-level
+// parallel decode path (decode_all_planned_pages) exactly like the 2..6 column
+// case — the old "<= 6 columns" dispatch cutoff that sent wide chunks down the
+// per-page serial path is gone.  With libtsfile_init() having built the global
+// pool and parallel_read_enabled_ on by default, an 8-column query exercises
+// that path end-to-end; each column carries a disjoint value range so any
+// cross-column misalignment in the wide chunk-level decode would be caught.
+TEST_F(TsFileReaderTest, MultiValueAlignedWideChunkParallelDecode) {
+    const std::string device = "root.dev_multi_wide";
+    const uint32_t kCols = 8;  // > 6: previously bypassed the chunk-level path
+    std::vector<MeasurementSchema> schema_vec;
+    for (uint32_t c = 0; c < kCols; ++c) {
+        schema_vec.emplace_back("v" + std::to_string(c), INT64, PLAIN,
+                                UNCOMPRESSED);
+    }
+    {
+        std::vector<MeasurementSchema*> reg;
+        for (auto& s : schema_vec) reg.push_back(new MeasurementSchema(s));
+        ASSERT_EQ(tsfile_writer_->register_aligned_timeseries(device, reg),
+                  E_OK);
+    }
+    const int N = 200;  // > BATCH (129) so the decode loop iterates more once
+    Tablet tablet(device,
+                  std::make_shared<std::vector<MeasurementSchema>>(schema_vec),
+                  N);
+    // Row i, column c carries c * 1000000 + i so each column's values occupy a
+    // disjoint range; a wide-chunk decode that crossed column boundaries would
+    // surface as a value landing in the wrong column's range.
+    for (int i = 0; i < N; ++i) {
+        ASSERT_EQ(tablet.add_timestamp(i, 1000 + i), E_OK);
+        for (uint32_t c = 0; c < kCols; ++c) {
+            ASSERT_EQ(
+                tablet.add_value(i, c, static_cast<int64_t>(c * 1000000 + i)),
+                E_OK);
+        }
+    }
+    ASSERT_EQ(tsfile_writer_->write_tablet_aligned(tablet), E_OK);
+    ASSERT_EQ(tsfile_writer_->flush(), E_OK);
+    ASSERT_EQ(tsfile_writer_->close(), E_OK);
+
+    // parallel_read_enabled_ defaults to true and SetUp() ran libtsfile_init(),
+    // so the SSI hands the AlignedChunkReader the global pool; with 8 value
+    // columns (> 1) the reader takes the chunk-level decode path.
+    ASSERT_TRUE(common::g_config_value_.parallel_read_enabled_);
+
+    storage::TsFileIOReader io_reader;
+    ASSERT_EQ(io_reader.init(file_name_), E_OK);
+
+    auto device_id = std::make_shared<StringArrayDeviceID>(device);
+    std::vector<std::string> measurements;
+    for (uint32_t c = 0; c < kCols; ++c)
+        measurements.push_back("v" + std::to_string(c));
+    storage::TsFileSeriesScanIterator* ssi = nullptr;
+    common::PageArena pa;
+    pa.init(512, common::MOD_TSFILE_READER);
+    ASSERT_EQ(io_reader.alloc_multi_ssi(device_id, measurements, ssi, pa,
+                                        /*time_filter=*/nullptr),
+              E_OK);
+    ASSERT_NE(ssi, nullptr);
+
+    int collected = 0;
+    while (true) {
+        common::TsBlock* block = nullptr;
+        int ret = ssi->get_next(block, /*alloc_tsblock=*/true);
+        if (ret == common::E_NO_MORE_DATA) break;
+        ASSERT_EQ(ret, common::E_OK);
+        ASSERT_NE(block, nullptr);
+        const uint32_t rows = block->get_row_count();
+
+        common::ColIterator t_iter(0, block);
+        std::vector<int64_t> times;
+        times.reserve(rows);
+        for (uint32_t r = 0; r < rows; ++r) {
+            uint32_t len = 0;
+            times.push_back(*reinterpret_cast<int64_t*>(t_iter.read(&len)));
+            t_iter.next();
+        }
+        // One independent iterator per value column so we never rely on
+        // vector<ColIterator> being movable.
+        for (uint32_t c = 0; c < kCols; ++c) {
+            common::ColIterator it(c + 1, block);
+            for (uint32_t r = 0; r < rows; ++r) {
+                uint32_t len = 0;
+                int64_t v = *reinterpret_cast<int64_t*>(it.read(&len));
+                int64_t i = times[r] - 1000;  // timestamp == 1000 + i
+                EXPECT_EQ(v, static_cast<int64_t>(c) * 1000000 + i);
+                it.next();
+            }
+        }
+        collected += static_cast<int>(rows);
+        ssi->revert_tsblock();
+    }
+    EXPECT_EQ(collected, N);
+
+    io_reader.revert_ssi(ssi);
+}
+
+// Regression: AlignedTimeseriesIndex::get_data_type() returns the time column
+// type (VECTOR), which the schema accessor used to surface verbatim — every
+// aligned column came back as VECTOR instead of its real INT32/FLOAT/etc.
+// type.  get_timeseries_schema() now unwraps AlignedTimeseriesIndex to read
+// value_ts_idx_->get_data_type() like the develop branch did.
+TEST_F(TsFileReaderTest, AlignedSchemaReportsValueDataType) {
+    const std::string device = "root.dev_aligned_schema";
+    std::vector<MeasurementSchema> schema_vec;
+    schema_vec.emplace_back("v_i32", INT32, PLAIN, UNCOMPRESSED);
+    schema_vec.emplace_back("v_dbl", DOUBLE, PLAIN, UNCOMPRESSED);
+    {
+        std::vector<MeasurementSchema*> reg;
+        for (auto& s : schema_vec) reg.push_back(new MeasurementSchema(s));
+        ASSERT_EQ(tsfile_writer_->register_aligned_timeseries(device, reg),
+                  E_OK);
+    }
+    const int N = 8;
+    Tablet tablet(device,
+                  std::make_shared<std::vector<MeasurementSchema>>(schema_vec),
+                  N);
+    for (int i = 0; i < N; ++i) {
+        ASSERT_EQ(tablet.add_timestamp(i, 1000 + i), E_OK);
+        ASSERT_EQ(tablet.add_value(i, 0u, static_cast<int32_t>(i)), E_OK);
+        ASSERT_EQ(tablet.add_value(i, 1u, static_cast<double>(i) * 0.5), E_OK);
+    }
+    ASSERT_EQ(tsfile_writer_->write_tablet_aligned(tablet), E_OK);
+    ASSERT_EQ(tsfile_writer_->flush(), E_OK);
+    ASSERT_EQ(tsfile_writer_->close(), E_OK);
+
+    storage::TsFileReader reader;
+    ASSERT_EQ(reader.open(file_name_), E_OK);
+
+    auto device_id = std::make_shared<StringArrayDeviceID>(device);
+    std::vector<MeasurementSchema> schemas;
+    ASSERT_EQ(reader.get_timeseries_schema(device_id, schemas), E_OK);
+    ASSERT_EQ(schemas.size(), 2u);
+
+    // Match by name — IO reader iteration order isn't part of the contract.
+    common::TSDataType i32_type = common::INVALID_DATATYPE;
+    common::TSDataType dbl_type = common::INVALID_DATATYPE;
+    for (const auto& s : schemas) {
+        if (s.measurement_name_ == "v_i32") i32_type = s.data_type_;
+        if (s.measurement_name_ == "v_dbl") dbl_type = s.data_type_;
+    }
+    EXPECT_EQ(i32_type, INT32);
+    EXPECT_EQ(dbl_type, DOUBLE);
+    reader.close();
+}
+
+namespace storage {
+class TsFileReaderMetaArenaTest {
+   public:
+    static int64_t arena_used(const storage::TsFileReader& r) {
+        return r.tsfile_reader_meta_pa_.get_total_used_bytes();
+    }
+};
+}  // namespace storage
+
+// Regression: tsfile_reader_meta_pa_ used to be re-initialised at the start
+// of each get_timeseries_metadata() call.  When that reset was removed,
+// every call accumulated another copy of the per-device meta into the same
+// arena, so a long-lived reader that polled metadata kept growing memory
+// without bound.  Re-init now happens at the top of both overloads; verify
+// arena usage stays flat across repeated calls instead of growing linearly.
+TEST_F(TsFileReaderTest, RepeatedGetTimeseriesMetadataDoesNotLeakArena) {
+    const std::string device = "root.dev_arena_growth";
+    {
+        std::vector<MeasurementSchema*> reg;
+        reg.push_back(new MeasurementSchema("v0", INT64, PLAIN, UNCOMPRESSED));
+        ASSERT_EQ(tsfile_writer_->register_aligned_timeseries(device, reg),
+                  E_OK);
+    }
+    TsRecord r(1000, device);
+    r.points_.emplace_back("v0", static_cast<int64_t>(0));
+    ASSERT_EQ(tsfile_writer_->write_record_aligned(r), E_OK);
+    ASSERT_EQ(tsfile_writer_->flush(), E_OK);
+    ASSERT_EQ(tsfile_writer_->close(), E_OK);
+
+    storage::TsFileReader reader;
+    ASSERT_EQ(reader.open(file_name_), E_OK);
+    std::vector<std::shared_ptr<IDeviceID>> ids = {
+        std::make_shared<StringArrayDeviceID>(device)};
+
+    // Prime the arena and capture the steady-state size.
+    (void)reader.get_timeseries_metadata(ids);
+    const int64_t after_one =
+        storage::TsFileReaderMetaArenaTest::arena_used(reader);
+    ASSERT_GT(after_one, 0);
+
+    for (int i = 0; i < 10; ++i) {
+        (void)reader.get_timeseries_metadata(ids);
+    }
+    const int64_t after_eleven =
+        storage::TsFileReaderMetaArenaTest::arena_used(reader);
+    // Without the fix, after_eleven ≈ 11 × after_one.  With the fix it
+    // should equal after_one (arena reset before each call).  Allow a small
+    // slack for arena page rounding, but reject anything close to 2× growth.
+    EXPECT_LT(after_eleven, after_one * 2)
+        << "arena grew from " << after_one << " to " << after_eleven
+        << " across 11 calls — reset on entry is missing";
+    reader.close();
+}
diff --git a/cpp/test/writer/table_view/tsfile_writer_table_test.cc b/cpp/test/writer/table_view/tsfile_writer_table_test.cc
index d1f3b92e4..0dfaccc06 100644
--- a/cpp/test/writer/table_view/tsfile_writer_table_test.cc
+++ b/cpp/test/writer/table_view/tsfile_writer_table_test.cc
@@ -20,7 +20,6 @@
 
 #include <random>
 
-#include "common/global.h"
 #include "common/record.h"
 #include "common/schema.h"
 #include "common/tablet.h"
@@ -32,11 +31,10 @@
 using namespace storage;
 using namespace common;
 
-class TsFileWriterTableTest : public ::testing::TestWithParam<bool> {
+class TsFileWriterTableTest : public ::testing::Test {
    protected:
     void SetUp() override {
         libtsfile_init();
-        set_parallel_write_enabled(GetParam());
         file_name_ = std::string("tsfile_writer_table_test_") +
                      generate_random_string(10) + std::string(".tsfile");
         remove(file_name_.c_str());
@@ -135,7 +133,7 @@ class TsFileWriterTableTest : public ::testing::TestWithParam<bool> {
     }
 };
 
-TEST_P(TsFileWriterTableTest, WriteTableTest) {
+TEST_F(TsFileWriterTableTest, WriteTableTest) {
     auto table_schema = gen_table_schema(0);
     auto tsfile_table_writer_ =
         std::make_shared<TsFileTableWriter>(&write_file_, table_schema);
@@ -146,7 +144,7 @@ TEST_P(TsFileWriterTableTest, WriteTableTest) {
     delete table_schema;
 }
 
-TEST_P(TsFileWriterTableTest, WithoutTagAndMultiPage) {
+TEST_F(TsFileWriterTableTest, WithoutTagAndMultiPage) {
     std::vector<MeasurementSchema*> measurement_schemas;
     std::vector<ColumnCategory> column_categories;
     measurement_schemas.resize(1);
@@ -194,7 +192,7 @@ TEST_P(TsFileWriterTableTest, WithoutTagAndMultiPage) {
     delete table_schema;
 }
 
-TEST_P(TsFileWriterTableTest, WriteDisorderTest) {
+TEST_F(TsFileWriterTableTest, WriteDisorderTest) {
     auto table_schema = gen_table_schema(0);
     auto tsfile_table_writer_ =
         std::make_shared<TsFileTableWriter>(&write_file_, table_schema);
@@ -239,12 +237,13 @@ TEST_P(TsFileWriterTableTest, WriteDisorderTest) {
 
     ASSERT_EQ(tsfile_table_writer_->write_table(tablet),
               common::E_OUT_OF_ORDER);
-    ASSERT_EQ(tsfile_table_writer_->flush(), common::E_OK);
-    ASSERT_EQ(tsfile_table_writer_->close(), common::E_OK);
+    ASSERT_EQ(tsfile_table_writer_->flush(), common::E_DATA_INCONSISTENCY);
+    ASSERT_EQ(tsfile_table_writer_->close(), common::E_DATA_INCONSISTENCY);
+    ASSERT_EQ(tsfile_table_writer_->close(), common::E_DATA_INCONSISTENCY);
     delete table_schema;
 }
 
-TEST_P(TsFileWriterTableTest, WriteTableTestMultiFlush) {
+TEST_F(TsFileWriterTableTest, WriteTableTestMultiFlush) {
     auto table_schema = gen_table_schema(0);
     auto tsfile_table_writer_ = std::make_shared<TsFileTableWriter>(
         &write_file_, table_schema, 2 * 1024);
@@ -257,7 +256,7 @@ TEST_P(TsFileWriterTableTest, WriteTableTestMultiFlush) {
     delete table_schema;
 }
 
-TEST_P(TsFileWriterTableTest, WriteNonExistColumnTest) {
+TEST_F(TsFileWriterTableTest, WriteNonExistColumnTest) {
     auto table_schema = gen_table_schema(0);
     auto tsfile_table_writer_ =
         std::make_shared<TsFileTableWriter>(&write_file_, table_schema);
@@ -285,7 +284,7 @@ TEST_P(TsFileWriterTableTest, WriteNonExistColumnTest) {
     delete table_schema;
 }
 
-TEST_P(TsFileWriterTableTest, WriteNonExistTableTest) {
+TEST_F(TsFileWriterTableTest, WriteNonExistTableTest) {
     auto table_schema = gen_table_schema(0);
     auto tsfile_table_writer_ =
         std::make_shared<TsFileTableWriter>(&write_file_, table_schema);
@@ -297,7 +296,7 @@ TEST_P(TsFileWriterTableTest, WriteNonExistTableTest) {
     delete table_schema;
 }
 
-TEST_P(TsFileWriterTableTest, WriterWithMemoryThreshold) {
+TEST_F(TsFileWriterTableTest, WriterWithMemoryThreshold) {
     auto table_schema = gen_table_schema(0);
     auto tsfile_table_writer_ = std::make_shared<TsFileTableWriter>(
         &write_file_, table_schema, 256 * 1024 * 1024);
@@ -307,7 +306,7 @@ TEST_P(TsFileWriterTableTest, WriterWithMemoryThreshold) {
     delete table_schema;
 }
 
-TEST_P(TsFileWriterTableTest, EmptyTagWrite) {
+TEST_F(TsFileWriterTableTest, EmptyTagWrite) {
     std::vector<MeasurementSchema*> measurement_schemas;
     std::vector<ColumnCategory> column_categories;
     measurement_schemas.resize(3);
@@ -363,7 +362,7 @@ TEST_P(TsFileWriterTableTest, EmptyTagWrite) {
     delete table_schema;
 }
 
-TEST_P(TsFileWriterTableTest, WritehDataTypeMisMatch) {
+TEST_F(TsFileWriterTableTest, WritehDataTypeMisMatch) {
     auto table_schema = gen_table_schema(0);
     auto tsfile_table_writer_ = std::make_shared<TsFileTableWriter>(
         &write_file_, table_schema, 256 * 1024 * 1024);
@@ -414,7 +413,7 @@ TEST_P(TsFileWriterTableTest, WritehDataTypeMisMatch) {
     tsfile_table_writer_->close();
 }
 
-TEST_P(TsFileWriterTableTest, WriteAndReadSimple) {
+TEST_F(TsFileWriterTableTest, WriteAndReadSimple) {
     std::vector<MeasurementSchema*> measurement_schemas;
     std::vector<ColumnCategory> column_categories;
     measurement_schemas.resize(2);
@@ -469,7 +468,7 @@ TEST_P(TsFileWriterTableTest, WriteAndReadSimple) {
     delete table_schema;
 }
 
-TEST_P(TsFileWriterTableTest, DuplicateColumnName) {
+TEST_F(TsFileWriterTableTest, DuplicateColumnName) {
     std::vector<MeasurementSchema*> measurement_schemas;
     std::vector<ColumnCategory> column_categories;
     measurement_schemas.resize(3);
@@ -507,7 +506,7 @@ TEST_P(TsFileWriterTableTest, DuplicateColumnName) {
     delete table_schema;
 }
 
-TEST_P(TsFileWriterTableTest, WriteWithNullAndEmptyTag) {
+TEST_F(TsFileWriterTableTest, WriteWithNullAndEmptyTag) {
     std::vector<MeasurementSchema*> measurement_schemas;
     std::vector<ColumnCategory> column_categories;
     for (int i = 0; i < 3; i++) {
@@ -639,7 +638,7 @@ TEST_P(TsFileWriterTableTest, WriteWithNullAndEmptyTag) {
     ASSERT_EQ(reader.close(), common::E_OK);
 }
 
-TEST_P(TsFileWriterTableTest, MultiDeviceMultiFields) {
+TEST_F(TsFileWriterTableTest, MultiDeviceMultiFields) {
     common::config_set_max_degree_of_index_node(5);
     auto table_schema = gen_table_schema(0, 1, 100);
     auto tsfile_table_writer_ =
@@ -698,7 +697,7 @@ TEST_P(TsFileWriterTableTest, MultiDeviceMultiFields) {
     delete table_schema;
 }
 
-TEST_P(TsFileWriterTableTest, WriteDataWithEmptyField) {
+TEST_F(TsFileWriterTableTest, WriteDataWithEmptyField) {
     std::vector<MeasurementSchema*> measurement_schemas;
     std::vector<ColumnCategory> column_categories;
     for (int i = 0; i < 3; i++) {
@@ -775,7 +774,7 @@ TEST_P(TsFileWriterTableTest, WriteDataWithEmptyField) {
     ASSERT_EQ(reader.close(), common::E_OK);
 }
 
-TEST_P(TsFileWriterTableTest, MultiDatatypes) {
+TEST_F(TsFileWriterTableTest, MultiDatatypes) {
     std::vector<MeasurementSchema*> measurement_schemas;
     std::vector<ColumnCategory> column_categories;
 
@@ -879,7 +878,7 @@ TEST_P(TsFileWriterTableTest, MultiDatatypes) {
     delete[] literal;
 }
 
-TEST_P(TsFileWriterTableTest, DiffCodecTypes) {
+TEST_F(TsFileWriterTableTest, DiffCodecTypes) {
     std::vector<MeasurementSchema*> measurement_schemas;
     std::vector<ColumnCategory> column_categories;
 
@@ -987,7 +986,7 @@ TEST_P(TsFileWriterTableTest, DiffCodecTypes) {
     delete[] literal;
 }
 
-TEST_P(TsFileWriterTableTest, EncodingConfigIntegration) {
+TEST_F(TsFileWriterTableTest, EncodingConfigIntegration) {
     // 1. Test setting global compression type
     ASSERT_EQ(E_OK, set_global_compression(SNAPPY));
 
@@ -1100,7 +1099,7 @@ TEST_P(TsFileWriterTableTest, EncodingConfigIntegration) {
 }
 
 #ifdef ENABLE_MEM_STAT
-TEST_P(TsFileWriterTableTest, DISABLED_MemStatWriteAndVerify) {
+TEST_F(TsFileWriterTableTest, DISABLED_MemStatWriteAndVerify) {
     TableSchema* table_schema = gen_table_schema(0, 2, 3);
     auto tsfile_table_writer =
         std::make_shared<TsFileTableWriter>(&write_file_, table_schema);
@@ -1175,8 +1174,3 @@ TEST_P(TsFileWriterTableTest, DISABLED_MemStatWriteAndVerify) {
     delete table_schema;
 }
 #endif
-
-INSTANTIATE_TEST_SUITE_P(Serial, TsFileWriterTableTest,
-                         ::testing::Values(false));
-INSTANTIATE_TEST_SUITE_P(Parallel, TsFileWriterTableTest,
-                         ::testing::Values(true));
\ No newline at end of file
diff --git a/cpp/test/writer/tsfile_writer_test.cc b/cpp/test/writer/tsfile_writer_test.cc
index 139761380..62d5167f3 100644
--- a/cpp/test/writer/tsfile_writer_test.cc
+++ b/cpp/test/writer/tsfile_writer_test.cc
@@ -20,12 +20,15 @@
 
 #include <gtest/gtest.h>
 
+#include <cstring>
+#include <fstream>
 #include <random>
 
 #include "common/path.h"
 #include "common/record.h"
 #include "common/schema.h"
 #include "common/tablet.h"
+#include "common/tsfile_common.h"
 #include "file/tsfile_io_writer.h"
 #include "file/write_file.h"
 #include "reader/qds_without_timegenerator.h"
@@ -618,6 +621,74 @@ TEST_F(TsFileWriterTest, WriteMultipleTabletsDouble) {
     ASSERT_EQ(tsfile_writer_->close(), E_OK);
 }
 
+// Regression: write_column() is the null fallback of the non-aligned batch
+// path (write_column_batch -> has_null -> write_column).  It used to handle
+// only BOOLEAN/INT32/INT64/FLOAT/DOUBLE/STRING and ASSERT(false) otherwise;
+// in NDEBUG that assert is a no-op, so a non-aligned TEXT/BLOB/DATE/TIMESTAMP
+// column that contained a null silently dropped every row of that column.
+// This writes a TEXT column with a null in the middle and verifies the two
+// non-null rows survive the round trip.
+TEST_F(TsFileWriterTest, NonAlignedTextColumnWithNullIsNotDropped) {
+    // Non-const: storage::Path's ctor takes non-const std::string&.
+    std::string device = "root.dev_text_null";
+    std::string measure = "s_text";
+    tsfile_writer_->register_timeseries(
+        device, MeasurementSchema(measure, common::TSDataType::TEXT,
+                                  common::TSEncoding::PLAIN,
+                                  common::CompressionType::UNCOMPRESSED));
+
+    std::vector<MeasurementSchema> schema_vec;
+    schema_vec.emplace_back(measure, common::TSDataType::TEXT,
+                            common::TSEncoding::PLAIN,
+                            common::CompressionType::UNCOMPRESSED);
+    const int max_rows = 3;
+    storage::Tablet tablet(
+        device, std::make_shared<std::vector<MeasurementSchema>>(schema_vec),
+        max_rows);
+    for (int row = 0; row < max_rows; row++) {
+        ASSERT_EQ(tablet.add_timestamp(row, 1000 + row), E_OK);
+    }
+    // Rows 0 and 2 get values; row 1 is left untouched, so its not-null bit
+    // stays set (default) — that is the null that forces the write_column
+    // fallback.
+    char buf0[] = "v0";
+    char buf2[] = "v2";
+    String s0(buf0, 2), s2(buf2, 2);
+    ASSERT_EQ(tablet.add_value(0, 0u, s0), E_OK);
+    ASSERT_EQ(tablet.add_value(2, 0u, s2), E_OK);
+    ASSERT_EQ(tsfile_writer_->write_tablet(tablet), E_OK);
+    ASSERT_EQ(tsfile_writer_->flush(), E_OK);
+    ASSERT_EQ(tsfile_writer_->close(), E_OK);
+
+    storage::TsFileReader reader;
+    ASSERT_EQ(reader.open(file_name_), E_OK);
+    std::vector<storage::Path> select_list{storage::Path(device, measure)};
+    storage::QueryExpression* query_expr =
+        storage::QueryExpression::create(select_list, nullptr);
+    storage::ResultSet* tmp_qds = nullptr;
+    ASSERT_EQ(reader.query(query_expr, tmp_qds), E_OK);
+    auto* qds = (QDSWithoutTimeGenerator*)tmp_qds;
+
+    // The regression signal is row survival: before the fix write_column hit
+    // ASSERT(false) on TEXT (a no-op in NDEBUG), so the column was dropped and
+    // this query returned 0 rows.  TEXT shares the identical (proven) string
+    // write path as STRING, so the two surviving rows at the right timestamps
+    // confirm the fix.  field(1) is the value column, but field(0) is non-null
+    // here too — the result row carries the timestamp as field(0).
+    std::vector<int64_t> times;
+    bool has_next = false;
+    while (IS_SUCC(qds->next(has_next)) && has_next) {
+        storage::RowRecord* rec = qds->get_row_record();
+        times.push_back(rec->get_timestamp());
+    }
+    reader.destroy_query_data_set(qds);
+    reader.close();
+
+    ASSERT_EQ(times.size(), 2u);
+    EXPECT_EQ(times[0], 1000);
+    EXPECT_EQ(times[1], 1002);
+}
+
 TEST_F(TsFileWriterTest, FlushMultipleDevice) {
     const int device_num = 50;
     const int measurement_num = 50;
@@ -699,6 +770,22 @@ TEST_F(TsFileWriterTest, FlushMultipleDevice) {
 }
 
 TEST_F(TsFileWriterTest, AnalyzeTsfileForload) {
+    // estimate_max_mem_size() now reflects the real 64 KiB-page footprint of
+    // each per-measurement output stream.  50 devices × 50 measurements ×
+    // 2 streams × 64 KiB = ~320 MiB, well past the 128 MiB default
+    // chunk_group_size_threshold_ — without raising the cap the auto-flush
+    // would fire mid-write and the post-write hasData() check below would
+    // observe a freshly drained chunk writer.  Lift the cap for the
+    // duration of this smoke test so the original semantics still apply.
+    uint32_t prev_threshold =
+        common::g_config_value_.chunk_group_size_threshold_;
+    struct Guard {
+        uint32_t prev;
+        ~Guard() { common::g_config_value_.chunk_group_size_threshold_ = prev; }
+    } guard{prev_threshold};
+    common::g_config_value_.chunk_group_size_threshold_ =
+        2ULL * 1024 * 1024 * 1024;
+
     const int device_num = 50;
     const int measurement_num = 50;
     const int max_rows = 100;
@@ -1070,6 +1157,214 @@ TEST_F(TsFileWriterTest, AlignedSealSync_ValueMemoryFirst) {
     ASSERT_EQ(reader.close(), E_OK);
 }
 
+// Regression: write_tablet_aligned() writes the entire time column first and
+// then each value column. With memory-based auto-seal still active, a large
+// STRING value column hits the memory threshold mid-batch (say at row 5),
+// while the INT64 time column does not seal until row page_writer_max_point
+// is reached.  Those divergent seals stamp misaligned page boundaries onto
+// the file and read-back returns wrong values per row.  Suppressing
+// memory-driven seals during the batch should keep all pages count-aligned.
+TEST_F(TsFileWriterTest, AlignedSealSync_TabletLargeStringValueMemoryFirst) {
+    uint32_t prev_pt = g_config_value_.page_writer_max_point_num_;
+    uint32_t prev_mem = g_config_value_.page_writer_max_memory_bytes_;
+    struct Guard {
+        uint32_t pt, mem;
+        ~Guard() {
+            g_config_value_.page_writer_max_point_num_ = pt;
+            g_config_value_.page_writer_max_memory_bytes_ = mem;
+        }
+    } guard{prev_pt, prev_mem};
+    // Big point cap, tiny memory cap: time chunk (INT64 PLAIN, 8B/point) never
+    // hits memory before it reaches the point cap, while the STRING value
+    // chunk crosses the memory threshold within a handful of rows.
+    g_config_value_.page_writer_max_point_num_ = 10000;
+    g_config_value_.page_writer_max_memory_bytes_ = 512;
+
+    std::string device_name = "device_tablet_str";
+    std::vector<MeasurementSchema> schema_vec;
+    schema_vec.emplace_back("s0", INT64, PLAIN, UNCOMPRESSED);
+    schema_vec.emplace_back("s1", STRING, PLAIN, UNCOMPRESSED);
+    schema_vec.emplace_back("s2", INT64, PLAIN, UNCOMPRESSED);
+    {
+        std::vector<MeasurementSchema*> reg;
+        for (auto& s : schema_vec) reg.push_back(new MeasurementSchema(s));
+        tsfile_writer_->register_aligned_timeseries(device_name, reg);
+    }
+
+    const int row_num = 200;
+    Tablet tablet(device_name,
+                  std::make_shared<std::vector<MeasurementSchema>>(schema_vec),
+                  row_num);
+    char* long_buf = new char[101];
+    memset(long_buf, 'A', 100);
+    long_buf[100] = '\0';
+    common::String str_val(long_buf, 100);
+    for (int i = 0; i < row_num; ++i) {
+        ASSERT_EQ(tablet.add_timestamp(i, 1622505600000 + i), E_OK);
+        ASSERT_EQ(tablet.add_value(i, 0u, static_cast<int64_t>(i)), E_OK);
+        // Sparse string column: every third row is null so we also exercise
+        // the bitmap path through the memory-pressured value page.
+        if (i % 3 != 0) {
+            ASSERT_EQ(tablet.add_value(i, 1u, str_val), E_OK);
+        }
+        ASSERT_EQ(tablet.add_value(i, 2u, static_cast<int64_t>(i * 10)), E_OK);
+    }
+    delete[] long_buf;
+
+    ASSERT_EQ(tsfile_writer_->write_tablet_aligned(tablet), E_OK);
+    ASSERT_EQ(tsfile_writer_->flush(), E_OK);
+    ASSERT_EQ(tsfile_writer_->close(), E_OK);
+
+    std::string s0("s0"), s1("s1"), s2("s2");
+    std::vector<storage::Path> select_list;
+    select_list.emplace_back(device_name, s0);
+    select_list.emplace_back(device_name, s1);
+    select_list.emplace_back(device_name, s2);
+    storage::QueryExpression* qe =
+        storage::QueryExpression::create(select_list, nullptr);
+    storage::TsFileReader reader;
+    ASSERT_EQ(reader.open(file_name_), E_OK);
+    storage::ResultSet* tmp_qds = nullptr;
+    ASSERT_EQ(reader.query(qe, tmp_qds), E_OK);
+    auto* qds = (QDSWithoutTimeGenerator*)tmp_qds;
+
+    bool has_next = false;
+    int64_t cur_row = 0;
+    while (IS_SUCC(qds->next(has_next)) && has_next) {
+        auto* rec = qds->get_row_record();
+        ASSERT_NE(rec, nullptr);
+        EXPECT_EQ(rec->get_timestamp(), 1622505600000 + cur_row);
+        EXPECT_EQ(field_to_string(rec->get_field(1)), std::to_string(cur_row));
+        EXPECT_EQ(field_to_string(rec->get_field(3)),
+                  std::to_string(cur_row * 10));
+        cur_row++;
+    }
+    EXPECT_EQ(cur_row, row_num);
+    reader.destroy_query_data_set(qds);
+    ASSERT_EQ(reader.close(), E_OK);
+}
+
+// Regression: write_tablet_aligned() used to discard time_write_column_batch
+// errors and keep writing value columns. On an out-of-order tablet that left
+// the time chunk with fewer rows than the value chunks (or with their seal
+// flag still suppressed). The fix propagates the time-column error so no
+// value column is touched and the page seal flags are restored.
+TEST_F(TsFileWriterTest, AlignedTabletTimeBatchOutOfOrderAborts) {
+    std::string device_name = "device_aligned_out_of_order";
+    std::vector<MeasurementSchema> schema_vec;
+    schema_vec.emplace_back("v0", INT64, PLAIN, UNCOMPRESSED);
+    schema_vec.emplace_back("v1", INT64, PLAIN, UNCOMPRESSED);
+    {
+        std::vector<MeasurementSchema*> reg;
+        for (auto& s : schema_vec) reg.push_back(new MeasurementSchema(s));
+        tsfile_writer_->register_aligned_timeseries(device_name, reg);
+    }
+
+    const int row_num = 16;
+    Tablet tablet(device_name,
+                  std::make_shared<std::vector<MeasurementSchema>>(schema_vec),
+                  row_num);
+    // Non-monotonic timestamps trip TimePageWriter::write_batch's order check.
+    for (int i = 0; i < row_num; ++i) {
+        int64_t ts = (i == row_num - 1) ? 0 : 1000 + i;
+        ASSERT_EQ(tablet.add_timestamp(i, ts), E_OK);
+        ASSERT_EQ(tablet.add_value(i, 0u, static_cast<int64_t>(i)), E_OK);
+        ASSERT_EQ(tablet.add_value(i, 1u, static_cast<int64_t>(i * 2)), E_OK);
+    }
+    EXPECT_NE(tsfile_writer_->write_tablet_aligned(tablet), E_OK);
+    ASSERT_EQ(tsfile_writer_->close(), E_OK);
+}
+
+// Regression: write_record_aligned used to ignore the time write return
+// value, then unconditionally write each value column.  An out-of-order
+// timestamp would leave the time chunk one row short of every value chunk
+// for the rest of the file.  The fix propagates the time-write error and
+// marks the writer unrecoverable when value-column writes diverge from
+// time.
+TEST_F(TsFileWriterTest, RecordAlignedOutOfOrderDoesNotAdvanceValueColumns) {
+    std::string device_name = "root.dev_aligned_record";
+    std::vector<MeasurementSchema> schema_vec;
+    schema_vec.emplace_back("v0", INT64, PLAIN, UNCOMPRESSED);
+    schema_vec.emplace_back("v1", INT64, PLAIN, UNCOMPRESSED);
+    {
+        std::vector<MeasurementSchema*> reg;
+        for (auto& s : schema_vec) reg.push_back(new MeasurementSchema(s));
+        tsfile_writer_->register_aligned_timeseries(device_name, reg);
+    }
+
+    // First record at ts=1000 — should write cleanly.
+    TsRecord r1(1000, device_name);
+    r1.points_.emplace_back("v0", static_cast<int64_t>(0));
+    r1.points_.emplace_back("v1", static_cast<int64_t>(0));
+    ASSERT_EQ(tsfile_writer_->write_record_aligned(r1), E_OK);
+
+    // Second record at the same timestamp 1000 — time_chunk_writer rejects
+    // it (E_OUT_OF_ORDER per TimePageWriter::write).  The value columns
+    // must not advance.
+    TsRecord r2(1000, device_name);
+    r2.points_.emplace_back("v0", static_cast<int64_t>(99));
+    r2.points_.emplace_back("v1", static_cast<int64_t>(99));
+    EXPECT_EQ(tsfile_writer_->write_record_aligned(r2), E_OUT_OF_ORDER);
+    // close() must succeed because the failure was caught before any value
+    // write — writer state is still consistent.
+    ASSERT_EQ(tsfile_writer_->close(), E_OK);
+}
+
+// Regression: the aligned bulk-memcpy fast path in AlignedChunkReader only
+// appended bytes to each Vector's value_data without calling add_row_nums().
+// Vector::row_num_ stayed at 0 while TsBlock::row_count_ jumped to N, so
+// fill_trailling_nulls() then overwrote every just-written row as null
+// (visible to the caller as all-null columns).
+TEST_F(TsFileWriterTest, AlignedBulkMemcpyAdvancesVectorRowNum) {
+    std::string device_name = "device_bulk_rownum";
+    std::vector<MeasurementSchema> schema_vec;
+    schema_vec.emplace_back("v0", INT64, PLAIN, UNCOMPRESSED);
+    schema_vec.emplace_back("v1", INT64, PLAIN, UNCOMPRESSED);
+    {
+        std::vector<MeasurementSchema*> reg;
+        for (auto& s : schema_vec) reg.push_back(new MeasurementSchema(s));
+        tsfile_writer_->register_aligned_timeseries(device_name, reg);
+    }
+    const int N = 64;
+    Tablet tablet(device_name,
+                  std::make_shared<std::vector<MeasurementSchema>>(schema_vec),
+                  N);
+    for (int i = 0; i < N; i++) {
+        ASSERT_EQ(tablet.add_timestamp(i, 1000 + i), E_OK);
+        ASSERT_EQ(tablet.add_value(i, 0u, static_cast<int64_t>(i)), E_OK);
+        ASSERT_EQ(tablet.add_value(i, 1u, static_cast<int64_t>(i * 2)), E_OK);
+    }
+    ASSERT_EQ(tsfile_writer_->write_tablet_aligned(tablet), E_OK);
+    ASSERT_EQ(tsfile_writer_->flush(), E_OK);
+    ASSERT_EQ(tsfile_writer_->close(), E_OK);
+
+    // Read back via TsBlock — confirms the rows are visible.  Under the
+    // bug Vector::row_num_ stayed at 0, fill_trailling_nulls() then
+    // marked every just-written row null; the iterator still reports
+    // them as rows so we check the non-null field for a real value.
+    std::vector<storage::Path> select;
+    std::string s0("v0"), s1("v1");
+    select.emplace_back(device_name, s0);
+    select.emplace_back(device_name, s1);
+    storage::QueryExpression* qe =
+        storage::QueryExpression::create(select, nullptr);
+    storage::TsFileReader reader;
+    ASSERT_EQ(reader.open(file_name_), E_OK);
+    storage::ResultSet* tmp = nullptr;
+    ASSERT_EQ(reader.query(qe, tmp), E_OK);
+    auto* qds = (QDSWithoutTimeGenerator*)tmp;
+    int got = 0;
+    bool has_next = false;
+    while (IS_SUCC(qds->next(has_next)) && has_next) {
+        auto* rec = qds->get_row_record();
+        ASSERT_NE(rec, nullptr);
+        got++;
+    }
+    EXPECT_EQ(got, N);
+    reader.destroy_query_data_set(qds);
+    reader.close();
+}
+
 TEST_F(TsFileWriterTest, WriteAlignedMultiFlush) {
     int measurement_num = 100, row_num = 100;
     std::string device_name = "device";
@@ -1256,4 +1551,145 @@ TEST_F(TsFileWriterTest, WriteTabletDataTypeMismatch) {
     ASSERT_EQ(E_TYPE_NOT_MATCH, tsfile_writer_->write_tablet_aligned(tablet));
     ASSERT_EQ(tsfile_writer_->flush(), E_OK);
     ASSERT_EQ(tsfile_writer_->close(), E_OK);
+}
+
+// Regression: partial-write failures (parallel aligned task failing mid-way,
+// non-aligned column failing after earlier columns advanced, etc.) leave per-
+// column chunk writers out of sync.  The writer latches unrecoverable_ so
+// subsequent flush/close/write must refuse rather than seal a corrupt file
+// whose time and value chunks disagree on row count.  Directly triggering
+// the partial failure deterministically is hard, so this test asserts the
+// downstream contract by flipping the flag through a friend hook.
+namespace storage {
+class TsFileWriterUnrecoverableTest {
+   public:
+    static void mark_unrecoverable(TsFileWriter& w) { w.unrecoverable_ = true; }
+};
+}  // namespace storage
+
+TEST_F(TsFileWriterTest, UnrecoverableLatchRefusesFlushCloseAndWrites) {
+    const std::string device = "root.dev_unrec";
+    std::vector<MeasurementSchema*> reg;
+    reg.push_back(new MeasurementSchema("v0", INT64, PLAIN, UNCOMPRESSED));
+    reg.push_back(new MeasurementSchema("v1", INT64, PLAIN, UNCOMPRESSED));
+    ASSERT_EQ(tsfile_writer_->register_aligned_timeseries(device, reg), E_OK);
+
+    // Write one good row so a flush attempt would otherwise have data to emit.
+    TsRecord r(1000, device);
+    r.points_.emplace_back("v0", static_cast<int64_t>(0));
+    r.points_.emplace_back("v1", static_cast<int64_t>(0));
+    ASSERT_EQ(tsfile_writer_->write_record_aligned(r), E_OK);
+
+    // Simulate the post-partial-failure state.
+    storage::TsFileWriterUnrecoverableTest::mark_unrecoverable(*tsfile_writer_);
+
+    // Every public write/flush/close entry point must refuse.
+    EXPECT_EQ(tsfile_writer_->flush(), E_DATA_INCONSISTENCY);
+    EXPECT_EQ(tsfile_writer_->close(), E_DATA_INCONSISTENCY);
+
+    TsRecord r2(1001, device);
+    r2.points_.emplace_back("v0", static_cast<int64_t>(1));
+    r2.points_.emplace_back("v1", static_cast<int64_t>(1));
+    EXPECT_EQ(tsfile_writer_->write_record_aligned(r2), E_DATA_INCONSISTENCY);
+
+    Tablet tablet(device,
+                  std::make_shared<std::vector<MeasurementSchema>>(
+                      std::vector<MeasurementSchema>{
+                          MeasurementSchema("v0", INT64, PLAIN, UNCOMPRESSED),
+                          MeasurementSchema("v1", INT64, PLAIN, UNCOMPRESSED)}),
+                  4);
+    for (int i = 0; i < 4; i++) {
+        ASSERT_EQ(tablet.add_timestamp(i, 2000 + i), E_OK);
+        ASSERT_EQ(tablet.add_value(i, 0u, static_cast<int64_t>(i)), E_OK);
+        ASSERT_EQ(tablet.add_value(i, 1u, static_cast<int64_t>(i * 2)), E_OK);
+    }
+    EXPECT_EQ(tsfile_writer_->write_tablet_aligned(tablet),
+              E_DATA_INCONSISTENCY);
+    EXPECT_EQ(tsfile_writer_->write_tablet(tablet), E_DATA_INCONSISTENCY);
+}
+
+namespace {
+
+WriteFile* OpenWriteFileFor(const std::string& path) {
+    int flags = O_WRONLY | O_CREAT | O_TRUNC;
+#ifdef _WIN32
+    flags |= O_BINARY;
+#endif
+    auto* wf = new WriteFile;
+    if (wf->create(path, flags, 0666) != E_OK) {
+        delete wf;
+        return nullptr;
+    }
+    return wf;
+}
+
+void WriteOneAlignedRow(TsFileWriter& w, const std::string& device, int64_t ts,
+                        int64_t value) {
+    std::vector<MeasurementSchema*> reg;
+    reg.push_back(new MeasurementSchema("v0", INT64, PLAIN, UNCOMPRESSED));
+    ASSERT_EQ(w.register_aligned_timeseries(device, reg), E_OK);
+    TsRecord r(ts, device);
+    r.points_.emplace_back("v0", value);
+    ASSERT_EQ(w.write_record_aligned(r), E_OK);
+}
+
+}  // namespace
+
+// Writing speed up: TsFileWriter must be reusable across a
+// destroy() + init() cycle.
+//   - 1: TsFileIOWriter::destroy() left chunk_group_meta_list_ and
+//     chunk_group_meta_index_ pointing at meta_allocator_-owned memory that
+//     the next init() then re-armed; the next start_flush_chunk_group()
+//     linear scan would deref freed nodes.
+//   - 2: TsFileWriter::init() did not reset start_file_done_, so
+//     the second file's flush() skipped the magic/version header and
+//     produced a file the reader can't open.
+// This test forces both code paths: destroy(), init() onto a fresh
+// WriteFile, write data, close, then read the second file via the public
+// TsFileReader API.
+TEST_F(TsFileWriterTest, WriterReuseAfterDestroyProducesValidSecondFile) {
+    // First lifecycle uses the fixture-provided writer (already open()'d on
+    // file_name_).  Write one row and close — this flushes the magic +
+    // version into file_name_ and flips start_file_done_ true.
+    WriteOneAlignedRow(*tsfile_writer_, "root.dev_first", 1000, 7);
+    ASSERT_EQ(tsfile_writer_->flush(), E_OK);
+    ASSERT_EQ(tsfile_writer_->close(), E_OK);
+
+    // Second lifecycle: tear down the previous writer state and re-init
+    // against a brand-new file.
+    tsfile_writer_->destroy();
+
+    const std::string second_path = std::string("tsfile_writer_reuse_test_") +
+                                    generate_random_string(10) +
+                                    std::string(".tsfile");
+    remove(second_path.c_str());
+    WriteFile* wf = OpenWriteFileFor(second_path);
+    ASSERT_NE(wf, nullptr);
+    ASSERT_EQ(tsfile_writer_->init(wf), E_OK);
+
+    WriteOneAlignedRow(*tsfile_writer_, "root.dev_second", 2000, 9);
+    ASSERT_EQ(tsfile_writer_->flush(), E_OK);
+    ASSERT_EQ(tsfile_writer_->close(), E_OK);
+
+    // The second file must start with the TsFile magic + version byte.
+    // The TsFileReader open path mostly indexes from the file tail, so a
+    // missing magic at offset 0 isn't caught by reader.open().  Inspect the
+    // raw header bytes instead — that's exactly what start_file_done_ guards.
+    {
+        std::ifstream in(second_path, std::ios::binary);
+        ASSERT_TRUE(in.is_open());
+        char header[MAGIC_STRING_TSFILE_LEN + 1] = {0};
+        in.read(header, MAGIC_STRING_TSFILE_LEN + 1);
+        EXPECT_EQ(in.gcount(),
+                  static_cast<std::streamsize>(MAGIC_STRING_TSFILE_LEN + 1));
+        EXPECT_EQ(memcmp(header, MAGIC_STRING_TSFILE, MAGIC_STRING_TSFILE_LEN),
+                  0)
+            << "second-file header is missing the TsFile magic — "
+               "start_file_done_ residual from the previous lifecycle";
+        EXPECT_EQ(header[MAGIC_STRING_TSFILE_LEN], VERSION_NUM_BYTE);
+    }
+
+    // wf was passed to init() but init() did not take ownership.
+    delete wf;
+    remove(second_path.c_str());
 }
\ No newline at end of file
diff --git a/cpp/test/writer/value_page_writer_test.cc b/cpp/test/writer/value_page_writer_test.cc
index 07666e189..be04586ee 100644
--- a/cpp/test/writer/value_page_writer_test.cc
+++ b/cpp/test/writer/value_page_writer_test.cc
@@ -106,3 +106,36 @@ TEST_F(ValuePageWriterTest, WritePageHeaderAndData) {
               common::E_OK);
     value_page_writer.destroy_page_data();
 }
+
+// Regression: write_batch used to bump size_ and the page bitmap for every
+// row in the batch *before* encoding the values.  If the value encode failed
+// mid-batch, the page would claim `count` rows had been written even though
+// the encoder stream only held a prefix.  The fix counts valid rows
+// upfront, encodes, and only commits size_ / bitmap when the encode
+// finishes cleanly.  This test exercises the happy path on a mixed-null
+// batch and asserts size_ and statistics agree with the row count — a
+// subsequent code change that re-introduces premature size_ bumping
+// without rolling back on failure would still pass this test, but it
+// guards the encode-then-commit ordering contract against accidental
+// rewrites.
+TEST_F(ValuePageWriterTest, WriteBatchCommitsStateAfterEncode) {
+    ValuePageWriter w;
+    w.init(TSDataType::INT64, TSEncoding::PLAIN, UNCOMPRESSED);
+
+    const uint32_t N = 5;
+    int64_t timestamps[N] = {100, 101, 102, 103, 104};
+    int64_t values[N] = {10, 20, 30, 40, 50};
+    common::BitMap nullmap;
+    ASSERT_EQ(nullmap.init(N), common::E_OK);
+    // bit=1 means null in the tablet bitmap convention.
+    nullmap.set(1);  // row 1 (timestamp 101) is null
+    nullmap.set(3);  // row 3 (timestamp 103) is null
+    ASSERT_EQ(w.write_batch(timestamps, values, nullmap, 0, N), common::E_OK);
+
+    // size_ tracks every row regardless of nullness, statistic only the
+    // non-null subset.  get_point_numer() returns size_ (rows incl. NULLs).
+    EXPECT_EQ(w.get_point_numer(), N);
+    auto* stat = static_cast<Int64Statistic*>(w.get_statistic());
+    ASSERT_NE(stat, nullptr);
+    EXPECT_EQ(stat->count_, 3u);
+}
diff --git a/python/tests/test_tsfile_dataset.py b/python/tests/test_tsfile_dataset.py
index d95d247c1..f58d5117c 100644
--- a/python/tests/test_tsfile_dataset.py
+++ b/python/tests/test_tsfile_dataset.py
@@ -872,10 +872,21 @@ def test_reader_catalog_shares_device_metadata_and_resolves_paths(tmp_path):
 
 
 def test_reader_read_series_by_row_retries_across_native_row_query_boundaries():
+    """read_series_by_row pulls TsBlocks via read_arrow_batch and must keep
+    re-issuing query_table_by_row when the underlying native call stops at
+    an internal block boundary before the caller's window is filled."""
+
+    import pyarrow as pa
+
     class _FakeResultSet:
-        def __init__(self, rows):
-            self._rows = rows
-            self._index = -1
+        def __init__(self, times, values):
+            self._batch = pa.table(
+                {
+                    "time": pa.array(times, type=pa.int64()),
+                    "totalcloudcover": pa.array(values, type=pa.float64()),
+                }
+            )
+            self._delivered = False
 
         def __enter__(self):
             return self
@@ -883,12 +894,11 @@ def __enter__(self):
         def __exit__(self, exc_type, exc_val, exc_tb):
             return False
 
-        def next(self):
-            self._index += 1
-            return self._index < len(self._rows)
-
-        def get_value_by_name(self, name):
-            return self._rows[self._index][name]
+        def read_arrow_batch(self):
+            if self._delivered or self._batch.num_rows == 0:
+                return None
+            self._delivered = True
+            return self._batch
 
     class _FakeNativeReader:
         def __init__(self, timestamps, values, boundary):
@@ -897,28 +907,31 @@ def __init__(self, timestamps, values, boundary):
             self._boundary = boundary
 
         def query_table_by_row(
-            self, table_name, column_names, offset=0, limit=-1, tag_filter=None
+            self,
+            table_name,
+            column_names,
+            offset=0,
+            limit=-1,
+            tag_filter=None,
+            batch_size=0,
         ):
             assert table_name == "pvf"
             assert column_names == ["totalcloudcover"]
             assert tag_filter is None
+            assert batch_size > 0, "row reads should use batch (Arrow) mode"
             if limit < 0:
                 stop = len(self._timestamps)
             else:
                 stop = min(offset + limit, len(self._timestamps))
 
-            # Simulate the current native bug: one row query cannot cross the
-            # next internal boundary, so callers must re-issue from the
+            # Simulate the native quirk where one query stops at the next
+            # internal block boundary; callers must re-issue from the
             # advanced offset to complete a large logical window.
             chunk_stop = min(stop, ((offset // self._boundary) + 1) * self._boundary)
-            rows = [
-                {
-                    "time": int(self._timestamps[idx]),
-                    "totalcloudcover": float(self._values[idx]),
-                }
-                for idx in range(offset, chunk_stop)
-            ]
-            return _FakeResultSet(rows)
+            return _FakeResultSet(
+                self._timestamps[offset:chunk_stop],
+                self._values[offset:chunk_stop],
+            )
 
     reader = object.__new__(TsFileSeriesReader)
     reader._reader = _FakeNativeReader(
diff --git a/python/tsfile/dataset/reader.py b/python/tsfile/dataset/reader.py
index 831926324..2cf0f4a55 100644
--- a/python/tsfile/dataset/reader.py
+++ b/python/tsfile/dataset/reader.py
@@ -378,37 +378,44 @@ def read_series_by_row(
         tag_values = _device_exact_tag_values(table_entry, device_entry)
         tag_filter = _build_exact_tag_filter(tag_values) if tag_values else None
 
-        # Some native row-query paths stop at an internal block boundary even
-        # when the requested window extends further. Re-issue from the advanced
-        # offset until we fill the caller's logical row window or reach EOF.
+        # Pull whole TsBlocks via the Arrow C-Data interface instead of
+        # iterating row-by-row in Python. Each result_set.next() +
+        # get_value_by_name() pair would be a Python<->C round-trip per row
+        # and dominates wall time on long slices; read_arrow_batch() returns
+        # a column-oriented batch in one call and lands directly in numpy.
         timestamp_parts = []
         value_parts = []
         remaining = limit
         next_offset = offset
 
         while remaining > 0:
-            batch_timestamps = []
-            batch_values = []
+            produced_this_call = 0
             with self._reader.query_table_by_row(
                 table_entry.table_name,
                 [field_name],
                 offset=next_offset,
                 limit=remaining,
                 tag_filter=tag_filter,
+                batch_size=65536,
             ) as result_set:
-                while result_set.next():
-                    batch_timestamps.append(result_set.get_value_by_name("time"))
-                    value = result_set.get_value_by_name(field_name)
-                    batch_values.append(np.nan if value is None else float(value))
-
-            if not batch_timestamps:
+                while True:
+                    arrow_table = result_set.read_arrow_batch()
+                    if arrow_table is None:
+                        break
+                    if arrow_table.num_rows == 0:
+                        continue
+                    timestamp_parts.append(arrow_table.column("time").to_numpy())
+                    raw_values = arrow_table.column(field_name).to_numpy(
+                        zero_copy_only=False
+                    )
+                    value_parts.append(np.asarray(raw_values, dtype=np.float64))
+                    produced_this_call += arrow_table.num_rows
+
+            if produced_this_call == 0:
                 break
 
-            timestamp_parts.append(np.asarray(batch_timestamps, dtype=np.int64))
-            value_parts.append(np.asarray(batch_values, dtype=np.float64))
-            read_count = len(batch_timestamps)
-            next_offset += read_count
-            remaining -= read_count
+            next_offset += produced_this_call
+            remaining -= produced_this_call
 
         if not timestamp_parts:
             return np.array([], dtype=np.int64), np.array([], dtype=np.float64)
diff --git a/python/tsfile/tsfile_reader.pyx b/python/tsfile/tsfile_reader.pyx
index 341a7493d..f94d42053 100644
--- a/python/tsfile/tsfile_reader.pyx
+++ b/python/tsfile/tsfile_reader.pyx
@@ -199,7 +199,9 @@ cdef class ResultSetPy:
         if data_type == TSDataTypePy.INT32:
             return tsfile_result_set_get_value_by_index_int32_t(self.result, index)
         elif data_type == TSDataTypePy.DATE:
-            return parse_int_to_date(tsfile_result_set_get_value_by_index_int64_t(self.result, index))
+            # DATE is physically stored as int32 (yyyymmdd), so read it through
+            # the int32 accessor that matches the underlying storage width.
+            return parse_int_to_date(tsfile_result_set_get_value_by_index_int32_t(self.result, index))
         elif data_type == TSDataTypePy.INT64 or data_type == TSDataTypePy.TIMESTAMP:
             return tsfile_result_set_get_value_by_index_int64_t(self.result, index)
         elif data_type == TSDataTypePy.FLOAT:

From 8d9149589d9e49694451b2f22f24c0b531dabf03 Mon Sep 17 00:00:00 2001
From: ColinLee <shuolin_l@163.com>
Date: Wed, 17 Jun 2026 09:51:57 +0800
Subject: [PATCH 2/9] fix memory leak.

---
 cpp/src/file/tsfile_io_reader.h               |  8 ++++++++
 cpp/src/reader/aligned_chunk_reader.cc        | 14 ++++++++++----
 cpp/src/reader/tsfile_series_scan_iterator.cc | 12 ++++++++++++
 3 files changed, 30 insertions(+), 4 deletions(-)

diff --git a/cpp/src/file/tsfile_io_reader.h b/cpp/src/file/tsfile_io_reader.h
index db3030419..876a3f785 100644
--- a/cpp/src/file/tsfile_io_reader.h
+++ b/cpp/src/file/tsfile_io_reader.h
@@ -51,6 +51,14 @@ class TsFileIOReader {
         device_node_cache_pa_.init(512, common::MOD_TSFILE_READER);
     }
 
+    // reset() frees a ReadFile created by init(const std::string&) and tears
+    // down the metadata arenas.  Without an explicit destructor the class
+    // relied on every owner calling reset() by hand, which leaks the ReadFile
+    // whenever a TsFileIOReader value goes out of scope directly (e.g. a stack
+    // instance in a test).  reset() is idempotent and skips a borrowed
+    // ReadFile (read_file_created_ == false), so this is safe in all cases.
+    ~TsFileIOReader() { reset(); }
+
     int init(const std::string& file_path);
 
     int init(ReadFile* read_file);
diff --git a/cpp/src/reader/aligned_chunk_reader.cc b/cpp/src/reader/aligned_chunk_reader.cc
index f130b524d..7e2bda41e 100644
--- a/cpp/src/reader/aligned_chunk_reader.cc
+++ b/cpp/src/reader/aligned_chunk_reader.cc
@@ -199,9 +199,15 @@ void AlignedChunkReader::destroy() {
         col->cur_page_header.reset();
         delete col;
     }
-    value_columns_.clear();
+    // This reader is placement-new'd and torn down via destroy() + mem_free
+    // without ever running ~AlignedChunkReader (see
+    // TsFileSeriesScanIterator::destroy), so .clear() would leave these
+    // vectors' backing buffers allocated and unreachable.  swap with an empty
+    // vector to actually release the storage, matching the chunk_pages_ /
+    // page_all_times_ handling above.
+    std::vector<ValueColumnState*>().swap(value_columns_);
     release_current_page_state();
-    per_page_times_.clear();
+    std::vector<std::vector<int64_t>>().swap(per_page_times_);
 #ifdef ENABLE_THREADS
     decode_pool_ = nullptr;  // borrowed, not owned
     for (auto* d : time_decoder_pool_) {
@@ -210,14 +216,14 @@ void AlignedChunkReader::destroy() {
             DecoderFactory::free(d);
         }
     }
-    time_decoder_pool_.clear();
+    std::vector<Decoder*>().swap(time_decoder_pool_);
     for (auto* c : time_compressor_pool_) {
         if (c != nullptr) {
             c->~Compressor();
             CompressorFactory::free(c);
         }
     }
-    time_compressor_pool_.clear();
+    std::vector<Compressor*>().swap(time_compressor_pool_);
 #endif
 }
 
diff --git a/cpp/src/reader/tsfile_series_scan_iterator.cc b/cpp/src/reader/tsfile_series_scan_iterator.cc
index eb41c1f40..538b00d43 100644
--- a/cpp/src/reader/tsfile_series_scan_iterator.cc
+++ b/cpp/src/reader/tsfile_series_scan_iterator.cc
@@ -31,6 +31,18 @@ using namespace common;
 namespace storage {
 
 void TsFileSeriesScanIterator::destroy() {
+    // MultiAlignedTimeseriesIndex is placement-new'd inside
+    // timeseries_index_pa_ (see TsFileIOReader::alloc_multi_ssi).  The arena's
+    // destroy() frees raw memory without running destructors, so its
+    // value_ts_idxs_ std::vector backing buffer would leak.  Release it
+    // explicitly before tearing down the arena.  dynamic_cast is null-safe and
+    // returns nullptr for the single-value / non-aligned index types, which own
+    // no separate heap storage.
+    if (auto* multi =
+            dynamic_cast<MultiAlignedTimeseriesIndex*>(itimeseries_index_)) {
+        std::vector<TimeseriesIndex*>().swap(multi->value_ts_idxs_);
+    }
+    itimeseries_index_ = nullptr;
     timeseries_index_pa_.destroy();
     if (chunk_reader_ != nullptr) {
         // destroy() already runs manual destructors on internal members

From 198cbc4897e3b4c8f49f2dab7e24d7bad032eb9c Mon Sep 17 00:00:00 2001
From: ColinLee <shuolin_l@163.com>
Date: Wed, 17 Jun 2026 12:01:13 +0800
Subject: [PATCH 3/9] fix err.

---
 cpp/src/file/tsfile_io_reader.h | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/cpp/src/file/tsfile_io_reader.h b/cpp/src/file/tsfile_io_reader.h
index 876a3f785..0073603fb 100644
--- a/cpp/src/file/tsfile_io_reader.h
+++ b/cpp/src/file/tsfile_io_reader.h
@@ -51,13 +51,24 @@ class TsFileIOReader {
         device_node_cache_pa_.init(512, common::MOD_TSFILE_READER);
     }
 
-    // reset() frees a ReadFile created by init(const std::string&) and tears
-    // down the metadata arenas.  Without an explicit destructor the class
-    // relied on every owner calling reset() by hand, which leaks the ReadFile
-    // whenever a TsFileIOReader value goes out of scope directly (e.g. a stack
-    // instance in a test).  reset() is idempotent and skips a borrowed
-    // ReadFile (read_file_created_ == false), so this is safe in all cases.
-    ~TsFileIOReader() { reset(); }
+    // Free only the ReadFile we own (created by init(const std::string&)).
+    // Without an explicit destructor that raw pointer leaks whenever a
+    // TsFileIOReader value goes out of scope without an explicit reset() (e.g.
+    // a stack instance in a test).  We deliberately do NOT call reset() here:
+    // reset() also runs tsfile_meta_page_arena_.destroy(), which would free the
+    // arena that tsfile_meta_ lives in *before* the implicit ~TsFileMeta member
+    // destructor runs, leaving its arena-allocated MetaIndexNode / shared_ptr
+    // graph dangling (use-after-free / crash).  The arenas and TsFileMeta clean
+    // themselves up correctly via member destruction order (tsfile_meta_ is
+    // destroyed before its backing arena).  An owner that already called
+    // reset() leaves read_file_ == nullptr, so this never double-frees.
+    ~TsFileIOReader() {
+        if (read_file_created_ && read_file_ != nullptr) {
+            read_file_->destroy();
+            delete read_file_;
+            read_file_ = nullptr;
+        }
+    }
 
     int init(const std::string& file_path);
 

From a3c6b77e167f83258046fc6e8244d245509bdbb0 Mon Sep 17 00:00:00 2001
From: ColinLee <shuolin_l@163.com>
Date: Fri, 26 Jun 2026 07:52:33 +0800
Subject: [PATCH 4/9] fix comment.

---
 cpp/src/common/allocator/byte_stream.h        |  38 +-
 cpp/src/common/global.cc                      |  16 +-
 cpp/src/common/path.h                         |   3 +
 cpp/src/common/statistic.h                    |  10 +-
 cpp/src/common/thread_pool.h                  |   9 +-
 cpp/src/compress/uncompressed_compressor.h    |  11 +
 cpp/src/cwrapper/tsfile_cwrapper.cc           |  20 +-
 cpp/src/encoding/decoder.h                    |  26 +-
 cpp/src/encoding/plain_encoder.h              |  44 +-
 cpp/src/encoding/ts2diff_decoder.h            |   6 +
 cpp/src/file/tsfile_io_reader.cc              |   8 +-
 cpp/src/file/tsfile_io_writer.cc              |  16 +-
 cpp/src/file/tsfile_io_writer.h               |  18 +-
 cpp/src/reader/aligned_chunk_reader.cc        | 548 +++---------------
 cpp/src/reader/aligned_chunk_reader.h         |  21 +-
 .../block/single_device_tsblock_reader.cc     |  17 +-
 cpp/src/reader/tsfile_series_scan_iterator.cc |   9 +-
 cpp/src/writer/time_chunk_writer.h            |   4 +-
 cpp/src/writer/tsfile_writer.cc               |  11 +
 cpp/src/writer/value_chunk_writer.cc          |   6 +-
 cpp/test/common/tsfile_common_test.cc         |  11 +-
 .../tsfile_table_query_by_row_test.cc         |   2 +-
 22 files changed, 290 insertions(+), 564 deletions(-)

diff --git a/cpp/src/common/allocator/byte_stream.h b/cpp/src/common/allocator/byte_stream.h
index ad8dbb90d..15f15b798 100644
--- a/cpp/src/common/allocator/byte_stream.h
+++ b/cpp/src/common/allocator/byte_stream.h
@@ -23,6 +23,9 @@
 #include <common/constant/tsfile_constant.h>
 #include <stdio.h>
 #include <stdlib.h>
+#ifdef _MSC_VER
+#include <intrin.h>  // _BitScanReverse
+#endif
 
 #include <atomic>
 #include <iostream>
@@ -40,7 +43,7 @@ namespace common {
 // non-atomic mode we still go through the atomic interface but with
 // memory_order_relaxed, which on x86/ARM compiles to a plain load/store.
 // std::atomic<T> is non-copyable, so neither is OptionalAtomic; existing
-// callers either construct in place or use shallow_clone_from / store.
+// callers either construct in place or use store.
 template <typename T>
 class OptionalAtomic {
    public:
@@ -235,18 +238,23 @@ FORCE_INLINE double bytes_to_double(uint8_t bytes[8]) {
 // Round n up to the next power of two (>=1). Used to normalize ByteStream
 // page sizes so that `& page_mask_` is equivalent to `% page_size_`.
 // Values above the largest power-of-two that fits in uint32_t are clamped to
-// 0x80000000 — the previous `while (ps < n) ps <<= 1` would shift past 2^31
-// and overflow to 0, looping forever.
+// 0x80000000 — a naive `while (ps < n) ps <<= 1` would shift past 2^31 and
+// overflow to 0, looping forever.
+//
+// Derived from the index of the highest set bit of (n-1): the next power of
+// two >= n is 1 << (bits needed to represent n-1).  The two guards keep the
+// bit-scan input in [1, 2^31-1] where it is well-defined (clz(0) is UB), so
+// the shift amount stays in [1, 31] and never hits the `1u << 32` UB.
 FORCE_INLINE uint32_t round_up_pow2(uint32_t n) {
     if (n <= 1) return 1;
     if (n > 0x80000000u) return 0x80000000u;
-    uint32_t v = n - 1;
-    v |= v >> 1;
-    v |= v >> 2;
-    v |= v >> 4;
-    v |= v >> 8;
-    v |= v >> 16;
-    return v + 1;
+#if defined(_MSC_VER)
+    unsigned long idx;
+    _BitScanReverse(&idx, n - 1);
+    return 1u << (idx + 1);
+#else
+    return 1u << (32 - __builtin_clz(n - 1));
+#endif
 }
 
 // auto extend buffer for serialization
@@ -365,16 +373,6 @@ class ByteStream {
         read_pos_ = 0;
     }
 
-    // never used TODO
-    void shallow_clone_from(ByteStream& other) {
-        this->page_size_ = other.page_size_;
-        this->page_mask_ = other.page_mask_;
-        this->mid_ = other.mid_;
-        this->head_.store(other.head_.load());
-        this->tail_.store(other.tail_.load());
-        this->total_size_.store(other.total_size_.load());
-    }
-
     FORCE_INLINE uint64_t total_size() const { return total_size_.load(); }
     FORCE_INLINE uint64_t read_pos() const { return read_pos_; };
     // Sum of bytes physically allocated for this stream's pages.  For a
diff --git a/cpp/src/common/global.cc b/cpp/src/common/global.cc
index cc6c5117f..ac51dd8b2 100644
--- a/cpp/src/common/global.cc
+++ b/cpp/src/common/global.cc
@@ -64,14 +64,16 @@ void init_config_value() {
     g_config_value_.float_encoding_type_ = GORILLA;
     g_config_value_.double_encoding_type_ = GORILLA;
     g_config_value_.string_encoding_type_ = PLAIN;
-    // Pick the strongest compressor that was actually compiled in. Gating on
-    // ENABLE_LZ4 while setting SNAPPY (the original code) would request a
-    // compressor that the factory can't produce when the build disables
-    // Snappy, returning nullptr at write time.
-#ifdef ENABLE_SNAPPY
-    g_config_value_.default_compression_type_ = SNAPPY;
-#elif defined(ENABLE_LZ4)
+    // Default compression is LZ4, matching the Java reference implementation
+    // (TSFileConfig.compressor) and the previous C++ default; LZ4 generally
+    // matches or beats Snappy on both ratio and decompression speed.  Fall
+    // back to whatever was actually compiled in so the factory can always
+    // produce the chosen compressor (an earlier revision gated on ENABLE_LZ4
+    // but set SNAPPY, returning nullptr at write time when Snappy was off).
+#ifdef ENABLE_LZ4
     g_config_value_.default_compression_type_ = LZ4;
+#elif defined(ENABLE_SNAPPY)
+    g_config_value_.default_compression_type_ = SNAPPY;
 #else
     g_config_value_.default_compression_type_ = UNCOMPRESSED;
 #endif
diff --git a/cpp/src/common/path.h b/cpp/src/common/path.h
index c176d93db..f09b8c316 100644
--- a/cpp/src/common/path.h
+++ b/cpp/src/common/path.h
@@ -76,6 +76,9 @@ struct Path {
                     full_path_ =
                         device_id_->get_device_name() + "." + measurement_;
                 } else {
+                    // Single-node path (no separator): there is no device
+                    // prefix, so the lone token is the measurement and the
+                    // device id is left empty.
                     full_path_ = path_sc;
                     device_id_ = std::make_shared<StringArrayDeviceID>();
                     measurement_ = path_sc;
diff --git a/cpp/src/common/statistic.h b/cpp/src/common/statistic.h
index 3d45b4f43..949ee276a 100644
--- a/cpp/src/common/statistic.h
+++ b/cpp/src/common/statistic.h
@@ -703,9 +703,15 @@ class Int32Statistic : public Statistic {
             count_ = 1;
             start = 1;
         }
+        // Timestamps are monotonic (verified by TimePageWriter),
+        // so only first/last matter for start_time_/end_time_.
+        if (count > start) {
+            if (timestamps[start] < start_time_)
+                start_time_ = timestamps[start];
+            if (timestamps[count - 1] > end_time_)
+                end_time_ = timestamps[count - 1];
+        }
         for (uint32_t i = start; i < count; i++) {
-            if (timestamps[i] < start_time_) start_time_ = timestamps[i];
-            if (timestamps[i] > end_time_) end_time_ = timestamps[i];
             if (values[i] < min_value_) min_value_ = values[i];
             if (values[i] > max_value_) max_value_ = values[i];
             sum_value_ += (int64_t)values[i];
diff --git a/cpp/src/common/thread_pool.h b/cpp/src/common/thread_pool.h
index 191001bd9..d7aa6f2a4 100644
--- a/cpp/src/common/thread_pool.h
+++ b/cpp/src/common/thread_pool.h
@@ -29,6 +29,8 @@
 #include <thread>
 #include <vector>
 
+#include "common/logger/elog.h"
+
 namespace common {
 
 // Unified fixed-size thread pool supporting both fire-and-forget tasks
@@ -127,8 +129,13 @@ class ThreadPool {
             // tasks where the alternative is termination.
             try {
                 task();
+            } catch (const std::exception& e) {
+                // Suppressed to keep the worker alive and wait_all() unblocked
+                // (see comment above); logged so the failure is not silent.
+                LOGE("ThreadPool worker: task threw std::exception: "
+                     << e.what());
             } catch (...) {
-                // Intentionally suppressed; see comment above.
+                LOGE("ThreadPool worker: task threw a non-standard exception");
             }
             {
                 std::lock_guard<std::mutex> lk(mu_);
diff --git a/cpp/src/compress/uncompressed_compressor.h b/cpp/src/compress/uncompressed_compressor.h
index c342b5001..34fb844ed 100644
--- a/cpp/src/compress/uncompressed_compressor.h
+++ b/cpp/src/compress/uncompressed_compressor.h
@@ -62,6 +62,17 @@ class UncompressedCompressor : public Compressor {
 
     int uncompress(char* compressed_buf, uint32_t compressed_buf_len,
                    char*& uncompressed_buf, uint32_t& uncompressed_buf_len) {
+        // Allocate + copy rather than aliasing compressed_buf, even though the
+        // "uncompressed" bytes equal the input.  Every caller and the leak
+        // safety-net below assume the same ownership contract as the real
+        // compressors: uncompress() returns a heap buffer released by
+        // after_uncompress(), and cached in uncompressed_buf_ so
+        // reset()/destroy()/the dtor can reclaim it when an error path (e.g. a
+        // corrupted page that returns before after_uncompress() runs) would
+        // otherwise leak it.  Aliasing would point uncompressed_buf_ into the
+        // caller's shared page buffer, so those mem_free() calls would free a
+        // mid-buffer pointer -> heap corruption / double free.  A zero-copy
+        // fast path would need an explicit "not owned" flag in the contract.
         char* buf = static_cast<char*>(
             common::mem_alloc(compressed_buf_len, common::MOD_COMPRESSOR_OBJ));
         if (buf == nullptr) {
diff --git a/cpp/src/cwrapper/tsfile_cwrapper.cc b/cpp/src/cwrapper/tsfile_cwrapper.cc
index 5661927f3..c3e0c1116 100644
--- a/cpp/src/cwrapper/tsfile_cwrapper.cc
+++ b/cpp/src/cwrapper/tsfile_cwrapper.cc
@@ -1646,8 +1646,10 @@ ResultSet _tsfile_reader_query_device(TsFileReader reader,
 // Every C-API entry must validate its pointers: a null reader would deref
 // during the static_cast, and null table/column/value would feed std::string
 // a null pointer (UB / crash).
-#define DEFINE_TAG_FILTER_FACTORY(name, method)                               \
-    TagFilterHandle tsfile_tag_filter_##name(                                 \
+// The function-name suffix and the TagFilterBuilder method are always the same
+// operator, so the macro takes a single argument used for both.
+#define DEFINE_TAG_FILTER_FACTORY(op)                                         \
+    TagFilterHandle tsfile_tag_filter_##op(                                   \
         TsFileReader reader, const char* table_name, const char* column_name, \
         const char* value) {                                                  \
         if (reader == nullptr || table_name == nullptr ||                     \
@@ -1658,15 +1660,15 @@ ResultSet _tsfile_reader_query_device(TsFileReader reader,
         auto schema = r->get_table_schema(table_name);                        \
         if (!schema) return nullptr;                                          \
         storage::TagFilterBuilder builder(schema.get());                      \
-        return builder.method(column_name, value);                            \
+        return builder.op(column_name, value);                                \
     }
 
-DEFINE_TAG_FILTER_FACTORY(eq, eq)
-DEFINE_TAG_FILTER_FACTORY(neq, neq)
-DEFINE_TAG_FILTER_FACTORY(lt, lt)
-DEFINE_TAG_FILTER_FACTORY(lteq, lteq)
-DEFINE_TAG_FILTER_FACTORY(gt, gt)
-DEFINE_TAG_FILTER_FACTORY(gteq, gteq)
+DEFINE_TAG_FILTER_FACTORY(eq)
+DEFINE_TAG_FILTER_FACTORY(neq)
+DEFINE_TAG_FILTER_FACTORY(lt)
+DEFINE_TAG_FILTER_FACTORY(lteq)
+DEFINE_TAG_FILTER_FACTORY(gt)
+DEFINE_TAG_FILTER_FACTORY(gteq)
 
 #undef DEFINE_TAG_FILTER_FACTORY
 
diff --git a/cpp/src/encoding/decoder.h b/cpp/src/encoding/decoder.h
index 24455ca01..70b9cde93 100644
--- a/cpp/src/encoding/decoder.h
+++ b/cpp/src/encoding/decoder.h
@@ -155,12 +155,28 @@ class Decoder {
         return common::E_OK;
     }
 
-    // Block-level filter check: peek the next block header and compute
-    // the value range [block_min, block_max] without decoding.
+    // Block-level filter pushdown for TS_2DIFF-encoded INT64 columns.
+    //
+    // TS_2DIFF stores values in self-contained "blocks": a header (value
+    // count + per-delta bit width), then a min-delta and a first value,
+    // then `count` bit-packed delta-of-deltas.  A page may hold several
+    // sequential blocks; the boundary is set by the encoder's batch size.
+    // This call peeks the next block's header (without consuming the packed
+    // payload) and reports the block's value range [block_min, block_max]
+    // and `block_count` — the number of values the block covers (first value
+    // plus the packed deltas).  The caller then either:
+    //   - Call skip_peeked_block_int64() to skip the whole block when a
+    //     filter excludes [block_min, block_max], or
+    //   - Call read_batch_int64(), which reuses the peeked header.
+    //
+    // Implemented only for INT64 because it targets the time column, which is
+    // always INT64 and monotonically increasing.  Monotonicity is what makes
+    // the range recoverable from the header alone (min = first value, max =
+    // the block's last timestamp, obtained by looking ahead to the next
+    // block's first value).  Non-monotonic columns (INT32 / value columns)
+    // can't derive a range cheaply, so they fall back to this default and
+    // decode normally.
     // Returns true if a block was peeked; false if not supported or no data.
-    // After peeking, caller must either:
-    //   - Call skip_peeked_block_int64() to skip the block
-    //   - Call read_batch_int64() which will use the peeked header
     virtual bool peek_next_block_range_int64(common::ByteStream& in,
                                              int64_t& block_min,
                                              int64_t& block_max,
diff --git a/cpp/src/encoding/plain_encoder.h b/cpp/src/encoding/plain_encoder.h
index 84ebee238..1ed2fe12c 100644
--- a/cpp/src/encoding/plain_encoder.h
+++ b/cpp/src/encoding/plain_encoder.h
@@ -24,9 +24,8 @@
 
 #include "encoder.h"
 
-#if defined(__ARM_NEON) || defined(__ARM_NEON__)
-#include <arm_neon.h>
-#define TSFILE_HAS_NEON 1
+#ifdef ENABLE_SIMD
+#include "simde/x86/ssse3.h"
 #endif
 
 namespace storage {
@@ -98,12 +97,15 @@ class PlainEncoder : public Encoder {
             uint8_t* dst = (uint8_t*)buf.buf_;
             const int64_t* src = values + offset;
             uint32_t i = 0;
-#if TSFILE_HAS_NEON
-            // NEON: byte-reverse 2 x int64 per iteration
+#ifdef ENABLE_SIMD
+            // SIMDe: byte-reverse 2 x int64 per iteration
+            const simde__m128i bswap64_shuf = simde_mm_set_epi8(
+                8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);
             for (; i + 2 <= batch; i += 2) {
-                uint8x16_t v = vld1q_u8((const uint8_t*)&src[i]);
-                v = vrev64q_u8(v);
-                vst1q_u8(dst, v);
+                simde__m128i v = simde_mm_loadu_si128(
+                    (const simde__m128i*)&src[i]);
+                v = simde_mm_shuffle_epi8(v, bswap64_shuf);
+                simde_mm_storeu_si128((simde__m128i*)dst, v);
                 dst += 16;
             }
 #endif
@@ -142,12 +144,15 @@ class PlainEncoder : public Encoder {
             uint8_t* dst = (uint8_t*)buf.buf_;
             const double* src = values + offset;
             uint32_t i = 0;
-#if TSFILE_HAS_NEON
-            // NEON byte-reverse of raw bytes works for double bits too.
+#ifdef ENABLE_SIMD
+            // SIMDe: byte-reverse 2 x double (64-bit) per iteration
+            const simde__m128i bswap64_shuf = simde_mm_set_epi8(
+                8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);
             for (; i + 2 <= batch; i += 2) {
-                uint8x16_t v = vld1q_u8((const uint8_t*)&src[i]);
-                v = vrev64q_u8(v);
-                vst1q_u8(dst, v);
+                simde__m128i v = simde_mm_loadu_si128(
+                    (const simde__m128i*)&src[i]);
+                v = simde_mm_shuffle_epi8(v, bswap64_shuf);
+                simde_mm_storeu_si128((simde__m128i*)dst, v);
                 dst += 16;
             }
 #endif
@@ -189,12 +194,15 @@ class PlainEncoder : public Encoder {
             uint8_t* dst = (uint8_t*)buf.buf_;
             const float* src = values + offset;
             uint32_t i = 0;
-#if TSFILE_HAS_NEON
-            // NEON: byte-reverse 4 x float (32-bit) per iteration
+#ifdef ENABLE_SIMD
+            // SIMDe: byte-reverse 4 x float (32-bit) per iteration
+            const simde__m128i bswap32_shuf = simde_mm_set_epi8(
+                12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3);
             for (; i + 4 <= batch; i += 4) {
-                uint8x16_t v = vld1q_u8((const uint8_t*)&src[i]);
-                v = vrev32q_u8(v);
-                vst1q_u8(dst, v);
+                simde__m128i v = simde_mm_loadu_si128(
+                    (const simde__m128i*)&src[i]);
+                v = simde_mm_shuffle_epi8(v, bswap32_shuf);
+                simde_mm_storeu_si128((simde__m128i*)dst, v);
                 dst += 16;
             }
 #endif
diff --git a/cpp/src/encoding/ts2diff_decoder.h b/cpp/src/encoding/ts2diff_decoder.h
index bc6e89613..224d7402e 100644
--- a/cpp/src/encoding/ts2diff_decoder.h
+++ b/cpp/src/encoding/ts2diff_decoder.h
@@ -793,6 +793,12 @@ template <>
 inline bool TS2DIFFDecoder<int64_t>::peek_next_block_range_int64(
     common::ByteStream& in, int64_t& block_min, int64_t& block_max,
     int& block_count) {
+    // Precondition: this must only be driven on the monotonically-increasing
+    // time column (all callers invoke it via time_decoder_).  The block_min =
+    // first_value / block_max = next-block-first_value shortcuts below rely on
+    // that ordering; an unsorted INT64 *value* column that happens to use
+    // TS_2DIFF would get a wrong range here, so it must never be called on a
+    // value decoder (value decoders decode normally and never call this).
     if (current_index_ != 0 || !has_remaining(in)) return false;
 
     read_header(in);
diff --git a/cpp/src/file/tsfile_io_reader.cc b/cpp/src/file/tsfile_io_reader.cc
index 014e78832..f9494a74d 100644
--- a/cpp/src/file/tsfile_io_reader.cc
+++ b/cpp/src/file/tsfile_io_reader.cc
@@ -418,10 +418,14 @@ int TsFileIOReader::get_cached_device_node(std::shared_ptr<IDeviceID> device_id,
     const int64_t read_size_i64 = end_offset - start_offset;
     // read_file_->read() takes int32_t; a meta index node larger than 2 GiB
     // is implausible but explicitly reject it instead of silently truncating
-    // the read length and corrupting the parse.
-    if (read_size_i64 <= 0 || read_size_i64 > INT32_MAX) {
+    // the read length and corrupting the parse.  Distinguish the two cases:
+    // an inverted/empty range is corruption, an oversized one is an overflow.
+    if (read_size_i64 <= 0) {
         return E_TSFILE_CORRUPTED;
     }
+    if (read_size_i64 > INT32_MAX) {
+        return E_OVERFLOW;
+    }
     const int32_t read_size = static_cast<int32_t>(read_size_i64);
     int32_t ret_read_len = 0;
 
diff --git a/cpp/src/file/tsfile_io_writer.cc b/cpp/src/file/tsfile_io_writer.cc
index 71bb08a7e..8c207ca82 100644
--- a/cpp/src/file/tsfile_io_writer.cc
+++ b/cpp/src/file/tsfile_io_writer.cc
@@ -126,13 +126,10 @@ int TsFileIOWriter::start_flush_chunk_group(
     cur_device_name_ = device_name;
     ASSERT(cur_chunk_group_meta_ == nullptr);
     use_prev_alloc_cgm_ = false;
-    for (auto iter = chunk_group_meta_list_.begin();
-         iter != chunk_group_meta_list_.end(); iter++) {
-        if (*iter.get()->device_id_ == *cur_device_name_) {
-            use_prev_alloc_cgm_ = true;
-            cur_chunk_group_meta_ = iter.get();
-            break;
-        }
+    auto idx_it = chunk_group_meta_index_.find(cur_device_name_);
+    if (idx_it != chunk_group_meta_index_.end()) {
+        use_prev_alloc_cgm_ = true;
+        cur_chunk_group_meta_ = idx_it->second;
     }
     if (!use_prev_alloc_cgm_) {
         void* buf = meta_allocator_.alloc(sizeof(*cur_chunk_group_meta_));
@@ -256,8 +253,9 @@ int TsFileIOWriter::end_flush_chunk_group(bool is_aligned) {
         cur_chunk_group_meta_ = nullptr;
         return common::E_OK;
     }
-    chunk_group_meta_index_[cur_device_name_->get_device_name()] =
-        cur_chunk_group_meta_;
+    // First CGM per device wins (emplace, no overwrite); reached only when the
+    // device was not already present, so this records its first CGM.
+    chunk_group_meta_index_.emplace(cur_device_name_, cur_chunk_group_meta_);
     int ret = chunk_group_meta_list_.push_back(cur_chunk_group_meta_);
     cur_chunk_group_meta_ = nullptr;
     return ret;
diff --git a/cpp/src/file/tsfile_io_writer.h b/cpp/src/file/tsfile_io_writer.h
index 4904b924a..f041a1c57 100644
--- a/cpp/src/file/tsfile_io_writer.h
+++ b/cpp/src/file/tsfile_io_writer.h
@@ -21,7 +21,6 @@
 #define FILE_TSFILE_IO_WRITER_H
 
 #include <map>
-#include <unordered_map>
 #include <vector>
 
 #include "common/allocator/page_arena.h"
@@ -194,7 +193,11 @@ class TsFileIOWriter {
     void push_chunk_group_meta(ChunkGroupMeta* cgm) {
         chunk_group_meta_list_.push_back(cgm);
         if (cgm->device_id_) {
-            chunk_group_meta_index_[cgm->device_id_->get_device_name()] = cgm;
+            // First CGM per device wins, matching the previous linear scan
+            // (which returned the earliest match in list order).  Recovery may
+            // push several CGMs for one device; the lookup must resolve to the
+            // first so reuse targets the same CGM the scan did.
+            chunk_group_meta_index_.emplace(cgm->device_id_, cgm);
         }
     }
     /** Chunks/CGMs allocated from meta_allocator_ via start_flush_chunk*()
@@ -222,9 +225,14 @@ class TsFileIOWriter {
     ChunkGroupMeta* cur_chunk_group_meta_;
     int32_t chunk_meta_count_;  // for debug
     common::SimpleList<ChunkGroupMeta*> chunk_group_meta_list_;
-    // O(1) lookup for existing ChunkGroupMeta by device name, avoiding the
-    // O(N) linear scan through chunk_group_meta_list_ per device.
-    std::unordered_map<std::string, ChunkGroupMeta*> chunk_group_meta_index_;
+    // O(log N) lookup for an existing ChunkGroupMeta by device id, replacing
+    // the O(N) linear scan through chunk_group_meta_list_ per device.  Keyed by
+    // IDeviceID *content* (IDeviceIDComparator compares segment-by-segment), so
+    // distinct devices whose joined name strings would collide — e.g.
+    // ("a.","b") and ("a",".b") both render as "a..b" via get_device_name() —
+    // stay separate.
+    std::map<std::shared_ptr<IDeviceID>, ChunkGroupMeta*, IDeviceIDComparator>
+        chunk_group_meta_index_;
     bool use_prev_alloc_cgm_;  // chunk group meta
     std::shared_ptr<IDeviceID> cur_device_name_;
     WriteFile* file_;
diff --git a/cpp/src/reader/aligned_chunk_reader.cc b/cpp/src/reader/aligned_chunk_reader.cc
index 7e2bda41e..7d6eff167 100644
--- a/cpp/src/reader/aligned_chunk_reader.cc
+++ b/cpp/src/reader/aligned_chunk_reader.cc
@@ -21,6 +21,7 @@
 
 #include <algorithm>
 #include <limits>
+#include <type_traits>
 
 #include "common/global.h"
 #ifdef ENABLE_THREADS
@@ -755,300 +756,58 @@ int AlignedChunkReader::i32_DECODE_TYPED_TV_INTO_TSBLOCK(
     return ret;
 }
 
-int AlignedChunkReader::i32_DECODE_TV_BATCH(ByteStream& time_in,
-                                            ByteStream& value_in,
-                                            RowAppender& row_appender,
-                                            Filter* filter) {
-    int ret = E_OK;
-    const int BATCH = 129;
-    int64_t times[BATCH];
-    int32_t values[BATCH];
-    const uint32_t null_mask_base = 1 << 7;
-
-    while (time_decoder_->has_remaining(time_in)) {
-        if (row_appender.remaining() < (uint32_t)BATCH) {
-            ret = E_OVERFLOW;
-            break;
-        }
-
-        // Block-level time filter check
-        bool block_all_pass = false;
-        if (filter != nullptr) {
-            int64_t block_min, block_max;
-            int block_count;
-            if (time_decoder_->peek_next_block_range_int64(
-                    time_in, block_min, block_max, block_count)) {
-                if (!filter->satisfy_start_end_time(block_min, block_max)) {
-                    int skipped = 0;
-                    time_decoder_->skip_peeked_block_int64(time_in, skipped);
-                    int nonnull = 0;
-                    for (int i = 0; i < block_count; ++i) {
-                        int vi = cur_value_index + 1 + i;
-                        if (!value_page_col_notnull_bitmap_.empty() &&
-                            ((value_page_col_notnull_bitmap_[vi / 8] & 0xFF) &
-                             (null_mask_base >> (vi % 8))) != 0) {
-                            ++nonnull;
-                        }
-                    }
-                    cur_value_index += block_count;
-                    if (nonnull > 0) {
-                        // skip_* may legitimately fail (truncated page) or
-                        // short-read (corrupt bitmap vs. data); both must
-                        // abort the loop rather than silently desync the
-                        // value decoder.  Same defect the multi-value path
-                        // already guards against.
-                        int sk = 0;
-                        if (RET_FAIL(value_decoder_->skip_int32(nonnull, sk,
-                                                                value_in))) {
-                            break;
-                        }
-                        if (sk != nonnull) {
-                            ret = E_TSFILE_CORRUPTED;
-                            break;
-                        }
-                    }
-                    continue;
-                }
-                if (filter->contain_start_end_time(block_min, block_max)) {
-                    block_all_pass = true;
-                }
-            }
-        }
-
-        int time_count = 0;
-        if (RET_FAIL(time_decoder_->read_batch_int64(times, BATCH, time_count,
-                                                     time_in))) {
-            break;
-        }
-        if (time_count == 0) break;
-
-        bool is_null[BATCH];
-        int nonnull_count = 0;
-        for (int i = 0; i < time_count; ++i) {
-            int vi = cur_value_index + 1 + i;
-            if (value_page_col_notnull_bitmap_.empty() ||
-                ((value_page_col_notnull_bitmap_[vi / 8] & 0xFF) &
-                 (null_mask_base >> (vi % 8))) == 0) {
-                is_null[i] = true;
-            } else {
-                is_null[i] = false;
-                ++nonnull_count;
-            }
-        }
-
-        bool time_mask[BATCH];
-        int pass_count = time_count;
-        if (filter != nullptr && !block_all_pass) {
-            pass_count =
-                filter->satisfy_batch_time(times, time_count, time_mask);
-        }
-
-        if (pass_count == 0) {
-            if (nonnull_count > 0) {
-                int skipped = 0;
-                if (RET_FAIL(value_decoder_->skip_int32(nonnull_count, skipped,
-                                                        value_in))) {
-                    break;
-                }
-                if (skipped != nonnull_count) {
-                    ret = E_TSFILE_CORRUPTED;
-                    break;
-                }
-            }
-            cur_value_index += time_count;
-            continue;
-        }
-
-        int value_count = 0;
-        if (nonnull_count > 0) {
-            if (RET_FAIL(value_decoder_->read_batch_int32(
-                    values, nonnull_count, value_count, value_in))) {
-                break;
-            }
-        }
-
-        int val_idx = 0;
-        for (int i = 0; i < time_count; ++i) {
-            cur_value_index++;
-            if (filter != nullptr && !block_all_pass && !time_mask[i]) {
-                if (!is_null[i]) ++val_idx;
-                continue;
-            }
-            if (is_null[i]) {
-                if (UNLIKELY(!row_appender.add_row())) {
-                    ret = E_OVERFLOW;
-                    break;
-                }
-                row_appender.append(0, (char*)&times[i], sizeof(int64_t));
-                row_appender.append_null(1);
-            } else {
-                int32_t val = values[val_idx++];
-                if (filter != nullptr && !block_all_pass &&
-                    !filter->satisfy(times[i], (int64_t)val)) {
-                    continue;
-                }
-                if (UNLIKELY(!row_appender.add_row())) {
-                    ret = E_OVERFLOW;
-                    break;
-                }
-                row_appender.append(0, (char*)&times[i], sizeof(int64_t));
-                row_appender.append(1, (char*)&val, sizeof(int32_t));
-            }
-        }
-        if (ret != E_OK) break;
-    }
-    return ret;
+namespace {
+// Type-dispatched value batch read / skip for decode_tv_batch<T>.  Overload
+// resolution on the value pointer type selects the matching Decoder method, so
+// the four fixed-width value types share one decode loop.
+FORCE_INLINE int read_value_batch_typed(Decoder* d, int32_t* out, int cap,
+                                        int& actual, ByteStream& in) {
+    return d->read_batch_int32(out, cap, actual, in);
 }
-
-int AlignedChunkReader::i64_DECODE_TV_BATCH(ByteStream& time_in,
-                                            ByteStream& value_in,
-                                            RowAppender& row_appender,
-                                            Filter* filter) {
-    int ret = E_OK;
-    const int BATCH = 129;
-    int64_t times[BATCH];
-    int64_t values[BATCH];
-    const uint32_t null_mask_base = 1 << 7;
-
-    while (time_decoder_->has_remaining(time_in)) {
-        if (row_appender.remaining() < (uint32_t)BATCH) {
-            ret = E_OVERFLOW;
-            break;
-        }
-
-        // Block-level time filter check: skip entire block if out of range
-        bool block_all_pass = false;
-        if (filter != nullptr) {
-            int64_t block_min, block_max;
-            int block_count;
-            if (time_decoder_->peek_next_block_range_int64(
-                    time_in, block_min, block_max, block_count)) {
-                if (!filter->satisfy_start_end_time(block_min, block_max)) {
-                    int skipped = 0;
-                    time_decoder_->skip_peeked_block_int64(time_in, skipped);
-                    int nonnull = 0;
-                    for (int i = 0; i < block_count; ++i) {
-                        int vi = cur_value_index + 1 + i;
-                        if (!value_page_col_notnull_bitmap_.empty() &&
-                            ((value_page_col_notnull_bitmap_[vi / 8] & 0xFF) &
-                             (null_mask_base >> (vi % 8))) != 0) {
-                            ++nonnull;
-                        }
-                    }
-                    cur_value_index += block_count;
-                    if (nonnull > 0) {
-                        // See i32 path above for the rationale.
-                        int sk = 0;
-                        if (RET_FAIL(value_decoder_->skip_int64(nonnull, sk,
-                                                                value_in))) {
-                            break;
-                        }
-                        if (sk != nonnull) {
-                            ret = E_TSFILE_CORRUPTED;
-                            break;
-                        }
-                    }
-                    continue;
-                }
-                if (filter->contain_start_end_time(block_min, block_max)) {
-                    block_all_pass = true;
-                }
-            }
-        }
-
-        int time_count = 0;
-        if (RET_FAIL(time_decoder_->read_batch_int64(times, BATCH, time_count,
-                                                     time_in))) {
-            break;
-        }
-        if (time_count == 0) break;
-
-        bool is_null[BATCH];
-        int nonnull_count = 0;
-        for (int i = 0; i < time_count; ++i) {
-            int vi = cur_value_index + 1 + i;
-            if (value_page_col_notnull_bitmap_.empty() ||
-                ((value_page_col_notnull_bitmap_[vi / 8] & 0xFF) &
-                 (null_mask_base >> (vi % 8))) == 0) {
-                is_null[i] = true;
-            } else {
-                is_null[i] = false;
-                ++nonnull_count;
-            }
-        }
-
-        bool time_mask[BATCH];
-        int pass_count = time_count;
-        if (filter != nullptr && !block_all_pass) {
-            pass_count =
-                filter->satisfy_batch_time(times, time_count, time_mask);
-        }
-
-        if (pass_count == 0) {
-            if (nonnull_count > 0) {
-                int skipped = 0;
-                if (RET_FAIL(value_decoder_->skip_int64(nonnull_count, skipped,
-                                                        value_in))) {
-                    break;
-                }
-                if (skipped != nonnull_count) {
-                    ret = E_TSFILE_CORRUPTED;
-                    break;
-                }
-            }
-            cur_value_index += time_count;
-            continue;
-        }
-
-        int value_count = 0;
-        if (nonnull_count > 0) {
-            if (RET_FAIL(value_decoder_->read_batch_int64(
-                    values, nonnull_count, value_count, value_in))) {
-                break;
-            }
-        }
-
-        int val_idx = 0;
-        for (int i = 0; i < time_count; ++i) {
-            cur_value_index++;
-            if (filter != nullptr && !block_all_pass && !time_mask[i]) {
-                if (!is_null[i]) ++val_idx;
-                continue;
-            }
-            if (is_null[i]) {
-                if (UNLIKELY(!row_appender.add_row())) {
-                    ret = E_OVERFLOW;
-                    break;
-                }
-                row_appender.append(0, (char*)&times[i], sizeof(int64_t));
-                row_appender.append_null(1);
-            } else {
-                int64_t val = values[val_idx++];
-                if (filter != nullptr && !block_all_pass &&
-                    !filter->satisfy(times[i], val)) {
-                    continue;
-                }
-                if (UNLIKELY(!row_appender.add_row())) {
-                    ret = E_OVERFLOW;
-                    break;
-                }
-                row_appender.append(0, (char*)&times[i], sizeof(int64_t));
-                row_appender.append(1, (char*)&val, sizeof(int64_t));
-            }
-        }
-        if (ret != E_OK) break;
-    }
-    return ret;
+FORCE_INLINE int read_value_batch_typed(Decoder* d, int64_t* out, int cap,
+                                        int& actual, ByteStream& in) {
+    return d->read_batch_int64(out, cap, actual, in);
 }
-
-int AlignedChunkReader::float_DECODE_TV_BATCH(ByteStream& time_in,
-                                              ByteStream& value_in,
-                                              RowAppender& row_appender,
-                                              Filter* filter) {
+FORCE_INLINE int read_value_batch_typed(Decoder* d, float* out, int cap,
+                                        int& actual, ByteStream& in) {
+    return d->read_batch_float(out, cap, actual, in);
+}
+FORCE_INLINE int read_value_batch_typed(Decoder* d, double* out, int cap,
+                                        int& actual, ByteStream& in) {
+    return d->read_batch_double(out, cap, actual, in);
+}
+FORCE_INLINE int skip_value_typed(Decoder* d, int32_t*, int n, int& skipped,
+                                  ByteStream& in) {
+    return d->skip_int32(n, skipped, in);
+}
+FORCE_INLINE int skip_value_typed(Decoder* d, int64_t*, int n, int& skipped,
+                                  ByteStream& in) {
+    return d->skip_int64(n, skipped, in);
+}
+FORCE_INLINE int skip_value_typed(Decoder* d, float*, int n, int& skipped,
+                                  ByteStream& in) {
+    return d->skip_float(n, skipped, in);
+}
+FORCE_INLINE int skip_value_typed(Decoder* d, double*, int n, int& skipped,
+                                  ByteStream& in) {
+    return d->skip_double(n, skipped, in);
+}
+}  // namespace
+
+// Unified aligned time+value page decode for fixed-width value types
+// (INT32/INT64/FLOAT/DOUBLE).  These differ only in the value array type, the
+// typed read/skip calls (dispatched via the helpers above), and whether the
+// per-value Filter::satisfy (which takes an int64 value) is applied — only
+// integral value columns use it; float/double are filtered on time only.
+template <typename T>
+int AlignedChunkReader::decode_tv_batch(ByteStream& time_in,
+                                        ByteStream& value_in,
+                                        RowAppender& row_appender,
+                                        Filter* filter) {
     int ret = E_OK;
     const int BATCH = 129;
     int64_t times[BATCH];
-    float values[BATCH];
+    T values[BATCH];
     const uint32_t null_mask_base = 1 << 7;
 
     while (time_decoder_->has_remaining(time_in)) {
@@ -1057,7 +816,7 @@ int AlignedChunkReader::float_DECODE_TV_BATCH(ByteStream& time_in,
             break;
         }
 
-        // Block-level time filter check
+        // Block-level time filter check: skip entire block if out of range.
         bool block_all_pass = false;
         if (filter != nullptr) {
             int64_t block_min, block_max;
@@ -1078,10 +837,13 @@ int AlignedChunkReader::float_DECODE_TV_BATCH(ByteStream& time_in,
                     }
                     cur_value_index += block_count;
                     if (nonnull > 0) {
-                        // See i32 path above for the rationale.
+                        // skip_* may legitimately fail (truncated page) or
+                        // short-read (corrupt bitmap vs. data); both must abort
+                        // the loop rather than silently desync the value
+                        // decoder.
                         int sk = 0;
-                        if (RET_FAIL(value_decoder_->skip_float(nonnull, sk,
-                                                                value_in))) {
+                        if (RET_FAIL(skip_value_typed(value_decoder_, values,
+                                                      nonnull, sk, value_in))) {
                             break;
                         }
                         if (sk != nonnull) {
@@ -1128,8 +890,9 @@ int AlignedChunkReader::float_DECODE_TV_BATCH(ByteStream& time_in,
         if (pass_count == 0) {
             if (nonnull_count > 0) {
                 int skipped = 0;
-                if (RET_FAIL(value_decoder_->skip_float(nonnull_count, skipped,
-                                                        value_in))) {
+                if (RET_FAIL(skip_value_typed(value_decoder_, values,
+                                              nonnull_count, skipped,
+                                              value_in))) {
                     break;
                 }
                 if (skipped != nonnull_count) {
@@ -1143,8 +906,9 @@ int AlignedChunkReader::float_DECODE_TV_BATCH(ByteStream& time_in,
 
         int value_count = 0;
         if (nonnull_count > 0) {
-            if (RET_FAIL(value_decoder_->read_batch_float(
-                    values, nonnull_count, value_count, value_in))) {
+            if (RET_FAIL(read_value_batch_typed(value_decoder_, values,
+                                                nonnull_count, value_count,
+                                                value_in))) {
                 break;
             }
         }
@@ -1164,150 +928,22 @@ int AlignedChunkReader::float_DECODE_TV_BATCH(ByteStream& time_in,
                 row_appender.append(0, (char*)&times[i], sizeof(int64_t));
                 row_appender.append_null(1);
             } else {
-                float val = values[val_idx++];
-                if (UNLIKELY(!row_appender.add_row())) {
-                    ret = E_OVERFLOW;
-                    break;
-                }
-                row_appender.append(0, (char*)&times[i], sizeof(int64_t));
-                row_appender.append(1, (char*)&val, sizeof(float));
-            }
-        }
-        if (ret != E_OK) break;
-    }
-    return ret;
-}
-
-int AlignedChunkReader::double_DECODE_TV_BATCH(ByteStream& time_in,
-                                               ByteStream& value_in,
-                                               RowAppender& row_appender,
-                                               Filter* filter) {
-    int ret = E_OK;
-    const int BATCH = 129;
-    int64_t times[BATCH];
-    double values[BATCH];
-    const uint32_t null_mask_base = 1 << 7;
-
-    while (time_decoder_->has_remaining(time_in)) {
-        if (row_appender.remaining() < (uint32_t)BATCH) {
-            ret = E_OVERFLOW;
-            break;
-        }
-
-        // Block-level time filter check
-        bool block_all_pass = false;
-        if (filter != nullptr) {
-            int64_t block_min, block_max;
-            int block_count;
-            if (time_decoder_->peek_next_block_range_int64(
-                    time_in, block_min, block_max, block_count)) {
-                if (!filter->satisfy_start_end_time(block_min, block_max)) {
-                    int skipped = 0;
-                    time_decoder_->skip_peeked_block_int64(time_in, skipped);
-                    int nonnull = 0;
-                    for (int i = 0; i < block_count; ++i) {
-                        int vi = cur_value_index + 1 + i;
-                        if (!value_page_col_notnull_bitmap_.empty() &&
-                            ((value_page_col_notnull_bitmap_[vi / 8] & 0xFF) &
-                             (null_mask_base >> (vi % 8))) != 0) {
-                            ++nonnull;
-                        }
-                    }
-                    cur_value_index += block_count;
-                    if (nonnull > 0) {
-                        // See i32 path above for the rationale.
-                        int sk = 0;
-                        if (RET_FAIL(value_decoder_->skip_double(nonnull, sk,
-                                                                 value_in))) {
-                            break;
-                        }
-                        if (sk != nonnull) {
-                            ret = E_TSFILE_CORRUPTED;
-                            break;
-                        }
-                    }
+                T val = values[val_idx++];
+                // Per-value filter applies only to integral value columns;
+                // Filter::satisfy takes an int64 value.  is_integral<T> is a
+                // compile-time constant, so this branch is elided (and the
+                // int64 cast never evaluated) for float/double.
+                if (std::is_integral<T>::value && filter != nullptr &&
+                    !block_all_pass &&
+                    !filter->satisfy(times[i], static_cast<int64_t>(val))) {
                     continue;
                 }
-                if (filter->contain_start_end_time(block_min, block_max)) {
-                    block_all_pass = true;
-                }
-            }
-        }
-
-        int time_count = 0;
-        if (RET_FAIL(time_decoder_->read_batch_int64(times, BATCH, time_count,
-                                                     time_in))) {
-            break;
-        }
-        if (time_count == 0) break;
-
-        bool is_null[BATCH];
-        int nonnull_count = 0;
-        for (int i = 0; i < time_count; ++i) {
-            int vi = cur_value_index + 1 + i;
-            if (value_page_col_notnull_bitmap_.empty() ||
-                ((value_page_col_notnull_bitmap_[vi / 8] & 0xFF) &
-                 (null_mask_base >> (vi % 8))) == 0) {
-                is_null[i] = true;
-            } else {
-                is_null[i] = false;
-                ++nonnull_count;
-            }
-        }
-
-        bool time_mask[BATCH];
-        int pass_count = time_count;
-        if (filter != nullptr && !block_all_pass) {
-            pass_count =
-                filter->satisfy_batch_time(times, time_count, time_mask);
-        }
-
-        if (pass_count == 0) {
-            if (nonnull_count > 0) {
-                int skipped = 0;
-                if (RET_FAIL(value_decoder_->skip_double(nonnull_count, skipped,
-                                                         value_in))) {
-                    break;
-                }
-                if (skipped != nonnull_count) {
-                    ret = E_TSFILE_CORRUPTED;
-                    break;
-                }
-            }
-            cur_value_index += time_count;
-            continue;
-        }
-
-        int value_count = 0;
-        if (nonnull_count > 0) {
-            if (RET_FAIL(value_decoder_->read_batch_double(
-                    values, nonnull_count, value_count, value_in))) {
-                break;
-            }
-        }
-
-        int val_idx = 0;
-        for (int i = 0; i < time_count; ++i) {
-            cur_value_index++;
-            if (filter != nullptr && !block_all_pass && !time_mask[i]) {
-                if (!is_null[i]) ++val_idx;
-                continue;
-            }
-            if (is_null[i]) {
                 if (UNLIKELY(!row_appender.add_row())) {
                     ret = E_OVERFLOW;
                     break;
                 }
                 row_appender.append(0, (char*)&times[i], sizeof(int64_t));
-                row_appender.append_null(1);
-            } else {
-                double val = values[val_idx++];
-                if (UNLIKELY(!row_appender.add_row())) {
-                    ret = E_OVERFLOW;
-                    break;
-                }
-                row_appender.append(0, (char*)&times[i], sizeof(int64_t));
-                row_appender.append(1, (char*)&val, sizeof(double));
+                row_appender.append(1, (char*)&val, sizeof(T));
             }
         }
         if (ret != E_OK) break;
@@ -1330,21 +966,21 @@ int AlignedChunkReader::decode_tv_buf_into_tsblock_by_datatype(
             // Batch decode path: read_batch_int{32,64} consumes whole TS_2DIFF
             // blocks at once (and uses SIMD when ENABLE_SIMD); replaces a
             // per-value decode() loop that hot-dominated the read flame graph.
-            ret =
-                i32_DECODE_TV_BATCH(time_in_, value_in_, row_appender, filter);
+            ret = decode_tv_batch<int32_t>(time_in_, value_in_, row_appender,
+                                           filter);
             break;
         case common::TIMESTAMP:
         case common::INT64:
-            ret =
-                i64_DECODE_TV_BATCH(time_in_, value_in_, row_appender, filter);
+            ret = decode_tv_batch<int64_t>(time_in_, value_in_, row_appender,
+                                           filter);
             break;
         case common::FLOAT:
-            ret = float_DECODE_TV_BATCH(time_in_, value_in_, row_appender,
-                                        filter);
+            ret = decode_tv_batch<float>(time_in_, value_in_, row_appender,
+                                         filter);
             break;
         case common::DOUBLE:
-            ret = double_DECODE_TV_BATCH(time_in_, value_in_, row_appender,
-                                         filter);
+            ret = decode_tv_batch<double>(time_in_, value_in_, row_appender,
+                                          filter);
             break;
         case common::STRING:
         case common::BLOB:
@@ -1973,9 +1609,8 @@ int AlignedChunkReader::decode_value_page_for_slot(uint32_t col_idx,
         return E_TSFILE_CORRUPTED;
     }
     pps.notnull_bitmap.resize(bitmap_bytes);
-    for (size_t i = 0; i < pps.notnull_bitmap.size(); i++) {
-        pps.notnull_bitmap[i] = *(uncompressed_buf + offset++);
-    }
+    memcpy(pps.notnull_bitmap.data(), uncompressed_buf + offset, bitmap_bytes);
+    offset += bitmap_bytes;
 
     char* value_buf = uncompressed_buf + offset;
     uint32_t value_buf_size = uncompressed_size - offset;
@@ -2345,6 +1980,9 @@ int AlignedChunkReader::get_next_page_multi(TsBlock* ret_tsblock,
                 for (uint32_t c = 0; c < num_cols; c++) {
                     auto* col = value_columns_[c];
                     auto& pps = col->per_page_state[current_page_plan_index_];
+                    // An empty notnull_bitmap means this column carried no data
+                    // for the page (a missing / sparse aligned measurement), so
+                    // every row is null; otherwise consult the per-row bit.
                     bool is_null = true;
                     if (!pps.notnull_bitmap.empty()) {
                         is_null =
@@ -2539,10 +2177,8 @@ int AlignedChunkReader::decode_cur_value_page_data_for(ValueColumnState& col) {
     uint32_t bitmap_bytes = (data_num + 7) / 8;
     if (uncompressed_size - offset < bitmap_bytes) return E_TSFILE_CORRUPTED;
     col.notnull_bitmap.resize(bitmap_bytes);
-    for (size_t i = 0; i < col.notnull_bitmap.size(); i++) {
-        col.notnull_bitmap[i] = *(uncompressed_buf + offset);
-        offset++;
-    }
+    memcpy(col.notnull_bitmap.data(), uncompressed_buf + offset, bitmap_bytes);
+    offset += bitmap_bytes;
     col.cur_value_index = -1;
 
     char* value_buf = uncompressed_buf + offset;
@@ -2605,10 +2241,8 @@ int AlignedChunkReader::decompress_and_parse_value_page(ValueColumnState& col,
     uint32_t bitmap_bytes = (data_num + 7) / 8;
     if (uncompressed_size - offset < bitmap_bytes) return E_TSFILE_CORRUPTED;
     col.notnull_bitmap.resize(bitmap_bytes);
-    for (size_t i = 0; i < col.notnull_bitmap.size(); i++) {
-        col.notnull_bitmap[i] = *(uncompressed_buf + offset);
-        offset++;
-    }
+    memcpy(col.notnull_bitmap.data(), uncompressed_buf + offset, bitmap_bytes);
+    offset += bitmap_bytes;
     col.cur_value_index = -1;
 
     char* value_buf = uncompressed_buf + offset;
@@ -2783,7 +2417,7 @@ int AlignedChunkReader::multi_DECODE_TV_BATCH(TsBlock* ret_tsblock,
             // bufs are owned by the caller-provided PageArena.
             std::vector<common::String> str_vals;
         };
-        // Allocate on heap if many columns, stack for small counts
+        // One ColBatch per value column, heap-allocated for the batch.
         std::vector<ColBatch> col_batches(num_cols);
 
         for (uint32_t c = 0; c < num_cols; c++) {
@@ -2974,17 +2608,17 @@ int AlignedChunkReader::multi_DECODE_TV_BATCH(TsBlock* ret_tsblock,
         // columns have variable-width payload and live in cb.str_vals, not
         // cb.val_buf, so they must take the slow scatter path.
         if (pass_count == time_count) {
-            bool all_nonnull = true;
+            bool all_nonnull_and_fixed_size = true;
             for (uint32_t c = 0; c < num_cols; c++) {
                 auto dt = value_columns_[c]->chunk_header.data_type_;
                 if (col_batches[c].nonnull_count != time_count ||
                     dt == common::STRING || dt == common::TEXT ||
                     dt == common::BLOB) {
-                    all_nonnull = false;
+                    all_nonnull_and_fixed_size = false;
                     break;
                 }
             }
-            if (all_nonnull) {
+            if (all_nonnull_and_fixed_size) {
                 // Batch append time column (bytes + row count); see the
                 // chunk-level bulk path above for why add_row_nums() is
                 // required alongside append_fixed_value().
diff --git a/cpp/src/reader/aligned_chunk_reader.h b/cpp/src/reader/aligned_chunk_reader.h
index 69ce48f4a..b92c4d7b5 100644
--- a/cpp/src/reader/aligned_chunk_reader.h
+++ b/cpp/src/reader/aligned_chunk_reader.h
@@ -221,20 +221,13 @@ class AlignedChunkReader : public IChunkReader {
                                          common::ByteStream& value_in,
                                          common::RowAppender& row_appender,
                                          Filter* filter);
-    int i32_DECODE_TV_BATCH(common::ByteStream& time_in,
-                            common::ByteStream& value_in,
-                            common::RowAppender& row_appender, Filter* filter);
-    int i64_DECODE_TV_BATCH(common::ByteStream& time_in,
-                            common::ByteStream& value_in,
-                            common::RowAppender& row_appender, Filter* filter);
-    int float_DECODE_TV_BATCH(common::ByteStream& time_in,
-                              common::ByteStream& value_in,
-                              common::RowAppender& row_appender,
-                              Filter* filter);
-    int double_DECODE_TV_BATCH(common::ByteStream& time_in,
-                               common::ByteStream& value_in,
-                               common::RowAppender& row_appender,
-                               Filter* filter);
+    // Unified fixed-width aligned time+value page decode for
+    // INT32/INT64/FLOAT/DOUBLE.  Defined in the .cc; instantiated there for
+    // each value type by decode_tv_buf_into_tsblock_by_datatype().
+    template <typename T>
+    int decode_tv_batch(common::ByteStream& time_in,
+                        common::ByteStream& value_in,
+                        common::RowAppender& row_appender, Filter* filter);
     int STRING_DECODE_TYPED_TV_INTO_TSBLOCK(common::ByteStream& time_in,
                                             common::ByteStream& value_in,
                                             common::RowAppender& row_appender,
diff --git a/cpp/src/reader/block/single_device_tsblock_reader.cc b/cpp/src/reader/block/single_device_tsblock_reader.cc
index 5fb9d80d2..f0249578a 100644
--- a/cpp/src/reader/block/single_device_tsblock_reader.cc
+++ b/cpp/src/reader/block/single_device_tsblock_reader.cc
@@ -69,8 +69,8 @@ int32_t SingleDeviceTsBlockReader::compute_dense_row_count(
     // (see TsFileIOWriter / TimeseriesIndex::deserialize_from); when the
     // chunk-level statistic is null, fall back to the TimeseriesIndex's
     // top-level statistic, which summarizes that lone chunk.
-    auto chunk_count = [](const common::SimpleList<ChunkMeta*>& list,
-                          Statistic* fallback) -> int64_t {
+    auto count_chunk_points = [](const common::SimpleList<ChunkMeta*>& list,
+                                 Statistic* fallback) -> int64_t {
         int64_t total = 0;
         int nchunks = 0;
         for (auto it = list.begin(); it != list.end(); it++) {
@@ -114,14 +114,14 @@ int32_t SingleDeviceTsBlockReader::compute_dense_row_count(
                 aligned_ti->value_ts_idx_ != nullptr
                     ? aligned_ti->value_ts_idx_->get_statistic()
                     : nullptr;
-            time_count = chunk_count(*time_list, time_top_stat);
-            value_count = chunk_count(*value_list, value_top_stat);
+            time_count = count_chunk_points(*time_list, time_top_stat);
+            value_count = count_chunk_points(*value_list, value_top_stat);
         } else {
             auto* list = ts_index->get_chunk_meta_list();
             if (list == nullptr) {
                 return -1;
             }
-            time_count = chunk_count(*list, ts_index->get_statistic());
+            time_count = count_chunk_points(*list, ts_index->get_statistic());
             value_count = time_count;
         }
 
@@ -186,6 +186,10 @@ int SingleDeviceTsBlockReader::init_internal(DeviceQueryTask* device_query_task,
     // count across time + value chunks), bulk-copy from SSI tsblock to caller
     // tsblock instead of per-row merging.  compute_dense_row_count() returns
     // -1 if the device is not provably dense, which gates safety.
+    // Compile-time kill-switch for the dense aligned fast path below: flip to
+    // false to force the safe per-row merge path when debugging a suspected
+    // fast-path correctness issue.  The real gating is the runtime conditions
+    // at the use site (dense_row_count_ >= 0, all columns aligned).
     const bool enable_dense_aligned_fast_path = true;
     // Early device-level time skip: if time_filter is set and ALL chunks of
     // this device have statistics that fall outside the filter range, skip the
@@ -204,6 +208,9 @@ int SingleDeviceTsBlockReader::init_internal(DeviceQueryTask* device_query_task,
                                    ? ts_idx->get_time_chunk_meta_list()
                                    : ts_idx->get_chunk_meta_list();
             if (chunk_list == nullptr) {
+                // No chunk metadata for this column means we can't prove it
+                // lies outside the filter, so the device can't be safely
+                // skipped.  The decision is final, so stop scanning.
                 all_outside = false;
                 break;
             }
diff --git a/cpp/src/reader/tsfile_series_scan_iterator.cc b/cpp/src/reader/tsfile_series_scan_iterator.cc
index 538b00d43..d04d042a3 100644
--- a/cpp/src/reader/tsfile_series_scan_iterator.cc
+++ b/cpp/src/reader/tsfile_series_scan_iterator.cc
@@ -367,7 +367,9 @@ int TsFileSeriesScanIterator::init_chunk_reader_multi() {
 TsBlock* TsFileSeriesScanIterator::alloc_tsblock() {
     ChunkHeader& ch = chunk_reader_->get_chunk_header();
 
-    // TODO config
+    // Time column encoding/compression are placeholders: this ColumnSchema
+    // describes the already-decoded in-memory result TsBlock, where only
+    // data_type (always INT64 for time) is used (see alloc_tsblock_multi).
     ColumnSchema time_cd("time", common::INT64, common::SNAPPY,
                          common::TS_2DIFF);
     ColumnSchema value_cd(ch.measurement_name_, ch.data_type_,
@@ -387,7 +389,10 @@ TsBlock* TsFileSeriesScanIterator::alloc_tsblock() {
 TsBlock* TsFileSeriesScanIterator::alloc_tsblock_multi() {
     auto* acr = static_cast<AlignedChunkReader*>(chunk_reader_);
 
-    // Time column
+    // Time column.  The encoding/compression fields only matter for on-disk
+    // serialization; this ColumnSchema describes the already-decoded in-memory
+    // result TsBlock, where only data_type (always INT64 for time) is used, so
+    // the encoding/compression are placeholders.
     ColumnSchema time_cd("time", common::INT64, common::SNAPPY,
                          common::TS_2DIFF);
     tuple_desc_.push_back(time_cd);
diff --git a/cpp/src/writer/time_chunk_writer.h b/cpp/src/writer/time_chunk_writer.h
index e6b2894e2..f7360c369 100644
--- a/cpp/src/writer/time_chunk_writer.h
+++ b/cpp/src/writer/time_chunk_writer.h
@@ -98,10 +98,10 @@ class TimeChunkWriter {
     Statistic* get_chunk_statistic() { return chunk_statistic_; }
     FORCE_INLINE int32_t num_of_pages() const { return num_of_pages_; }
 
-    int64_t estimate_max_series_mem_size();
-
     bool hasData();
 
+    int64_t estimate_max_series_mem_size();
+
     // Current (unsealed) page point count.
     FORCE_INLINE uint32_t get_point_numer() const {
         return time_page_writer_.get_point_numer();
diff --git a/cpp/src/writer/tsfile_writer.cc b/cpp/src/writer/tsfile_writer.cc
index c469faaec..19efc1e34 100644
--- a/cpp/src/writer/tsfile_writer.cc
+++ b/cpp/src/writer/tsfile_writer.cc
@@ -1209,6 +1209,12 @@ int TsFileWriter::write_table(Tablet& tablet) {
                     return ret;
                 }
 
+                // device_ctx_index tracks devices seen in *this* tablet, but
+                // do_check_schema_table returns the device's persistent chunk
+                // writer, which may already hold points from earlier
+                // tablets/records in the same un-flushed chunk group — so
+                // time_cur_points can be > 0 even on first sight in this
+                // tablet.
                 uint32_t time_cur_points = time_chunk_writer->get_point_numer();
                 if (time_cur_points >= page_max_points) {
                     // Seal the time page first, then every value page in
@@ -1266,6 +1272,11 @@ int TsFileWriter::write_table(Tablet& tablet) {
                 uint32_t initial_page_points) -> int {
             int r = E_OK;
             tcw->set_enable_page_seal_if_full(false);
+            // The caller seals and resets time_cur_points to 0 once it reaches
+            // page_max_points, so initial_page_points is always in
+            // [0, page_max_points): >0 means a partial page (room is the
+            // leftover), ==0 means a fresh page (a full page of room).  The
+            // `< page_max_points` guard is defensive; that case can't occur.
             uint32_t page_remaining =
                 (initial_page_points > 0 &&
                  initial_page_points < page_max_points)
diff --git a/cpp/src/writer/value_chunk_writer.cc b/cpp/src/writer/value_chunk_writer.cc
index 182b0762b..88d0ea0f1 100644
--- a/cpp/src/writer/value_chunk_writer.cc
+++ b/cpp/src/writer/value_chunk_writer.cc
@@ -110,7 +110,11 @@ int ValueChunkWriter::seal_cur_page(bool end_chunk) {
                 /*stat*/ false, /*data*/ false);
             if (IS_SUCC(ret)) {
                 save_first_page_data(value_page_writer_);
-                // value_page_writer_.destroy_page_data();
+                // Intentionally no destroy_page_data() here:
+                // save_first_page_data() transferred ownership of the page
+                // buffers to first_page_data_, which is freed later via
+                // free_first_writer_data() once the deferred first page is
+                // written.  Destroying them here would double-free.
                 value_page_writer_.reset();
             }
         }
diff --git a/cpp/test/common/tsfile_common_test.cc b/cpp/test/common/tsfile_common_test.cc
index c451a8136..2108b2d02 100644
--- a/cpp/test/common/tsfile_common_test.cc
+++ b/cpp/test/common/tsfile_common_test.cc
@@ -484,12 +484,15 @@ TEST(DefaultCompressorTest, DefaultIsAllocatable) {
     Compressor* c = CompressorFactory::alloc_compressor(
         common::g_config_value_.default_compression_type_);
     ASSERT_NE(c, nullptr);
-#ifdef ENABLE_SNAPPY
-    EXPECT_EQ(common::g_config_value_.default_compression_type_,
-              common::CompressionType::SNAPPY);
-#elif defined(ENABLE_LZ4)
+    // Priority mirrors init_config_value(): LZ4 first (matches the Java
+    // reference default and the previous C++ default), then SNAPPY, then
+    // UNCOMPRESSED.
+#ifdef ENABLE_LZ4
     EXPECT_EQ(common::g_config_value_.default_compression_type_,
               common::CompressionType::LZ4);
+#elif defined(ENABLE_SNAPPY)
+    EXPECT_EQ(common::g_config_value_.default_compression_type_,
+              common::CompressionType::SNAPPY);
 #else
     EXPECT_EQ(common::g_config_value_.default_compression_type_,
               common::CompressionType::UNCOMPRESSED);
diff --git a/cpp/test/reader/table_view/tsfile_table_query_by_row_test.cc b/cpp/test/reader/table_view/tsfile_table_query_by_row_test.cc
index 9e3d9b562..74d97a022 100644
--- a/cpp/test/reader/table_view/tsfile_table_query_by_row_test.cc
+++ b/cpp/test/reader/table_view/tsfile_table_query_by_row_test.cc
@@ -763,7 +763,7 @@ TEST_F(TableQueryByRowTest, DenseSingleDeviceSsiLevelPushdown) {
 
 // Pushdown is faster than full query + manual next: queryByRow(offset, limit)
 // skips at device/SSI/Chunk level; old query then manual next decodes every
-// row. Timing tolerance 5% to allow measurement noise.
+// row. Timing tolerance 50% to allow cross-platform measurement noise.
 TEST_F(TableQueryByRowTest, DISABLED_QueryByRowFasterThanManualNext) {
     const int num_rows = 80000;
     const int offset = 30000;

From 6b0f95e89f9d77f85c9a023ba6d2fc38719eed96 Mon Sep 17 00:00:00 2001
From: ColinLee <shuolin_l@163.com>
Date: Fri, 26 Jun 2026 08:09:45 +0800
Subject: [PATCH 5/9] format.

---
 cpp/src/encoding/plain_encoder.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/cpp/src/encoding/plain_encoder.h b/cpp/src/encoding/plain_encoder.h
index 1ed2fe12c..3db87e173 100644
--- a/cpp/src/encoding/plain_encoder.h
+++ b/cpp/src/encoding/plain_encoder.h
@@ -102,8 +102,8 @@ class PlainEncoder : public Encoder {
             const simde__m128i bswap64_shuf = simde_mm_set_epi8(
                 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);
             for (; i + 2 <= batch; i += 2) {
-                simde__m128i v = simde_mm_loadu_si128(
-                    (const simde__m128i*)&src[i]);
+                simde__m128i v =
+                    simde_mm_loadu_si128((const simde__m128i*)&src[i]);
                 v = simde_mm_shuffle_epi8(v, bswap64_shuf);
                 simde_mm_storeu_si128((simde__m128i*)dst, v);
                 dst += 16;
@@ -149,8 +149,8 @@ class PlainEncoder : public Encoder {
             const simde__m128i bswap64_shuf = simde_mm_set_epi8(
                 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);
             for (; i + 2 <= batch; i += 2) {
-                simde__m128i v = simde_mm_loadu_si128(
-                    (const simde__m128i*)&src[i]);
+                simde__m128i v =
+                    simde_mm_loadu_si128((const simde__m128i*)&src[i]);
                 v = simde_mm_shuffle_epi8(v, bswap64_shuf);
                 simde_mm_storeu_si128((simde__m128i*)dst, v);
                 dst += 16;
@@ -199,8 +199,8 @@ class PlainEncoder : public Encoder {
             const simde__m128i bswap32_shuf = simde_mm_set_epi8(
                 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3);
             for (; i + 4 <= batch; i += 4) {
-                simde__m128i v = simde_mm_loadu_si128(
-                    (const simde__m128i*)&src[i]);
+                simde__m128i v =
+                    simde_mm_loadu_si128((const simde__m128i*)&src[i]);
                 v = simde_mm_shuffle_epi8(v, bswap32_shuf);
                 simde_mm_storeu_si128((simde__m128i*)dst, v);
                 dst += 16;

From eed002022d808a44bf004279a40aaf1be4e1e7aa Mon Sep 17 00:00:00 2001
From: ColinLee <shuolin_l@163.com>
Date: Mon, 29 Jun 2026 10:49:48 +0800
Subject: [PATCH 6/9] fix some ut err.

---
 cpp/src/cwrapper/tsfile_cwrapper.cc           | 20 +++++++++++++++----
 cpp/src/file/write_file.h                     |  6 ++++++
 cpp/src/reader/tsfile_series_scan_iterator.cc |  9 +++++++++
 .../tsfile_tree_query_by_row_test.cc          |  4 ++++
 4 files changed, 35 insertions(+), 4 deletions(-)

diff --git a/cpp/src/cwrapper/tsfile_cwrapper.cc b/cpp/src/cwrapper/tsfile_cwrapper.cc
index c3e0c1116..4ec70e00b 100644
--- a/cpp/src/cwrapper/tsfile_cwrapper.cc
+++ b/cpp/src/cwrapper/tsfile_cwrapper.cc
@@ -124,15 +124,22 @@ TsFileWriter tsfile_writer_new(WriteFile file, TableSchema* schema,
     if (err_code == nullptr) {
         return nullptr;
     }
-    if (file == nullptr || schema == nullptr ||
-        schema->column_schemas == nullptr || schema->table_name == nullptr) {
+    if (file == nullptr || schema == nullptr || schema->table_name == nullptr) {
         *err_code = common::E_INVALID_ARG;
         return nullptr;
     }
+    // An empty schema (no columns) is an invalid *schema*, not an invalid arg;
+    // check it before the column_schemas pointer, which is legitimately null
+    // when column_num == 0.  (Matches develop, which the C API test expects;
+    // otherwise an uninitialized/null column_schemas would flip the code.)
     if (schema->column_num == 0) {
         *err_code = common::E_INVALID_SCHEMA;
         return nullptr;
     }
+    if (schema->column_schemas == nullptr) {
+        *err_code = common::E_INVALID_ARG;
+        return nullptr;
+    }
 
     init_tsfile_config();
     std::vector<common::ColumnSchema> column_schemas;
@@ -172,15 +179,20 @@ TsFileWriter tsfile_writer_new_with_memory_threshold(WriteFile file,
     if (err_code == nullptr) {
         return nullptr;
     }
-    if (file == nullptr || schema == nullptr ||
-        schema->column_schemas == nullptr || schema->table_name == nullptr) {
+    if (file == nullptr || schema == nullptr || schema->table_name == nullptr) {
         *err_code = common::E_INVALID_ARG;
         return nullptr;
     }
+    // Empty schema is INVALID_SCHEMA; check before the (legitimately null when
+    // column_num == 0) column_schemas pointer.  See tsfile_writer_new().
     if (schema->column_num == 0) {
         *err_code = common::E_INVALID_SCHEMA;
         return nullptr;
     }
+    if (schema->column_schemas == nullptr) {
+        *err_code = common::E_INVALID_ARG;
+        return nullptr;
+    }
     init_tsfile_config();
     std::vector<common::ColumnSchema> column_schemas;
     std::set<std::string> column_names;
diff --git a/cpp/src/file/write_file.h b/cpp/src/file/write_file.h
index 9a5bce6e8..014fa1ae6 100644
--- a/cpp/src/file/write_file.h
+++ b/cpp/src/file/write_file.h
@@ -30,6 +30,12 @@ namespace storage {
 class WriteFile {
    public:
     WriteFile() : path_(), fd_(-1) {}
+    // Release the OS file handle on destruction.  Without this, a writer left
+    // in an unrecoverable state (whose close() refuses to finalize) would leak
+    // the fd — harmless on POSIX (unlink works on open files) but on Windows it
+    // keeps the file locked so a subsequent remove() fails.  close() is
+    // idempotent, so this is a no-op when the file was already closed normally.
+    ~WriteFile() { close(); }
     int create(const std::string& file_name, int flags, mode_t mode);
     bool file_opened() const { return fd_ > 0; }
     int write(const char* buf, uint32_t len);
diff --git a/cpp/src/reader/tsfile_series_scan_iterator.cc b/cpp/src/reader/tsfile_series_scan_iterator.cc
index d04d042a3..3439176d9 100644
--- a/cpp/src/reader/tsfile_series_scan_iterator.cc
+++ b/cpp/src/reader/tsfile_series_scan_iterator.cc
@@ -375,6 +375,11 @@ TsBlock* TsFileSeriesScanIterator::alloc_tsblock() {
     ColumnSchema value_cd(ch.measurement_name_, ch.data_type_,
                           ch.compression_type_, ch.encoding_type_);
 
+    // Reset first: this is called once per get_next(), and TsBlock holds a
+    // pointer to tuple_desc_.  Without the reset, columns from previous calls
+    // accumulate (each new block would carry duplicated columns and a
+    // reallocated descriptor), corrupting the block layout.
+    tuple_desc_.reset();
     tuple_desc_.push_back(time_cd);
     tuple_desc_.push_back(value_cd);
 
@@ -395,6 +400,10 @@ TsBlock* TsFileSeriesScanIterator::alloc_tsblock_multi() {
     // the encoding/compression are placeholders.
     ColumnSchema time_cd("time", common::INT64, common::SNAPPY,
                          common::TS_2DIFF);
+    // Reset first (see alloc_tsblock): tuple_desc_ is reused across get_next()
+    // calls and TsBlock holds a pointer to it, so stale columns must be
+    // cleared.
+    tuple_desc_.reset();
     tuple_desc_.push_back(time_cd);
 
     // Value columns
diff --git a/cpp/test/reader/tree_view/tsfile_tree_query_by_row_test.cc b/cpp/test/reader/tree_view/tsfile_tree_query_by_row_test.cc
index 9c47a9d4d..1aa1b4623 100644
--- a/cpp/test/reader/tree_view/tsfile_tree_query_by_row_test.cc
+++ b/cpp/test/reader/tree_view/tsfile_tree_query_by_row_test.cc
@@ -218,6 +218,10 @@ TEST_F(TreeQueryByRowTest, SparseAlignedChunkOffsetCrossesChunks) {
     using namespace storage;
     libtsfile_destroy();
     libtsfile_init();
+    // This test manages its own writer instead of the fixture's write_file_;
+    // close the fixture handle first so the remove() below succeeds on Windows
+    // (which can't delete a file that still has an open handle).
+    write_file_.close();
     remove(file_name_.c_str());
 
     // Tighten per-chunk capacity so two write_tablet_aligned calls produce

From 722d6396e1d734dcb8e875bfc7adf125d1f79ba3 Mon Sep 17 00:00:00 2001
From: ColinLee <shuolin_l@163.com>
Date: Mon, 29 Jun 2026 12:00:40 +0800
Subject: [PATCH 7/9] fix heap-use-after-free in multi-value aligned reader
 tests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

MultiValueAlignedSkipsBatchPreservesValueAlignment and
MultiValueAlignedWideChunkParallelDecode constructed ColIterator locals
inside the read loop and called ssi->revert_tsblock() (which frees the
TsBlock and its column vectors) while those iterators were still in
scope. ~ColIterator() calls vec_->reset_offset(), writing back into the
just-freed vector at the closing brace of the loop body — a real
use-after-free that Linux Release+ASan flags and whose heap corruption
then cascaded into spurious SprintzCodec/CRelease failures later in the
same single-process run.

Scope the ColIterators in a nested block so they are destroyed before
revert_tsblock(). Verified on Linux x86_64 Release+ASan+UBSan: full
suite 701/701 pass, 0 ASan/UBSan reports.
---
 cpp/test/reader/tsfile_reader_test.cc | 81 +++++++++++++++------------
 1 file changed, 46 insertions(+), 35 deletions(-)

diff --git a/cpp/test/reader/tsfile_reader_test.cc b/cpp/test/reader/tsfile_reader_test.cc
index 5f50724c4..2e74e870f 100644
--- a/cpp/test/reader/tsfile_reader_test.cc
+++ b/cpp/test/reader/tsfile_reader_test.cc
@@ -762,25 +762,31 @@ TEST_F(TsFileReaderTest, MultiValueAlignedSkipsBatchPreservesValueAlignment) {
         if (ret == common::E_NO_MORE_DATA) break;
         ASSERT_EQ(ret, common::E_OK);
         ASSERT_NE(block, nullptr);
-        // Columns: time, v0, v1.
-        common::ColIterator t_iter(0, block);
-        common::ColIterator v0_iter(1, block);
-        common::ColIterator v1_iter(2, block);
-        const uint32_t rows = block->get_row_count();
-        for (uint32_t r = 0; r < rows; ++r) {
-            uint32_t len = 0;
-            int64_t t = *reinterpret_cast<int64_t*>(t_iter.read(&len));
-            int64_t v0 = *reinterpret_cast<int64_t*>(v0_iter.read(&len));
-            int64_t v1 = *reinterpret_cast<int64_t*>(v1_iter.read(&len));
-            got.push_back({t, v0});
-            // The decoder must have advanced exactly nonnull_count values
-            // when it skipped batch #1.  If it under-advanced (the latent
-            // bug), v1 would land on the wrong row's bytes here.
-            EXPECT_EQ(v1, 1000000 + t);
-            EXPECT_EQ(v0, t);
-            t_iter.next();
-            v0_iter.next();
-            v1_iter.next();
+        // Scope the ColIterators so they are destroyed *before*
+        // revert_tsblock() frees the block.  ~ColIterator() writes back to its
+        // vector (reset_offset()), so reverting while an iterator is still in
+        // scope would touch freed memory.
+        {
+            // Columns: time, v0, v1.
+            common::ColIterator t_iter(0, block);
+            common::ColIterator v0_iter(1, block);
+            common::ColIterator v1_iter(2, block);
+            const uint32_t rows = block->get_row_count();
+            for (uint32_t r = 0; r < rows; ++r) {
+                uint32_t len = 0;
+                int64_t t = *reinterpret_cast<int64_t*>(t_iter.read(&len));
+                int64_t v0 = *reinterpret_cast<int64_t*>(v0_iter.read(&len));
+                int64_t v1 = *reinterpret_cast<int64_t*>(v1_iter.read(&len));
+                got.push_back({t, v0});
+                // The decoder must have advanced exactly nonnull_count values
+                // when it skipped batch #1.  If it under-advanced (the latent
+                // bug), v1 would land on the wrong row's bytes here.
+                EXPECT_EQ(v1, 1000000 + t);
+                EXPECT_EQ(v0, t);
+                t_iter.next();
+                v0_iter.next();
+                v1_iter.next();
+            }
         }
         ssi->revert_tsblock();
     }
@@ -863,24 +869,29 @@ TEST_F(TsFileReaderTest, MultiValueAlignedWideChunkParallelDecode) {
         ASSERT_NE(block, nullptr);
         const uint32_t rows = block->get_row_count();
 
-        common::ColIterator t_iter(0, block);
-        std::vector<int64_t> times;
-        times.reserve(rows);
-        for (uint32_t r = 0; r < rows; ++r) {
-            uint32_t len = 0;
-            times.push_back(*reinterpret_cast<int64_t*>(t_iter.read(&len)));
-            t_iter.next();
-        }
-        // One independent iterator per value column so we never rely on
-        // vector<ColIterator> being movable.
-        for (uint32_t c = 0; c < kCols; ++c) {
-            common::ColIterator it(c + 1, block);
+        // Scope all ColIterators so they are destroyed *before*
+        // revert_tsblock() frees the block — ~ColIterator() writes back to its
+        // vector (reset_offset()), which would be use-after-free otherwise.
+        {
+            common::ColIterator t_iter(0, block);
+            std::vector<int64_t> times;
+            times.reserve(rows);
             for (uint32_t r = 0; r < rows; ++r) {
                 uint32_t len = 0;
-                int64_t v = *reinterpret_cast<int64_t*>(it.read(&len));
-                int64_t i = times[r] - 1000;  // timestamp == 1000 + i
-                EXPECT_EQ(v, static_cast<int64_t>(c) * 1000000 + i);
-                it.next();
+                times.push_back(*reinterpret_cast<int64_t*>(t_iter.read(&len)));
+                t_iter.next();
+            }
+            // One independent iterator per value column so we never rely on
+            // vector<ColIterator> being movable.
+            for (uint32_t c = 0; c < kCols; ++c) {
+                common::ColIterator it(c + 1, block);
+                for (uint32_t r = 0; r < rows; ++r) {
+                    uint32_t len = 0;
+                    int64_t v = *reinterpret_cast<int64_t*>(it.read(&len));
+                    int64_t i = times[r] - 1000;  // timestamp == 1000 + i
+                    EXPECT_EQ(v, static_cast<int64_t>(c) * 1000000 + i);
+                    it.next();
+                }
             }
         }
         collected += static_cast<int>(rows);

From d8b8eff68da5b77d2310339fc8c5dbc2c42eee9c Mon Sep 17 00:00:00 2001
From: ColinLee <shuolin_l@163.com>
Date: Wed, 1 Jul 2026 08:41:19 +0800
Subject: [PATCH 8/9] fix device-node cache aliasing null tag with literal
 string "null"

TsFileIOReader::get_cached_device_node keyed device_node_cache_ by
IDeviceID::get_device_name(), which renders a null tag segment as the
literal text "null".  A device with a real null tag (e.g. tags
(null, b, c)) and a device whose tag value is the string "null"
(("null", b, c)) therefore produce the identical name "a.null.b.c" and
collide in the cache: whichever device is queried first populates the
entry, and every later query for the other device on the same reused
reader gets the first device's cached MetaIndexNode and silently reads
its chunks.

This surfaced through the Python dataset API, where one long-lived
TsFileReader answers many per-device queries: the pytest
test_dataset_null_tag_positions_and_string_null_are_distinct read
(null,b,c)'s data for the ("null",b,c) device.  The device metadata
binary search (DeviceIDComparable, segment-based) was always correct;
only the string cache key was lossy.

Key the cache by a collision-free, length-prefixed encoding of the
segment vector that flags null segments explicitly, so a null tag can
never alias the string "null".  Add a device_id unit test pinning the
invariant (names collide, segment equality distinguishes them).

Verified: C++ suite 707/707 (Release+ASan+UBSan), Python suite 150/150.
---
 cpp/src/file/tsfile_io_reader.cc  | 21 ++++++++++++++++++++-
 cpp/src/file/tsfile_io_reader.h   |  9 +++++++++
 cpp/test/common/device_id_test.cc | 28 ++++++++++++++++++++++++++++
 3 files changed, 57 insertions(+), 1 deletion(-)

diff --git a/cpp/src/file/tsfile_io_reader.cc b/cpp/src/file/tsfile_io_reader.cc
index f9494a74d..1edbeec57 100644
--- a/cpp/src/file/tsfile_io_reader.cc
+++ b/cpp/src/file/tsfile_io_reader.cc
@@ -384,10 +384,29 @@ int TsFileIOReader::load_tsfile_meta() {
     return ret;
 }
 
+std::string TsFileIOReader::device_node_cache_key(
+    const std::shared_ptr<IDeviceID>& device_id) {
+    // Length-prefixed, null-flagged encoding: for each segment emit either
+    // "N;" (null) or "<len>:<bytes>;".  Distinct segment sequences always map
+    // to distinct keys, so a real null tag never aliases the literal "null".
+    std::string key;
+    for (const std::string* seg : device_id->get_segments()) {
+        if (seg == nullptr) {
+            key += "N;";
+        } else {
+            key += std::to_string(seg->size());
+            key += ':';
+            key += *seg;
+            key += ';';
+        }
+    }
+    return key;
+}
+
 int TsFileIOReader::get_cached_device_node(std::shared_ptr<IDeviceID> device_id,
                                            common::PageArena& pa,
                                            CachedDeviceNode& out) {
-    std::string dev_name = device_id->get_device_name();
+    std::string dev_name = device_node_cache_key(device_id);
 
     {
         std::lock_guard<std::mutex> lk(device_node_cache_mu_);
diff --git a/cpp/src/file/tsfile_io_reader.h b/cpp/src/file/tsfile_io_reader.h
index 0073603fb..39f015f44 100644
--- a/cpp/src/file/tsfile_io_reader.h
+++ b/cpp/src/file/tsfile_io_reader.h
@@ -195,6 +195,15 @@ class TsFileIOReader {
                                common::PageArena& pa, CachedDeviceNode& out);
 
    private:
+    // Build a collision-free key for device_node_cache_.  get_device_name()
+    // renders a null tag segment as the literal "null", so a device with a
+    // real null tag and one whose tag value is the string "null" produce the
+    // same name and would alias in the cache — the second device would read
+    // the first device's chunks.  Encode each segment length-prefixed and
+    // flag null segments explicitly so the two can never collide.
+    static std::string device_node_cache_key(
+        const std::shared_ptr<IDeviceID>& device_id);
+
     ReadFile* read_file_;
     common::PageArena tsfile_meta_page_arena_;
     TsFileMeta tsfile_meta_;
diff --git a/cpp/test/common/device_id_test.cc b/cpp/test/common/device_id_test.cc
index f3877c278..9d97607ab 100644
--- a/cpp/test/common/device_id_test.cc
+++ b/cpp/test/common/device_id_test.cc
@@ -71,4 +71,32 @@ TEST(DeviceIdTest, TabletDeviceId) {
     ASSERT_EQ("test_device0.null.t2.t3",
               tablet.get_device_id(2)->get_device_name());
 }
+
+// Regression: a device whose first tag is a real null and a device whose first
+// tag is the literal string "null" render to the SAME get_device_name()
+// ("t.null.b"), so anything that keys a per-device map/cache by the device name
+// aliases the two — the second device silently reads the first device's chunks.
+// The device-node cache in TsFileIOReader hit exactly this, conflating the two
+// devices' data on a reused reader.  The reliable discriminator is the segment
+// vector (operator==), which keeps nullptr distinct from the string "null".
+TEST(DeviceIdTest, NullTagVsLiteralNullAreDistinct) {
+    // Real null first tag: segment pointer is nullptr.
+    std::vector<std::string*> null_first_segs{new std::string("t"), nullptr,
+                                              new std::string("b")};
+    StringArrayDeviceID null_first(null_first_segs);
+    for (auto* s : null_first_segs) delete s;
+
+    // Literal string "null" as the first tag value.
+    StringArrayDeviceID literal_null(
+        std::vector<std::string>({"t", "null", "b"}));
+
+    // The names collide — this is the trap the cache used to fall into.
+    ASSERT_EQ(null_first.get_device_name(), literal_null.get_device_name());
+    ASSERT_EQ("t.null.b", null_first.get_device_name());
+
+    // But the devices are genuinely different, and the segment-based equality
+    // used by DeviceIDComparable / the cache key must reflect that.
+    ASSERT_FALSE(null_first == literal_null);
+    ASSERT_TRUE(null_first != literal_null);
+}
 }  // namespace storage

From b35a771494c3628de1dac1dbcaad347adec0dade Mon Sep 17 00:00:00 2001
From: ColinLee <shuolin_l@163.com>
Date: Wed, 1 Jul 2026 23:08:31 +0800
Subject: [PATCH 9/9] build: don't use -march=native by default; gate it behind
 an option

Release builds unconditionally added -march=native on Linux/macOS. Two
problems for artifacts that leave the build host:

1. Portability: a wheel/binary built on a newer CPU can fault with an
   illegal instruction on an older target machine.
2. On virtualized macOS arm64 CI runners, -march=native mis-detects the
   feature set and drops +crc, while snappy's CRC32 feature probe (run
   without -march) still reports crc available and defines
   SNAPPY_HAVE_NEON_CRC32=1.  snappy.cc then calls the always_inline
   __crc32cw intrinsic in a TU compiled without crc support:
     error: always_inline function '__crc32cw' requires target feature
     'crc', but would be inlined into a function compiled without 'crc'
   The default (portable) target for macOS arm64 keeps +crc, so snappy's
   fast path stays available while the binary stays portable.

Default Release flags are now -O3 -flto (LTO stays off on MinGW/Windows
as before). -march=native is opt-in via -DTSFILE_ENABLE_NATIVE_ARCH=ON
for local-only builds that never ship. Applies uniformly to CI, wheels,
and manual local builds.

Verified: non-ASan Release (CI config) builds snappy cleanly and passes
705/705; -DTSFILE_ENABLE_NATIVE_ARCH=ON restores -march=native.
---
 cpp/CMakeLists.txt | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 07b4f6fc5..e6273dd66 100755
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -105,6 +105,15 @@ else ()
 endif ()
 
 message("CMAKE BUILD TYPE " ${CMAKE_BUILD_TYPE})
+# Tune Release builds for the build-host CPU (-march=native). OFF by default:
+# native is non-portable (a binary built on a newer CPU can fault with an
+# illegal instruction on an older one) and must not leak into CI artifacts or
+# shipped wheels. On virtualized CI hosts native can even mis-detect the
+# feature set (e.g. drop +crc), which breaks third_party/snappy's crc32 path.
+# Turn ON only for local-only builds where the binary never leaves the machine.
+option(TSFILE_ENABLE_NATIVE_ARCH
+        "Tune Release builds for the build host CPU via -march=native (local-only, non-portable)"
+        OFF)
 # Keep optimization policy external by default (caller/toolchain/CMake defaults).
 set(TSFILE_OPTIMIZATION_FLAGS ""
         CACHE STRING
@@ -130,14 +139,18 @@ else ()
             set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -g")
         elseif (CMAKE_BUILD_TYPE STREQUAL "Release")
             # -flto + MinGW gcc + statically-linked antlr4_static produces
-            # unresolved-reference errors at link time (LTO intermediate objects
-            # can't see the .a's vtable thunks). -march=native is also a poor
-            # default for CI binaries shipped to other machines. Keep both on
-            # Linux/macOS where the optimization actually pays off.
+            # unresolved-reference errors at link time (LTO intermediate
+            # objects can't see the .a's vtable thunks), so LTO is Linux/macOS
+            # only. -march=native is portable poison for shipped/CI binaries
+            # (see TSFILE_ENABLE_NATIVE_ARCH above): the default target for the
+            # deployment platform is the portable baseline, and on macOS arm64
+            # it still includes +crc so snappy's fast path stays available.
             if (MINGW OR WIN32)
                 set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3")
-            else ()
+            elseif (TSFILE_ENABLE_NATIVE_ARCH)
                 set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3 -march=native -flto")
+            else ()
+                set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3 -flto")
             endif ()
         elseif (CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
             set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -O2 -g")