diff --git a/.gitignore b/.gitignore index db6c0826ca..c060b2d8b2 100644 --- a/.gitignore +++ b/.gitignore @@ -30,3 +30,14 @@ resources/*.xml *.o .vscode cpp/pixels-retina/third_party/ + +# AI tools +.codex +.claude/ +.cursor/ +.continue/ +.aider* +.ai/ +.notes/ +CLAUDE.local.md +AGENTS.md.local diff --git a/cpp/pixels-retina/include/RGVisibility.h b/cpp/pixels-retina/include/RGVisibility.h index 144cb4833a..88eda0c775 100644 --- a/cpp/pixels-retina/include/RGVisibility.h +++ b/cpp/pixels-retina/include/RGVisibility.h @@ -31,7 +31,8 @@ class RGVisibility : public pixels::RetinaBase> { const std::vector* initialBitmap = nullptr); ~RGVisibility() override; - void deleteRGRecord(uint32_t rowId, uint64_t timestamp); + void deleteRGRecord(uint32_t rowId, uint64_t timestamp, + ReplayMode replayMode = ReplayMode::NORMAL); uint64_t* getRGVisibilityBitmap(uint64_t timestamp); std::vector collectRGGarbage(uint64_t timestamp); diff --git a/cpp/pixels-retina/include/RGVisibilityJni.h b/cpp/pixels-retina/include/RGVisibilityJni.h index c8bb1fc3a5..79e82e16b6 100644 --- a/cpp/pixels-retina/include/RGVisibilityJni.h +++ b/cpp/pixels-retina/include/RGVisibilityJni.h @@ -26,10 +26,10 @@ JNIEXPORT void JNICALL Java_io_pixelsdb_pixels_retina_RGVisibility_destroyNative /* * Class: io_pixelsdb_pixels_retina_RGVisibility * Method: deleteRecord - * Signature: (IJJ)V + * Signature: (IJJI)V */ JNIEXPORT void JNICALL Java_io_pixelsdb_pixels_retina_RGVisibility_deleteRecord - (JNIEnv *, jobject, jint, jlong, jlong); + (JNIEnv *, jobject, jint, jlong, jlong, jint); /* * Class: io_pixelsdb_pixels_retina_RGVisibility diff --git a/cpp/pixels-retina/include/TileVisibility.h b/cpp/pixels-retina/include/TileVisibility.h index ef9bd59143..fae7665dee 100644 --- a/cpp/pixels-retina/include/TileVisibility.h +++ b/cpp/pixels-retina/include/TileVisibility.h @@ -48,6 +48,22 @@ inline uint64_t extractTimestamp(uint64_t raw) { return (raw & 0x0000FFFFFFFFFFFFULL); } +/** + * Controls how DELETE replay interacts with the compacted base bitmap. + * + * NORMAL is the live append path: the caller provides a current delete + * timestamp and the record is appended to the chain. VERSIONED is used when + * replay may race with READY readers; historical deletes publish a new + * VersionedData with a folded baseBitmap. EXCLUSIVE is used only while recovery + * blocks readers and GC; historical deletes may update baseBitmap in place, but + * concurrent recovery writers still need tile-level synchronization. + */ +enum class ReplayMode : uint8_t { + NORMAL = 0, + VERSIONED = 1, + EXCLUSIVE = 2 +}; + struct DeleteIndexBlock : public pixels::RetinaBase { static constexpr size_t BLOCK_CAPACITY = 8; uint64_t items[BLOCK_CAPACITY] = {0}; @@ -96,7 +112,7 @@ class TileVisibility : public pixels::RetinaBase> { // timestamp defaults to 0; bitmap defaults to all-zeros. explicit TileVisibility(uint64_t timestamp = 0, const uint64_t* bitmap = nullptr); ~TileVisibility() override; - void deleteTileRecord(uint16_t rowId, uint64_t ts); + void deleteTileRecord(uint16_t rowId, uint64_t ts, ReplayMode replayMode = ReplayMode::NORMAL); void getTileVisibilityBitmap(uint64_t ts, uint64_t* outBitmap) const; void collectTileGarbage(uint64_t ts, uint64_t* gcSnapshotBitmap); void exportChainItemsAfter(uint32_t tileId, uint64_t safeGcTs, @@ -109,6 +125,14 @@ class TileVisibility : public pixels::RetinaBase> { void reclaimRetiredVersions(); + void appendDeleteChain(uint16_t rowId, uint64_t ts); + + // VERSIONED: replay with possible readers; historical deletes use COW fold. + void deleteTileRecordVersioned(uint16_t rowId, uint64_t ts); + + // EXCLUSIVE: recovery replay without readers; historical deletes fold in place. + void deleteTileRecordExclusive(uint16_t rowId, uint64_t ts); + std::atomic*> currentVersion; std::atomic tail; std::atomic tailUsed; diff --git a/cpp/pixels-retina/lib/RGVisibility.cpp b/cpp/pixels-retina/lib/RGVisibility.cpp index d1609535f0..289de9e0d3 100644 --- a/cpp/pixels-retina/lib/RGVisibility.cpp +++ b/cpp/pixels-retina/lib/RGVisibility.cpp @@ -70,9 +70,10 @@ TileVisibility* RGVisibility::getTileVisibility(uint32_t row } template -void RGVisibility::deleteRGRecord(uint32_t rowId, uint64_t timestamp) { +void RGVisibility::deleteRGRecord(uint32_t rowId, uint64_t timestamp, + ReplayMode replayMode) { TileVisibility* tileVisibility = getTileVisibility(rowId); - tileVisibility->deleteTileRecord(rowId % VISIBILITY_RECORD_CAPACITY, timestamp); + tileVisibility->deleteTileRecord(rowId % VISIBILITY_RECORD_CAPACITY, timestamp, replayMode); } template diff --git a/cpp/pixels-retina/lib/RGVisibilityJni.cpp b/cpp/pixels-retina/lib/RGVisibilityJni.cpp index fdcbeaa328..b6293366ca 100644 --- a/cpp/pixels-retina/lib/RGVisibilityJni.cpp +++ b/cpp/pixels-retina/lib/RGVisibilityJni.cpp @@ -23,6 +23,17 @@ #include "RGVisibility.h" #include +namespace { +ReplayMode toReplayMode(jint mode) { + switch (mode) { + case 0: return ReplayMode::NORMAL; + case 1: return ReplayMode::VERSIONED; + case 2: return ReplayMode::EXCLUSIVE; + default: throw std::invalid_argument("unknown ReplayMode"); + } +} +} + /* * Class: io_pixelsdb_pixels_retina_RGVisibility * Method: createNativeObject @@ -72,13 +83,13 @@ JNIEXPORT void JNICALL Java_io_pixelsdb_pixels_retina_RGVisibility_destroyNative /* * Class: io_pixelsdb_pixels_retina_RGVisibility * Method: deleteRecord - * Signature: (JJJ)V + * Signature: (IJJI)V */ JNIEXPORT void JNICALL Java_io_pixelsdb_pixels_retina_RGVisibility_deleteRecord - (JNIEnv* env, jobject, jint rowId, jlong timestamp, jlong handle) { + (JNIEnv* env, jobject, jint rowId, jlong timestamp, jlong handle, jint replayMode) { try { auto* rgVisibility = reinterpret_cast(handle); - rgVisibility->deleteRGRecord(rowId, timestamp); + rgVisibility->deleteRGRecord(rowId, timestamp, toReplayMode(replayMode)); } catch (const std::exception& e) { env->ThrowNew(env->FindClass("java/lang/RuntimeException"), e.what()); } diff --git a/cpp/pixels-retina/lib/TileVisibility.cpp b/cpp/pixels-retina/lib/TileVisibility.cpp index f4fcdcb429..49710c71b5 100644 --- a/cpp/pixels-retina/lib/TileVisibility.cpp +++ b/cpp/pixels-retina/lib/TileVisibility.cpp @@ -68,7 +68,71 @@ TileVisibility::~TileVisibility() { } template -void TileVisibility::deleteTileRecord(uint16_t rowId, uint64_t ts) { +void TileVisibility::deleteTileRecord(uint16_t rowId, uint64_t ts, + ReplayMode replayMode) { + switch (replayMode) { + case ReplayMode::NORMAL: + appendDeleteChain(rowId, ts); + return; + case ReplayMode::VERSIONED: + deleteTileRecordVersioned(rowId, ts); + return; + case ReplayMode::EXCLUSIVE: + deleteTileRecordExclusive(rowId, ts); + return; + default: + throw std::invalid_argument("unknown ReplayMode"); + } +} + +template +void TileVisibility::deleteTileRecordVersioned(uint16_t rowId, uint64_t ts) { + // READY backlog replay can race with getTileVisibilityBitmap readers. Fold + // historical deletes by publishing a new VersionedData instead of mutating + // baseBitmap observed by an existing reader. + // Keep ts=0 out of this path because item=0 is the chain-slot sentinel. + while (ts > 0) { + VersionedData* cur = currentVersion.load(std::memory_order_acquire); + if (ts > cur->baseTimestamp) { + break; + } + if ((cur->baseBitmap[rowId / 64] & (1ULL << (rowId % 64))) != 0) { + return; + } + uint64_t newBaseBitmap[NUM_WORDS]; + std::memcpy(newBaseBitmap, cur->baseBitmap, NUM_WORDS * sizeof(uint64_t)); + SET_BITMAP_BIT(newBaseBitmap, rowId); + VersionedData* newVer = + new VersionedData(cur->baseTimestamp, newBaseBitmap, cur->head); + if (currentVersion.compare_exchange_strong(cur, newVer, std::memory_order_acq_rel)) { + pendingRetire.store(cur, std::memory_order_release); + return; + } + delete newVer; + } + + appendDeleteChain(rowId, ts); +} + +template +void TileVisibility::deleteTileRecordExclusive(uint16_t rowId, uint64_t ts) { + // RECOVERING replay blocks readers and GC, so historical deletes can fold + // into baseBitmap in place. Atomic OR prevents lost updates when concurrent + // recovery writers touch the same bitmap word. + VersionedData* cur = currentVersion.load(std::memory_order_acquire); + if (ts > 0 && ts <= cur->baseTimestamp) { + uint64_t mask = 1ULL << (rowId % 64); + __atomic_fetch_or(&cur->baseBitmap[rowId / 64], mask, __ATOMIC_RELAXED); + return; + } + + appendDeleteChain(rowId, ts); +} + +template +void TileVisibility::appendDeleteChain(uint16_t rowId, uint64_t ts) { + // Normal live apply assumes a current timestamp and records the delete in + // the append-only chain, leaving baseBitmap untouched for the hot path. uint64_t item = makeDeleteIndex(rowId, ts); while (true) { DeleteIndexBlock *curTail = tail.load(std::memory_order_acquire); diff --git a/cpp/pixels-retina/test/RGVisibilityTest.cpp b/cpp/pixels-retina/test/RGVisibilityTest.cpp index 8d8b135eee..145a9918f3 100644 --- a/cpp/pixels-retina/test/RGVisibilityTest.cpp +++ b/cpp/pixels-retina/test/RGVisibilityTest.cpp @@ -49,6 +49,50 @@ class RGVisibilityTest : public ::testing::Test { RGVisibilityInstance* rgVisibility; }; +static bool rgBitSet(const uint64_t* bitmap, uint32_t rowId) { + return ((bitmap[rowId / 64] >> (rowId % 64)) & 1ULL) != 0; +} + +static void runConcurrentRGDeletes(RGVisibilityInstance* visibility, + ReplayMode mode, + uint64_t ts, + int rowCount = 64, + int threadCount = 8) { + ASSERT_EQ(rowCount % threadCount, 0); + std::atomic start{false}; + std::vector threads; + int rowsPerThread = rowCount / threadCount; + + for (int t = 0; t < threadCount; t++) { + threads.emplace_back([&, t]() { + while (!start.load(std::memory_order_acquire)) { + std::this_thread::yield(); + } + for (int i = 0; i < rowsPerThread; i++) { + uint32_t rowId = static_cast(t * rowsPerThread + i); + visibility->deleteRGRecord(rowId, ts, mode); + } + }); + } + + start.store(true, std::memory_order_release); + for (auto& thread : threads) { + thread.join(); + } +} + +static void expectRGRows(RGVisibilityInstance* visibility, + uint64_t queryTs, + int rowCount, + bool expectedSet) { + uint64_t* bitmap = visibility->getRGVisibilityBitmap(queryTs); + for (int row = 0; row < rowCount; row++) { + EXPECT_EQ(expectedSet, rgBitSet(bitmap, static_cast(row))) + << "row=" << row << " queryTs=" << queryTs; + } + delete[] bitmap; +} + TEST_F(RGVisibilityTest, BasicDeleteAndVisibility) { uint64_t timestamp1 = 100; uint64_t timestamp2 = 200; @@ -67,6 +111,34 @@ TEST_F(RGVisibilityTest, BasicDeleteAndVisibility) { delete[] bitmap2; } +TEST_F(RGVisibilityTest, ConcurrentNormalModeAppendsDeleteChain) { + constexpr uint64_t baseTs = 100; + RGVisibilityInstance visibility(ROW_COUNT, baseTs, nullptr); + + runConcurrentRGDeletes(&visibility, ReplayMode::NORMAL, baseTs + 1); + + expectRGRows(&visibility, baseTs, 64, false); + expectRGRows(&visibility, baseTs + 1, 64, true); +} + +TEST_F(RGVisibilityTest, ConcurrentVersionedModeFoldsWithCow) { + constexpr uint64_t baseTs = 100; + RGVisibilityInstance visibility(ROW_COUNT, baseTs, nullptr); + + runConcurrentRGDeletes(&visibility, ReplayMode::VERSIONED, baseTs - 1); + + expectRGRows(&visibility, baseTs, 64, true); +} + +TEST_F(RGVisibilityTest, ConcurrentExclusiveModeFoldsWithAtomicOr) { + constexpr uint64_t baseTs = 100; + RGVisibilityInstance visibility(ROW_COUNT, baseTs, nullptr); + + runConcurrentRGDeletes(&visibility, ReplayMode::EXCLUSIVE, baseTs - 1); + + expectRGRows(&visibility, baseTs, 64, true); +} + TEST_F(RGVisibilityTest, MultiThread) { struct DeleteRecord { uint64_t timestamp; diff --git a/cpp/pixels-retina/test/TileVisibilityTest.cpp b/cpp/pixels-retina/test/TileVisibilityTest.cpp index 0a84b806f9..7994f62e4d 100644 --- a/cpp/pixels-retina/test/TileVisibilityTest.cpp +++ b/cpp/pixels-retina/test/TileVisibilityTest.cpp @@ -695,3 +695,162 @@ TEST_F(TileVisibilityTest, ImportDeletionItems_EmptyChainTailClaim) { v->getTileVisibilityBitmap(500, actualBitmap); EXPECT_TRUE(checkBitmap(actualBitmap, expectedBitmap)); } + +// ========================================================================= +// COW fold of `ts <= baseTimestamp` deletes into baseBitmap. +// Three ts relations plus duplicate replay. +// ========================================================================= + +class TileVisibilityCowFoldTest : public ::testing::Test { +protected: + static constexpr uint64_t kBaseTimestamp = 100; + TileVisibility* v; + + void SetUp() override { + // Start with a non-zero baseTimestamp so the fold guard is exercised. + v = new TileVisibility(kBaseTimestamp, nullptr); + } + + void TearDown() override { + delete v; + } + + bool bitSet(const uint64_t* bitmap, uint16_t rowId) { + return ((bitmap[rowId / 64] >> (rowId % 64)) & 1ULL) != 0; + } + + void runConcurrentDeletes(ReplayMode mode, uint64_t ts, int rowCount = 64, int threadCount = 8) { + ASSERT_EQ(rowCount % threadCount, 0); + std::atomic start{false}; + std::vector threads; + int rowsPerThread = rowCount / threadCount; + + for (int t = 0; t < threadCount; t++) { + threads.emplace_back([&, t]() { + while (!start.load(std::memory_order_acquire)) { + std::this_thread::yield(); + } + for (int i = 0; i < rowsPerThread; i++) { + uint16_t rowId = static_cast(t * rowsPerThread + i); + v->deleteTileRecord(rowId, ts, mode); + } + }); + } + + start.store(true, std::memory_order_release); + for (auto& thread : threads) { + thread.join(); + } + } + + void expectRows(uint64_t queryTs, int rowCount, bool expectedSet) { + uint64_t bitmap[BITMAP_SIZE] = {0}; + v->getTileVisibilityBitmap(queryTs, bitmap); + for (int row = 0; row < rowCount; row++) { + EXPECT_EQ(expectedSet, bitSet(bitmap, static_cast(row))) + << "row=" << row << " queryTs=" << queryTs; + } + } +}; + +TEST_F(TileVisibilityCowFoldTest, FoldsWhenTsLessThanBaseTimestamp) { + // ts < baseTimestamp: row must be folded into baseBitmap and visible at any + // snap_ts >= baseTimestamp. + v->deleteTileRecord(7, kBaseTimestamp - 50, ReplayMode::VERSIONED); + + uint64_t bitmap[BITMAP_SIZE] = {0}; + v->getTileVisibilityBitmap(kBaseTimestamp, bitmap); + EXPECT_TRUE(bitSet(bitmap, 7)); + + // Even at a much later snap_ts the row should still be visible-as-deleted. + uint64_t bitmap2[BITMAP_SIZE] = {0}; + v->getTileVisibilityBitmap(kBaseTimestamp + 1000, bitmap2); + EXPECT_TRUE(bitSet(bitmap2, 7)); +} + +TEST_F(TileVisibilityCowFoldTest, FoldsWhenTsEqualsBaseTimestamp) { + v->deleteTileRecord(9, kBaseTimestamp, ReplayMode::VERSIONED); + + uint64_t bitmap[BITMAP_SIZE] = {0}; + v->getTileVisibilityBitmap(kBaseTimestamp, bitmap); + EXPECT_TRUE(bitSet(bitmap, 9)); +} + +TEST_F(TileVisibilityCowFoldTest, NormalModeDoesNotFoldHistoricalTimestamp) { + v->deleteTileRecord(10, kBaseTimestamp - 1, ReplayMode::NORMAL); + + uint64_t bitmap[BITMAP_SIZE] = {0}; + v->getTileVisibilityBitmap(kBaseTimestamp, bitmap); + EXPECT_FALSE(bitSet(bitmap, 10)); +} + +TEST_F(TileVisibilityCowFoldTest, ExclusiveModeFoldsHistoricalTimestamp) { + v->deleteTileRecord(12, kBaseTimestamp - 1, ReplayMode::EXCLUSIVE); + + uint64_t bitmap[BITMAP_SIZE] = {0}; + v->getTileVisibilityBitmap(kBaseTimestamp, bitmap); + EXPECT_TRUE(bitSet(bitmap, 12)); +} + +TEST_F(TileVisibilityCowFoldTest, ConcurrentNormalModeAppendsDeleteChain) { + runConcurrentDeletes(ReplayMode::NORMAL, kBaseTimestamp + 1); + + expectRows(kBaseTimestamp, 64, false); + expectRows(kBaseTimestamp + 1, 64, true); +} + +TEST_F(TileVisibilityCowFoldTest, ConcurrentVersionedModeFoldsWithCow) { + runConcurrentDeletes(ReplayMode::VERSIONED, kBaseTimestamp - 1); + + expectRows(kBaseTimestamp, 64, true); +} + +TEST_F(TileVisibilityCowFoldTest, ConcurrentExclusiveModeFoldsWithAtomicOr) { + runConcurrentDeletes(ReplayMode::EXCLUSIVE, kBaseTimestamp - 1); + + expectRows(kBaseTimestamp, 64, true); +} + +TEST_F(TileVisibilityCowFoldTest, AppendsToChainWhenTsGreaterThanBaseTimestamp) { + // ts > baseTimestamp: should take the append-to-chain path. The row must be + // invisible at snap_ts < ts and visible at snap_ts >= ts. + v->deleteTileRecord(11, kBaseTimestamp + 50, ReplayMode::VERSIONED); + + uint64_t before[BITMAP_SIZE] = {0}; + v->getTileVisibilityBitmap(kBaseTimestamp + 49, before); + EXPECT_FALSE(bitSet(before, 11)); + + uint64_t after[BITMAP_SIZE] = {0}; + v->getTileVisibilityBitmap(kBaseTimestamp + 50, after); + EXPECT_TRUE(bitSet(after, 11)); +} + +TEST_F(TileVisibilityCowFoldTest, DuplicateFoldOnAlreadyDeletedRowIsIdempotent) { + // A replayed historical DELETE for a row already folded into baseBitmap should + // remain a no-op semantically. This guards the fast path that returns before + // cloning another VersionedData when the base bit is already set. + v->deleteTileRecord(13, kBaseTimestamp - 10, ReplayMode::VERSIONED); + for (int i = 0; i < 32; i++) { + v->deleteTileRecord(13, kBaseTimestamp - 20, ReplayMode::VERSIONED); + } + + uint64_t atBase[BITMAP_SIZE] = {0}; + v->getTileVisibilityBitmap(kBaseTimestamp, atBase); + EXPECT_TRUE(bitSet(atBase, 13)); + EXPECT_FALSE(bitSet(atBase, 14)); + + // The duplicate fold must not corrupt the append-to-chain path or later GC. + v->deleteTileRecord(14, kBaseTimestamp + 5, ReplayMode::VERSIONED); + uint64_t beforeAppendTs[BITMAP_SIZE] = {0}; + v->getTileVisibilityBitmap(kBaseTimestamp + 4, beforeAppendTs); + EXPECT_TRUE(bitSet(beforeAppendTs, 13)); + EXPECT_FALSE(bitSet(beforeAppendTs, 14)); + + uint64_t gcBitmap[BITMAP_SIZE] = {0}; + v->collectTileGarbage(kBaseTimestamp + 5, gcBitmap); + + uint64_t afterGc[BITMAP_SIZE] = {0}; + v->getTileVisibilityBitmap(kBaseTimestamp + 5, afterGc); + EXPECT_TRUE(bitSet(afterGc, 13)); + EXPECT_TRUE(bitSet(afterGc, 14)); +} diff --git a/pixels-cli/src/main/java/io/pixelsdb/pixels/cli/executor/CompactExecutor.java b/pixels-cli/src/main/java/io/pixelsdb/pixels/cli/executor/CompactExecutor.java index ec8c0501c0..b2a6d20281 100644 --- a/pixels-cli/src/main/java/io/pixelsdb/pixels/cli/executor/CompactExecutor.java +++ b/pixels-cli/src/main/java/io/pixelsdb/pixels/cli/executor/CompactExecutor.java @@ -21,6 +21,7 @@ import com.google.common.base.Joiner; import io.pixelsdb.pixels.cli.Main; +import io.pixelsdb.pixels.common.exception.MetadataException; import io.pixelsdb.pixels.common.exception.RetinaException; import io.pixelsdb.pixels.common.metadata.MetadataService; import io.pixelsdb.pixels.common.metadata.domain.Compact; @@ -261,7 +262,10 @@ public void execute(Namespace ns, String command) throws Exception // Issue #192: wait for the compaction to complete. compactExecutor.shutdown(); while (!compactExecutor.awaitTermination(100, TimeUnit.SECONDS)); - metadataService.addFiles(compactFiles); + if (!metadataService.addFiles(compactFiles)) + { + throw new MetadataException("failed to add compact files to metadata"); + } if (retinaService.isEnabled()) { diff --git a/pixels-cli/src/main/java/io/pixelsdb/pixels/cli/executor/ImportExecutor.java b/pixels-cli/src/main/java/io/pixelsdb/pixels/cli/executor/ImportExecutor.java index 140ded28c6..c2c7b8c3b7 100644 --- a/pixels-cli/src/main/java/io/pixelsdb/pixels/cli/executor/ImportExecutor.java +++ b/pixels-cli/src/main/java/io/pixelsdb/pixels/cli/executor/ImportExecutor.java @@ -20,6 +20,7 @@ package io.pixelsdb.pixels.cli.executor; import com.google.common.collect.ImmutableList; +import io.pixelsdb.pixels.common.exception.MetadataException; import io.pixelsdb.pixels.common.metadata.MetadataService; import io.pixelsdb.pixels.common.metadata.domain.File; import io.pixelsdb.pixels.common.metadata.domain.Layout; @@ -67,7 +68,10 @@ public void execute(Namespace ns, String command) throws Exception try { List importFiles = getImportFiles(ordered, writableLayout); - metadataService.addFiles(importFiles); + if (!metadataService.addFiles(importFiles)) + { + throw new MetadataException("failed to import pixels files into metadata"); + } System.out.println(command + " is successful"); } catch (Exception e) diff --git a/pixels-cli/src/main/java/io/pixelsdb/pixels/cli/executor/LoadExecutor.java b/pixels-cli/src/main/java/io/pixelsdb/pixels/cli/executor/LoadExecutor.java index 765f031a39..fde71d3da1 100644 --- a/pixels-cli/src/main/java/io/pixelsdb/pixels/cli/executor/LoadExecutor.java +++ b/pixels-cli/src/main/java/io/pixelsdb/pixels/cli/executor/LoadExecutor.java @@ -93,7 +93,10 @@ public void execute(Namespace ns, String command) throws Exception { File file = loadedInfo.loadedFile; Path path = loadedInfo.loadedPath; - metadataService.updateFile(file); + if (!metadataService.updateFile(file)) + { + throw new MetadataException("failed to publish loaded file " + file.getName()); + } try { diff --git a/pixels-cli/src/main/java/io/pixelsdb/pixels/cli/load/AbstractPixelsConsumer.java b/pixels-cli/src/main/java/io/pixelsdb/pixels/cli/load/AbstractPixelsConsumer.java index cb1d3c32f5..373ca3b83c 100644 --- a/pixels-cli/src/main/java/io/pixelsdb/pixels/cli/load/AbstractPixelsConsumer.java +++ b/pixels-cli/src/main/java/io/pixelsdb/pixels/cli/load/AbstractPixelsConsumer.java @@ -163,11 +163,14 @@ private void cleanupTemporaryFiles() { for (File tmpFile : tmpFiles) { - if (tmpFile.getType() == File.Type.TEMPORARY) + if (tmpFile.getType() == File.Type.TEMPORARY_INGEST) { try { - metadataService.deleteFiles(Collections.singletonList((tmpFile.getId()))); + if (!metadataService.deleteFiles(Collections.singletonList((tmpFile.getId())))) + { + throw new MetadataException("failed to delete temporary load file " + tmpFile.getId()); + } } catch (MetadataException e) { e.printStackTrace(); @@ -207,11 +210,14 @@ protected File openTmpFile(String fileName, Path filePath) throws MetadataExcept { File file = new File(); file.setName(fileName); - file.setType(File.Type.TEMPORARY); + file.setType(File.Type.TEMPORARY_INGEST); file.setNumRowGroup(1); file.setPathId(filePath.getId()); String tmpFilePath = filePath.getUri() + "/" + fileName; - this.metadataService.addFiles(Collections.singletonList(file)); + if (!this.metadataService.addFiles(Collections.singletonList(file))) + { + throw new MetadataException("failed to add temporary load file " + tmpFilePath); + } file.setId(metadataService.getFileId(tmpFilePath)); return file; } diff --git a/pixels-common/src/main/java/io/pixelsdb/pixels/common/error/ErrorCode.java b/pixels-common/src/main/java/io/pixelsdb/pixels/common/error/ErrorCode.java index 2bcd676994..8334b2a3f4 100644 --- a/pixels-common/src/main/java/io/pixelsdb/pixels/common/error/ErrorCode.java +++ b/pixels-common/src/main/java/io/pixelsdb/pixels/common/error/ErrorCode.java @@ -163,4 +163,11 @@ public class ErrorCode public static final int NODE_RETINA_INFO_FAIL = ERROR_NODE_SERVER + 1; public static final int NODE_NO_AVAILABLE = ERROR_NODE_SERVER + 2; public static final int NODE_INVALID_BUCKET = ERROR_NODE_SERVER + 3; + + // error code for retina lifecycle/recovery + private static final int ERROR_RETINA_SERVER = ERROR_BASE + 800; + public static final int RETINA_NOT_READY = ERROR_RETINA_SERVER + 1; + public static final int RETINA_MARK_READY_FAILED = ERROR_RETINA_SERVER + 2; + public static final int RETINA_UPDATE_FAILED = ERROR_RETINA_SERVER + 3; + public static final int RETINA_VISIBILITY_FAILED = ERROR_RETINA_SERVER + 4; } diff --git a/pixels-common/src/main/java/io/pixelsdb/pixels/common/index/MainIndexBuffer.java b/pixels-common/src/main/java/io/pixelsdb/pixels/common/index/MainIndexBuffer.java index 5ee71ba582..e8efb46fc5 100644 --- a/pixels-common/src/main/java/io/pixelsdb/pixels/common/index/MainIndexBuffer.java +++ b/pixels-common/src/main/java/io/pixelsdb/pixels/common/index/MainIndexBuffer.java @@ -53,6 +53,40 @@ public class MainIndexBuffer implements Closeable private final MainIndexCache indexCache; private boolean populateCache = false; + public static final class FlushSnapshot + { + private final long fileId; + private final int entryCount; + private final List rowIdRanges; + + private FlushSnapshot(long fileId, int entryCount, List rowIdRanges) + { + this.fileId = fileId; + this.entryCount = entryCount; + this.rowIdRanges = Collections.unmodifiableList(new ArrayList<>(rowIdRanges)); + } + + public long getFileId() + { + return fileId; + } + + public int getEntryCount() + { + return entryCount; + } + + public List getRowIdRanges() + { + return rowIdRanges; + } + + public boolean isEmpty() + { + return entryCount == 0; + } + } + /** * Create a main index buffer and bind the main index cache to it. * Entries put into this buffer will also be put into the cache. @@ -143,20 +177,19 @@ public IndexProto.RowLocation lookup(long rowId) throws MainIndexException } /** - * Flush the (row id -> row location) mappings of the given file id into ranges and remove them from the buffer. - * This method does not evict the main index cache bind to this buffer as the cached entries are not out of date. - * However, this method may disable synchronous cache population and clear the cache if remaining file ids in the - * buffer is below or equals to the {@link #CACHE_POP_ENABLE_THRESHOLD}. + * Build a stable snapshot of the (row id -> row location) mappings of the given file id. + * This method must not mutate the buffer or cache; callers should only discard the buffered + * entries after the snapshot has been durably committed. * @param fileId the given file id to flush - * @return the flushed row id ranges to be persisited into the storage + * @return the row id range snapshot to be persisted into the storage * @throws MainIndexException */ - public List flush(long fileId) throws MainIndexException + public FlushSnapshot snapshotForFlush(long fileId) throws MainIndexException { Map fileBuffer = this.indexBuffer.get(fileId); if (fileBuffer == null) { - return null; + return new FlushSnapshot(fileId, 0, Collections.emptyList()); } ImmutableList.Builder ranges = ImmutableList.builder(); RowIdRange.Builder currRangeBuilder = new RowIdRange.Builder(); @@ -210,16 +243,34 @@ public List flush(long fileId) throws MainIndexException // release the flushed file index buffer if(fileBuffer.size() != rowIds.length) { - throw new MainIndexException("FileBuffer Changed while flush"); + throw new MainIndexException("FileBuffer changed while building flush snapshot"); + } + return new FlushSnapshot(fileId, rowIds.length, ranges.build()); + } + + /** + * Discard a flush snapshot after the backing store has durably committed it. + * @param snapshot the committed snapshot + * @throws MainIndexException if the buffer no longer matches the committed snapshot + */ + public void discardFlushed(FlushSnapshot snapshot) throws MainIndexException + { + if (snapshot.isEmpty()) + { + return; + } + Map fileBuffer = this.indexBuffer.get(snapshot.getFileId()); + if (fileBuffer == null || fileBuffer.size() != snapshot.getEntryCount()) + { + throw new MainIndexException("FileBuffer changed before committed flush discard"); } fileBuffer.clear(); - this.indexBuffer.remove(fileId); + this.indexBuffer.remove(snapshot.getFileId()); if (this.indexBuffer.size() <= CACHE_POP_ENABLE_THRESHOLD) { this.populateCache = false; this.indexCache.evictAllEntries(); } - return ranges.build(); } public List cachedFileIds() diff --git a/pixels-common/src/main/java/io/pixelsdb/pixels/common/index/ResolvedPrimary.java b/pixels-common/src/main/java/io/pixelsdb/pixels/common/index/ResolvedPrimary.java new file mode 100644 index 0000000000..4587a6fb63 --- /dev/null +++ b/pixels-common/src/main/java/io/pixelsdb/pixels/common/index/ResolvedPrimary.java @@ -0,0 +1,52 @@ +/* + * Copyright 2026 PixelsDB. + * + * This file is part of Pixels. + * + * Pixels is free software: you can redistribute it and/or modify + * it under the terms of the Affero GNU General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * Pixels is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Affero GNU General Public License for more details. + * + * You should have received a copy of the Affero GNU General Public + * License along with Pixels. If not, see + * . + */ +package io.pixelsdb.pixels.common.index; + +import io.pixelsdb.pixels.index.IndexProto; + +import java.util.Objects; + +/** + * Result of a successful primary index resolution, returned wrapped in + * {@link java.util.Optional}: present = key is live; empty = key missing or + * maps to an orphan / non-baseline-visible location; backend failures surface + * as {@link io.pixelsdb.pixels.common.exception.IndexException}. + */ +public final class ResolvedPrimary +{ + private final long rowId; + private final IndexProto.RowLocation rowLocation; + + public ResolvedPrimary(long rowId, IndexProto.RowLocation rowLocation) + { + this.rowId = rowId; + this.rowLocation = Objects.requireNonNull(rowLocation, "rowLocation"); + } + + public long getRowId() + { + return rowId; + } + + public IndexProto.RowLocation getRowLocation() + { + return rowLocation; + } +} diff --git a/pixels-common/src/main/java/io/pixelsdb/pixels/common/index/RollbackEntry.java b/pixels-common/src/main/java/io/pixelsdb/pixels/common/index/RollbackEntry.java new file mode 100644 index 0000000000..20780aa2a0 --- /dev/null +++ b/pixels-common/src/main/java/io/pixelsdb/pixels/common/index/RollbackEntry.java @@ -0,0 +1,59 @@ +/* + * Copyright 2026 PixelsDB. + * + * This file is part of Pixels. + * + * Pixels is free software: you can redistribute it and/or modify + * it under the terms of the Affero GNU General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * Pixels is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Affero GNU General Public License for more details. + * + * You should have received a copy of the Affero GNU General Public + * License along with Pixels. If not, see + * . + */ +package io.pixelsdb.pixels.common.index; + +import io.pixelsdb.pixels.index.IndexProto; + +import java.util.Objects; + +/** + * Journal record for restoring one primary index pointer from newRowId + * back to oldRowId. restorePrimaryIndexEntries writes back oldRowId only when + * the current pointer still equals newRowId, skipping entries that have + * been tombstoned or moved on to a third rowId. + */ +public final class RollbackEntry +{ + private final IndexProto.IndexKey indexKey; + private final long oldRowId; + private final long newRowId; + + public RollbackEntry(IndexProto.IndexKey indexKey, long oldRowId, long newRowId) + { + this.indexKey = Objects.requireNonNull(indexKey, "indexKey"); + this.oldRowId = oldRowId; + this.newRowId = newRowId; + } + + public IndexProto.IndexKey getIndexKey() + { + return indexKey; + } + + public long getOldRowId() + { + return oldRowId; + } + + public long getNewRowId() + { + return newRowId; + } +} diff --git a/pixels-common/src/main/java/io/pixelsdb/pixels/common/index/service/IndexService.java b/pixels-common/src/main/java/io/pixelsdb/pixels/common/index/service/IndexService.java index 627f340207..faff4e2a72 100644 --- a/pixels-common/src/main/java/io/pixelsdb/pixels/common/index/service/IndexService.java +++ b/pixels-common/src/main/java/io/pixelsdb/pixels/common/index/service/IndexService.java @@ -21,8 +21,12 @@ import io.pixelsdb.pixels.common.exception.IndexException; import io.pixelsdb.pixels.common.index.IndexOption; +import io.pixelsdb.pixels.common.index.ResolvedPrimary; +import io.pixelsdb.pixels.common.index.RollbackEntry; import io.pixelsdb.pixels.index.IndexProto; + import java.util.List; +import java.util.Optional; public interface IndexService { @@ -40,7 +44,7 @@ public interface IndexService /** * Lookup a unique index. * @param key the index key - * @return the row location or null if the index entry is not found + * @return the row location, or null if the key is missing or maps to an orphan */ IndexProto.RowLocation lookupUniqueIndex(IndexProto.IndexKey key, IndexOption indexOption) throws IndexException; @@ -87,6 +91,7 @@ boolean putSecondaryIndexEntries(long tableId, long indexId, /** * Delete an entry from the primary index. The deleted index entry is marked as deleted using a tombstone. + * Crash-unsafe; prefer {@link #resolvePrimary} + {@link #deletePrimaryIndexEntriesOnly}. * @param key the index key * @return the row location of the deleted index entry * @throws IndexException if no existing entry to delete @@ -103,6 +108,7 @@ boolean putSecondaryIndexEntries(long tableId, long indexId, /** * Delete entries from the primary index. Each deleted index entry is marked as deleted using a tombstone. + * Crash-unsafe; prefer {@link #resolvePrimary} + {@link #deletePrimaryIndexEntriesOnly}. * @param tableId the table id of the index * @param indexId the index id of the index * @param keys the keys of the entries to delete @@ -126,6 +132,7 @@ List deleteSecondaryIndexEntries(long tableId, long indexId, /** * Update the entry of a primary index. + * Crash-unsafe; prefer DELETE + INSERT. * @param indexEntry the index entry to update * @return the previous row location of the index entry * @throws IndexException if no existing entry to update @@ -142,6 +149,7 @@ List deleteSecondaryIndexEntries(long tableId, long indexId, /** * Update the entries of a primary index. + * Crash-unsafe; prefer DELETE + INSERT. * @param tableId the table id of the primary index * @param indexId the index id of the primary index * @param indexEntries the index entries to update @@ -215,5 +223,129 @@ boolean flushIndexEntriesOfFile(long tableId, long indexId, * @return true on success */ boolean removeIndex(long tableId, long indexId, boolean isPrimary, IndexOption option) throws IndexException; + + // ================================================================================== + // Staged primary-index APIs. Default implementations throw UnsupportedOperationException; + // LocalIndexService provides the in-process implementation. + // ================================================================================== + + /** + * Resolve a batch of primary index keys to {@link ResolvedPrimary} (rowId + RowLocation), + * positionally aligned with keys. Returns Optional.empty() for keys + * that are missing, tombstoned, orphan in MainIndex, or filtered out by the + * baseline visible file set; throws on backend error. + * + * @param tableId the table id of the primary index + * @param indexId the index id of the primary index + * @param keys the primary index keys to resolve + * @param indexOption optional index option + * @return positional list of resolved primaries + * @throws IndexException on backend error + */ + default List> resolvePrimary(long tableId, long indexId, + List keys, IndexOption indexOption) throws IndexException + { + throw new UnsupportedOperationException( + "resolvePrimary is not supported by this IndexService scheme"); + } + + /** + * Write rowId -> RowLocation entries into the main index. + * + * @param tableId the table id of the main index + * @param entries the entries to persist + * @throws IndexException on backend error + */ + default void putMainIndexEntriesOnly(long tableId, + List entries) throws IndexException + { + throw new UnsupportedOperationException( + "putMainIndexEntriesOnly is not supported by this IndexService scheme"); + } + + /** + * Write IndexKey -> rowId entries into the primary single point index. + * + * @param tableId the table id of the primary index + * @param indexId the index id of the primary index + * @param entries the entries to persist + * @param indexOption optional index option + * @throws IndexException on backend error + */ + default void putPrimaryIndexEntriesOnly(long tableId, long indexId, + List entries, IndexOption indexOption) throws IndexException + { + throw new UnsupportedOperationException( + "putPrimaryIndexEntriesOnly is not supported by this IndexService scheme"); + } + + /** + * Delete primary index entries for keys already resolved by {@link #resolvePrimary}. + * Repeating on an already-deleted key is a no-op. + * + * @param tableId the table id of the primary index + * @param indexId the index id of the primary index + * @param resolvedKeys the keys to delete + * @param indexOption optional index option + * @throws IndexException on backend error + */ + default void deletePrimaryIndexEntriesOnly(long tableId, long indexId, + List resolvedKeys, IndexOption indexOption) throws IndexException + { + throw new UnsupportedOperationException( + "deletePrimaryIndexEntriesOnly is not supported by this IndexService scheme"); + } + + /** + * Update primary index entries to the new IndexKey -> rowId mapping; + * does not look up the previous rowId. + * + * @param tableId the table id of the primary index + * @param indexId the index id of the primary index + * @param entries the new IndexKey -> rowId mappings + * @param indexOption optional index option + * @throws IndexException on backend error + */ + default void updatePrimaryIndexEntriesOnly(long tableId, long indexId, + List entries, IndexOption indexOption) throws IndexException + { + throw new UnsupportedOperationException( + "updatePrimaryIndexEntriesOnly is not supported by this IndexService scheme"); + } + + /** + * Restore primary index entries to oldRowId where the current pointer + * still equals newRowId; skip otherwise. Intended for single-threaded + * rollback windows and does not require atomic conditional update from the backend. + * + * @param tableId the table id of the primary index + * @param indexId the index id of the primary index + * @param entries rollback entries describing each oldRowId -> newRowId transition + * @param indexOption optional index option + * @throws IndexException on backend error + */ + default void restorePrimaryIndexEntries(long tableId, long indexId, + List entries, IndexOption indexOption) throws IndexException + { + throw new UnsupportedOperationException( + "restorePrimaryIndexEntries is not supported by this IndexService scheme"); + } + + /** + * Delete rowId -> RowLocation mappings for a contiguous main-index range. + * The range is half-open: [rowIdStart, rowIdStart + rowCount). + * + * @param tableId the table id of the main index + * @param fileId the file id owning the rowId range + * @param rowIdStart the first row id to delete + * @param rowCount the number of row ids to delete + * @throws IndexException on backend error + */ + default void deleteMainIndexRange(long tableId, long fileId, long rowIdStart, int rowCount) + throws IndexException + { + throw new UnsupportedOperationException( + "deleteMainIndexRange is not supported by this IndexService scheme"); + } } diff --git a/pixels-common/src/main/java/io/pixelsdb/pixels/common/index/service/LocalIndexService.java b/pixels-common/src/main/java/io/pixelsdb/pixels/common/index/service/LocalIndexService.java index 7577036278..581c374598 100644 --- a/pixels-common/src/main/java/io/pixelsdb/pixels/common/index/service/LocalIndexService.java +++ b/pixels-common/src/main/java/io/pixelsdb/pixels/common/index/service/LocalIndexService.java @@ -28,12 +28,16 @@ import io.pixelsdb.pixels.common.utils.ConfigFactory; import java.util.ArrayList; +import java.util.Collections; import java.util.List; +import java.util.Optional; +import java.util.Set; public class LocalIndexService implements IndexService { private static final LocalIndexService defaultInstance = new LocalIndexService(); private static boolean upsertMode; + public static LocalIndexService Instance() { return defaultInstance; @@ -60,34 +64,10 @@ public IndexProto.RowIdBatch allocateRowIdBatch(long tableId, int numRowIds) thr @Override public IndexProto.RowLocation lookupUniqueIndex(IndexProto.IndexKey key, IndexOption indexOption) throws IndexException { - try - { - long tableId = key.getTableId(); - long indexId = key.getIndexId(); - MainIndex mainIndex = MainIndexFactory.Instance().getMainIndex(tableId); - SinglePointIndex singlePointIndex = SinglePointIndexFactory.Instance().getSinglePointIndex(tableId, indexId, indexOption); - long rowId = singlePointIndex.getUniqueRowId(key); - if (rowId >= 0) - { - IndexProto.RowLocation rowLocation = mainIndex.getLocation(rowId); - if (rowLocation != null) - { - return rowLocation; - } - else - { - throw new IndexException("Failed to get row location for rowId=" + rowId); - } - } - else - { - return null; - } - } - catch (SinglePointIndexException | MainIndexException e) - { - throw new IndexException("Failed to lookup unique index for key=" + key, e); - } + // Delegates to resolvePrimary; only backend errors throw, everything else returns null. + List> resolved = resolvePrimary( + key.getTableId(), key.getIndexId(), Collections.singletonList(key), indexOption); + return resolved.get(0).map(ResolvedPrimary::getRowLocation).orElse(null); } @Override @@ -134,71 +114,23 @@ public List lookupNonUniqueIndex(IndexProto.IndexKey key @Override public boolean putPrimaryIndexEntry(IndexProto.PrimaryIndexEntry entry, IndexOption indexOption) throws IndexException { - try - { - IndexProto.IndexKey key = entry.getIndexKey(); - long tableId = key.getTableId(); - long indexId = key.getIndexId(); - MainIndex mainIndex = MainIndexFactory.Instance().getMainIndex(tableId); - SinglePointIndex singlePointIndex = SinglePointIndexFactory.Instance().getSinglePointIndex(tableId, indexId, indexOption); - // Insert into single point index - boolean spSuccess = singlePointIndex.putEntry(entry.getIndexKey(), entry.getRowId()); - if (!spSuccess) - { - throw new IndexException("Failed to put entry into single point index for key=" + key); - } - // Insert into main index - boolean mainSuccess = mainIndex.putEntry(entry.getRowId(), entry.getRowLocation()); - if (!mainSuccess) - { - throw new IndexException("Failed to put entry into main index for rowId=" + entry.getRowId()); - } - return true; - } - catch (SinglePointIndexException e) - { - throw new IndexException("Failed to put entry into single point index for key=" + entry.getIndexKey(), e); - } - catch (MainIndexException e) - { - throw new IndexException("Failed to put entry into main index for rowId=" + entry.getRowId(), e); - } + // Delegates to putPrimaryIndexEntries. + IndexProto.IndexKey key = entry.getIndexKey(); + return putPrimaryIndexEntries(key.getTableId(), key.getIndexId(), + Collections.singletonList(entry), indexOption); } @Override public boolean putPrimaryIndexEntries(long tableId, long indexId, List entries, IndexOption indexOption) throws IndexException { - try + if (entries == null || entries.isEmpty()) { - SinglePointIndex singlePointIndex = SinglePointIndexFactory.Instance().getSinglePointIndex(tableId, indexId, indexOption); - // Batch insert into single point index - boolean success = singlePointIndex.putPrimaryEntries(entries); - if (!success) - { - throw new IndexException("Failed to put primary entries into single point index, tableId=" - + tableId + ", indexId=" + indexId); - } - MainIndex mainIndex = MainIndexFactory.Instance().getMainIndex(tableId); - for (Boolean mainSuccess : mainIndex.putEntries(entries)) - { - if(!mainSuccess) - { - throw new MainIndexException("Failed to put entry into main index, tableId: " + tableId); - } - } return true; } - catch (SinglePointIndexException e) - { - throw new IndexException("Failed to put primary entries into single point index, tableId=" - + tableId + ", indexId=" + indexId, e); - } - catch (MainIndexException e) - { - // Retained for consistency with original code, though normally not expected here - throw new IndexException("Failed to put primary entries into main index, tableId=" - + tableId + ", indexId=" + indexId, e); - } + // Crash-safe order: MainIndex first (rowId -> RowLocation), then primary (IndexKey -> rowId). + putMainIndexEntriesOnly(tableId, entries); + putPrimaryIndexEntriesOnly(tableId, indexId, entries, indexOption); + return true; } @Override @@ -633,4 +565,206 @@ public boolean removeIndex(long tableId, long indexId, boolean isPrimary, IndexO throw new IndexException("Failed to remove index for tableId=" + tableId + ", indexId=" + indexId, e); } } + + // ================================================================================== + // Staged primary-index APIs. Contracts live on the matching IndexService methods. + // ================================================================================== + + @Override + public List> resolvePrimary(long tableId, long indexId, + List keys, IndexOption indexOption) throws IndexException + { + if (keys == null || keys.isEmpty()) + { + return Collections.emptyList(); + } + try + { + SinglePointIndex sp = SinglePointIndexFactory.Instance().getSinglePointIndex(tableId, indexId, indexOption); + MainIndex mi = MainIndexFactory.Instance().getMainIndex(tableId); + List> result = new ArrayList<>(keys.size()); + for (IndexProto.IndexKey key : keys) + { + long rowId = sp.getUniqueRowId(key); + if (rowId < 0) + { + // missing or tombstoned in primary + result.add(Optional.empty()); + continue; + } + IndexProto.RowLocation location = mi.getLocation(rowId); + if (location == null) + { + // MainIndex orphan rowId + result.add(Optional.empty()); + continue; + } + result.add(Optional.of(new ResolvedPrimary(rowId, location))); + } + return result; + } + catch (SinglePointIndexException | MainIndexException e) + { + throw new IndexException("Failed to resolve primary for tableId=" + tableId + + ", indexId=" + indexId, e); + } + } + + @Override + public void putMainIndexEntriesOnly(long tableId, List entries) throws IndexException + { + if (entries == null || entries.isEmpty()) + { + return; + } + try + { + MainIndex mainIndex = MainIndexFactory.Instance().getMainIndex(tableId); + List results = mainIndex.putEntries(entries); + for (Boolean ok : results) + { + if (ok == null || !ok) + { + throw new IndexException("Failed to put main index entry, tableId=" + tableId); + } + } + } + catch (MainIndexException e) + { + throw new IndexException("Failed to put main index entries for tableId=" + tableId, e); + } + } + + @Override + public void putPrimaryIndexEntriesOnly(long tableId, long indexId, + List entries, IndexOption indexOption) throws IndexException + { + if (entries == null || entries.isEmpty()) + { + return; + } + try + { + SinglePointIndex sp = SinglePointIndexFactory.Instance().getSinglePointIndex(tableId, indexId, indexOption); + if (!sp.putPrimaryEntries(entries)) + { + throw new IndexException("Failed to put primary entries into single point index for tableId=" + + tableId + ", indexId=" + indexId); + } + } + catch (SinglePointIndexException e) + { + throw new IndexException("Failed to put primary entries into single point index for tableId=" + + tableId + ", indexId=" + indexId, e); + } + } + + @Override + public void deletePrimaryIndexEntriesOnly(long tableId, long indexId, + List resolvedKeys, IndexOption indexOption) throws IndexException + { + if (resolvedKeys == null || resolvedKeys.isEmpty()) + { + return; + } + try + { + SinglePointIndex sp = SinglePointIndexFactory.Instance().getSinglePointIndex(tableId, indexId, indexOption); + // TODO: avoid the repeated primary lookup by adding a tombstone-only index API. + sp.deleteEntries(resolvedKeys); + } + catch (SinglePointIndexException e) + { + throw new IndexException("Failed to delete primary entries for tableId=" + + tableId + ", indexId=" + indexId, e); + } + } + + @Override + public void updatePrimaryIndexEntriesOnly(long tableId, long indexId, + List entries, IndexOption indexOption) throws IndexException + { + if (entries == null || entries.isEmpty()) + { + return; + } + try + { + SinglePointIndex sp = SinglePointIndexFactory.Instance().getSinglePointIndex(tableId, indexId, indexOption); + // TODO: avoid the repeated primary lookup by adding an update API that accepts resolved rowIds. + sp.updatePrimaryEntries(entries); + } + catch (SinglePointIndexException e) + { + throw new IndexException("Failed to update primary entries for tableId=" + + tableId + ", indexId=" + indexId, e); + } + } + + @Override + public void restorePrimaryIndexEntries(long tableId, long indexId, + List entries, IndexOption indexOption) throws IndexException + { + if (entries == null || entries.isEmpty()) + { + return; + } + // RECOVERING is single-threaded for these entries; read-then-write needs no CAS. + try + { + SinglePointIndex sp = SinglePointIndexFactory.Instance().getSinglePointIndex(tableId, indexId, indexOption); + List toRestore = new ArrayList<>(); + for (RollbackEntry entry : entries) + { + long current = sp.getUniqueRowId(entry.getIndexKey()); + if (current == entry.getNewRowId()) + { + toRestore.add(IndexProto.PrimaryIndexEntry.newBuilder() + .setIndexKey(entry.getIndexKey()) + .setRowId(entry.getOldRowId()) + .build()); + } + // else: primary already tombstoned, reverted, or moved on; skip. + } + if (!toRestore.isEmpty()) + { + sp.updatePrimaryEntries(toRestore); + } + } + catch (SinglePointIndexException e) + { + throw new IndexException("Failed to restore primary entries for tableId=" + + tableId + ", indexId=" + indexId, e); + } + } + + @Override + public void deleteMainIndexRange(long tableId, long fileId, long rowIdStart, int rowCount) + throws IndexException + { + if (rowCount <= 0) + { + return; + } + try + { + MainIndex mainIndex = MainIndexFactory.Instance().getMainIndex(tableId); + if (mainIndex.hasCache()) + { + mainIndex.flushCache(fileId); + } + RowIdRange rowIdRange = new RowIdRange(rowIdStart, rowIdStart + rowCount, + fileId, 0, 0, rowCount); + if (!mainIndex.deleteRowIdRange(rowIdRange)) + { + throw new IndexException("Failed to delete main index range for tableId=" + + tableId + ", fileId=" + fileId); + } + } + catch (MainIndexException e) + { + throw new IndexException("Failed to delete main index range for tableId=" + + tableId + ", fileId=" + fileId, e); + } + } } diff --git a/pixels-common/src/main/java/io/pixelsdb/pixels/common/metadata/MetadataService.java b/pixels-common/src/main/java/io/pixelsdb/pixels/common/metadata/MetadataService.java index 8835f63ac7..6486e7c0c4 100644 --- a/pixels-common/src/main/java/io/pixelsdb/pixels/common/metadata/MetadataService.java +++ b/pixels-common/src/main/java/io/pixelsdb/pixels/common/metadata/MetadataService.java @@ -28,6 +28,7 @@ import io.pixelsdb.pixels.common.physical.Storage; import io.pixelsdb.pixels.common.server.HostAddress; import io.pixelsdb.pixels.common.utils.ConfigFactory; +import io.pixelsdb.pixels.common.utils.PixelsFileNameUtils; import io.pixelsdb.pixels.common.utils.ShutdownHookManager; import io.pixelsdb.pixels.daemon.MetadataProto; import io.pixelsdb.pixels.daemon.MetadataServiceGrpc; @@ -1361,7 +1362,7 @@ public boolean addFiles(Collection files) throws MetadataException { throw new MetadataException("failed to add file", e); } - return false; + return true; } /** @@ -1420,7 +1421,7 @@ public File.Type getFileType(String filePathUri) throws MetadataException { throw new MetadataException("response token does not match."); } - return File.Type.valueOf(response.getFileType().getNumber()); + return File.Type.valueOf(response.getFileTypeValue()); } catch (Exception e) { @@ -1428,14 +1429,57 @@ public File.Type getFileType(String filePathUri) throws MetadataException } } - public List getFiles(long pathId) throws MetadataException + /** + * Return query-visible {@link File.Type#REGULAR} files under the path. + */ + public List getRegularFiles(long pathId) throws MetadataException + { + return getFilesByType(pathId, EnumSet.of(File.Type.REGULAR)); + } + + /** + * Return files of the requested types, scoped to a single path. + */ + public List getFilesByType(long pathId, Set types) throws MetadataException + { + return invokeGetFilesByType(pathId, types, "get files by type"); + } + + /** + * Catalog-wide cross-path enumeration of the requested types. + */ + public List getFilesByType(Set types) throws MetadataException + { + return invokeGetFilesByType(null, types, "get files by type (cross-path)"); + } + + private List invokeGetFilesByType(Long pathId, Set types, String errorContext) + throws MetadataException { + if (types == null || types.isEmpty()) + { + throw new IllegalArgumentException( + errorContext + ": 'types' must be non-null and non-empty"); + } String token = UUID.randomUUID().toString(); - MetadataProto.GetFilesRequest request = MetadataProto.GetFilesRequest.newBuilder() - .setHeader(MetadataProto.RequestHeader.newBuilder().setToken(token)).setPathId(pathId).build(); + MetadataProto.GetFilesByTypeRequest.Builder requestBuilder = + MetadataProto.GetFilesByTypeRequest.newBuilder() + .setHeader(MetadataProto.RequestHeader.newBuilder().setToken(token)); + if (pathId != null) + { + requestBuilder.setPathId(pathId); + } + for (File.Type type : types) + { + if (type != null) + { + requestBuilder.addFileTypesValue(type.getNumber()); + } + } + try { - MetadataProto.GetFilesResponse response = this.stub.getFiles(request); + MetadataProto.GetFilesByTypeResponse response = this.stub.getFilesByType(requestBuilder.build()); if (response.getHeader().getErrorCode() != 0) { throw new MetadataException("error code=" + response.getHeader().getErrorCode() @@ -1447,10 +1491,104 @@ public List getFiles(long pathId) throws MetadataException } return File.convertFiles(response.getFilesList()); } + catch (MetadataException e) + { + throw e; + } catch (Exception e) { - throw new MetadataException("failed to get files", e); + throw new MetadataException("failed to " + errorContext, e); + } + } + + /** + * Return temporary files (TEMPORARY_INGEST + TEMPORARY_GC) whose filename + * create time plus {@code ttlMs} is not later than now. + * + *

The create time is decoded from the {@code yyyyMMddHHmmss} timestamp in + * the file name. Files with unparsable names are logged and skipped. + * + *

For background sweepers only; not for query-visible callers. + * + * @param ttlMs temporary-file TTL in milliseconds. Must be {@code >= 0}. + */ + public List listTemporaryFilesDue(long ttlMs) throws MetadataException + { + if (ttlMs < 0) + { + throw new IllegalArgumentException("listTemporaryFilesDue: ttlMs must be >= 0, got " + ttlMs); + } + long now = System.currentTimeMillis(); + List all = getFilesByType( + EnumSet.of(File.Type.TEMPORARY_INGEST, File.Type.TEMPORARY_GC)); + List due = new ArrayList<>(all.size()); + int skippedParseFailure = 0; + for (File f : all) + { + OptionalLong createTime = PixelsFileNameUtils.extractCreateTimeMillis(f.getName()); + if (!createTime.isPresent()) + { + skippedParseFailure++; + logger.warn("listTemporaryFilesDue: cannot decode createTime from file name '{}' " + + "(id={}, pathId={}, type={}); skipping. event=sweep.parse_failure", + f.getName(), f.getId(), f.getPathId(), f.getType()); + continue; + } + if (createTime.getAsLong() + ttlMs <= now) + { + due.add(f); + } + } + if (skippedParseFailure > 0) + { + logger.warn("listTemporaryFilesDue: skipped {} temporary file(s) due to filename parse failure; " + + "investigate writer-side filename generation. event=sweep.parse_failure.summary", + skippedParseFailure); + } + // Oldest-first ordering for reproducible sweep batches. The createTime is already + // parsed once above, but the file list is small (sweep batch), so re-parsing here + // is acceptable and keeps the sort key self-contained. + due.sort(Comparator + .comparingLong((File f) -> PixelsFileNameUtils.extractCreateTimeMillis(f.getName()) + .orElse(Long.MAX_VALUE)) + .thenComparingLong(File::getId)); + return due; + } + + /** + * Return RETIRED files whose {@code cleanupAt} deadline has arrived. + */ + public List listRetiredFilesDue() throws MetadataException + { + long now = System.currentTimeMillis(); + List all = getFilesByType(EnumSet.of(File.Type.RETIRED)); + List due = new ArrayList<>(all.size()); + int skippedInvariantViolation = 0; + for (File f : all) + { + Long cleanupAt = f.getCleanupAt(); + if (cleanupAt == null) + { + skippedInvariantViolation++; + logger.warn("listRetiredFilesDue: RETIRED file '{}' (id={}, pathId={}) carries no cleanupAt; " + + "skipping. event=sweep.invariant_violation", + f.getName(), f.getId(), f.getPathId()); + continue; + } + if (cleanupAt <= now) + { + due.add(f); + } + } + if (skippedInvariantViolation > 0) + { + logger.warn("listRetiredFilesDue: skipped {} RETIRED file(s) missing cleanupAt; " + + "investigate DAO write path. event=sweep.invariant_violation.summary", + skippedInvariantViolation); } + due.sort(Comparator.comparingLong((File f) -> f.getCleanupAt()) + .thenComparingLong(File::getId)); + return due; } public boolean updateFile(File file) throws MetadataException @@ -1476,7 +1614,7 @@ public boolean updateFile(File file) throws MetadataException { throw new MetadataException("failed to update file", e); } - return false; + return true; } public boolean deleteFiles(List fileIds) throws MetadataException @@ -1502,7 +1640,7 @@ public boolean deleteFiles(List fileIds) throws MetadataException { throw new MetadataException("failed to delete files", e); } - return false; + return true; } /** @@ -1537,17 +1675,18 @@ public File getFileById(long fileId) throws MetadataException } /** - * Atomically promote a TEMPORARY file to REGULAR and delete the old files. - * @param newFileId the id of the new TEMPORARY file to promote - * @param oldFileIds the ids of old files to delete + * Atomically promote a temporary GC file to REGULAR and retire the old files. + * @param newFileId the id of the new temporary GC file to promote + * @param oldFileIds the ids of old files to retire + * @param cleanupAt the cleanup deadline to write on retired old files * @throws MetadataException if the request fails */ - public void atomicSwapFiles(long newFileId, List oldFileIds) throws MetadataException + public void atomicSwapFiles(long newFileId, List oldFileIds, long cleanupAt) throws MetadataException { String token = UUID.randomUUID().toString(); MetadataProto.AtomicSwapFilesRequest request = MetadataProto.AtomicSwapFilesRequest.newBuilder() .setHeader(MetadataProto.RequestHeader.newBuilder().setToken(token)) - .setNewFileId(newFileId).addAllOldFileIds(oldFileIds).build(); + .setNewFileId(newFileId).addAllOldFileIds(oldFileIds).setCleanupAt(cleanupAt).build(); try { MetadataProto.AtomicSwapFilesResponse response = this.stub.atomicSwapFiles(request); diff --git a/pixels-common/src/main/java/io/pixelsdb/pixels/common/metadata/domain/File.java b/pixels-common/src/main/java/io/pixelsdb/pixels/common/metadata/domain/File.java index 7dd46ecdc3..a567b82939 100644 --- a/pixels-common/src/main/java/io/pixelsdb/pixels/common/metadata/domain/File.java +++ b/pixels-common/src/main/java/io/pixelsdb/pixels/common/metadata/domain/File.java @@ -33,22 +33,37 @@ */ public class File extends Base { - /** - * Files such as loaded and compacted are marked as REGULAR, while file - * created by pixelsWriterImpl during build are marked as TEMPORARY. - */ public enum Type { - TEMPORARY, REGULAR; + TEMPORARY_INGEST(0), + REGULAR(1), + TEMPORARY_GC(2), + RETIRED(3); + + private final int number; + + Type(int number) + { + this.number = number; + } + + public int getNumber() + { + return number; + } public static Type valueOf(int number) { switch (number) { case 0: - return TEMPORARY; + return TEMPORARY_INGEST; case 1: return REGULAR; + case 2: + return TEMPORARY_GC; + case 3: + return RETIRED; default: throw new InvalidArgumentException("invalid number for File.Type"); } @@ -61,6 +76,7 @@ public static Type valueOf(int number) private long minRowId; private long maxRowId; private long pathId; + private Long cleanupAt; public File() { @@ -70,11 +86,12 @@ public File(MetadataProto.File file) { this.setId(file.getId()); this.name = file.getName(); - this.type = Type.valueOf(file.getType().getNumber()); + this.type = Type.valueOf(file.getTypeValue()); this.numRowGroup = file.getNumRowGroup(); this.minRowId = file.getMinRowId(); this.maxRowId = file.getMaxRowId(); this.pathId = file.getPathId(); + this.cleanupAt = file.hasCleanupAt() ? file.getCleanupAt() : null; } public String getName() @@ -137,6 +154,16 @@ public void setPathId(long pathId) this.pathId = pathId; } + public Long getCleanupAt() + { + return cleanupAt; + } + + public void setCleanupAt(Long cleanupAt) + { + this.cleanupAt = cleanupAt; + } + public static List convertFiles(List protoFiles) { requireNonNull(protoFiles, "protoFiles is null"); @@ -182,8 +209,14 @@ public static String getFilePath(Path path, File file) @Override public MetadataProto.File toProto() { - return MetadataProto.File.newBuilder().setId(this.getId()).setName(this.name) - .setTypeValue(this.type.ordinal()).setNumRowGroup(this.numRowGroup) - .setMinRowId(this.minRowId).setMaxRowId(this.maxRowId).setPathId(this.pathId).build(); + MetadataProto.File.Builder builder = MetadataProto.File.newBuilder() + .setId(this.getId()).setName(this.name) + .setTypeValue(this.type.getNumber()).setNumRowGroup(this.numRowGroup) + .setMinRowId(this.minRowId).setMaxRowId(this.maxRowId).setPathId(this.pathId); + if (this.cleanupAt != null) + { + builder.setCleanupAt(this.cleanupAt); + } + return builder.build(); } } diff --git a/pixels-common/src/main/java/io/pixelsdb/pixels/common/physical/Storage.java b/pixels-common/src/main/java/io/pixelsdb/pixels/common/physical/Storage.java index bdf151d9fa..620ff24434 100644 --- a/pixels-common/src/main/java/io/pixelsdb/pixels/common/physical/Storage.java +++ b/pixels-common/src/main/java/io/pixelsdb/pixels/common/physical/Storage.java @@ -262,6 +262,17 @@ default DataOutputStream create(String path, boolean overwrite, return create(path, overwrite, bufferSize, replication); } + /** + * Opens an existing file for appending. + * Backends that do not support append (e.g. S3, GCS) throw {@link UnsupportedOperationException}. + * @param path the path of the existing file + * @param bufferSize write-buffer size in bytes + * @return an output stream positioned at the end of the file + * @throws IOException if the file does not exist or the backend reports an error + * @throws UnsupportedOperationException if the storage backend does not support append + */ + DataOutputStream append(String path, int bufferSize) throws IOException; + /** * For local fs, path is considered as local. * @param path the path to delete diff --git a/pixels-common/src/main/java/io/pixelsdb/pixels/common/retina/RetinaService.java b/pixels-common/src/main/java/io/pixelsdb/pixels/common/retina/RetinaService.java index d27f912c01..deb0bef8ff 100644 --- a/pixels-common/src/main/java/io/pixelsdb/pixels/common/retina/RetinaService.java +++ b/pixels-common/src/main/java/io/pixelsdb/pixels/common/retina/RetinaService.java @@ -147,6 +147,20 @@ private synchronized void shutdown() throws InterruptedException } } + private static void checkHeader(String operation, String expectedToken, + RetinaProto.ResponseHeader header) throws RetinaException + { + if (header.getErrorCode() != 0) + { + throw new RetinaException("Failed to " + operation + ": " + + header.getErrorCode() + " " + header.getErrorMsg()); + } + if (!header.getToken().equals(expectedToken)) + { + throw new RetinaException("Response token does not match"); + } + } + public boolean updateRecord(String schemaName, int virtualNodeId, List tableUpdateData) throws RetinaException { String token = UUID.randomUUID().toString(); @@ -157,15 +171,7 @@ public boolean updateRecord(String schemaName, int virtualNodeId, List> resolved = indexService.resolvePrimary( + STAGED_TABLE_ID, STAGED_PRIMARY_INDEX_ID, + Arrays.asList(e0.getIndexKey(), e1.getIndexKey()), opt); + assertEquals(2, resolved.size()); + assertTrue(resolved.get(0).isPresent()); + assertEquals(row0, resolved.get(0).get().getRowId()); + assertEquals(100L, resolved.get(0).get().getRowLocation().getFileId()); + assertTrue(resolved.get(1).isPresent()); + assertEquals(row1, resolved.get(1).get().getRowId()); + } + + @Test + @Order(11) + void testStagedResolvePrimaryReturnsEmptyForUnknownKey() throws Exception + { + IndexOption opt = IndexOption.builder().vNodeId(0).build(); + IndexProto.IndexKey unknown = IndexProto.IndexKey.newBuilder() + .setTableId(STAGED_TABLE_ID) + .setIndexId(STAGED_PRIMARY_INDEX_ID) + .setKey(ByteString.copyFromUtf8("staged-not-there")) + .setTimestamp(1000L) + .build(); + + List> resolved = indexService.resolvePrimary( + STAGED_TABLE_ID, STAGED_PRIMARY_INDEX_ID, Collections.singletonList(unknown), opt); + assertEquals(1, resolved.size()); + assertFalse(resolved.get(0).isPresent()); + } + + @Test + @Order(13) + void testStagedTombstonePrimaryResolvedIsIdempotent() throws Exception + { + IndexOption opt = IndexOption.builder().vNodeId(0).build(); + IndexProto.IndexKey k0 = stagedEntry("staged-k0", 0L, 100L, 0, 0).getIndexKey(); + + // First tombstone removes the live primary entry. + indexService.deletePrimaryIndexEntriesOnly(STAGED_TABLE_ID, STAGED_PRIMARY_INDEX_ID, + Collections.singletonList(k0), opt); + + List> resolved = indexService.resolvePrimary( + STAGED_TABLE_ID, STAGED_PRIMARY_INDEX_ID, Collections.singletonList(k0), opt); + assertFalse(resolved.get(0).isPresent()); + + // Repeated tombstone of an already-tombstoned key must be a no-op (idempotency invariant). + assertDoesNotThrow(() -> indexService.deletePrimaryIndexEntriesOnly( + STAGED_TABLE_ID, STAGED_PRIMARY_INDEX_ID, Collections.singletonList(k0), opt)); + } + + @Test + @Order(14) + void testStagedUpdateResolvedThenRestorePrimaryEntries() throws Exception + { + IndexOption opt = IndexOption.builder().vNodeId(0).build(); + IndexProto.IndexKey k1 = stagedEntry("staged-k1", 0L, 100L, 0, 1).getIndexKey(); + long oldRowId = indexService.resolvePrimary(STAGED_TABLE_ID, STAGED_PRIMARY_INDEX_ID, + Collections.singletonList(k1), opt).get(0).get().getRowId(); + + long newRowId = oldRowId + 100; + IndexProto.PrimaryIndexEntry newEntry = stagedEntry("staged-k1", newRowId, 101L, 0, 0); + indexService.putMainIndexEntriesOnly(STAGED_TABLE_ID, Collections.singletonList(newEntry)); + indexService.updatePrimaryIndexEntriesOnly(STAGED_TABLE_ID, STAGED_PRIMARY_INDEX_ID, + Collections.singletonList(newEntry), opt); + + Optional updated = indexService.resolvePrimary( + STAGED_TABLE_ID, STAGED_PRIMARY_INDEX_ID, Collections.singletonList(k1), opt).get(0); + assertTrue(updated.isPresent()); + assertEquals(newRowId, updated.get().getRowId()); + + indexService.restorePrimaryIndexEntries(STAGED_TABLE_ID, STAGED_PRIMARY_INDEX_ID, + Collections.singletonList(new RollbackEntry(k1, oldRowId, newRowId)), opt); + + Optional restored = indexService.resolvePrimary( + STAGED_TABLE_ID, STAGED_PRIMARY_INDEX_ID, Collections.singletonList(k1), opt).get(0); + assertTrue(restored.isPresent()); + assertEquals(oldRowId, restored.get().getRowId()); + } + + @Test + @Order(15) + void testStagedRestorePrimaryEntriesSkipsNonMatchingCurrent() throws Exception + { + IndexOption opt = IndexOption.builder().vNodeId(0).build(); + IndexProto.IndexKey k1 = stagedEntry("staged-k1", 0L, 100L, 0, 1).getIndexKey(); + long currentRowId = indexService.resolvePrimary(STAGED_TABLE_ID, STAGED_PRIMARY_INDEX_ID, + Collections.singletonList(k1), opt).get(0).get().getRowId(); + + // Rollback entry says: switch from newRowId=currentRowId+5 back to oldRowId=currentRowId-7. + // Since the actual current pointer is `currentRowId` (not newRowId=currentRowId+5), the + // restore must be a no-op. + RollbackEntry entry = new RollbackEntry(k1, currentRowId - 7, currentRowId + 5); + indexService.restorePrimaryIndexEntries(STAGED_TABLE_ID, STAGED_PRIMARY_INDEX_ID, + Collections.singletonList(entry), opt); + + // Verify primary still points at the original rowId, not the spurious oldRowId. + Optional after = indexService.resolvePrimary( + STAGED_TABLE_ID, STAGED_PRIMARY_INDEX_ID, Collections.singletonList(k1), opt).get(0); + assertTrue(after.isPresent()); + assertEquals(currentRowId, after.get().getRowId()); + } } diff --git a/pixels-common/src/test/java/io/pixelsdb/pixels/common/metadata/domain/TestFileDomain.java b/pixels-common/src/test/java/io/pixelsdb/pixels/common/metadata/domain/TestFileDomain.java new file mode 100644 index 0000000000..3907948f18 --- /dev/null +++ b/pixels-common/src/test/java/io/pixelsdb/pixels/common/metadata/domain/TestFileDomain.java @@ -0,0 +1,336 @@ +/* + * Copyright 2026 PixelsDB. + * + * This file is part of Pixels. + * + * Pixels is free software: you can redistribute it and/or modify + * it under the terms of the Affero GNU General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * Pixels is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Affero GNU General Public License for more details. + * + * You should have received a copy of the Affero GNU General Public + * License along with Pixels. If not, see + * . + */ +package io.pixelsdb.pixels.common.metadata.domain; + +import io.pixelsdb.pixels.common.exception.InvalidArgumentException; +import io.pixelsdb.pixels.daemon.MetadataProto; +import org.junit.Test; + +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertSame; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; + +/** + * Unit tests for {@link File} that exercise the c01.1 contract: + *

    + *
  • {@link File.Type} now carries an explicit numeric tag (no longer relies on {@code ordinal()}).
  • + *
  • The four enum constants — {@code TEMPORARY_INGEST(0)}, {@code REGULAR(1)}, + * {@code TEMPORARY_GC(2)}, {@code RETIRED(3)} — must round-trip cleanly through both + * {@link MetadataProto.File} and the domain object.
  • + *
  • {@link File#getCleanupAt()} is an optional field: it must be preserved across + * {@link File#toProto()} / {@code new File(MetadataProto.File)} when present and absent.
  • + *
+ * + * @author tdd-guide + * @create 2026-05-13 + */ +public class TestFileDomain +{ + // ------------------------------------------------------------------------- + // File.Type — numeric tags + // ------------------------------------------------------------------------- + + /** + * The domain {@link File.Type#getNumber()} must agree with the proto-generated + * {@link MetadataProto.File.Type#getNumber()} for every constant we publish. + * This guards against the previous implementation that relied on + * {@code ordinal()} and would silently re-number constants when the enum order changed. + */ + @Test + public void typeNumber_isConsistentWithProtoEnum() + { + assertEquals(MetadataProto.File.Type.TEMPORARY_INGEST.getNumber(), + File.Type.TEMPORARY_INGEST.getNumber()); + assertEquals(MetadataProto.File.Type.REGULAR.getNumber(), + File.Type.REGULAR.getNumber()); + assertEquals(MetadataProto.File.Type.TEMPORARY_GC.getNumber(), + File.Type.TEMPORARY_GC.getNumber()); + assertEquals(MetadataProto.File.Type.RETIRED.getNumber(), + File.Type.RETIRED.getNumber()); + } + + // ------------------------------------------------------------------------- + // File.Type.valueOf(int) — happy path + boundaries + // ------------------------------------------------------------------------- + + @Test + public void typeValueOf_resolvesAllKnownNumbers() + { + assertSame(File.Type.TEMPORARY_INGEST, File.Type.valueOf(0)); + assertSame(File.Type.REGULAR, File.Type.valueOf(1)); + assertSame(File.Type.TEMPORARY_GC, File.Type.valueOf(2)); + assertSame(File.Type.RETIRED, File.Type.valueOf(3)); + } + + @Test + public void typeValueOf_rejectsInvalidNumbers() + { + // Test various boundary cases for invalid type numbers + int[] invalidNumbers = {-1, 4, Integer.MAX_VALUE, Integer.MIN_VALUE}; + + for (int invalidNumber : invalidNumbers) + { + try + { + File.Type.valueOf(invalidNumber); + fail("expected InvalidArgumentException for number: " + invalidNumber); + } + catch (InvalidArgumentException expected) + { + assertNotNull("Exception message should not be null for number: " + invalidNumber, + expected.getMessage()); + } + } + } + + /** + * Round-trip: every constant survives {@code num -> valueOf -> getNumber}. + */ + @Test + public void typeValueOf_roundTripForAllConstants() + { + for (File.Type t : File.Type.values()) + { + assertSame("round-trip failed for " + t, + t, File.Type.valueOf(t.getNumber())); + } + } + + // ------------------------------------------------------------------------- + // cleanupAt — getter / setter + // ------------------------------------------------------------------------- + + @Test + public void cleanupAt_defaultsToNullOnNoArgConstructor() + { + File f = new File(); + assertNull("a freshly constructed File must have a null cleanupAt", f.getCleanupAt()); + } + + @Test + public void cleanupAt_setterAcceptsValueAndNull() + { + File f = new File(); + f.setCleanupAt(123_456_789L); + assertEquals(Long.valueOf(123_456_789L), f.getCleanupAt()); + + // explicit clear must be supported (used after promote-to-REGULAR) + f.setCleanupAt(null); + assertNull(f.getCleanupAt()); + } + + // ------------------------------------------------------------------------- + // toProto / fromProto round-trip + // ------------------------------------------------------------------------- + + /** + * When {@code cleanupAt == null}, {@link File#toProto()} must NOT set the optional + * field on the wire. Otherwise downstream consumers calling {@code hasCleanupAt()} + * would see a spurious zero deadline. + */ + @Test + public void toProto_omitsCleanupAt_whenDomainValueIsNull() + { + File f = makeFile(1L, "n.pxl", File.Type.TEMPORARY_INGEST, 1, 0L, 0L, 1L, null); + + MetadataProto.File proto = f.toProto(); + + assertFalse("cleanupAt must be absent on the wire when domain value is null", + proto.hasCleanupAt()); + } + + /** + * cleanupAt = 0L is a legitimate value (epoch start); it must NOT be confused with "absent". + * Without this guard, a naïve {@code if (cleanupAt != 0)} check would silently drop the field. + */ + @Test + public void toProto_includesCleanupAt_whenValueIsZero() + { + File f = makeFile(1L, "z.pxl", File.Type.RETIRED, 1, 0L, 0L, 1L, 0L); + + MetadataProto.File proto = f.toProto(); + + assertTrue("cleanupAt = 0L must be carried on the wire (zero != absent)", + proto.hasCleanupAt()); + assertEquals(0L, proto.getCleanupAt()); + } + + @Test + public void fromProto_preservesCleanupAt_whenSet() + { + long deadline = 1_700_000_123_456L; + MetadataProto.File proto = MetadataProto.File.newBuilder() + .setId(42L) + .setName("retired.pxl") + .setTypeValue(File.Type.RETIRED.getNumber()) + .setNumRowGroup(2) + .setMinRowId(0L) + .setMaxRowId(127L) + .setPathId(9L) + .setCleanupAt(deadline) + .build(); + + File f = new File(proto); + + assertEquals(42L, f.getId()); + assertEquals("retired.pxl", f.getName()); + assertSame(File.Type.RETIRED, f.getType()); + assertEquals(2, f.getNumRowGroup()); + assertEquals(0L, f.getMinRowId()); + assertEquals(127L, f.getMaxRowId()); + assertEquals(9L, f.getPathId()); + assertNotNull("cleanupAt must be retained from the proto", f.getCleanupAt()); + assertEquals(Long.valueOf(deadline), f.getCleanupAt()); + } + + /** + * If the proto omits the optional cleanupAt, the domain object MUST observe {@code null} + * (not 0L). This is the reciprocal of {@link #toProto_omitsCleanupAt_whenDomainValueIsNull()}. + */ + @Test + public void fromProto_returnsNullCleanupAt_whenAbsent() + { + MetadataProto.File proto = MetadataProto.File.newBuilder() + .setId(1L) + .setName("tmp.pxl") + .setTypeValue(File.Type.TEMPORARY_GC.getNumber()) + .setNumRowGroup(1) + .setMinRowId(0L) + .setMaxRowId(0L) + .setPathId(1L) + .build(); + + File f = new File(proto); + + assertNull("absent cleanupAt on the wire must materialise as null in the domain", + f.getCleanupAt()); + } + + /** + * End-to-end round-trip — domain → proto → domain — must be lossless for every {@link File.Type}. + */ + @Test + public void roundTrip_domainProtoDomain_isLossless_forEveryType() + { + for (File.Type t : File.Type.values()) + { + // The domain object preserves cleanupAt exactly as provided; lifecycle-specific + // invariants are enforced by callers that create or update catalog rows. + Long cleanup = (t == File.Type.REGULAR) ? null : 1_700_000_000_999L; + File original = makeFile(7L, "x_" + t + ".pxl", t, 1, 0L, 63L, 3L, cleanup); + + File restored = new File(original.toProto()); + + assertEquals("id mismatch for " + t, original.getId(), restored.getId()); + assertEquals("name mismatch for " + t, original.getName(), restored.getName()); + assertSame("type mismatch for " + t, original.getType(), restored.getType()); + assertEquals("numRowGroup mismatch for " + t, + original.getNumRowGroup(), restored.getNumRowGroup()); + assertEquals("minRowId mismatch for " + t, + original.getMinRowId(), restored.getMinRowId()); + assertEquals("maxRowId mismatch for " + t, + original.getMaxRowId(), restored.getMaxRowId()); + assertEquals("pathId mismatch for " + t, + original.getPathId(), restored.getPathId()); + assertEquals("cleanupAt mismatch for " + t, + original.getCleanupAt(), restored.getCleanupAt()); + } + } + + // ------------------------------------------------------------------------- + // convertFiles / revertFiles + // ------------------------------------------------------------------------- + + @Test + public void convertFiles_handlesEmptyList() + { + List result = File.convertFiles(Collections.emptyList()); + assertNotNull(result); + assertTrue(result.isEmpty()); + } + + @Test(expected = NullPointerException.class) + public void convertFiles_rejectsNullInput() + { + File.convertFiles(null); + } + + @Test + public void convertFiles_thenRevertFiles_isLossless() + { + MetadataProto.File p1 = MetadataProto.File.newBuilder() + .setId(10L).setName("a.pxl") + .setTypeValue(File.Type.REGULAR.getNumber()) + .setNumRowGroup(1).setMinRowId(0L).setMaxRowId(63L).setPathId(1L) + .build(); + MetadataProto.File p2 = MetadataProto.File.newBuilder() + .setId(11L).setName("b.pxl") + .setTypeValue(File.Type.RETIRED.getNumber()) + .setNumRowGroup(2).setMinRowId(64L).setMaxRowId(127L).setPathId(1L) + .setCleanupAt(1_700_000_000_000L) + .build(); + + List domain = File.convertFiles(Arrays.asList(p1, p2)); + assertEquals(2, domain.size()); + assertSame(File.Type.REGULAR, domain.get(0).getType()); + assertNull(domain.get(0).getCleanupAt()); + assertSame(File.Type.RETIRED, domain.get(1).getType()); + assertEquals(Long.valueOf(1_700_000_000_000L), domain.get(1).getCleanupAt()); + + List back = File.revertFiles(domain); + assertEquals(2, back.size()); + assertEquals(p1, back.get(0)); + assertEquals(p2, back.get(1)); + } + + @Test(expected = NullPointerException.class) + public void revertFiles_rejectsNullInput() + { + File.revertFiles(null); + } + + // ------------------------------------------------------------------------- + // helpers + // ------------------------------------------------------------------------- + + private static File makeFile(long id, String name, File.Type type, + int numRowGroup, long minRowId, long maxRowId, + long pathId, Long cleanupAt) + { + File f = new File(); + f.setId(id); + f.setName(name); + f.setType(type); + f.setNumRowGroup(numRowGroup); + f.setMinRowId(minRowId); + f.setMaxRowId(maxRowId); + f.setPathId(pathId); + f.setCleanupAt(cleanupAt); + return f; + } +} diff --git a/pixels-common/src/test/java/io/pixelsdb/pixels/common/utils/TestPixelsFileNameUtils.java b/pixels-common/src/test/java/io/pixelsdb/pixels/common/utils/TestPixelsFileNameUtils.java new file mode 100644 index 0000000000..965b48f5d6 --- /dev/null +++ b/pixels-common/src/test/java/io/pixelsdb/pixels/common/utils/TestPixelsFileNameUtils.java @@ -0,0 +1,174 @@ +/* + * Copyright 2026 PixelsDB. + * + * This file is part of Pixels. + * + * Pixels is free software: you can redistribute it and/or modify + * it under the terms of the Affero GNU General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * Pixels is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Affero GNU General Public License for more details. + * + * You should have received a copy of the Affero GNU General Public + * License along with Pixels. If not, see + * . + */ +package io.pixelsdb.pixels.common.utils; + +import org.junit.Test; + +import java.time.LocalDateTime; +import java.time.ZoneId; +import java.time.ZoneOffset; +import java.util.OptionalLong; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +/** + * Tests timestamp extraction from {@code .pxl} file names. + */ +public class TestPixelsFileNameUtils +{ + private static final String PXL_FILE_TIMESTAMP_ZONE_KEY = "pxl.file.timestamp.zone"; + private static final String DEFAULT_PXL_FILE_TIMESTAMP_ZONE = "UTC"; + + @Test + public void extractCreateTimeMillis_decodesEmbeddedTimestampUsingConfiguredDefaultZone() + { + ConfigFactory.Instance().addProperty(PXL_FILE_TIMESTAMP_ZONE_KEY, + DEFAULT_PXL_FILE_TIMESTAMP_ZONE); + String name = "host_20260514071200_0_3_ordered.pxl"; + long expected = LocalDateTime.of(2026, 5, 14, 7, 12, 0) + .toInstant(ZoneOffset.UTC).toEpochMilli(); + + OptionalLong actual = PixelsFileNameUtils.extractCreateTimeMillis(name); + assertTrue("well-formed file name must decode", actual.isPresent()); + assertEquals(expected, actual.getAsLong()); + } + + @Test + public void extractCreateTimeMillis_honorsConfiguredTimestampZone() + { + ConfigFactory.Instance().addProperty(PXL_FILE_TIMESTAMP_ZONE_KEY, "Asia/Shanghai"); + try + { + String name = "host_20260514071200_0_3_ordered.pxl"; + long expected = LocalDateTime.of(2026, 5, 14, 7, 12, 0) + .atZone(ZoneId.of("Asia/Shanghai")).toInstant().toEpochMilli(); + + OptionalLong actual = PixelsFileNameUtils.extractCreateTimeMillis(name); + assertTrue("well-formed file name must decode", actual.isPresent()); + assertEquals(expected, actual.getAsLong()); + } + finally + { + ConfigFactory.Instance().addProperty(PXL_FILE_TIMESTAMP_ZONE_KEY, + DEFAULT_PXL_FILE_TIMESTAMP_ZONE); + } + } + + @Test + public void extractCreateTimeMillis_roundTripsThroughDateUtilGetCurTimeWithConfiguredZone() + { + ConfigFactory.Instance().addProperty(PXL_FILE_TIMESTAMP_ZONE_KEY, "Asia/Shanghai"); + try + { + long before = System.currentTimeMillis(); + String name = "host_" + DateUtil.getCurTime() + "_3_ordered.pxl"; + long after = System.currentTimeMillis(); + + OptionalLong decoded = PixelsFileNameUtils.extractCreateTimeMillis(name); + assertTrue("DateUtil-generated filename must decode", decoded.isPresent()); + + long beforeSec = (before / 1000L) * 1000L; + long afterSec = ((after / 1000L) + 1L) * 1000L; + assertTrue("decoded createTime " + decoded.getAsLong() + + " out of [" + beforeSec + ", " + afterSec + "]", + decoded.getAsLong() >= beforeSec && decoded.getAsLong() <= afterSec); + } + finally + { + ConfigFactory.Instance().addProperty(PXL_FILE_TIMESTAMP_ZONE_KEY, + DEFAULT_PXL_FILE_TIMESTAMP_ZONE); + } + } + + @Test + public void extractCreateTimeMillis_handlesAbsolutePathPrefix() + { + ConfigFactory.Instance().addProperty(PXL_FILE_TIMESTAMP_ZONE_KEY, + DEFAULT_PXL_FILE_TIMESTAMP_ZONE); + String path = "/data/p/host_20200101000000_42_-1_single.pxl"; + long expected = LocalDateTime.of(2020, 1, 1, 0, 0, 0) + .toInstant(ZoneOffset.UTC).toEpochMilli(); + + OptionalLong actual = PixelsFileNameUtils.extractCreateTimeMillis(path); + assertTrue(actual.isPresent()); + assertEquals(expected, actual.getAsLong()); + } + + @Test + public void extractCreateTimeMillis_handlesHostnameWithUnderscores() + { + ConfigFactory.Instance().addProperty(PXL_FILE_TIMESTAMP_ZONE_KEY, + DEFAULT_PXL_FILE_TIMESTAMP_ZONE); + // Host names may contain underscores before the timestamp. + String name = "retina_node_3_20260514071200_7_2_compact.pxl"; + long expected = LocalDateTime.of(2026, 5, 14, 7, 12, 0) + .toInstant(ZoneOffset.UTC).toEpochMilli(); + + OptionalLong actual = PixelsFileNameUtils.extractCreateTimeMillis(name); + assertTrue(actual.isPresent()); + assertEquals(expected, actual.getAsLong()); + } + + @Test + public void extractCreateTimeMillis_returnsEmptyOnUnrecognisedFormat() + { + assertFalse(PixelsFileNameUtils.extractCreateTimeMillis(null).isPresent()); + assertFalse(PixelsFileNameUtils.extractCreateTimeMillis("").isPresent()); + assertFalse(PixelsFileNameUtils.extractCreateTimeMillis("random.txt").isPresent()); + // Unknown file type label. + assertFalse(PixelsFileNameUtils.extractCreateTimeMillis( + "host_20260514071200_0_3_unknown.pxl").isPresent()); + // Timestamp must be exactly 14 digits. + assertFalse(PixelsFileNameUtils.extractCreateTimeMillis( + "host_2026051407120_0_3_ordered.pxl").isPresent()); + } + + @Test + public void extractCreateTimeMillis_returnsEmptyOnStructurallyInvalidTimestamp() + { + // Structurally valid name with an invalid timestamp. + OptionalLong actual = PixelsFileNameUtils.extractCreateTimeMillis( + "host_20261314071200_0_3_ordered.pxl"); + assertFalse(actual.isPresent()); + } + + @Test + public void extractCreateTimeMillis_roundTripsThroughDateUtilGetCurTime() + { + ConfigFactory.Instance().addProperty(PXL_FILE_TIMESTAMP_ZONE_KEY, + DEFAULT_PXL_FILE_TIMESTAMP_ZONE); + // DateUtil.getCurTime() should produce a decodable filename timestamp. + long before = System.currentTimeMillis(); + String name = "host_" + DateUtil.getCurTime() + "_3_ordered.pxl"; + long after = System.currentTimeMillis(); + + OptionalLong decoded = PixelsFileNameUtils.extractCreateTimeMillis(name); + assertTrue("DateUtil-generated filename must decode", decoded.isPresent()); + + // Decoded timestamp has second-level precision. + long beforeSec = (before / 1000L) * 1000L; + long afterSec = ((after / 1000L) + 1L) * 1000L; + assertTrue("decoded createTime " + decoded.getAsLong() + + " out of [" + beforeSec + ", " + afterSec + "]", + decoded.getAsLong() >= beforeSec && decoded.getAsLong() <= afterSec); + } +} diff --git a/pixels-core/src/main/java/io/pixelsdb/pixels/core/PixelsWriter.java b/pixels-core/src/main/java/io/pixelsdb/pixels/core/PixelsWriter.java index c8c394587b..bf21f73dc9 100644 --- a/pixels-core/src/main/java/io/pixelsdb/pixels/core/PixelsWriter.java +++ b/pixels-core/src/main/java/io/pixelsdb/pixels/core/PixelsWriter.java @@ -63,4 +63,22 @@ public interface PixelsWriter extends Closeable int getNumWriteRequests(); long getCompletedBytes(); + + /** + * Release writer resources without writing the file tail. Caller is + * responsible for deleting any partial bytes the underlying physical + * writer may have flushed before abort. + * + *

Aborting after one or more row batches have been added is not + * supported and results in undefined file contents; aborting an + * already-closed writer is a no-op. + * + *

The default implementation falls back to {@link #close()} for + * writers that do not distinguish abort from normal close (e.g. test + * fakes or stream writers that never produce a file tail). + */ + default void abort() throws IOException + { + close(); + } } diff --git a/pixels-core/src/main/java/io/pixelsdb/pixels/core/PixelsWriterImpl.java b/pixels-core/src/main/java/io/pixelsdb/pixels/core/PixelsWriterImpl.java index 9b86e55906..02ae3a8547 100644 --- a/pixels-core/src/main/java/io/pixelsdb/pixels/core/PixelsWriterImpl.java +++ b/pixels-core/src/main/java/io/pixelsdb/pixels/core/PixelsWriterImpl.java @@ -588,6 +588,68 @@ public void close() } } + /** + * Abort the writer: release underlying resources without writing the + * file tail. Caller must ensure no row batches have been added; calling + * abort after data has been written results in undefined file contents + * and the caller should also delete any partial bytes the physical + * writer may have flushed. + * + *

Errors closing component writers are logged and the first failure + * is rethrown after all components have been attempted, so resources are + * released as eagerly as possible. + */ + @Override + public void abort() throws IOException + { + IOException firstFailure = null; + try + { + physicalWriter.close(); + } + catch (IOException e) + { + firstFailure = e; + LOGGER.warn("PixelsWriterImpl.abort: physicalWriter close failed", e); + } + for (ColumnWriter cw : columnWriters) + { + try + { + cw.close(); + } + catch (IOException e) + { + if (firstFailure == null) + { + firstFailure = e; + } + LOGGER.warn("PixelsWriterImpl.abort: columnWriter close failed", e); + } + } + if (hasHiddenColumn) + { + try + { + hiddenColumnWriter.close(); + } + catch (IOException e) + { + if (firstFailure == null) + { + firstFailure = e; + } + LOGGER.warn("PixelsWriterImpl.abort: hiddenColumnWriter close failed", e); + } + } + columnWriterService.shutdown(); + columnWriterService.shutdownNow(); + if (firstFailure != null) + { + throw firstFailure; + } + } + private void writeRowGroup() throws IOException { int rowGroupDataLength = 0; diff --git a/pixels-core/src/test/java/io/pixelsdb/pixels/core/reader/TestVisibilityCheckpointCache.java b/pixels-core/src/test/java/io/pixelsdb/pixels/core/reader/TestVisibilityCheckpointCache.java index e4ca0e3040..874b23d8db 100644 --- a/pixels-core/src/test/java/io/pixelsdb/pixels/core/reader/TestVisibilityCheckpointCache.java +++ b/pixels-core/src/test/java/io/pixelsdb/pixels/core/reader/TestVisibilityCheckpointCache.java @@ -46,7 +46,7 @@ public class TestVisibilityCheckpointCache @Before public void setUp() throws IOException { - testCheckpointDir = ConfigFactory.Instance().getProperty("retina.checkpoint.dir"); + testCheckpointDir = ConfigFactory.Instance().getProperty("retina.offload.checkpoint.dir"); storage = StorageFactory.Instance().getStorage(testCheckpointDir); if (!storage.exists(testCheckpointDir)) @@ -86,7 +86,7 @@ private void createDummyCheckpoint(String path, int numFiles, int rgsPerFile, lo public void testCacheLoading() throws Exception { long timestamp = 1000L; - String checkpointPath = resolve(testCheckpointDir, "vis_gc_tencent_100.bin"); + String checkpointPath = resolve(testCheckpointDir, "vis_offload_tencent_100.bin"); long[] dummyBitmap = new long[]{0x1L, 0x2L}; createDummyCheckpoint(checkpointPath, 1, 1, dummyBitmap); diff --git a/pixels-daemon/src/main/java/io/pixelsdb/pixels/daemon/cache/CacheCoordinator.java b/pixels-daemon/src/main/java/io/pixelsdb/pixels/daemon/cache/CacheCoordinator.java index e355e9021e..4b311dd741 100644 --- a/pixels-daemon/src/main/java/io/pixelsdb/pixels/daemon/cache/CacheCoordinator.java +++ b/pixels-daemon/src/main/java/io/pixelsdb/pixels/daemon/cache/CacheCoordinator.java @@ -297,7 +297,7 @@ private List select(Layout layout) throws MetadataException // Issue #723: files are managed in metadata, do not get file paths from storage. for (Path compactPath : compactPaths) { - this.metadataService.getFiles(compactPath.getId()).forEach( + this.metadataService.getRegularFiles(compactPath.getId()).forEach( file -> filePaths.add(File.getFilePath(compactPath, file))); } return filePaths; diff --git a/pixels-daemon/src/main/java/io/pixelsdb/pixels/daemon/heartbeat/HeartbeatWorker.java b/pixels-daemon/src/main/java/io/pixelsdb/pixels/daemon/heartbeat/HeartbeatWorker.java index 78201b6260..ac358c05f1 100644 --- a/pixels-daemon/src/main/java/io/pixelsdb/pixels/daemon/heartbeat/HeartbeatWorker.java +++ b/pixels-daemon/src/main/java/io/pixelsdb/pixels/daemon/heartbeat/HeartbeatWorker.java @@ -42,11 +42,12 @@ public class HeartbeatWorker implements Server { private static final Logger logger = LogManager.getLogger(HeartbeatWorker.class); - private static final AtomicInteger currentStatus = new AtomicInteger(NodeStatus.READY.StatusCode); + private static final AtomicInteger currentStatus = new AtomicInteger(NodeStatus.INIT.StatusCode); private final HeartbeatConfig heartbeatConfig = new HeartbeatConfig(); private final ScheduledExecutorService scheduledExecutor = Executors.newSingleThreadScheduledExecutor(); private final NodeProto.NodeRole role; private String hostName; + private String workerKey; private WorkerRegister workerRegister; private boolean initializeSuccess = false; private CountDownLatch runningLatch; @@ -59,6 +60,15 @@ public HeartbeatWorker(NodeProto.NodeRole role) initialize(); } + public static void setCurrentStatus(NodeStatus status) + { + if (status == null) + { + throw new IllegalArgumentException("status is null"); + } + currentStatus.set(status.StatusCode); + } + /** * Initialize heartbeat worker: *

@@ -92,13 +102,16 @@ private void initialize() default: throw new IllegalStateException("Unknown heartbeat role: " + role); } + this.workerKey = key; + currentStatus.set(role == NodeProto.NodeRole.RETINA + ? NodeStatus.INIT.StatusCode + : NodeStatus.READY.StatusCode); EtcdUtil.Instance().putKeyValueWithLeaseId(key, String.valueOf(currentStatus.get()), leaseId); // start a scheduled thread to update node status periodically this.workerRegister = new WorkerRegister(key, leaseClient, leaseId); scheduledExecutor.scheduleAtFixedRate(workerRegister, 0, heartbeatConfig.getNodeHeartbeatPeriod(), TimeUnit.SECONDS); initializeSuccess = true; - currentStatus.set(NodeStatus.READY.StatusCode); logger.info("Heartbeat worker on {} is initialized", hostName); } catch (Exception e) { @@ -126,10 +139,16 @@ public void shutdown() switch (role) { case WORKER: - EtcdUtil.Instance().deleteByPrefix(Constants.HEARTBEAT_WORKER_LITERAL); + if (workerKey != null) + { + EtcdUtil.Instance().delete(workerKey); + } break; case RETINA: - EtcdUtil.Instance().deleteByPrefix(Constants.HEARTBEAT_RETINA_LITERAL); + if (workerKey != null) + { + EtcdUtil.Instance().delete(workerKey); + } break; default: throw new IllegalStateException("Unknown heartbeat role: " + role); diff --git a/pixels-daemon/src/main/java/io/pixelsdb/pixels/daemon/metadata/MetadataServiceImpl.java b/pixels-daemon/src/main/java/io/pixelsdb/pixels/daemon/metadata/MetadataServiceImpl.java index 5b65dd637e..1e9957e73e 100644 --- a/pixels-daemon/src/main/java/io/pixelsdb/pixels/daemon/metadata/MetadataServiceImpl.java +++ b/pixels-daemon/src/main/java/io/pixelsdb/pixels/daemon/metadata/MetadataServiceImpl.java @@ -1351,14 +1351,18 @@ public void addFiles(MetadataProto.AddFilesRequest request, } @Override - public void getFiles(MetadataProto.GetFilesRequest request, - StreamObserver responseObserver) + public void getFilesByType(MetadataProto.GetFilesByTypeRequest request, + StreamObserver responseObserver) { + // pathId is optional; absent means scanning across paths. MetadataProto.ResponseHeader.Builder headerBuilder = MetadataProto.ResponseHeader.newBuilder() .setToken(request.getHeader().getToken()); - MetadataProto.GetFilesResponse.Builder responseBuilder = MetadataProto.GetFilesResponse.newBuilder(); - List files = this.fileDao.getAllByPathId(request.getPathId()); + MetadataProto.GetFilesByTypeResponse.Builder responseBuilder = + MetadataProto.GetFilesByTypeResponse.newBuilder(); + Long pathId = request.hasPathId() ? request.getPathId() : null; + List files = + this.fileDao.getFilesByType(pathId, request.getFileTypesList()); if (files != null) { headerBuilder.setErrorCode(SUCCESS).setErrorMsg(""); @@ -1366,7 +1370,7 @@ public void getFiles(MetadataProto.GetFilesRequest request, } else { - headerBuilder.setErrorCode(METADATA_GET_FILES_FAILED).setErrorMsg("get files by path id failed"); + headerBuilder.setErrorCode(METADATA_GET_FILES_FAILED).setErrorMsg("get files by type failed"); responseBuilder.setHeader(headerBuilder); } @@ -1522,7 +1526,9 @@ public void atomicSwapFiles(MetadataProto.AtomicSwapFilesRequest request, MetadataProto.ResponseHeader.Builder headerBuilder = MetadataProto.ResponseHeader.newBuilder() .setToken(request.getHeader().getToken()); - if (this.fileDao.atomicSwapFiles(request.getNewFileId(), request.getOldFileIdsList())) + if (request.hasCleanupAt() && + this.fileDao.atomicSwapFiles(request.getNewFileId(), request.getOldFileIdsList(), + request.getCleanupAt())) { headerBuilder.setErrorCode(SUCCESS).setErrorMsg(""); } diff --git a/pixels-daemon/src/main/java/io/pixelsdb/pixels/daemon/metadata/dao/FileDao.java b/pixels-daemon/src/main/java/io/pixelsdb/pixels/daemon/metadata/dao/FileDao.java index 73b921008b..a3d9920355 100644 --- a/pixels-daemon/src/main/java/io/pixelsdb/pixels/daemon/metadata/dao/FileDao.java +++ b/pixels-daemon/src/main/java/io/pixelsdb/pixels/daemon/metadata/dao/FileDao.java @@ -38,7 +38,13 @@ public List getAll() throw new UnsupportedOperationException("getAll is not supported."); } - public abstract List getAllByPathId(long pathId); + /** + * Return files of the requested types. + * + * @param pathId path scope, or {@code null} for all paths + * @param types file types to include; null or empty returns no files + */ + public abstract List getFilesByType(Long pathId, List types); public abstract MetadataProto.File getByPathIdAndFileName(long pathId, String fileName); @@ -75,10 +81,11 @@ public boolean save (MetadataProto.File file) abstract public boolean deleteByIds (List ids); /** - * Atomically promote a TEMPORARY file to REGULAR and delete the old files in a single transaction. - * @param newFileId the id of the new TEMPORARY file to promote - * @param oldFileIds the ids of old files to delete + * Atomically promote a temporary GC file to REGULAR and retire old files in a single transaction. + * @param newFileId the id of the new temporary GC file to promote + * @param oldFileIds the ids of old regular files to retire + * @param cleanupAt the cleanup deadline to write on retired old files * @return true if the transaction committed successfully */ - abstract public boolean atomicSwapFiles(long newFileId, List oldFileIds); + abstract public boolean atomicSwapFiles(long newFileId, List oldFileIds, long cleanupAt); } diff --git a/pixels-daemon/src/main/java/io/pixelsdb/pixels/daemon/metadata/dao/impl/RdbFileDao.java b/pixels-daemon/src/main/java/io/pixelsdb/pixels/daemon/metadata/dao/impl/RdbFileDao.java index 1af30d564b..f205de88e9 100644 --- a/pixels-daemon/src/main/java/io/pixelsdb/pixels/daemon/metadata/dao/impl/RdbFileDao.java +++ b/pixels-daemon/src/main/java/io/pixelsdb/pixels/daemon/metadata/dao/impl/RdbFileDao.java @@ -27,8 +27,9 @@ import java.sql.*; import java.util.ArrayList; +import java.util.Collections; +import java.util.LinkedHashSet; import java.util.List; -import java.util.stream.Collectors; /** * @author hank @@ -42,22 +43,64 @@ public RdbFileDao() { } private static final MetaDBUtil db = MetaDBUtil.Instance(); + private static MetadataProto.File buildFile(ResultSet rs) throws SQLException + { + MetadataProto.File.Builder builder = MetadataProto.File.newBuilder() + .setId(rs.getLong("FILE_ID")) + .setName(rs.getString("FILE_NAME")) + .setTypeValue(rs.getInt("FILE_TYPE")) + .setNumRowGroup(rs.getInt("FILE_NUM_RG")) + .setMinRowId(rs.getLong("FILE_MIN_ROW_ID")) + .setMaxRowId(rs.getLong("FILE_MAX_ROW_ID")) + .setPathId(rs.getLong("PATHS_PATH_ID")); + long cleanupAt = rs.getLong("FILE_CLEANUP_AT"); + if (!rs.wasNull()) + { + builder.setCleanupAt(cleanupAt); + } + return builder.build(); + } + + /** + * Bind {@code FILE_CLEANUP_AT} for a file row. + * + *

{@code RETIRED} files must carry a cleanup deadline; other types must not. + */ + private static void setCleanupAt(PreparedStatement pst, int index, MetadataProto.File file) throws SQLException + { + if (file.getTypeValue() == MetadataProto.File.Type.RETIRED.getNumber()) + { + if (!file.hasCleanupAt()) + { + throw new SQLException("FILES row invariant violated: RETIRED file '" + + file.getName() + "' (id=" + file.getId() + + ") must carry a non-null FILE_CLEANUP_AT"); + } + pst.setLong(index, file.getCleanupAt()); + } + else + { + if (file.hasCleanupAt()) + { + throw new SQLException("FILES row invariant violated: non-RETIRED file '" + + file.getName() + "' (id=" + file.getId() + + ", type=" + file.getType() + + ") must NOT carry FILE_CLEANUP_AT (got " + file.getCleanupAt() + ")"); + } + pst.setNull(index, Types.BIGINT); + } + } + @Override public MetadataProto.File getById(long id) { Connection conn = db.getConnection(); - try (Statement st = conn.createStatement()) + try (Statement st = conn.createStatement(); + ResultSet rs = st.executeQuery("SELECT * FROM FILES WHERE FILE_ID=" + id)) { - ResultSet rs = st.executeQuery("SELECT * FROM FILES WHERE FILE_ID=" + id); if (rs.next()) { - return MetadataProto.File.newBuilder().setId(id) - .setName(rs.getString("FILE_NAME")) - .setTypeValue(rs.getInt("FILE_TYPE")) - .setNumRowGroup(rs.getInt("FILE_NUM_RG")) - .setMinRowId(rs.getLong("FILE_MIN_ROW_ID")) - .setMaxRowId(rs.getLong("FILE_MAX_ROW_ID")) - .setPathId(rs.getLong("PATHS_PATH_ID")).build(); + return buildFile(rs); } } catch (SQLException e) { @@ -68,30 +111,59 @@ public MetadataProto.File getById(long id) } @Override - public List getAllByPathId(long pathId) + public List getFilesByType(Long pathId, List types) { + if (types == null || types.isEmpty()) + { + return Collections.emptyList(); + } + // De-duplicate while preserving insertion order so the SQL bind order is stable. + LinkedHashSet typeNumbers = new LinkedHashSet<>(); + for (MetadataProto.File.Type type : types) + { + if (type != null) + { + typeNumbers.add(type.getNumber()); + } + } + if (typeNumbers.isEmpty()) + { + return Collections.emptyList(); + } + + StringBuilder sql = new StringBuilder("SELECT * FROM FILES WHERE "); + if (pathId != null) + { + sql.append("PATHS_PATH_ID = ? AND "); + } + sql.append("FILE_TYPE IN (") + .append(String.join(",", Collections.nCopies(typeNumbers.size(), "?"))) + .append(") ORDER BY FILE_ID"); + Connection conn = db.getConnection(); - try (Statement st = conn.createStatement()) + try (PreparedStatement pst = conn.prepareStatement(sql.toString())) { - // Issue #932: Add empty file markers and ignore empty files when retrieving file lists. - ResultSet rs = st.executeQuery("SELECT * FROM FILES WHERE FILE_TYPE <> 0 AND PATHS_PATH_ID=" + pathId); - List files = new ArrayList<>(); - while (rs.next()) + int index = 1; + if (pathId != null) + { + pst.setLong(index++, pathId); + } + for (Integer number : typeNumbers) + { + pst.setInt(index++, number); + } + try (ResultSet rs = pst.executeQuery()) { - MetadataProto.File.Builder builder = MetadataProto.File.newBuilder() - .setId(rs.getLong("FILE_ID")) - .setTypeValue(rs.getInt("FILE_TYPE")) - .setName(rs.getString("FILE_NAME")) - .setNumRowGroup(rs.getInt("FILE_NUM_RG")) - .setMinRowId(rs.getLong("FILE_MIN_ROW_ID")) - .setMaxRowId(rs.getLong("FILE_MAX_ROW_ID")) - .setPathId(rs.getLong("PATHS_PATH_ID")); - files.add(builder.build()); + List files = new ArrayList<>(); + while (rs.next()) + { + files.add(buildFile(rs)); + } + return files; } - return files; } catch (SQLException e) { - log.error("getAllByPathId in RdbFileDao", e); + log.error("getFilesByType in RdbFileDao", e); } return null; @@ -101,22 +173,17 @@ public List getAllByPathId(long pathId) public MetadataProto.File getByPathIdAndFileName(long pathId, String fileName) { Connection conn = db.getConnection(); - String sql = "SELECT FILE_ID, FILE_TYPE, FILE_NUM_RG, FILE_MIN_ROW_ID, FILE_MAX_ROW_ID FROM FILES WHERE PATHS_PATH_ID=? AND FILE_NAME=?"; + String sql = "SELECT * FROM FILES WHERE PATHS_PATH_ID=? AND FILE_NAME=?"; try (PreparedStatement st = conn.prepareStatement(sql)) { st.setLong(1, pathId); st.setString(2, fileName); - ResultSet rs = st.executeQuery(); - if (rs.next()) + try (ResultSet rs = st.executeQuery()) { - return MetadataProto.File.newBuilder() - .setId(rs.getLong("FILE_ID")) - .setName(fileName) - .setTypeValue(rs.getInt("FILE_TYPE")) - .setNumRowGroup(rs.getInt("FILE_NUM_RG")) - .setMinRowId(rs.getLong("FILE_MIN_ROW_ID")) - .setMaxRowId(rs.getLong("FILE_MAX_ROW_ID")) - .setPathId(pathId).build(); + if (rs.next()) + { + return buildFile(rs); + } } } catch (SQLException e) { @@ -133,10 +200,12 @@ public boolean exists(MetadataProto.File file) try (Statement st = conn.createStatement()) { String sql = "SELECT 1 FROM FILES WHERE FILE_ID=" + file.getId(); - ResultSet rs = st.executeQuery(sql); - if (rs.next()) + try (ResultSet rs = st.executeQuery(sql)) { - return true; + if (rs.next()) + { + return true; + } } } catch (SQLException e) { @@ -156,7 +225,8 @@ public long insert(MetadataProto.File file) "`FILE_NUM_RG`," + "`FILE_MIN_ROW_ID`," + "`FILE_MAX_ROW_ID`," + - "`PATHS_PATH_ID`) VALUES (?,?,?,?,?,?)"; + "`PATHS_PATH_ID`," + + "`FILE_CLEANUP_AT`) VALUES (?,?,?,?,?,?,?)"; try (PreparedStatement pst = conn.prepareStatement(sql)) { pst.setString(1, file.getName()); @@ -165,16 +235,19 @@ public long insert(MetadataProto.File file) pst.setLong(4, file.getMinRowId()); pst.setLong(5, file.getMaxRowId()); pst.setLong(6, file.getPathId()); + setCleanupAt(pst, 7, file); if (pst.executeUpdate() == 1) { - ResultSet rs = pst.executeQuery("SELECT LAST_INSERT_ID()"); - if (rs.next()) - { - return rs.getLong(1); - } - else + try (ResultSet rs = pst.executeQuery("SELECT LAST_INSERT_ID()")) { - return -1; + if (rs.next()) + { + return rs.getLong(1); + } + else + { + return -1; + } } } else @@ -199,7 +272,8 @@ public boolean insertBatch(List files) "`FILE_NUM_RG`," + "`FILE_MIN_ROW_ID`," + "`FILE_MAX_ROW_ID`," + - "`PATHS_PATH_ID`) VALUES (?,?,?,?,?,?)"; + "`PATHS_PATH_ID`," + + "`FILE_CLEANUP_AT`) VALUES (?,?,?,?,?,?,?)"; try (PreparedStatement pst = conn.prepareStatement(sql)) { for (MetadataProto.File file : files) @@ -210,6 +284,7 @@ public boolean insertBatch(List files) pst.setLong(4, file.getMinRowId()); pst.setLong(5, file.getMaxRowId()); pst.setLong(6, file.getPathId()); + setCleanupAt(pst, 7, file); pst.addBatch(); } pst.executeBatch(); @@ -230,7 +305,8 @@ public boolean update(MetadataProto.File file) "`FILE_TYPE` = ?," + "`FILE_NUM_RG` = ?," + "`FILE_MIN_ROW_ID` = ?," + - "`FILE_MAX_ROW_ID` = ?\n" + + "`FILE_MAX_ROW_ID` = ?," + + "`FILE_CLEANUP_AT` = ?\n" + "WHERE `FILE_ID` = ?"; try (PreparedStatement pst = conn.prepareStatement(sql)) { @@ -239,7 +315,8 @@ public boolean update(MetadataProto.File file) pst.setInt(3, file.getNumRowGroup()); pst.setLong(4, file.getMinRowId()); pst.setLong(5, file.getMaxRowId()); - pst.setLong(6, file.getId()); + setCleanupAt(pst, 6, file); + pst.setLong(7, file.getId()); return pst.executeUpdate() == 1; } catch (SQLException e) { @@ -272,28 +349,33 @@ public boolean deleteByIds(List ids) } @Override - public boolean atomicSwapFiles(long newFileId, List oldFileIds) + public boolean atomicSwapFiles(long newFileId, List oldFileIds, long cleanupAt) { Connection conn = db.getConnection(); try { conn.setAutoCommit(false); try (PreparedStatement pst = conn.prepareStatement( - "UPDATE FILES SET FILE_TYPE=? WHERE FILE_ID=?")) + "UPDATE FILES SET FILE_TYPE=?, FILE_CLEANUP_AT=NULL WHERE FILE_ID=?")) { pst.setInt(1, MetadataProto.File.Type.REGULAR.getNumber()); pst.setLong(2, newFileId); pst.executeUpdate(); } - String inClause = oldFileIds.stream().map(id -> "?").collect(Collectors.joining(",")); - try (PreparedStatement pst = conn.prepareStatement( - "DELETE FROM FILES WHERE FILE_ID IN (" + inClause + ")")) + if (oldFileIds != null && !oldFileIds.isEmpty()) { - for (int i = 0; i < oldFileIds.size(); i++) + try (PreparedStatement pst = conn.prepareStatement( + "UPDATE FILES SET FILE_TYPE=?, FILE_CLEANUP_AT=? WHERE FILE_ID=?")) { - pst.setLong(i + 1, oldFileIds.get(i)); + for (Long oldFileId : oldFileIds) + { + pst.setInt(1, MetadataProto.File.Type.RETIRED.getNumber()); + pst.setLong(2, cleanupAt); + pst.setLong(3, oldFileId); + pst.addBatch(); + } + pst.executeBatch(); } - pst.executeUpdate(); } conn.commit(); return true; diff --git a/pixels-daemon/src/main/java/io/pixelsdb/pixels/daemon/retina/RetinaServerImpl.java b/pixels-daemon/src/main/java/io/pixelsdb/pixels/daemon/retina/RetinaServerImpl.java index 09218beef5..1bc27d61e3 100644 --- a/pixels-daemon/src/main/java/io/pixelsdb/pixels/daemon/retina/RetinaServerImpl.java +++ b/pixels-daemon/src/main/java/io/pixelsdb/pixels/daemon/retina/RetinaServerImpl.java @@ -24,37 +24,43 @@ import com.google.protobuf.ByteString; import com.sun.management.OperatingSystemMXBean; import io.grpc.stub.StreamObserver; +import io.pixelsdb.pixels.common.error.ErrorCode; import io.pixelsdb.pixels.common.exception.IndexException; +import io.pixelsdb.pixels.common.exception.MetadataException; import io.pixelsdb.pixels.common.exception.RetinaException; import io.pixelsdb.pixels.common.index.IndexOption; +import io.pixelsdb.pixels.common.index.ResolvedPrimary; import io.pixelsdb.pixels.common.index.service.IndexService; import io.pixelsdb.pixels.common.index.service.IndexServiceProvider; import io.pixelsdb.pixels.common.metadata.MetadataService; import io.pixelsdb.pixels.common.metadata.domain.*; -import io.pixelsdb.pixels.common.physical.Storage; import io.pixelsdb.pixels.common.utils.ConfigFactory; import io.pixelsdb.pixels.common.utils.IndexUtils; +import io.pixelsdb.pixels.daemon.heartbeat.HeartbeatWorker; +import io.pixelsdb.pixels.daemon.heartbeat.NodeStatus; import io.pixelsdb.pixels.index.IndexProto; import io.pixelsdb.pixels.retina.RGVisibility; +import io.pixelsdb.pixels.retina.RecoveryCheckpoint; +import io.pixelsdb.pixels.retina.RecoveryCheckpoint.Body; +import io.pixelsdb.pixels.retina.RecoveryCheckpoint.LoadedCheckpoint; +import io.pixelsdb.pixels.retina.RecoveryCheckpoint.PendingSegmentEntry; +import io.pixelsdb.pixels.retina.RecoveryCheckpoint.VisibilityEntry; import io.pixelsdb.pixels.retina.RetinaProto; +import io.pixelsdb.pixels.retina.RetinaProto.RetinaState; import io.pixelsdb.pixels.retina.RetinaResourceManager; import io.pixelsdb.pixels.retina.RetinaWorkerServiceGrpc; -import org.apache.logging.log4j.LogManager; -import org.apache.logging.log4j.Logger; - +import io.pixelsdb.pixels.retina.StorageGcWal; import java.lang.management.ManagementFactory; import java.util.*; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.Executors; +import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; -import java.util.concurrent.atomic.AtomicBoolean; -import java.util.concurrent.atomic.AtomicReference; import java.util.concurrent.locks.Lock; import java.util.stream.Collectors; import java.util.stream.IntStream; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; -import static com.google.common.base.Preconditions.checkArgument; import static java.util.Objects.requireNonNull; /** @@ -68,6 +74,7 @@ public class RetinaServerImpl extends RetinaWorkerServiceGrpc.RetinaWorkerServic private final IndexService indexService; private final RetinaResourceManager retinaResourceManager; private final Striped updateLocks = Striped.lock(1024); + private volatile RetinaStatus status; private IndexOption[] indexOptionPool; /** @@ -75,9 +82,18 @@ public class RetinaServerImpl extends RetinaWorkerServiceGrpc.RetinaWorkerServic */ public RetinaServerImpl() { - this.metadataService = MetadataService.Instance(); - this.indexService = IndexServiceProvider.getService(IndexServiceProvider.ServiceMode.local); - this.retinaResourceManager = RetinaResourceManager.Instance(); + this(MetadataService.Instance(), + IndexServiceProvider.getService(IndexServiceProvider.ServiceMode.local), + RetinaResourceManager.Instance()); + } + + RetinaServerImpl(MetadataService metadataService, IndexService indexService, + RetinaResourceManager retinaResourceManager) + { + this.metadataService = requireNonNull(metadataService, "metadataService is null"); + this.indexService = requireNonNull(indexService, "indexService is null"); + this.retinaResourceManager = requireNonNull(retinaResourceManager, "retinaResourceManager is null"); + int totalBuckets = Integer.parseInt(ConfigFactory.Instance().getProperty("index.bucket.num")); this.indexOptionPool = new IndexOption[totalBuckets]; for (int i = 0; i < totalBuckets; i++) @@ -86,118 +102,351 @@ public RetinaServerImpl() this.indexOptionPool[i].setVNodeId(i); } - startRetinaMetricsLogThread(); try { - logger.info("Pre-loading checkpoints..."); - this.retinaResourceManager.recoverCheckpoints(); + RecoveryContext recoveryContext = prepareRecoveryContext(); + RecoveryResult recoveryResult = recoverRetinaState(recoveryContext); + initializeRecoveredResources(); + publishStartupLifecycle(recoveryContext, recoveryResult); + startRetinaMetricsLogThread(); + boolean ready = this.status.getState() == RetinaState.READY; + logger.info(ready ? "Retina service is ready" : "Retina service is recovering"); + } + catch (Exception e) + { + RetinaStatus base = this.status; + RetinaStatus.Builder builder = base == null + ? RetinaStatus.newBuilder() + : base.toBuilder(); + this.status = builder + .setState(RetinaState.FAILED) + .build(); + this.retinaResourceManager.setRecovering(false); + HeartbeatWorker.setCurrentStatus(NodeStatus.INIT); + logger.error("Error while initializing RetinaServerImpl", e); + throw new IllegalStateException("Failed to initialize RetinaServerImpl", e); + } + } + + private RecoveryContext prepareRecoveryContext() throws RetinaException + { + String recoveryEpoch = UUID.randomUUID().toString(); + + RecoveryCheckpoint recoveryCheckpoint = RecoveryCheckpoint.createFromConfig(); + int virtualNodesPerNode = recoveryCheckpoint.getVirtualNodesPerNode(); + if (virtualNodesPerNode <= 0) + { + throw new RetinaException("virtualNodesPerNode must be positive, got " + virtualNodesPerNode); + } + // Config only provides vnode count; expected vnode ids are derived as [0, N). + Set expectedVnodes = new HashSet<>(virtualNodesPerNode); + for (int i = 0; i < virtualNodesPerNode; i++) + { + expectedVnodes.add(i); + } + + StorageGcWal storageGcWal = retinaResourceManager.getStorageGcWal(); + StorageGcWal.RecoveryHandler storageGcWalRecoveryHandler = new StorageGcWal.RecoveryHandler( + storageGcWal, metadataService, indexService); - List schemas = this.metadataService.getSchemas(); - for (Schema schema : schemas) + return new RecoveryContext( + recoveryEpoch, + storageGcWal, + storageGcWalRecoveryHandler, + recoveryCheckpoint.load(), + expectedVnodes); + } + + private RecoveryResult recoverRetinaState(RecoveryContext context) throws RetinaException + { + LoadedCheckpoint loaded = context.loadedCheckpoint; + if (loaded == null) + { + context.storageGcWalRecoveryHandler.recover(Collections.emptySet()); + try { - List tables = this.metadataService.getTables(schema.getName()); - for (Table table : tables) + if (!metadataService.getFilesByType(EnumSet.of(File.Type.REGULAR)).isEmpty()) { - List layouts = this.metadataService.getLayouts(schema.getName(), table.getName()); - List files = new LinkedList<>(); - for (Layout layout : layouts) - { - if (layout.isReadable()) - { - /* - * Issue #946: always add visibility to all files - */ - // add visibility for ordered files - List orderedPaths = layout.getOrderedPaths(); - validateOrderedOrCompactPaths(orderedPaths); - List orderedFiles = this.metadataService.getFiles(orderedPaths.get(0).getId()); - files.addAll(orderedFiles.stream() - .map(file -> orderedPaths.get(0).getUri() + "/" + file.getName()) - .collect(Collectors.toList())); - - // add visibility for compact files - List compactPaths = layout.getCompactPaths(); - validateOrderedOrCompactPaths(compactPaths); - List compactFiles = this.metadataService.getFiles(compactPaths.get(0).getId()); - files.addAll(compactFiles.stream() - .map(file -> compactPaths.get(0).getUri() + "/" + file.getName()) - .collect(Collectors.toList())); - } - } + throw new RetinaException("Recovery aborted: no checkpoint body found but catalog has REGULAR files"); + } + } + catch (MetadataException e) + { + throw new RetinaException("Recovery catalog probe failed", e); + } + return new RecoveryResult(true, computeReplay(0L, Collections.emptyList(), context.expectedVnodes)); + } - int threadNum = Integer.parseInt - (ConfigFactory.Instance().getProperty("retina.service.init.threads")); - ExecutorService executorService = Executors.newFixedThreadPool(threadNum); - AtomicBoolean success = new AtomicBoolean(true); - AtomicReference e = new AtomicReference<>(); - try - { - for (String filePath : files) - { - executorService.submit(() -> - { - try - { - this.retinaResourceManager.addVisibility(filePath); - } - catch (Exception ex) - { - success.set(false); - e.set(ex); - } - }); - } - } - finally - { - executorService.shutdown(); - } + Body body = loaded.body; + logger.info("Recovery: applying checkpoint body={} (checkpointAppliedTs={})", + loaded.bodyPath, body.getCheckpointAppliedTs()); - if (success.get()) - { - executorService.awaitTermination(Long.MAX_VALUE, TimeUnit.NANOSECONDS); - } + int warnCount = 0; + Map catalogByFileId = new HashMap<>(); + Set droppedFileIds = new HashSet<>(); + List validRgEntries = new ArrayList<>(body.getRgEntries().size()); + Set recoverableFileIds = new HashSet<>(); - if (!success.get()) - { - throw new RetinaException("Can't add visibility", e.get()); - } + for (VisibilityEntry ve : body.getRgEntries()) + { + long fileId = ve.getFileId(); + if (droppedFileIds.contains(fileId)) + { + warnCount++; + continue; + } + File catalog = catalogByFileId.get(fileId); + if (catalog == null) + { + try + { + catalog = metadataService.getFileById(fileId); + } + catch (MetadataException e) + { + throw new RetinaException("Recovery cleanser: catalog lookup failed for fileId=" + fileId, e); + } + if (catalog == null) + { + logger.warn("Recovery cleanser: dropping VisibilityEntry fileId={}, rgId={} because fileId is not in catalog", + fileId, ve.getRgId()); + droppedFileIds.add(fileId); + warnCount++; + continue; + } + if (catalog.getType() != File.Type.REGULAR) + { + logger.warn("Recovery cleanser: dropping VisibilityEntry fileId={}, rgId={} because catalog type is {}", + fileId, ve.getRgId(), catalog.getType()); + droppedFileIds.add(fileId); + warnCount++; + continue; + } + if (catalog.getMinRowId() < 0 || catalog.getMaxRowId() < catalog.getMinRowId()) + { + logger.warn("Recovery cleanser: dropping VisibilityEntry fileId={}, rgId={} because catalog hull is invalid: min={}, max={}", + fileId, ve.getRgId(), catalog.getMinRowId(), catalog.getMaxRowId()); + droppedFileIds.add(fileId); + warnCount++; + continue; + } + catalogByFileId.put(fileId, catalog); + } + if (ve.getRgId() < 0 || ve.getRecordNum() <= 0 || catalog.getNumRowGroup() <= ve.getRgId()) + { + logger.warn("Recovery cleanser: dropping VisibilityEntry fileId={}, rgId={}, recordNum={}, catalogRowGroups={}", + fileId, ve.getRgId(), ve.getRecordNum(), catalog.getNumRowGroup()); + warnCount++; + continue; + } + validRgEntries.add(ve); + recoverableFileIds.add(fileId); + } + + context.storageGcWalRecoveryHandler.recover(recoverableFileIds); + + for (VisibilityEntry ve : validRgEntries) + { + retinaResourceManager.addVisibility(ve.getFileId(), ve.getRgId(), + ve.getRecordNum(), ve.getBaseTimestamp(), ve.getBitmap(), true); + } + + ReplayResult replay = computeReplay( + body.getCheckpointAppliedTs(), + body.getSegmentEntries(), + context.expectedVnodes); + + long cleanupAt = System.currentTimeMillis(); + Set pendingJournalFileIds; + try + { + pendingJournalFileIds = context.storageGcWal.collectPendingFileIds(); + } + catch (RuntimeException e) + { + throw new RetinaException("Recovery retirer: failed to load Storage GC journal tasks", e); + } + + List catalogRegulars; + try + { + catalogRegulars = metadataService.getFilesByType(EnumSet.of(File.Type.REGULAR)); + } + catch (MetadataException e) + { + throw new RetinaException("Recovery retirer: catalog-wide REGULAR scan failed", e); + } + + int retiredCount = 0; + int protectedCount = 0; + for (File catalog : catalogRegulars) + { + long fileId = catalog.getId(); + if (recoverableFileIds.contains(fileId)) + { + continue; + } + if (pendingJournalFileIds.contains(fileId)) + { + protectedCount++; + continue; + } + catalog.setType(File.Type.RETIRED); + catalog.setCleanupAt(cleanupAt); + try + { + if (!metadataService.updateFile(catalog)) + { + throw new RetinaException("Recovery retirer: updateFile returned false for fileId=" + fileId); + } + retiredCount++; + } + catch (MetadataException e) + { + throw new RetinaException("Recovery retirer: failed to retire fileId=" + fileId, e); + } + } - this.retinaResourceManager.addWriteBuffer(schema.getName(), table.getName()); + logger.info("Recovery complete: checkpointId={}, checkpointAppliedTs={}, baselineFiles={}, retired={}, protected={}, nodeReplayFromTs={}, warns={}", + loaded.bodyPath, + body.getCheckpointAppliedTs(), + recoverableFileIds.size(), + retiredCount, + protectedCount, + replay.nodeReplayFromTs, + warnCount); + + // Clean up terminal tasks related to this checkpoint to prevent blocking future recoveries + try + { + List terminalTasks = context.storageGcWal.listTerminalTasks(); + if (!terminalTasks.isEmpty()) + { + List tasksToDelete = new ArrayList<>(terminalTasks.size()); + for (StorageGcWal.Task task : terminalTasks) + { + tasksToDelete.add(task.getTaskId()); } + context.storageGcWal.deleteTerminalTasks(tasksToDelete); + logger.info("Cleaned up {} terminal Storage GC WAL tasks after checkpoint ts={}", + tasksToDelete.size(), body.getCheckpointAppliedTs()); } - logger.info("Retina service is ready"); } catch (Exception e) { - logger.error("Error while initializing RetinaServerImpl", e); + logger.warn("Failed to cleanup terminal Storage GC journal tasks: {}", e.getMessage()); } + + return new RecoveryResult(false, replay); } - /** - * Check if the order or compact paths from pixels metadata is valid. - * - * @param paths the order or compact paths from pixels metadata. - */ - public static void validateOrderedOrCompactPaths(List paths) throws RetinaException + private void initializeRecoveredResources() throws Exception { - requireNonNull(paths, "paths is null"); - checkArgument(!paths.isEmpty(), "paths must contain at least one valid directory"); - try + this.retinaResourceManager.recoverOffloadCheckpoints(); + List schemas = this.metadataService.getSchemas(); + for (Schema schema : schemas) { - Storage.Scheme firstScheme = Storage.Scheme.fromPath(paths.get(0).getUri()); - assert firstScheme != null; - for (int i = 1; i < paths.size(); ++i) + List
tables = this.metadataService.getTables(schema.getName()); + for (Table table : tables) { - Storage.Scheme scheme = Storage.Scheme.fromPath(paths.get(i).getUri()); - checkArgument(firstScheme.equals(scheme), - "all the directories in the paths must have the same storage scheme"); + this.retinaResourceManager.addWriteBuffer(schema.getName(), table.getName()); } } - catch (Throwable e) + } + + private void publishStartupLifecycle(RecoveryContext context, RecoveryResult result) throws RetinaException + { + this.retinaResourceManager.setRecovering(true); + HeartbeatWorker.setCurrentStatus(NodeStatus.INIT); + this.status = RetinaStatus.newBuilder() + .setState(RetinaState.RECOVERING) + .setRecoveryEpoch(context.recoveryEpoch) + .putAllVnodeReplayStarts(result.replay.vnodeReplayStarts) + .build(); + + if (result.freshDeployment) + { + retinaResourceManager.startBackgroundGc(); + this.status = this.status.toBuilder() + .setState(RetinaState.READY) + .build(); + this.retinaResourceManager.setRecovering(false); + HeartbeatWorker.setCurrentStatus(NodeStatus.READY); + } + } + + private static final class RecoveryContext + { + final String recoveryEpoch; + final StorageGcWal storageGcWal; + final StorageGcWal.RecoveryHandler storageGcWalRecoveryHandler; + final LoadedCheckpoint loadedCheckpoint; + final Set expectedVnodes; + + RecoveryContext(String recoveryEpoch, + StorageGcWal storageGcWal, + StorageGcWal.RecoveryHandler storageGcWalRecoveryHandler, + LoadedCheckpoint loadedCheckpoint, + Set expectedVnodes) + { + this.recoveryEpoch = recoveryEpoch; + this.storageGcWal = storageGcWal; + this.storageGcWalRecoveryHandler = storageGcWalRecoveryHandler; + this.loadedCheckpoint = loadedCheckpoint; + this.expectedVnodes = expectedVnodes; + } + } + + private static final class RecoveryResult + { + final boolean freshDeployment; + final ReplayResult replay; + + RecoveryResult(boolean freshDeployment, ReplayResult replay) + { + this.freshDeployment = freshDeployment; + this.replay = replay; + } + } + + private static final class ReplayResult + { + final Map vnodeReplayStarts; + final long nodeReplayFromTs; + + ReplayResult(Map vnodeReplayStarts, long nodeReplayFromTs) + { + this.vnodeReplayStarts = Collections.unmodifiableMap(new HashMap<>(vnodeReplayStarts)); + this.nodeReplayFromTs = nodeReplayFromTs; + } + } + + private static ReplayResult computeReplay(long checkpointAppliedTs, + List segmentEntries, + Set expectedVnodes) + { + Map vnodeReplayStarts = new HashMap<>(); + for (PendingSegmentEntry se : segmentEntries) + { + long ts = se.getMinCommitTs() == Long.MAX_VALUE + ? checkpointAppliedTs + : Math.min(checkpointAppliedTs, se.getMinCommitTs()); + vnodeReplayStarts.merge(se.getVirtualNodeId(), ts, Math::min); + } + for (Integer vnode : expectedVnodes) + { + vnodeReplayStarts.putIfAbsent(vnode, checkpointAppliedTs); + } + long nodeReplayFromTs = checkpointAppliedTs; + if (!vnodeReplayStarts.isEmpty()) { - throw new RetinaException("Failed to parse storage scheme from paths", e); + nodeReplayFromTs = Long.MAX_VALUE; + for (Long v : vnodeReplayStarts.values()) + { + nodeReplayFromTs = Math.min(nodeReplayFromTs, v); + } } + return new ReplayResult(vnodeReplayStarts, nodeReplayFromTs); } private static String getRetinaMetrics(OperatingSystemMXBean osBean) @@ -275,10 +524,18 @@ public void updateRecord(RetinaProto.UpdateRecordRequest request, .setHeader(headerBuilder.build()) .build()); } - catch (RetinaException | IndexException e) + catch (RetinaException e) + { + logger.error("updateRecord failed for schema={} (retina)", request.getSchemaName(), e); + headerBuilder.setErrorCode(ErrorCode.RETINA_UPDATE_FAILED).setErrorMsg("Retina: " + e.getMessage()); + responseObserver.onNext(RetinaProto.UpdateRecordResponse.newBuilder() + .setHeader(headerBuilder.build()) + .build()); + } + catch (IndexException e) { - logger.error("updateRecord failed for schema={}", request.getSchemaName(), e); - headerBuilder.setErrorCode(1).setErrorMsg(e.getMessage()); + logger.error("updateRecord failed for schema={} (index)", request.getSchemaName(), e); + headerBuilder.setErrorCode(ErrorCode.RETINA_UPDATE_FAILED).setErrorMsg("Index: " + e.getMessage()); responseObserver.onNext(RetinaProto.UpdateRecordResponse.newBuilder() .setHeader(headerBuilder.build()) .build()); @@ -310,7 +567,7 @@ public void onNext(RetinaProto.UpdateRecordRequest request) } catch (RetinaException e) { - headerBuilder.setErrorCode(1).setErrorMsg("Retina: " + e.getMessage()); + headerBuilder.setErrorCode(ErrorCode.RETINA_UPDATE_FAILED).setErrorMsg("Retina: " + e.getMessage()); responseObserver.onNext(RetinaProto.UpdateRecordResponse.newBuilder() .setHeader(headerBuilder.build()) .build()); @@ -318,7 +575,7 @@ public void onNext(RetinaProto.UpdateRecordRequest request) } catch (IndexException e) { - headerBuilder.setErrorCode(2).setErrorMsg("Index: " + e.getMessage()); + headerBuilder.setErrorCode(ErrorCode.RETINA_UPDATE_FAILED).setErrorMsg("Index: " + e.getMessage()); responseObserver.onNext(RetinaProto.UpdateRecordResponse.newBuilder() .setHeader(headerBuilder.build()) .build()); @@ -326,7 +583,7 @@ public void onNext(RetinaProto.UpdateRecordRequest request) } catch (Exception e) { - headerBuilder.setErrorCode(3).setErrorMsg("Internal error: " + e.getMessage()); + headerBuilder.setErrorCode(ErrorCode.RETINA_UPDATE_FAILED).setErrorMsg("Internal error: " + e.getMessage()); responseObserver.onNext(RetinaProto.UpdateRecordResponse.newBuilder() .setHeader(headerBuilder.build()) .build()); @@ -334,7 +591,7 @@ public void onNext(RetinaProto.UpdateRecordRequest request) } catch (Throwable t) { - headerBuilder.setErrorCode(4).setErrorMsg("Fatal error: " + t.getMessage()); + headerBuilder.setErrorCode(ErrorCode.RETINA_UPDATE_FAILED).setErrorMsg("Fatal error: " + t.getMessage()); responseObserver.onNext(RetinaProto.UpdateRecordResponse.newBuilder() .setHeader(headerBuilder.build()) .build()); @@ -386,7 +643,7 @@ private List> transposeIndexKeys(List dataList, private void executeParallelByBucket( List dataList, java.util.function.Function keyExtractor, - BucketProcessor processor) throws RetinaException + BucketProcessor processor) throws RetinaException, IndexException { if (dataList == null || dataList.isEmpty()) { @@ -398,27 +655,47 @@ private void executeParallelByBucket( .collect(Collectors.groupingBy(d -> IndexUtils.getBucketIdFromByteBuffer(keyExtractor.apply(d).getKey()))); - // 2. Parallel Execution: Process each bucket in parallel + // 2. Parallel Execution: Process each bucket in parallel // This utilizes the common ForkJoinPool to execute RPCs and logic simultaneously - bucketMap.entrySet().parallelStream().forEach(entry -> + try { - int bucketId = entry.getKey(); - List subList = entry.getValue(); + bucketMap.entrySet().parallelStream().forEach(entry -> + { + int bucketId = entry.getKey(); + List subList = entry.getValue(); // Fetch the pre-initialized IndexOption from the pool (Zero allocation) - IndexOption option = this.indexOptionPool[bucketId]; + IndexOption option = this.indexOptionPool[bucketId]; - try - { + try + { // Execute the specific Delete/Insert/Update logic - processor.process(bucketId, subList, option); + processor.process(bucketId, subList, option); + } + catch (Exception e) + { + // Wrap checked exceptions to propagate through the parallel stream + throw new RuntimeException("Failure during parallel index processing for Bucket: " + bucketId, e); + } + }); + } + catch (RuntimeException e) + { + Throwable cause = e; + while (cause instanceof RuntimeException && cause.getCause() != null) + { + cause = cause.getCause(); } - catch (Exception e) + if (cause instanceof RetinaException) { - // Wrap checked exceptions to propagate through the parallel stream - throw new RuntimeException("Failure during parallel index processing for Bucket: " + bucketId, e); + throw (RetinaException) cause; } - }); + if (cause instanceof IndexException) + { + throw (IndexException) cause; + } + throw e; + } } /** @@ -456,6 +733,200 @@ private void processSecondaryIndexes( } } + /** + * Delete phase for one bucket. Hide existing rows before removing primary entries; + * secondary cleanup is best effort. + */ + private void executeStagedDeletePhase( + List subList, + java.util.function.Function> keyListExtractor, + long primaryIndexId, long timestamp, IndexOption option) throws IndexException, RetinaException + { + List> keysList = transposeIndexKeys(subList, keyListExtractor::apply); + List primaryKeys = keysList.get(0); + long tableId = primaryKeys.get(0).getTableId(); + + List> resolved = + indexService.resolvePrimary(tableId, primaryIndexId, primaryKeys, option); + List foundKeys = new ArrayList<>(primaryKeys.size()); + for (int i = 0; i < primaryKeys.size(); i++) + { + Optional r = resolved.get(i); + if (r.isPresent()) + { + this.retinaResourceManager.deleteRecord(r.get().getRowLocation(), timestamp); + foundKeys.add(primaryKeys.get(i)); + } + // Missing primary keys are no-op deletes. + } + if (!foundKeys.isEmpty()) + { + indexService.deletePrimaryIndexEntriesOnly(tableId, primaryIndexId, foundKeys, option); + } + + for (int i = 1; i < keysList.size(); ++i) + { + try + { + indexService.deleteSecondaryIndexEntries(tableId, + keysList.get(i).get(0).getIndexId(), keysList.get(i), option); + } + catch (IndexException e) + { + logger.warn("Best-effort staged secondary delete failed for tableId={}, indexId={}", + tableId, keysList.get(i).get(0).getIndexId(), e); + } + } + } + + /** + * Insert phase for one bucket. Write main index entries before primary entries + * so new primary mappings point to resolvable row locations. + */ + private void executeStagedInsertPhase( + String schemaName, String tableName, int virtualNodeId, + List subList, + java.util.function.Function> keyListExtractor, + java.util.function.Function> colValuesExtractor, + long primaryIndexId, long timestamp, IndexOption option) throws Exception + { + List primaryEntries = new ArrayList<>(subList.size()); + List rowIds = new ArrayList<>(subList.size()); + List insertedLocations = new ArrayList<>(subList.size()); + + try + { + for (T data : subList) + { + byte[][] values = colValuesExtractor.apply(data).stream() + .map(ByteString::toByteArray).toArray(byte[][]::new); + IndexProto.PrimaryIndexEntry.Builder builder = retinaResourceManager.insertRecord( + schemaName, tableName, values, timestamp, virtualNodeId); + builder.setIndexKey(keyListExtractor.apply(data).get(0)); + IndexProto.PrimaryIndexEntry entry = builder.build(); + primaryEntries.add(entry); + rowIds.add(entry.getRowId()); + insertedLocations.add(entry.getRowLocation()); + } + + long tableId = primaryEntries.get(0).getIndexKey().getTableId(); + indexService.putMainIndexEntriesOnly(tableId, primaryEntries); + indexService.putPrimaryIndexEntriesOnly(tableId, primaryIndexId, primaryEntries, option); + + processSecondaryIndexes(subList, keyListExtractor::apply, rowIds, option, false); + } + catch (Exception e) + { + for (IndexProto.RowLocation loc : insertedLocations) + { + try + { + this.retinaResourceManager.deleteRecord(loc, timestamp); + } + catch (Exception rollbackEx) + { + logger.error("Failed to roll back visibility for inserted row at fileId={}, rgId={}, rgRowOffset={}", + loc.getFileId(), loc.getRgId(), loc.getRgRowOffset(), rollbackEx); + } + } + throw e; + } + } + + /** + * Update phase for one bucket. Resolve current rows, append replacements, + * write main index entries, switch primary entries, then hide old rows. + */ + private void executeStagedUpdatePhase( + String schemaName, String tableName, int virtualNodeId, + int bucketId, + List subList, + java.util.function.Function> keyListExtractor, + java.util.function.Function> colValuesExtractor, + long primaryIndexId, long timestamp, IndexOption option) throws Exception + { + List primaryEntries = new ArrayList<>(subList.size()); + List rowIds = new ArrayList<>(subList.size()); + List insertedLocations = new ArrayList<>(subList.size()); + String lockKey = "v_" + virtualNodeId + "_b_" + bucketId + "_i_" + primaryIndexId; + Lock lock = updateLocks.get(lockKey); + + try + { + lock.lock(); + try + { + List> keysList = transposeIndexKeys(subList, keyListExtractor::apply); + List primaryKeys = keysList.get(0); + long tableId = primaryKeys.get(0).getTableId(); + + List> resolved = + indexService.resolvePrimary(tableId, primaryIndexId, primaryKeys, option); + if (resolved.size() != primaryKeys.size()) + { + throw new IndexException("Resolved primary count mismatch for tableId=" + + tableId + ", indexId=" + primaryIndexId); + } + + List previousLocations = new ArrayList<>(primaryKeys.size()); + for (int i = 0; i < primaryKeys.size(); i++) + { + Optional r = resolved.get(i); + if (!r.isPresent()) + { + throw new IndexException("Primary index entry not found for update, tableId=" + + tableId + ", indexId=" + primaryIndexId); + } + previousLocations.add(r.get().getRowLocation()); + } + + for (T data : subList) + { + byte[][] values = colValuesExtractor.apply(data).stream() + .map(ByteString::toByteArray).toArray(byte[][]::new); + IndexProto.PrimaryIndexEntry.Builder builder = retinaResourceManager.insertRecord( + schemaName, tableName, values, timestamp, virtualNodeId); + builder.setIndexKey(keyListExtractor.apply(data).get(0)); + IndexProto.PrimaryIndexEntry entry = builder.build(); + primaryEntries.add(entry); + rowIds.add(entry.getRowId()); + insertedLocations.add(entry.getRowLocation()); + } + + // TODO: replace this JVM-local lock with an index API that updates only when the + // resolved old rowIds still match, so concurrent writers can avoid bucket serialization. + indexService.putMainIndexEntriesOnly(tableId, primaryEntries); + indexService.updatePrimaryIndexEntriesOnly(tableId, primaryIndexId, primaryEntries, option); + for (IndexProto.RowLocation loc : previousLocations) + { + this.retinaResourceManager.deleteRecord(loc, timestamp); + } + } + finally + { + lock.unlock(); + } + + processSecondaryIndexes(subList, keyListExtractor::apply, rowIds, option, true); + } + catch (Exception e) + { + for (IndexProto.RowLocation loc : insertedLocations) + { + try + { + this.retinaResourceManager.deleteRecord(loc, timestamp); + } + catch (Exception rollbackEx) + { + logger.error("Failed to roll back visibility for inserted row at fileId={}, rgId={}, rgRowOffset={}", + loc.getFileId(), loc.getRgId(), loc.getRgRowOffset(), rollbackEx); + } + } + throw e; + } + } + /** * Common method to process updates for both normal and streaming rpc. * @@ -484,31 +955,11 @@ private void processUpdateRequest(RetinaProto.UpdateRecordRequest request) throw List deleteDataList = tableUpdateData.getDeleteDataList(); if (!deleteDataList.isEmpty()) { - // 1a. Validate the delete data validateIndexData(deleteDataList, d -> d.getIndexKeysList(), primaryIndexId, "Delete"); executeParallelByBucket(deleteDataList, d -> d.getIndexKeys(0), (bucketId, subList, option) -> - { - // 1b. Transpose the index keys - List> keysList = transposeIndexKeys(subList, RetinaProto.DeleteData::getIndexKeysList); - List primaryKeys = keysList.get(0); - long tableId = primaryKeys.get(0).getTableId(); - - // 1c. Delete primary index entries - List rowLocations = indexService.deletePrimaryIndexEntries(tableId, primaryIndexId, primaryKeys, option); - - // 1d. Delete records - for (IndexProto.RowLocation loc : rowLocations) - { - this.retinaResourceManager.deleteRecord(loc, timestamp); - } - - // 1e. Delete secondary index entries - for (int i = 1; i < keysList.size(); ++i) - { - indexService.deleteSecondaryIndexEntries(tableId, keysList.get(i).get(0).getIndexId(), keysList.get(i), option); - } - }); + executeStagedDeletePhase(subList, RetinaProto.DeleteData::getIndexKeysList, + primaryIndexId, timestamp, option)); } // ================================================================= @@ -517,81 +968,30 @@ private void processUpdateRequest(RetinaProto.UpdateRecordRequest request) throw List insertDataList = tableUpdateData.getInsertDataList(); if (!insertDataList.isEmpty()) { - // 2a. Validate the insert data validateIndexData(insertDataList, d -> d.getIndexKeysList(), primaryIndexId, "Insert"); executeParallelByBucket(insertDataList, d -> d.getIndexKeys(0), (bucketId, subList, option) -> - { - List primaryEntries = new ArrayList<>(subList.size()); - List rowIds = new ArrayList<>(subList.size()); - - // 2c. Insert records - for (RetinaProto.InsertData data : subList) - { - byte[][] values = data.getColValuesList().stream().map(ByteString::toByteArray).toArray(byte[][]::new); - IndexProto.PrimaryIndexEntry.Builder builder = retinaResourceManager.insertRecord(schemaName, tableName, values, timestamp, virtualNodeId); - builder.setIndexKey(data.getIndexKeys(0)); - IndexProto.PrimaryIndexEntry entry = builder.build(); - primaryEntries.add(entry); - rowIds.add(entry.getRowId()); - } - - // 2d. Put primary index entries - long tableId = primaryEntries.get(0).getIndexKey().getTableId(); - indexService.putPrimaryIndexEntries(tableId, primaryIndexId, primaryEntries, option); - - // 2e. Put secondary index entries - processSecondaryIndexes(subList, RetinaProto.InsertData::getIndexKeysList, rowIds, option, false); - }); + executeStagedInsertPhase(schemaName, tableName, virtualNodeId, subList, + RetinaProto.InsertData::getIndexKeysList, + RetinaProto.InsertData::getColValuesList, + primaryIndexId, timestamp, option)); } // ================================================================= // 3. Process Update Data + // + // UpdateData keeps primary-index update semantics; new row locations + // are written before primary entries are switched. // ================================================================= List updateDataList = tableUpdateData.getUpdateDataList(); if (!updateDataList.isEmpty()) { - // 3a. Validate the update data validateIndexData(updateDataList, d -> d.getIndexKeysList(), primaryIndexId, "Update"); executeParallelByBucket(updateDataList, d -> d.getIndexKeys(0), (bucketId, subList, option) -> - { - List primaryEntries = new ArrayList<>(subList.size()); - List rowIds = new ArrayList<>(subList.size()); - - // 3c. Insert new records - for (RetinaProto.UpdateData data : subList) - { - byte[][] values = data.getColValuesList().stream().map(ByteString::toByteArray).toArray(byte[][]::new); - IndexProto.PrimaryIndexEntry.Builder builder = retinaResourceManager.insertRecord(schemaName, tableName, values, timestamp, virtualNodeId); - builder.setIndexKey(data.getIndexKeys(0)); - IndexProto.PrimaryIndexEntry entry = builder.build(); - primaryEntries.add(entry); - rowIds.add(entry.getRowId()); - } - - // 3d. Update primary index entries with fine-grained locking - long tableId = primaryEntries.get(0).getIndexKey().getTableId(); - String lockKey = "v_" + virtualNodeId + "_b_" + bucketId + "_i_" + primaryIndexId; - Lock lock = updateLocks.get(lockKey); - - lock.lock(); - try - { - List prevLocs = indexService.updatePrimaryIndexEntries(tableId, primaryIndexId, primaryEntries, option); - // 3e. Delete previous records - for (IndexProto.RowLocation loc : prevLocs) - { - this.retinaResourceManager.deleteRecord(loc, timestamp); - } - } - finally - { - lock.unlock(); - } - - // 3f. Update secondary index entries - processSecondaryIndexes(subList, RetinaProto.UpdateData::getIndexKeysList, rowIds, option, true); - }); + executeStagedUpdatePhase(schemaName, tableName, virtualNodeId, bucketId, subList, + RetinaProto.UpdateData::getIndexKeysList, + RetinaProto.UpdateData::getColValuesList, + primaryIndexId, timestamp, option)); } } } @@ -614,6 +1014,69 @@ private void validateIndexData(List dataList, java.util.function.Function throw new RetinaException("Primary index id mismatch or inconsistent index key list size in " + opType); } + @Override + public void getRetinaStatus(RetinaProto.GetRetinaStatusRequest request, + StreamObserver responseObserver) + { + RetinaProto.ResponseHeader.Builder headerBuilder = RetinaProto.ResponseHeader.newBuilder() + .setToken(request.getHeader().getToken()); + RetinaProto.GetRetinaStatusResponse.Builder response = + RetinaProto.GetRetinaStatusResponse.newBuilder().setHeader(headerBuilder); + RetinaStatus current = this.status; + response.setState(current.getState()) + .setRecoveryEpoch(current.getRecoveryEpoch()); + // vnodeReplayStarts is only meaningful while RECOVERING; CDC drives + // replay from those timestamps. Once published, the plan is ready by + // construction, so no separate readiness flag is needed. + if (current.getState() == RetinaState.RECOVERING) + { + current.getVnodeReplayStartsMap().entrySet().stream() + .sorted(Map.Entry.comparingByKey()) + .forEach(entry -> response.addVnodeReplayStarts(RetinaProto.VnodeReplayStart.newBuilder() + .setVirtualNodeId(entry.getKey()) + .setStartTs(entry.getValue()) + .build())); + } + responseObserver.onNext(response.build()); + responseObserver.onCompleted(); + } + + @Override + public void markReady(RetinaProto.MarkReadyRequest request, + StreamObserver responseObserver) + { + RetinaProto.ResponseHeader.Builder headerBuilder = RetinaProto.ResponseHeader.newBuilder() + .setToken(request.getHeader().getToken()); + try + { + RetinaStatus current = this.status; + if (!current.getRecoveryEpoch().equals(request.getRecoveryEpoch())) + { + throw new RetinaException("MarkReady recoveryEpoch mismatch"); + } + if (current.getState() != RetinaState.RECOVERING && current.getState() != RetinaState.READY) + { + throw new RetinaException("Retina is " + current.getState() + "; cannot mark READY"); + } + + // Start GC before publishing READY so a failed scheduler start remains retryable. + retinaResourceManager.startBackgroundGc(); + this.status = current.toBuilder() + .setState(RetinaState.READY) + .build(); + this.retinaResourceManager.setRecovering(false); + HeartbeatWorker.setCurrentStatus(NodeStatus.READY); + } + catch (RetinaException e) + { + headerBuilder.setErrorCode(ErrorCode.RETINA_MARK_READY_FAILED).setErrorMsg(e.getMessage()); + } + responseObserver.onNext(RetinaProto.MarkReadyResponse.newBuilder() + .setHeader(headerBuilder.build()) + .build()); + responseObserver.onCompleted(); + } + @Override public void addVisibility(RetinaProto.AddVisibilityRequest request, StreamObserver responseObserver) @@ -633,7 +1096,7 @@ public void addVisibility(RetinaProto.AddVisibilityRequest request, } catch (RetinaException e) { - headerBuilder.setErrorCode(1).setErrorMsg(e.getMessage()); + headerBuilder.setErrorCode(ErrorCode.RETINA_VISIBILITY_FAILED).setErrorMsg(e.getMessage()); responseObserver.onNext(RetinaProto.AddVisibilityResponse.newBuilder() .setHeader(headerBuilder.build()) .build()); @@ -659,7 +1122,7 @@ public void queryVisibility(RetinaProto.QueryVisibilityRequest request, .newBuilder() .setHeader(headerBuilder.build()); - String checkpointPath = this.retinaResourceManager.getCheckpointPath(timestamp); + String checkpointPath = this.retinaResourceManager.getOffloadCheckpointPath(timestamp); if (checkpointPath != null) { responseBuilder.setCheckpointPath(checkpointPath); @@ -680,7 +1143,7 @@ public void queryVisibility(RetinaProto.QueryVisibilityRequest request, } catch (RetinaException e) { - headerBuilder.setErrorCode(1).setErrorMsg(e.getMessage()); + headerBuilder.setErrorCode(ErrorCode.RETINA_VISIBILITY_FAILED).setErrorMsg(e.getMessage()); responseObserver.onNext(RetinaProto.QueryVisibilityResponse.newBuilder() .setHeader(headerBuilder.build()) .build()); @@ -712,7 +1175,7 @@ public void reclaimVisibility(RetinaProto.ReclaimVisibilityRequest request, } catch (RetinaException e) { - headerBuilder.setErrorCode(1).setErrorMsg(e.getMessage()); + headerBuilder.setErrorCode(ErrorCode.RETINA_VISIBILITY_FAILED).setErrorMsg(e.getMessage()); responseObserver.onNext(RetinaProto.ReclaimVisibilityResponse.newBuilder() .setHeader(headerBuilder.build()) .build()); @@ -737,7 +1200,7 @@ public void addWriteBuffer(RetinaProto.AddWriteBufferRequest request, } catch (RetinaException e) { - headerBuilder.setErrorCode(1).setErrorMsg(e.getMessage()); + headerBuilder.setErrorCode(ErrorCode.RETINA_NOT_READY).setErrorMsg(e.getMessage()); responseObserver.onNext(RetinaProto.AddWriteBufferResponse.newBuilder() .setHeader(headerBuilder.build()) .build()); @@ -763,7 +1226,7 @@ public void getWriteBuffer(RetinaProto.GetWriteBufferRequest request, } catch (RetinaException e) { - headerBuilder.setErrorCode(1).setErrorMsg(e.getMessage()); + headerBuilder.setErrorCode(ErrorCode.RETINA_NOT_READY).setErrorMsg(e.getMessage()); responseObserver.onNext(RetinaProto.GetWriteBufferResponse.newBuilder() .setHeader(headerBuilder.build()) .build()); @@ -788,7 +1251,7 @@ public void registerOffload(RetinaProto.RegisterOffloadRequest request, { logger.error("registerOffload failed for timestamp={}", request.getTimestamp(), e); - headerBuilder.setErrorCode(1).setErrorMsg(e.getMessage()); + headerBuilder.setErrorCode(ErrorCode.RETINA_NOT_READY).setErrorMsg(e.getMessage()); responseObserver.onNext(RetinaProto.RegisterOffloadResponse.newBuilder() .setHeader(headerBuilder.build()).build()); } @@ -815,7 +1278,7 @@ public void unregisterOffload(RetinaProto.UnregisterOffloadRequest request, { logger.error("unregisterOffload failed for timestamp={}", request.getTimestamp(), e); - headerBuilder.setErrorCode(1).setErrorMsg(e.getMessage()); + headerBuilder.setErrorCode(ErrorCode.RETINA_NOT_READY).setErrorMsg(e.getMessage()); responseObserver.onNext(RetinaProto.UnregisterOffloadResponse.newBuilder() .setHeader(headerBuilder.build()).build()); } @@ -942,4 +1405,58 @@ public int size() } } } + + /** + * In-memory lifecycle status of this Retina server. This struct is intentionally + * kept inside RetinaServerImpl because it is never transmitted: outbound RPCs + * project the relevant subset onto GetRetinaStatusResponse. + */ + private static final class RetinaStatus + { + private final RetinaState state; + private final String recoveryEpoch; + private final Map vnodeReplayStarts; + + private RetinaStatus(Builder b) + { + this.state = b.state; + this.recoveryEpoch = b.recoveryEpoch; + this.vnodeReplayStarts = Collections.unmodifiableMap(new LinkedHashMap<>(b.vnodeReplayStarts)); + } + + static Builder newBuilder() + { + return new Builder(); + } + + Builder toBuilder() + { + Builder b = new Builder(); + b.state = this.state; + b.recoveryEpoch = this.recoveryEpoch; + b.vnodeReplayStarts = new LinkedHashMap<>(this.vnodeReplayStarts); + return b; + } + + String getRecoveryEpoch() { return recoveryEpoch; } + Map getVnodeReplayStartsMap() { return vnodeReplayStarts; } + RetinaState getState() { return state; } + + static final class Builder + { + private RetinaState state = RetinaState.UNKNOWN; + private String recoveryEpoch = ""; + private Map vnodeReplayStarts = new LinkedHashMap<>(); + + Builder setState(RetinaState v) { this.state = v; return this; } + Builder setRecoveryEpoch(String v) { this.recoveryEpoch = v; return this; } + Builder putAllVnodeReplayStarts(Map m) + { + this.vnodeReplayStarts.putAll(m); + return this; + } + + RetinaStatus build() { return new RetinaStatus(this); } + } + } } diff --git a/pixels-daemon/src/main/java/io/pixelsdb/pixels/daemon/transaction/TransServer.java b/pixels-daemon/src/main/java/io/pixelsdb/pixels/daemon/transaction/TransServer.java index 2e3be1a464..05dd64192a 100644 --- a/pixels-daemon/src/main/java/io/pixelsdb/pixels/daemon/transaction/TransServer.java +++ b/pixels-daemon/src/main/java/io/pixelsdb/pixels/daemon/transaction/TransServer.java @@ -19,13 +19,28 @@ */ package io.pixelsdb.pixels.daemon.transaction; -import io.grpc.ServerBuilder; -import io.pixelsdb.pixels.common.server.Server; +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.HashMap; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.TimeUnit; + import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; -import java.io.IOException; -import java.util.concurrent.TimeUnit; +import io.etcd.jetcd.KeyValue; +import io.grpc.ServerBuilder; +import io.pixelsdb.pixels.common.server.Server; +import io.pixelsdb.pixels.common.utils.ConfigFactory; +import io.pixelsdb.pixels.common.utils.Constants; +import io.pixelsdb.pixels.common.utils.EtcdUtil; +import io.pixelsdb.pixels.daemon.heartbeat.NodeStatus; /** * @author hank @@ -35,6 +50,13 @@ public class TransServer implements Server { private static final Logger log = LogManager.getLogger(TransServer.class); + /** + * Default time to wait for all expected Retina nodes to reach READY before giving up + * and aborting the trans server boot. Overridable by {@code trans.server.retina.readiness.timeout.ms}. + */ + private static final long DEFAULT_RETINA_READINESS_TIMEOUT_MS = 10 * 60 * 1000L; + private static final long RETINA_READINESS_POLL_INTERVAL_MS = 1_000L; + private boolean running = false; private final io.grpc.Server rpcServer; @@ -69,6 +91,7 @@ public void run() { try { + awaitRetinaReady(); this.rpcServer.start(); this.running = true; this.rpcServer.awaitTermination(); @@ -83,4 +106,126 @@ public void run() this.shutdown(); } } + + /** + * Boot-time gate. When {@code retina.enable=true}, blocks until every node listed in + * {@code $PIXELS_HOME/etc/retina} reports {@code NodeStatus.READY} via heartbeat. When + * {@code retina.enable=false}, returns immediately. On timeout, throws so that + * {@link #run()} aborts and the supervisor can restart the process. + * + *

This is intentionally a one-shot check executed before the gRPC server starts. + * Once the trans server is serving, it does not re-check Retina lifecycle state. + */ + private void awaitRetinaReady() + { + ConfigFactory config = ConfigFactory.Instance(); + if (!Boolean.parseBoolean(config.getProperty("retina.enable"))) + { + return; + } + + // Load expected Retina nodes from $PIXELS_HOME/etc/retina. + Path retinaFile = Paths.get(config.getProperty("pixels.home"), "etc", "retina"); + if (!Files.isRegularFile(retinaFile)) + { + throw new IllegalStateException(retinaFile + " is missing"); + } + Set expected = new LinkedHashSet<>(); + try + { + for (String raw : Files.readAllLines(retinaFile, StandardCharsets.UTF_8)) + { + String line = raw.trim(); + if (line.isEmpty() || line.startsWith("#")) + { + continue; + } + String host = line.split("\\s+", 2)[0]; + expected.add(host); + } + } catch (IOException e) + { + throw new IllegalStateException("Failed to load expected Retina nodes from " + + "$PIXELS_HOME/etc/retina", e); + } + if (expected.isEmpty()) + { + throw new IllegalStateException( + "retina.enable=true but $PIXELS_HOME/etc/retina has no nodes"); + } + + long deadline = System.currentTimeMillis() + DEFAULT_RETINA_READINESS_TIMEOUT_MS; + EtcdUtil etcd = EtcdUtil.Instance(); + String prefix = Constants.HEARTBEAT_RETINA_LITERAL; + int prefixLen = prefix.length(); + log.info("Waiting for {} Retina node(s) to report READY (timeout {} ms)", + expected.size(), DEFAULT_RETINA_READINESS_TIMEOUT_MS); + while (true) + { + String reason = null; + // Poll all Retina heartbeat keys once and check whether every expected node is READY. + Map observed; + try + { + List all = etcd.getKeyValuesByPrefix(prefix); + observed = new HashMap<>(all.size() * 2); + for (KeyValue kv : all) + { + String key = kv.getKey().toString(StandardCharsets.UTF_8); + if (key.length() > prefixLen) + { + observed.put(key.substring(prefixLen), kv); + } + } + } catch (RuntimeException e) + { + observed = null; + reason = "etcd heartbeat read failed: " + e.getMessage(); + } + if (reason == null) + { + for (String host : expected) + { + KeyValue kv = observed.get(host); + if (kv == null) + { + reason = "Retina node " + host + " has no heartbeat status"; + break; + } + if (kv.getLease() <= 0) + { + reason = "Retina node " + host + " has heartbeat status without lease"; + break; + } + String status = kv.getValue().toString(StandardCharsets.UTF_8).trim(); + if (!String.valueOf(NodeStatus.READY.StatusCode).equals(status)) + { + reason = "Retina node " + host + " heartbeat status is " + status; + break; + } + } + } + if (reason == null) + { + log.info("All Retina nodes are READY, starting trans server"); + return; + } + if (System.currentTimeMillis() >= deadline) + { + throw new IllegalStateException( + "Timed out waiting for Retina readiness after " + + DEFAULT_RETINA_READINESS_TIMEOUT_MS + + " ms; last reason: " + reason); + } + try + { + Thread.sleep(RETINA_READINESS_POLL_INTERVAL_MS); + } catch (InterruptedException e) + { + Thread.currentThread().interrupt(); + throw new IllegalStateException( + "Interrupted while waiting for Retina readiness", e); + } + } + } } diff --git a/pixels-daemon/src/main/java/io/pixelsdb/pixels/daemon/transaction/TransServiceImpl.java b/pixels-daemon/src/main/java/io/pixelsdb/pixels/daemon/transaction/TransServiceImpl.java index 94a7d7b958..06d49f464a 100644 --- a/pixels-daemon/src/main/java/io/pixelsdb/pixels/daemon/transaction/TransServiceImpl.java +++ b/pixels-daemon/src/main/java/io/pixelsdb/pixels/daemon/transaction/TransServiceImpl.java @@ -629,14 +629,18 @@ public void dumpTrans(TransProto.DumpTransRequest request, } @Override - public void getSafeGcTimestamp(com.google.protobuf.Empty request, - StreamObserver responseObserver) + public void getSafeVisibilityFoldingTimestamp(TransProto.GetSafeVisibilityFoldingTimestampRequest request, + StreamObserver responseObserver) { - long safeTs = Math.max(0, lowWatermark.get() - 1); - TransProto.GetSafeGcTimestampResponse response = TransProto.GetSafeGcTimestampResponse.newBuilder() - .setErrorCode(ErrorCode.SUCCESS) - .setTimestamp(safeTs) - .build(); + long writerSafeTs = Math.max(0, highWatermark.get() - 1); + long safeTs = request.getIncludeRunningQueries() + ? Math.min(lowWatermark.get(), writerSafeTs) + : writerSafeTs; + TransProto.GetSafeVisibilityFoldingTimestampResponse response = + TransProto.GetSafeVisibilityFoldingTimestampResponse.newBuilder() + .setErrorCode(ErrorCode.SUCCESS) + .setTimestamp(safeTs) + .build(); responseObserver.onNext(response); responseObserver.onCompleted(); } diff --git a/pixels-daemon/src/main/resources/pixels_metadata.mwb b/pixels-daemon/src/main/resources/pixels_metadata.mwb index 3a9176fa93..0874f98600 100644 Binary files a/pixels-daemon/src/main/resources/pixels_metadata.mwb and b/pixels-daemon/src/main/resources/pixels_metadata.mwb differ diff --git a/pixels-daemon/src/test/java/io/pixelsdb/pixels/daemon/metadata/dao/TestRdbFileDao.java b/pixels-daemon/src/test/java/io/pixelsdb/pixels/daemon/metadata/dao/TestRdbFileDao.java new file mode 100644 index 0000000000..02197516c3 --- /dev/null +++ b/pixels-daemon/src/test/java/io/pixelsdb/pixels/daemon/metadata/dao/TestRdbFileDao.java @@ -0,0 +1,607 @@ +/* + * Copyright 2026 PixelsDB. + * + * This file is part of Pixels. + * + * Pixels is free software: you can redistribute it and/or modify + * it under the terms of the Affero GNU General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * Pixels is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Affero GNU General Public License for more details. + * + * You should have received a copy of the Affero GNU General Public + * License along with Pixels. If not, see + * . + */ +package io.pixelsdb.pixels.daemon.metadata.dao; + +import io.pixelsdb.pixels.common.utils.MetaDBUtil; +import io.pixelsdb.pixels.daemon.MetadataProto; +import io.pixelsdb.pixels.daemon.metadata.dao.impl.RdbFileDao; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; +import org.mockito.ArgumentCaptor; + +import java.lang.reflect.Field; +import java.sql.Connection; +import java.sql.PreparedStatement; +import java.sql.ResultSet; +import java.sql.SQLException; +import java.sql.Statement; +import java.sql.Types; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertTrue; +import static org.mockito.ArgumentMatchers.anyInt; +import static org.mockito.ArgumentMatchers.anyLong; +import static org.mockito.ArgumentMatchers.anyString; +import static org.mockito.ArgumentMatchers.eq; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.never; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +/** + * Unit tests for {@link RdbFileDao} cleanup-at handling and typed file enumeration. + */ +public class TestRdbFileDao +{ + private static final MetadataProto.File.Type REGULAR = MetadataProto.File.Type.REGULAR; + private static final MetadataProto.File.Type RETIRED = MetadataProto.File.Type.RETIRED; + private static final MetadataProto.File.Type TEMPORARY_INGEST = + MetadataProto.File.Type.TEMPORARY_INGEST; + private static final MetadataProto.File.Type TEMPORARY_GC = + MetadataProto.File.Type.TEMPORARY_GC; + + private static final int REGULAR_VALUE = REGULAR.getNumber(); + private static final int RETIRED_VALUE = RETIRED.getNumber(); + private static final int TEMPORARY_INGEST_VALUE = TEMPORARY_INGEST.getNumber(); + private static final int TEMPORARY_GC_VALUE = TEMPORARY_GC.getNumber(); + + private Connection mockConn; + private Connection originalConn; + private RdbFileDao dao; + + @Before + public void setUp() throws Exception + { + mockConn = mock(Connection.class); + // Keep lazy reconnect on the mock connection. + when(mockConn.isValid(anyInt())).thenReturn(true); + originalConn = swapConnection(mockConn); + dao = new RdbFileDao(); + } + + @After + public void tearDown() throws Exception + { + swapConnection(originalConn); + } + + // ========================================================================= + // INSERT / UPDATE cleanup-at binding + // ========================================================================= + + /** + * Non-RETIRED rows bind {@code FILE_CLEANUP_AT} as {@code NULL}. + */ + @Test + public void insert_nonRetired_withoutCleanupAt_bindsNull() throws Exception + { + PreparedStatement pstRegular = stubPreparedStatementForInsert(); + dao.insert(baseFile("a.pxl", REGULAR_VALUE).build()); + verify(pstRegular).setNull(7, Types.BIGINT); + verify(pstRegular, never()).setLong(eq(7), anyLong()); + + PreparedStatement pstIngest = stubPreparedStatementForInsert(); + dao.insert(baseFile("ingest_unset.pxl", TEMPORARY_INGEST_VALUE).build()); + verify(pstIngest).setNull(7, Types.BIGINT); + verify(pstIngest, never()).setLong(eq(7), anyLong()); + + PreparedStatement pstGc = stubPreparedStatementForInsert(); + dao.insert(baseFile("gc_unset.pxl", TEMPORARY_GC_VALUE).build()); + verify(pstGc).setNull(7, Types.BIGINT); + verify(pstGc, never()).setLong(eq(7), anyLong()); + } + + /** + * Non-RETIRED rows with {@code cleanupAt} are rejected before writing. + */ + @Test + public void insert_nonRetired_withCleanupAt_failsFast() throws Exception + { + PreparedStatement pst = stubPreparedStatementForInsert(); + long unwanted = 123_456_789L; + long id = dao.insert(baseFile("a.pxl", REGULAR_VALUE).setCleanupAt(unwanted).build()); + assertEquals("DAO must surface the invariant violation as the -1 failure sentinel", -1L, id); + verify(pst, never()).setLong(eq(7), anyLong()); + verify(pst, never()).setNull(eq(7), anyInt()); + verify(pst, never()).executeUpdate(); + + PreparedStatement pst2 = stubPreparedStatementForInsert(); + long id2 = dao.insert(baseFile("t.pxl", TEMPORARY_GC_VALUE).setCleanupAt(24L).build()); + assertEquals(-1L, id2); + verify(pst2, never()).executeUpdate(); + } + + /** + * RETIRED rows bind the provided cleanup deadline. + */ + @Test + public void insert_retiredFile_bindingScenarios() throws Exception + { + PreparedStatement pst1 = stubPreparedStatementForInsert(); + long deadline = 1_700_000_000_000L; + dao.insert(baseFile("retired.pxl", RETIRED_VALUE).setCleanupAt(deadline).build()); + verify(pst1).setLong(7, deadline); + verify(pst1, never()).setNull(eq(7), anyInt()); + + PreparedStatement pst2 = stubPreparedStatementForInsert(); + dao.insert(baseFile("retired_zero.pxl", RETIRED_VALUE).setCleanupAt(0L).build()); + verify(pst2).setLong(7, 0L); + verify(pst2, never()).setNull(eq(7), anyInt()); + } + + /** + * RETIRED rows without {@code cleanupAt} are rejected. + */ + @Test + public void insert_retired_withoutCleanupAt_failsFast() throws Exception + { + PreparedStatement pst = stubPreparedStatementForInsert(); + long id = dao.insert(baseFile("nd.pxl", RETIRED_VALUE).build()); + assertEquals(-1L, id); + verify(pst, never()).executeUpdate(); + } + + @Test + public void insertBatch_mixedTypes_bindsCleanupAtPerRow() throws Exception + { + PreparedStatement pst = stubPreparedStatementForInsert(); + + MetadataProto.File regular = baseFile("r.pxl", REGULAR_VALUE).build(); + MetadataProto.File temporaryNoDeadline = baseFile("t.pxl", TEMPORARY_GC_VALUE).build(); + MetadataProto.File ingestNoDeadline = baseFile("i.pxl", TEMPORARY_INGEST_VALUE).build(); + MetadataProto.File retiredWithDeadline = baseFile("d.pxl", RETIRED_VALUE) + .setCleanupAt(42L).build(); + + assertTrue(dao.insertBatch( + Arrays.asList(regular, temporaryNoDeadline, ingestNoDeadline, retiredWithDeadline))); + + // Three non-RETIRED rows bind NULL; the single RETIRED row binds its deadline. + verify(pst, times(3)).setNull(7, Types.BIGINT); + verify(pst, times(1)).setLong(7, 42L); + verify(pst).executeBatch(); + } + + /** + * Any invalid cleanup-at row rejects the whole batch. + */ + @Test + public void insertBatch_invariantViolation_rejectsWholeBatch() throws Exception + { + PreparedStatement pst = stubPreparedStatementForInsert(); + + // Mix one legal RETIRED with one illegal TEMPORARY_GC+cleanupAt. + MetadataProto.File legal = baseFile("d.pxl", RETIRED_VALUE).setCleanupAt(42L).build(); + MetadataProto.File illegal = baseFile("t.pxl", TEMPORARY_GC_VALUE).setCleanupAt(24L).build(); + + assertFalse(dao.insertBatch(Arrays.asList(legal, illegal))); + verify(pst, never()).executeBatch(); + } + + /** + * UPDATE binds cleanup-at at index 6 and the WHERE id at index 7. + */ + @Test + public void update_bindingScenarios() throws Exception + { + PreparedStatement pst1 = mock(PreparedStatement.class); + when(mockConn.prepareStatement(anyString())).thenReturn(pst1); + when(pst1.executeUpdate()).thenReturn(1); + assertTrue(dao.update(baseFile("u.pxl", REGULAR_VALUE).setId(7L).build())); + verify(pst1).setNull(6, Types.BIGINT); + verify(pst1).setLong(7, 7L); + + PreparedStatement pst2 = mock(PreparedStatement.class); + when(mockConn.prepareStatement(anyString())).thenReturn(pst2); + when(pst2.executeUpdate()).thenReturn(1); + long deadline = 1_700_000_000_999L; + assertTrue(dao.update(baseFile("u.pxl", RETIRED_VALUE).setId(8L) + .setCleanupAt(deadline).build())); + verify(pst2).setLong(6, deadline); + verify(pst2).setLong(7, 8L); + } + + /** + * Invalid cleanup-at combinations are rejected on UPDATE. + */ + @Test + public void update_invariantViolations_failFast() throws Exception + { + PreparedStatement pst1 = mock(PreparedStatement.class); + when(mockConn.prepareStatement(anyString())).thenReturn(pst1); + assertFalse(dao.update(baseFile("u.pxl", TEMPORARY_GC_VALUE).setId(8L) + .setCleanupAt(99L).build())); + verify(pst1, never()).executeUpdate(); + + PreparedStatement pst2 = mock(PreparedStatement.class); + when(mockConn.prepareStatement(anyString())).thenReturn(pst2); + assertFalse(dao.update(baseFile("u.pxl", RETIRED_VALUE).setId(9L).build())); + verify(pst2, never()).executeUpdate(); + } + + // ========================================================================= + // atomicSwapFiles transactional behaviour + // ========================================================================= + + /** + * Promoting a file clears {@code FILE_CLEANUP_AT}; retiring old files writes the shared deadline. + */ + @Test + public void atomicSwapFiles_promotesNewFileAndRetiresOldFilesWithCleanupAt() throws Exception + { + PreparedStatement promotePst = mock(PreparedStatement.class); + PreparedStatement retirePst = mock(PreparedStatement.class); + when(mockConn.prepareStatement(anyString())).thenReturn(promotePst).thenReturn(retirePst); + + long cleanupAt = 1_700_000_001_234L; + assertTrue(dao.atomicSwapFiles(101L, Arrays.asList(11L, 12L), cleanupAt)); + + ArgumentCaptor sqlCaptor = ArgumentCaptor.forClass(String.class); + verify(mockConn, times(2)).prepareStatement(sqlCaptor.capture()); + String promoteSql = sqlCaptor.getAllValues().get(0); + String retireSql = sqlCaptor.getAllValues().get(1); + assertTrue("promote SQL must update FILE_TYPE", + promoteSql.contains("FILE_TYPE=?")); + assertTrue("promote SQL must clear FILE_CLEANUP_AT to NULL", + promoteSql.contains("FILE_CLEANUP_AT=NULL")); + assertTrue("retire SQL must update FILE_TYPE", + retireSql.contains("FILE_TYPE=?")); + assertTrue("retire SQL must bind FILE_CLEANUP_AT", + retireSql.contains("FILE_CLEANUP_AT=?")); + assertTrue("retire SQL must address old files by FILE_ID", + retireSql.contains("WHERE FILE_ID=?")); + + verify(promotePst).setInt(1, REGULAR_VALUE); + verify(promotePst).setLong(2, 101L); + verify(promotePst).executeUpdate(); + + verify(retirePst, times(2)).setInt(1, RETIRED_VALUE); + verify(retirePst, times(2)).setLong(2, cleanupAt); + verify(retirePst).setLong(3, 11L); + verify(retirePst).setLong(3, 12L); + verify(retirePst, times(2)).addBatch(); + verify(retirePst).executeBatch(); + + verify(mockConn).setAutoCommit(false); + verify(mockConn).commit(); + verify(mockConn).setAutoCommit(true); + } + + @Test + public void atomicSwapFiles_withNoOldFiles_onlyPromotesNewFile() throws Exception + { + PreparedStatement promotePst = mock(PreparedStatement.class); + when(mockConn.prepareStatement(anyString())).thenReturn(promotePst); + + assertTrue(dao.atomicSwapFiles(202L, Collections.emptyList(), 1_700_000_002_000L)); + + ArgumentCaptor sqlCaptor = ArgumentCaptor.forClass(String.class); + verify(mockConn).prepareStatement(sqlCaptor.capture()); + String promoteSql = sqlCaptor.getValue(); + assertTrue("promote SQL must update FILE_TYPE", + promoteSql.contains("FILE_TYPE=?")); + assertTrue("promote SQL must clear FILE_CLEANUP_AT to NULL", + promoteSql.contains("FILE_CLEANUP_AT=NULL")); + + verify(promotePst).setInt(1, REGULAR_VALUE); + verify(promotePst).setLong(2, 202L); + verify(promotePst).executeUpdate(); + verify(mockConn).setAutoCommit(false); + verify(mockConn).commit(); + verify(mockConn).setAutoCommit(true); + } + + @Test + public void atomicSwapFiles_rollsBackOnSqlException() throws Exception + { + when(mockConn.prepareStatement(anyString())).thenThrow(new SQLException("boom")); + + assertFalse("atomicSwapFiles must report failure when the JDBC layer throws", + dao.atomicSwapFiles(1L, Collections.singletonList(2L), 42L)); + verify(mockConn).setAutoCommit(false); + verify(mockConn).rollback(); + verify(mockConn).setAutoCommit(true); + verify(mockConn, never()).commit(); + } + + // ========================================================================= + // SELECT cleanup-at round-trip + // ========================================================================= + + /** + * SQL {@code NULL} cleanup-at values surface as unset proto fields. + */ + @Test + public void getById_cleanupAtRoundTripScenarios() throws Exception + { + // Scenario 1: non-NULL deadline must surface as hasCleanupAt() == true + Statement st1 = mock(Statement.class); + ResultSet rs1 = mock(ResultSet.class); + when(mockConn.createStatement()).thenReturn(st1); + when(st1.executeQuery(anyString())).thenReturn(rs1); + when(rs1.next()).thenReturn(true).thenReturn(false); + stubFileRow(rs1, 99L, "x.pxl", RETIRED_VALUE, 5L, 1_700_000_000_000L, /*wasNull*/ false); + + MetadataProto.File proto1 = dao.getById(99L); + assertNotNull(proto1); + assertEquals(99L, proto1.getId()); + assertEquals(RETIRED, proto1.getType()); + assertTrue("non-NULL FILE_CLEANUP_AT column must surface as hasCleanupAt()", + proto1.hasCleanupAt()); + assertEquals(1_700_000_000_000L, proto1.getCleanupAt()); + + // Scenario 2: NULL column must surface as !hasCleanupAt() + Statement st2 = mock(Statement.class); + ResultSet rs2 = mock(ResultSet.class); + when(mockConn.createStatement()).thenReturn(st2); + when(st2.executeQuery(anyString())).thenReturn(rs2); + when(rs2.next()).thenReturn(true).thenReturn(false); + stubFileRow(rs2, 1L, "r.pxl", REGULAR_VALUE, 1L, 0L, /*wasNull*/ true); + + MetadataProto.File proto2 = dao.getById(1L); + assertNotNull(proto2); + assertFalse("NULL FILE_CLEANUP_AT column must surface as !hasCleanupAt()", + proto2.hasCleanupAt()); + } + + // ========================================================================= + // getFilesByType + // ========================================================================= + + /** + * Single-path queries bind path id first, then requested file types. + */ + @Test + public void getFilesByType_singlePath_bindsPathIdAndRequestedTypes() throws Exception + { + PreparedStatement pst = stubEmptyQuery(); + + dao.getFilesByType(9L, Arrays.asList(TEMPORARY_INGEST, RETIRED)); + + ArgumentCaptor sqlCaptor = ArgumentCaptor.forClass(String.class); + verify(mockConn).prepareStatement(sqlCaptor.capture()); + String sql = sqlCaptor.getValue(); + assertTrue("single-path enumeration must filter by PATHS_PATH_ID", + sql.contains("PATHS_PATH_ID = ?")); + assertTrue("enumeration must filter by FILE_TYPE IN (...)", + sql.contains("FILE_TYPE IN (")); + assertTrue("enumeration must order by FILE_ID for stable iteration", + sql.contains("ORDER BY FILE_ID")); + + verify(pst).setLong(1, 9L); + verify(pst).setInt(2, TEMPORARY_INGEST_VALUE); + verify(pst).setInt(3, RETIRED_VALUE); + } + + /** + * Cross-path queries omit the path predicate and bind types from index 1. + */ + @Test + public void getFilesByType_crossPath_omitsPathPredicateAndBindsTypesAtIndexOne() + throws Exception + { + PreparedStatement pst = stubEmptyQuery(); + + dao.getFilesByType(/*pathId*/ null, Arrays.asList(TEMPORARY_INGEST, TEMPORARY_GC)); + + ArgumentCaptor sqlCaptor = ArgumentCaptor.forClass(String.class); + verify(mockConn).prepareStatement(sqlCaptor.capture()); + String sql = sqlCaptor.getValue(); + assertFalse("cross-path enumeration must NOT include the PATHS_PATH_ID predicate", + sql.contains("PATHS_PATH_ID")); + assertTrue("cross-path enumeration must still filter by FILE_TYPE IN (...)", + sql.contains("FILE_TYPE IN (")); + assertTrue("cross-path enumeration must order by FILE_ID", + sql.contains("ORDER BY FILE_ID")); + + // No path bind — type numbers start at index 1. + verify(pst, never()).setLong(eq(1), anyLong()); + verify(pst).setInt(1, TEMPORARY_INGEST_VALUE); + verify(pst).setInt(2, TEMPORARY_GC_VALUE); + } + + /** + * Repeated file types share one SQL placeholder. + */ + @Test + public void getFilesByType_dedupesRepeatedTypes() throws Exception + { + PreparedStatement pst = stubEmptyQuery(); + + dao.getFilesByType(2L, Arrays.asList(REGULAR, REGULAR, REGULAR)); + + ArgumentCaptor sqlCaptor = ArgumentCaptor.forClass(String.class); + verify(mockConn).prepareStatement(sqlCaptor.capture()); + String sql = sqlCaptor.getValue(); + int inStart = sql.indexOf("FILE_TYPE IN ("); + int inEnd = sql.indexOf(")", inStart); + String inClause = sql.substring(inStart, inEnd); + assertEquals("duplicate types must be deduped to a single placeholder", + 1, countOccurrences(inClause, '?')); + + verify(pst).setLong(1, 2L); + verify(pst).setInt(2, REGULAR_VALUE); + verify(pst, never()).setInt(eq(3), anyInt()); + } + + /** + * Empty or null type lists return an empty result without querying JDBC. + */ + @Test + public void getFilesByType_emptyTypes_returnsEmptyWithoutQuerying() throws Exception + { + // Single-path empty / null + List emptyResult = dao.getFilesByType(5L, Collections.emptyList()); + assertNotNull(emptyResult); + assertTrue(emptyResult.isEmpty()); + + List nullResult = dao.getFilesByType(5L, null); + assertNotNull(nullResult); + assertTrue(nullResult.isEmpty()); + + // Cross-path empty / null + List crossEmpty = dao.getFilesByType(null, Collections.emptyList()); + assertNotNull(crossEmpty); + assertTrue(crossEmpty.isEmpty()); + + List crossNull = dao.getFilesByType(null, null); + assertNotNull(crossNull); + assertTrue(crossNull.isEmpty()); + + verify(mockConn, never()).prepareStatement(anyString()); + verify(mockConn, never()).createStatement(); + } + + /** + * SQL exceptions return {@code null} on single-path queries. + */ + @Test + public void getFilesByType_singlePath_sqlException_returnsNull() throws Exception + { + when(mockConn.prepareStatement(anyString())).thenThrow(new SQLException("boom")); + + List failure = + dao.getFilesByType(7L, Collections.singletonList(REGULAR)); + assertNull("SQL exception on single-path enumeration must surface as null", failure); + } + + /** + * SQL exceptions return {@code null} on cross-path queries. + */ + @Test + public void getFilesByType_crossPath_sqlException_returnsNull() throws Exception + { + when(mockConn.prepareStatement(anyString())).thenThrow(new SQLException("boom")); + + List failure = + dao.getFilesByType(null, Collections.singletonList(RETIRED)); + assertNull("SQL exception on cross-path enumeration must surface as null", failure); + } + + // ========================================================================= + // deleteByIds + // ========================================================================= + + /** + * deleteByIds batches {@code FILE_ID} deletes with one SQL template. + */ + @Test + public void deleteByIds_batchesBindsAndIssuesSingleSqlTemplate() throws Exception + { + PreparedStatement pst = mock(PreparedStatement.class); + when(mockConn.prepareStatement(anyString())).thenReturn(pst); + + assertTrue(dao.deleteByIds(Arrays.asList(11L, 22L, 33L))); + + ArgumentCaptor sqlCaptor = ArgumentCaptor.forClass(String.class); + verify(mockConn).prepareStatement(sqlCaptor.capture()); + String sql = sqlCaptor.getValue(); + assertEquals("deleteByIds must use a positional FILE_ID=? template (batched)", + "DELETE FROM FILES WHERE FILE_ID=?", sql); + + verify(pst).setLong(1, 11L); + verify(pst).setLong(1, 22L); + verify(pst).setLong(1, 33L); + verify(pst, times(3)).addBatch(); + verify(pst).executeBatch(); + } + + // ========================================================================= + // helpers + // ========================================================================= + + private PreparedStatement stubPreparedStatementForInsert() throws SQLException + { + PreparedStatement pst = mock(PreparedStatement.class); + when(mockConn.prepareStatement(anyString())).thenReturn(pst); + when(pst.executeUpdate()).thenReturn(1); + // Stub LAST_INSERT_ID() on the insert statement. + ResultSet idRs = mock(ResultSet.class); + when(pst.executeQuery(anyString())).thenReturn(idRs); + when(idRs.next()).thenReturn(true); + when(idRs.getLong(1)).thenReturn(1L); + return pst; + } + + private PreparedStatement stubEmptyQuery() throws SQLException + { + PreparedStatement pst = mock(PreparedStatement.class); + ResultSet rs = mock(ResultSet.class); + when(mockConn.prepareStatement(anyString())).thenReturn(pst); + when(pst.executeQuery()).thenReturn(rs); + when(rs.next()).thenReturn(false); + return pst; + } + + private static MetadataProto.File.Builder baseFile(String name, int typeValue) + { + return MetadataProto.File.newBuilder() + .setName(name) + .setTypeValue(typeValue) + .setNumRowGroup(1) + .setMinRowId(0L) + .setMaxRowId(0L) + .setPathId(1L); + } + + private static void stubFileRow(ResultSet rs, long id, String name, int typeValue, + long pathId, long cleanupAt, boolean cleanupAtWasNull) + throws SQLException + { + when(rs.getLong("FILE_ID")).thenReturn(id); + when(rs.getString("FILE_NAME")).thenReturn(name); + when(rs.getInt("FILE_TYPE")).thenReturn(typeValue); + when(rs.getInt("FILE_NUM_RG")).thenReturn(1); + when(rs.getLong("FILE_MIN_ROW_ID")).thenReturn(0L); + when(rs.getLong("FILE_MAX_ROW_ID")).thenReturn(0L); + when(rs.getLong("PATHS_PATH_ID")).thenReturn(pathId); + when(rs.getLong("FILE_CLEANUP_AT")).thenReturn(cleanupAt); + when(rs.wasNull()).thenReturn(cleanupAtWasNull); + } + + private static int countOccurrences(String haystack, char needle) + { + int n = 0; + for (int i = 0; i < haystack.length(); i++) + { + if (haystack.charAt(i) == needle) n++; + } + return n; + } + + /** + * Swap the {@link MetaDBUtil} singleton connection for this test. + */ + private static Connection swapConnection(Connection replacement) throws Exception + { + Field f = MetaDBUtil.class.getDeclaredField("connection"); + f.setAccessible(true); + Connection previous = (Connection) f.get(MetaDBUtil.Instance()); + f.set(MetaDBUtil.Instance(), replacement); + return previous; + } +} diff --git a/pixels-daemon/src/test/java/io/pixelsdb/pixels/daemon/retina/TestRetinaServer.java b/pixels-daemon/src/test/java/io/pixelsdb/pixels/daemon/retina/TestRetinaServer.java index 1167cf6e86..6e3e360326 100644 --- a/pixels-daemon/src/test/java/io/pixelsdb/pixels/daemon/retina/TestRetinaServer.java +++ b/pixels-daemon/src/test/java/io/pixelsdb/pixels/daemon/retina/TestRetinaServer.java @@ -19,12 +19,56 @@ */ package io.pixelsdb.pixels.daemon.retina; +import com.google.protobuf.ByteString; +import io.grpc.stub.StreamObserver; +import io.pixelsdb.pixels.common.exception.IndexException; +import io.pixelsdb.pixels.common.exception.MetadataException; +import io.pixelsdb.pixels.common.exception.RetinaException; +import io.pixelsdb.pixels.common.index.ResolvedPrimary; +import io.pixelsdb.pixels.common.index.service.IndexService; +import io.pixelsdb.pixels.common.index.service.LocalIndexService; +import io.pixelsdb.pixels.common.metadata.MetadataService; +import io.pixelsdb.pixels.common.metadata.domain.File; +import io.pixelsdb.pixels.common.metadata.domain.Layout; +import io.pixelsdb.pixels.common.metadata.domain.Path; +import io.pixelsdb.pixels.common.metadata.domain.Permission; +import io.pixelsdb.pixels.common.metadata.domain.Schema; +import io.pixelsdb.pixels.common.metadata.domain.Table; import io.pixelsdb.pixels.daemon.ServerContainer; import io.pixelsdb.pixels.daemon.metadata.MetadataServer; +import io.pixelsdb.pixels.index.IndexProto; +import io.pixelsdb.pixels.retina.RetinaProto; +import io.pixelsdb.pixels.retina.RetinaResourceManager; +import org.junit.Ignore; import org.junit.Test; +import org.mockito.ArgumentMatchers; +import org.mockito.InOrder; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.Optional; +import java.util.concurrent.atomic.AtomicReference; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; +import static org.mockito.Mockito.any; +import static org.mockito.Mockito.anyLong; +import static org.mockito.Mockito.doAnswer; +import static org.mockito.Mockito.doThrow; +import static org.mockito.Mockito.eq; +import static org.mockito.Mockito.inOrder; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.never; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; public class TestRetinaServer { + @Ignore("Integration test requires real metadata server, metadata DB, and fixed local ports.") @Test public void testRetinaServer() { @@ -34,4 +78,592 @@ public void testRetinaServer() RetinaServer retinaServer = new RetinaServer(18890); container.addServer("retina server", retinaServer); } + + @Test + public void testRetinaServerImplInitializationFailureIsFailClosed() throws Exception + { + MetadataService metadataService = mock(MetadataService.class); + IndexService indexService = mock(LocalIndexService.class); + RetinaResourceManager resourceManager = mock(RetinaResourceManager.class); + + when(metadataService.getSchemas()).thenThrow(new MetadataException("metadata unavailable")); + + try + { + RetinaServerImpl server = new RetinaServerImpl(metadataService, indexService, resourceManager); + fail("RetinaServerImpl must fail closed when initialization fails: " + server); + } + catch (IllegalStateException e) + { + assertTrue(e.getMessage().contains("Failed to initialize RetinaServerImpl")); + } + + verify(resourceManager).recoverOffloadCheckpoints(); + verify(resourceManager, never()).startBackgroundGc(); + } + + @Test + public void testRetinaServerImplStartsBackgroundGcAfterSuccessfulInitialization() throws Exception + { + MetadataService metadataService = mock(MetadataService.class); + IndexService indexService = mock(LocalIndexService.class); + RetinaResourceManager resourceManager = mock(RetinaResourceManager.class); + + Schema schema = new Schema(); + schema.setName("gc_schema"); + Table table = new Table(); + table.setName("gc_table"); + Path orderedPath = new Path(); + orderedPath.setId(11L); + orderedPath.setUri("file:///tmp/pixels/ordered"); + Path compactPath = new Path(); + compactPath.setId(12L); + compactPath.setUri("file:///tmp/pixels/compact"); + Layout layout = new Layout(); + layout.setPermission(Permission.READ_WRITE); + layout.setOrderedPaths(Collections.singletonList(orderedPath)); + layout.setCompactPaths(Collections.singletonList(compactPath)); + File orderedFile = new File(); + orderedFile.setName("ordered.pxl"); + File compactFile = new File(); + compactFile.setName("compact.pxl"); + List lifecycleEvents = Collections.synchronizedList(new ArrayList<>()); + + when(metadataService.getSchemas()).thenReturn(Collections.singletonList(schema)); + when(metadataService.getTables(schema.getName())).thenReturn(Collections.singletonList(table)); + when(metadataService.getLayouts(schema.getName(), table.getName())).thenReturn(Collections.singletonList(layout)); + when(metadataService.getRegularFiles(orderedPath.getId())).thenReturn(Collections.singletonList(orderedFile)); + when(metadataService.getRegularFiles(compactPath.getId())).thenReturn(Collections.singletonList(compactFile)); + doAnswer(invocation -> { + lifecycleEvents.add("recover"); + return null; + }).when(resourceManager).recoverOffloadCheckpoints(); + doAnswer(invocation -> { + lifecycleEvents.add("visibility:" + invocation.getArgument(0)); + return null; + }).when(resourceManager).addVisibility(org.mockito.ArgumentMatchers.anyString()); + doAnswer(invocation -> { + lifecycleEvents.add("writeBuffer"); + return null; + }).when(resourceManager).addWriteBuffer(schema.getName(), table.getName()); + doAnswer(invocation -> { + lifecycleEvents.add("startGc"); + return null; + }).when(resourceManager).startBackgroundGc(); + + new RetinaServerImpl(metadataService, indexService, resourceManager); + + assertTrue(lifecycleEvents.indexOf("recover") >= 0); + assertTrue(lifecycleEvents.contains("visibility:file:///tmp/pixels/ordered/ordered.pxl")); + assertTrue(lifecycleEvents.contains("visibility:file:///tmp/pixels/compact/compact.pxl")); + int writeBufferIndex = lifecycleEvents.indexOf("writeBuffer"); + assertTrue(writeBufferIndex > lifecycleEvents.indexOf("recover")); + assertTrue(writeBufferIndex > lifecycleEvents.indexOf("visibility:file:///tmp/pixels/ordered/ordered.pxl")); + assertTrue(writeBufferIndex > lifecycleEvents.indexOf("visibility:file:///tmp/pixels/compact/compact.pxl")); + assertTrue(lifecycleEvents.indexOf("startGc") > writeBufferIndex); + verify(resourceManager).addVisibility("file:///tmp/pixels/ordered/ordered.pxl"); + verify(resourceManager).addVisibility("file:///tmp/pixels/compact/compact.pxl"); + verify(resourceManager).startBackgroundGc(); + } + + @Test + public void testRetinaServerImplBackgroundGcStartFailureIsFailClosed() throws Exception + { + MetadataService metadataService = mock(MetadataService.class); + IndexService indexService = mock(LocalIndexService.class); + RetinaResourceManager resourceManager = mock(RetinaResourceManager.class); + + when(metadataService.getSchemas()).thenReturn(Collections.emptyList()); + doThrow(new RetinaException("gc disabled by invalid lifecycle")) + .when(resourceManager).startBackgroundGc(); + + try + { + RetinaServerImpl server = new RetinaServerImpl(metadataService, indexService, resourceManager); + fail("RetinaServerImpl must fail closed when background GC cannot start: " + server); + } + catch (IllegalStateException e) + { + assertTrue(e.getMessage().contains("Failed to initialize RetinaServerImpl")); + } + + InOrder inOrder = inOrder(resourceManager); + inOrder.verify(resourceManager).recoverOffloadCheckpoints(); + inOrder.verify(resourceManager).startBackgroundGc(); + } + + // ===================================================================== + // UpdateRecord write paths. + // ===================================================================== + + /** + * Build a RetinaServerImpl with the bare-minimum mocks needed to reach updateRecord + * without performing real metadata work or any background initialisation. + */ + private RetinaServerImpl buildServerWithLocalIndex(LocalIndexService localIndex, + RetinaResourceManager rm) throws Exception + { + MetadataService metadataService = mock(MetadataService.class); + when(metadataService.getSchemas()).thenReturn(Collections.emptyList()); + return new RetinaServerImpl(metadataService, localIndex, rm); + } + + private static IndexProto.IndexKey makeKey(long tableId, long indexId, String key, long ts) + { + return IndexProto.IndexKey.newBuilder() + .setTableId(tableId).setIndexId(indexId) + .setKey(ByteString.copyFromUtf8(key)) + .setTimestamp(ts) + .build(); + } + + private static IndexProto.RowLocation makeLoc(long fileId, int rgId, int rgRowOffset) + { + return IndexProto.RowLocation.newBuilder() + .setFileId(fileId).setRgId(rgId).setRgRowOffset(rgRowOffset) + .build(); + } + + private static RetinaProto.UpdateRecordRequest makeDeleteRequest(long tableId, long indexId, + String schema, String table, + long ts, String... keys) + { + RetinaProto.TableUpdateData.Builder tud = RetinaProto.TableUpdateData.newBuilder() + .setTableName(table) + .setPrimaryIndexId(indexId) + .setTimestamp(ts); + for (String k : keys) + { + tud.addDeleteData(RetinaProto.DeleteData.newBuilder() + .addIndexKeys(makeKey(tableId, indexId, k, ts))); + } + return RetinaProto.UpdateRecordRequest.newBuilder() + .setHeader(RetinaProto.RequestHeader.newBuilder().setToken("t")) + .setSchemaName(schema) + .addTableUpdateData(tud) + .build(); + } + + private static RetinaProto.UpdateRecordRequest makeInsertRequest(long tableId, long indexId, + String schema, String table, + long ts, String... keys) + { + RetinaProto.TableUpdateData.Builder tud = RetinaProto.TableUpdateData.newBuilder() + .setTableName(table) + .setPrimaryIndexId(indexId) + .setTimestamp(ts); + for (String k : keys) + { + tud.addInsertData(RetinaProto.InsertData.newBuilder() + .addIndexKeys(makeKey(tableId, indexId, k, ts)) + .addColValues(ByteString.copyFromUtf8("v-" + k))); + } + return RetinaProto.UpdateRecordRequest.newBuilder() + .setHeader(RetinaProto.RequestHeader.newBuilder().setToken("t")) + .setSchemaName(schema) + .addTableUpdateData(tud) + .build(); + } + + private static RetinaProto.UpdateRecordRequest makeDeleteWithSecondaryRequest( + long tableId, long primaryIndexId, long secondaryIndexId, + String schema, String table, long ts, String key) + { + RetinaProto.TableUpdateData.Builder tud = RetinaProto.TableUpdateData.newBuilder() + .setTableName(table) + .setPrimaryIndexId(primaryIndexId) + .setTimestamp(ts) + .addDeleteData(RetinaProto.DeleteData.newBuilder() + .addIndexKeys(makeKey(tableId, primaryIndexId, key, ts)) + .addIndexKeys(makeKey(tableId, secondaryIndexId, "sec-" + key, ts))); + return RetinaProto.UpdateRecordRequest.newBuilder() + .setHeader(RetinaProto.RequestHeader.newBuilder().setToken("t")) + .setSchemaName(schema) + .addTableUpdateData(tud) + .build(); + } + + private static IndexProto.PrimaryIndexEntry.Builder makePrimaryEntryBuilder( + IndexProto.IndexKey key, long rowId, IndexProto.RowLocation location) + { + return IndexProto.PrimaryIndexEntry.newBuilder() + .setIndexKey(key) + .setRowId(rowId) + .setRowLocation(location); + } + + @Test + public void testStagedDeleteCallsResolveBeforeDeleteRecordThenTombstone() throws Exception + { + LocalIndexService localIndex = mock(LocalIndexService.class); + RetinaResourceManager rm = mock(RetinaResourceManager.class); + + long tableId = 1L; + long indexId = 100L; + long ts = 12345L; + IndexProto.IndexKey foundKey = makeKey(tableId, indexId, "k-found", ts); + IndexProto.IndexKey missKey = makeKey(tableId, indexId, "k-miss", ts); + IndexProto.RowLocation foundLoc = makeLoc(7L, 0, 3); + + when(localIndex.resolvePrimary(eq(tableId), eq(indexId), + ArgumentMatchers.>any(), any())) + .thenReturn(Arrays.asList( + Optional.of(new ResolvedPrimary(42L, foundLoc)), + Optional.empty())); + + RetinaServerImpl server = buildServerWithLocalIndex(localIndex, rm); + + AtomicReference respHolder = new AtomicReference<>(); + StreamObserver observer = new StreamObserver() + { + @Override public void onNext(RetinaProto.UpdateRecordResponse v) { respHolder.set(v); } + @Override public void onError(Throwable t) { fail(t.getMessage()); } + @Override public void onCompleted() { } + }; + + server.updateRecord(makeDeleteRequest(tableId, indexId, "s", "tbl", ts, "k-found", "k-miss"), observer); + assertNotNull(respHolder.get()); + assertEquals(0, respHolder.get().getHeader().getErrorCode()); + + InOrder inOrder = inOrder(localIndex, rm); + inOrder.verify(localIndex).resolvePrimary(eq(tableId), eq(indexId), + ArgumentMatchers.>any(), any()); + // Only the FOUND key triggers deleteRecord and contributes to the tombstone list. + inOrder.verify(rm).deleteRecord(eq(foundLoc), eq(ts)); + inOrder.verify(localIndex).deletePrimaryIndexEntriesOnly(eq(tableId), eq(indexId), + eq(Collections.singletonList(foundKey)), any()); + + verify(localIndex, never()).deletePrimaryIndexEntries(anyLong(), anyLong(), + ArgumentMatchers.>any(), any()); + } + + @Test + public void testStagedDeleteAllNotFoundProducesNoTombstone() throws Exception + { + LocalIndexService localIndex = mock(LocalIndexService.class); + RetinaResourceManager rm = mock(RetinaResourceManager.class); + + long tableId = 1L; + long indexId = 100L; + long ts = 1L; + when(localIndex.resolvePrimary(eq(tableId), eq(indexId), + ArgumentMatchers.>any(), any())) + .thenReturn(Collections.singletonList(Optional.empty())); + + RetinaServerImpl server = buildServerWithLocalIndex(localIndex, rm); + + AtomicReference respHolder = new AtomicReference<>(); + server.updateRecord(makeDeleteRequest(tableId, indexId, "s", "tbl", ts, "absent"), + new StreamObserver() + { + @Override public void onNext(RetinaProto.UpdateRecordResponse v) { respHolder.set(v); } + @Override public void onError(Throwable t) { fail(t.getMessage()); } + @Override public void onCompleted() { } + }); + + assertNotNull(respHolder.get()); + assertEquals(0, respHolder.get().getHeader().getErrorCode()); + verify(rm, never()).deleteRecord(any(IndexProto.RowLocation.class), anyLong()); + verify(localIndex, never()).deletePrimaryIndexEntriesOnly(anyLong(), anyLong(), + ArgumentMatchers.>any(), any()); + } + + @Test + public void testStagedDeleteSecondaryFailureIsBestEffort() throws Exception + { + LocalIndexService localIndex = mock(LocalIndexService.class); + RetinaResourceManager rm = mock(RetinaResourceManager.class); + + long tableId = 1L; + long primaryIndexId = 100L; + long secondaryIndexId = 200L; + long ts = 9L; + IndexProto.RowLocation loc = makeLoc(7L, 0, 3); + when(localIndex.resolvePrimary(eq(tableId), eq(primaryIndexId), + ArgumentMatchers.>any(), any())) + .thenReturn(Collections.singletonList(Optional.of(new ResolvedPrimary(42L, loc)))); + doThrow(new IndexException("secondary already tombstoned")) + .when(localIndex).deleteSecondaryIndexEntries(eq(tableId), eq(secondaryIndexId), + ArgumentMatchers.>any(), any()); + + RetinaServerImpl server = buildServerWithLocalIndex(localIndex, rm); + AtomicReference respHolder = new AtomicReference<>(); + server.updateRecord(makeDeleteWithSecondaryRequest(tableId, primaryIndexId, secondaryIndexId, + "s", "tbl", ts, "k"), + new StreamObserver() + { + @Override public void onNext(RetinaProto.UpdateRecordResponse v) { respHolder.set(v); } + @Override public void onError(Throwable t) { fail(t.getMessage()); } + @Override public void onCompleted() { } + }); + + assertNotNull(respHolder.get()); + assertEquals(0, respHolder.get().getHeader().getErrorCode()); + verify(rm).deleteRecord(eq(loc), eq(ts)); + verify(localIndex).deletePrimaryIndexEntriesOnly(eq(tableId), eq(primaryIndexId), + ArgumentMatchers.>any(), any()); + } + + @Test + public void testStagedInsertWritesMainBeforePrimary() throws Exception + { + LocalIndexService localIndex = mock(LocalIndexService.class); + RetinaResourceManager rm = mock(RetinaResourceManager.class); + + long tableId = 1L; + long indexId = 100L; + long ts = 123L; + IndexProto.IndexKey key = makeKey(tableId, indexId, "k-insert", ts); + IndexProto.RowLocation loc = makeLoc(70L, 0, 4); + when(rm.insertRecord(eq("s"), eq("tbl"), ArgumentMatchers.any(), eq(ts), eq(0))) + .thenReturn(makePrimaryEntryBuilder(key, 51L, loc)); + + RetinaServerImpl server = buildServerWithLocalIndex(localIndex, rm); + AtomicReference respHolder = new AtomicReference<>(); + server.updateRecord(makeInsertRequest(tableId, indexId, "s", "tbl", ts, "k-insert"), + new StreamObserver() + { + @Override public void onNext(RetinaProto.UpdateRecordResponse v) { respHolder.set(v); } + @Override public void onError(Throwable t) { fail(t.getMessage()); } + @Override public void onCompleted() { } + }); + + assertNotNull(respHolder.get()); + assertEquals(0, respHolder.get().getHeader().getErrorCode()); + InOrder inOrder = inOrder(localIndex); + inOrder.verify(localIndex).putMainIndexEntriesOnly(eq(tableId), + ArgumentMatchers.>any()); + inOrder.verify(localIndex).putPrimaryIndexEntriesOnly(eq(tableId), eq(indexId), + ArgumentMatchers.>any(), any()); + } + + @Test + public void testStagedInsertPrimaryFailureMasksInsertedRows() throws Exception + { + LocalIndexService localIndex = mock(LocalIndexService.class); + RetinaResourceManager rm = mock(RetinaResourceManager.class); + + long tableId = 1L; + long indexId = 100L; + long ts = 124L; + IndexProto.IndexKey key0 = makeKey(tableId, indexId, "k0", ts); + IndexProto.IndexKey key1 = makeKey(tableId, indexId, "k1", ts); + IndexProto.RowLocation loc0 = makeLoc(71L, 0, 0); + IndexProto.RowLocation loc1 = makeLoc(71L, 0, 1); + when(rm.insertRecord(eq("s"), eq("tbl"), ArgumentMatchers.any(), eq(ts), eq(0))) + .thenReturn(makePrimaryEntryBuilder(key0, 61L, loc0), + makePrimaryEntryBuilder(key1, 62L, loc1)); + doThrow(new IndexException("primary write failed")) + .when(localIndex).putPrimaryIndexEntriesOnly(eq(tableId), eq(indexId), + ArgumentMatchers.>any(), any()); + + RetinaServerImpl server = buildServerWithLocalIndex(localIndex, rm); + AtomicReference respHolder = new AtomicReference<>(); + server.updateRecord(makeInsertRequest(tableId, indexId, "s", "tbl", ts, "k0", "k1"), + new StreamObserver() + { + @Override public void onNext(RetinaProto.UpdateRecordResponse v) { respHolder.set(v); } + @Override public void onError(Throwable t) { fail(t.getMessage()); } + @Override public void onCompleted() { } + }); + + assertNotNull(respHolder.get()); + assertEquals(2, respHolder.get().getHeader().getErrorCode()); + verify(rm).deleteRecord(eq(loc0), eq(ts)); + verify(rm).deleteRecord(eq(loc1), eq(ts)); + } + + @Test + public void testUpdateDataUsesStagedUpdateIndexPath() throws Exception + { + LocalIndexService localIndex = mock(LocalIndexService.class); + RetinaResourceManager rm = mock(RetinaResourceManager.class); + + long tableId = 1L; + long indexId = 100L; + long secondaryIndexId = 200L; + long ts = 7L; + IndexProto.IndexKey key = makeKey(tableId, indexId, "k-upd", ts); + IndexProto.IndexKey secondaryKey = makeKey(tableId, secondaryIndexId, "sec-k-upd", ts); + IndexProto.RowLocation prevLoc = makeLoc(7L, 0, 3); + IndexProto.RowLocation newLoc = makeLoc(70L, 0, 4); + + when(localIndex.resolvePrimary(eq(tableId), eq(indexId), + ArgumentMatchers.>any(), any())) + .thenReturn(Collections.singletonList(Optional.of(new ResolvedPrimary(42L, prevLoc)))); + when(rm.insertRecord(eq("s"), eq("tbl"), ArgumentMatchers.any(), eq(ts), eq(0))) + .thenReturn(makePrimaryEntryBuilder(key, 99L, newLoc)); + + RetinaProto.UpdateRecordRequest req = RetinaProto.UpdateRecordRequest.newBuilder() + .setHeader(RetinaProto.RequestHeader.newBuilder().setToken("t")) + .setSchemaName("s") + .addTableUpdateData(RetinaProto.TableUpdateData.newBuilder() + .setTableName("tbl") + .setPrimaryIndexId(indexId) + .setTimestamp(ts) + .addUpdateData(RetinaProto.UpdateData.newBuilder() + .addIndexKeys(key) + .addIndexKeys(secondaryKey) + .addColValues(ByteString.copyFromUtf8("v")))) + .build(); + + RetinaServerImpl server = buildServerWithLocalIndex(localIndex, rm); + AtomicReference respHolder = new AtomicReference<>(); + server.updateRecord(req, new StreamObserver() + { + @Override public void onNext(RetinaProto.UpdateRecordResponse v) { respHolder.set(v); } + @Override public void onError(Throwable t) { fail(t.getMessage()); } + @Override public void onCompleted() { } + }); + + assertNotNull(respHolder.get()); + assertEquals(0, respHolder.get().getHeader().getErrorCode()); + + InOrder inOrder = inOrder(localIndex, rm); + inOrder.verify(localIndex).resolvePrimary(eq(tableId), eq(indexId), + ArgumentMatchers.>any(), any()); + inOrder.verify(rm).insertRecord(eq("s"), eq("tbl"), ArgumentMatchers.any(), + eq(ts), eq(0)); + inOrder.verify(localIndex).putMainIndexEntriesOnly(eq(tableId), + ArgumentMatchers.>any()); + inOrder.verify(localIndex).updatePrimaryIndexEntriesOnly(eq(tableId), eq(indexId), + ArgumentMatchers.>any(), any()); + inOrder.verify(rm).deleteRecord(eq(prevLoc), eq(ts)); + inOrder.verify(localIndex).updateSecondaryIndexEntries(eq(tableId), eq(secondaryIndexId), + ArgumentMatchers.>any(), any()); + + verify(localIndex, never()).deletePrimaryIndexEntriesOnly(anyLong(), anyLong(), + ArgumentMatchers.>any(), any()); + verify(localIndex, never()).putPrimaryIndexEntriesOnly(anyLong(), anyLong(), + ArgumentMatchers.>any(), any()); + verify(localIndex, never()).updatePrimaryIndexEntries(anyLong(), anyLong(), + ArgumentMatchers.>any(), any()); + } + + @Test + public void testStagedUpdatePrimaryFailureMasksInsertedRows() throws Exception + { + LocalIndexService localIndex = mock(LocalIndexService.class); + RetinaResourceManager rm = mock(RetinaResourceManager.class); + + long tableId = 1L; + long indexId = 100L; + long ts = 8L; + IndexProto.IndexKey key = makeKey(tableId, indexId, "k-upd-fail", ts); + IndexProto.RowLocation prevLoc = makeLoc(7L, 0, 3); + IndexProto.RowLocation newLoc = makeLoc(70L, 0, 4); + + when(localIndex.resolvePrimary(eq(tableId), eq(indexId), + ArgumentMatchers.>any(), any())) + .thenReturn(Collections.singletonList(Optional.of(new ResolvedPrimary(42L, prevLoc)))); + when(rm.insertRecord(eq("s"), eq("tbl"), ArgumentMatchers.any(), eq(ts), eq(0))) + .thenReturn(makePrimaryEntryBuilder(key, 99L, newLoc)); + doThrow(new IndexException("primary update failed")) + .when(localIndex).updatePrimaryIndexEntriesOnly(eq(tableId), eq(indexId), + ArgumentMatchers.>any(), any()); + + RetinaProto.UpdateRecordRequest req = RetinaProto.UpdateRecordRequest.newBuilder() + .setHeader(RetinaProto.RequestHeader.newBuilder().setToken("t")) + .setSchemaName("s") + .addTableUpdateData(RetinaProto.TableUpdateData.newBuilder() + .setTableName("tbl") + .setPrimaryIndexId(indexId) + .setTimestamp(ts) + .addUpdateData(RetinaProto.UpdateData.newBuilder() + .addIndexKeys(key) + .addColValues(ByteString.copyFromUtf8("v")))) + .build(); + + RetinaServerImpl server = buildServerWithLocalIndex(localIndex, rm); + AtomicReference respHolder = new AtomicReference<>(); + server.updateRecord(req, new StreamObserver() + { + @Override public void onNext(RetinaProto.UpdateRecordResponse v) { respHolder.set(v); } + @Override public void onError(Throwable t) { fail(t.getMessage()); } + @Override public void onCompleted() { } + }); + + assertNotNull(respHolder.get()); + assertEquals(2, respHolder.get().getHeader().getErrorCode()); + + InOrder inOrder = inOrder(localIndex, rm); + inOrder.verify(localIndex).resolvePrimary(eq(tableId), eq(indexId), + ArgumentMatchers.>any(), any()); + inOrder.verify(rm).insertRecord(eq("s"), eq("tbl"), ArgumentMatchers.any(), + eq(ts), eq(0)); + inOrder.verify(localIndex).putMainIndexEntriesOnly(eq(tableId), + ArgumentMatchers.>any()); + inOrder.verify(localIndex).updatePrimaryIndexEntriesOnly(eq(tableId), eq(indexId), + ArgumentMatchers.>any(), any()); + inOrder.verify(rm).deleteRecord(eq(newLoc), eq(ts)); + verify(rm, never()).deleteRecord(eq(prevLoc), eq(ts)); + verify(localIndex, never()).putPrimaryIndexEntriesOnly(anyLong(), anyLong(), + ArgumentMatchers.>any(), any()); + } + + @Test + public void testStagedUpdateMissingPrimaryFailsBeforeAppend() throws Exception + { + LocalIndexService localIndex = mock(LocalIndexService.class); + RetinaResourceManager rm = mock(RetinaResourceManager.class); + + long tableId = 1L; + long indexId = 100L; + long ts = 9L; + IndexProto.IndexKey key = makeKey(tableId, indexId, "k-upd-missing", ts); + + when(localIndex.resolvePrimary(eq(tableId), eq(indexId), + ArgumentMatchers.>any(), any())) + .thenReturn(Collections.singletonList(Optional.empty())); + + RetinaProto.UpdateRecordRequest req = RetinaProto.UpdateRecordRequest.newBuilder() + .setHeader(RetinaProto.RequestHeader.newBuilder().setToken("t")) + .setSchemaName("s") + .addTableUpdateData(RetinaProto.TableUpdateData.newBuilder() + .setTableName("tbl") + .setPrimaryIndexId(indexId) + .setTimestamp(ts) + .addUpdateData(RetinaProto.UpdateData.newBuilder() + .addIndexKeys(key) + .addColValues(ByteString.copyFromUtf8("v")))) + .build(); + + RetinaServerImpl server = buildServerWithLocalIndex(localIndex, rm); + AtomicReference respHolder = new AtomicReference<>(); + server.updateRecord(req, new StreamObserver() + { + @Override public void onNext(RetinaProto.UpdateRecordResponse v) { respHolder.set(v); } + @Override public void onError(Throwable t) { fail(t.getMessage()); } + @Override public void onCompleted() { } + }); + + assertNotNull(respHolder.get()); + assertEquals(2, respHolder.get().getHeader().getErrorCode()); + verify(rm, never()).insertRecord(ArgumentMatchers.anyString(), ArgumentMatchers.anyString(), + ArgumentMatchers.any(), ArgumentMatchers.anyLong(), ArgumentMatchers.anyInt()); + verify(localIndex, never()).putMainIndexEntriesOnly(anyLong(), + ArgumentMatchers.>any()); + verify(localIndex, never()).updatePrimaryIndexEntriesOnly(anyLong(), anyLong(), + ArgumentMatchers.>any(), any()); + } + + @Test + public void testFailsClosedOnNonLocalIndexService() throws Exception + { + // UpdateRecord uses LocalIndexService-only primary-index operations. + IndexService nonLocal = mock(IndexService.class); + RetinaResourceManager rm = mock(RetinaResourceManager.class); + MetadataService md = mock(MetadataService.class); + try + { + new RetinaServerImpl(md, nonLocal, rm); + fail("RetinaServerImpl must require LocalIndexService"); + } + catch (IllegalStateException e) + { + assertTrue(e.getMessage().contains("LocalIndexService") + || (e.getCause() != null && e.getCause().getMessage() != null + && e.getCause().getMessage().contains("LocalIndexService"))); + } + } } diff --git a/pixels-index/pixels-index-main-sqlite/README.md b/pixels-index/pixels-index-main-sqlite/README.md new file mode 100644 index 0000000000..74d53c74aa --- /dev/null +++ b/pixels-index/pixels-index-main-sqlite/README.md @@ -0,0 +1,156 @@ +# SQLite MainIndex + +This module implements the SQLite-backed `MainIndex`. It stores +`rowId -> RowLocation` mappings as row-id ranges in SQLite and uses a per-file +durable marker to make file-scoped persistence retryable. + +The primary table is `row_id_ranges`. A file-scoped persistence operation writes +the ranges for one file and one row in `row_id_range_flush_markers` in the same +SQLite transaction. The marker records the `file_id`, entry count, range count, +and a deterministic SHA-256 hash of the persisted ranges. + +If a later retry sees a matching marker, the file's ranges are already durable. +If it sees conflicting marker metadata, or ranges without a matching marker, the +backend fails closed instead of silently accepting ambiguous index state. + +## Test Setup + +Commands below assume they are run from the repository root: + +```bash +cd /path/to/pixels +``` + +If you are currently in this module directory, run: + +```bash +cd ../.. +``` + +The root `pom.xml` configures Surefire with `skipTests=true`, so +`mvn test -Dtest=...` still reports `Tests are skipped` for this module. To run +only a few SQLite tests without changing the POM, compile the module first and +then invoke Maven Failsafe directly. Failsafe is not bound by the inherited +Surefire `skipTests=true` setting. + +## Compile The Module + +```bash +mvn -pl pixels-index/pixels-index-main-sqlite -am \ + test-compile +``` + +This compiles the module and its reactor dependencies, including test classes, +but does not execute the JUnit tests. + +## Correctness Tests + +Run the main correctness suite: + +```bash +mvn -pl pixels-index/pixels-index-main-sqlite -am \ + test-compile \ + org.apache.maven.plugins:maven-failsafe-plugin:2.22.2:integration-test \ + org.apache.maven.plugins:maven-failsafe-plugin:2.22.2:verify \ + -Dit.test=TestSqliteMainIndex \ + -DfailIfNoTests=false +``` + +This covers normal put/get/delete behavior and the durable flush marker cases: + +- missing `fileId` flush is a no-op success; +- normal put -> flush -> lookup/delete; +- matching durable marker is accepted as an idempotent retry; +- marker metadata/hash conflicts fail closed and leave buffer retryable; +- dirty ranges without marker fail closed and leave buffer retryable; +- marker insert failure rolls back the range inserts; +- close/reopen flushes cached ranges and keeps rows readable. + +Run the JDBC range query correctness test: + +```bash +mvn -pl pixels-index/pixels-index-main-sqlite -am \ + test-compile \ + org.apache.maven.plugins:maven-failsafe-plugin:2.22.2:integration-test \ + org.apache.maven.plugins:maven-failsafe-plugin:2.22.2:verify \ + -Dit.test=TestSqliteMainIndexQuery \ + -DfailIfNoTests=false +``` + +This test writes a small file-scoped set of entries, flushes it, queries +`row_id_ranges` through JDBC, and asserts the persisted ranges are correct. + +## Performance Benchmark + +The benchmark is not a correctness gate. It is disabled by default and only runs +when explicitly enabled: + +```bash +mvn -pl pixels-index/pixels-index-main-sqlite -am \ + test-compile \ + org.apache.maven.plugins:maven-failsafe-plugin:2.22.2:integration-test \ + org.apache.maven.plugins:maven-failsafe-plugin:2.22.2:verify \ + -Dit.test=TestSqliteMainIndexBenchmark \ + -DfailIfNoTests=false \ + -Dpixels.sqlite.main.index.benchmark=true \ + -Dpixels.sqlite.main.index.benchmark.contiguousRows=1000000 \ + -Dpixels.sqlite.main.index.benchmark.fragmentedRows=10000 +``` + +Parameters: + +- `pixels.sqlite.main.index.benchmark`: must be `true` to run the benchmark. +- `pixels.sqlite.main.index.benchmark.contiguousRows`: row count for contiguous + rowId workloads. Default: `1000000`. +- `pixels.sqlite.main.index.benchmark.fragmentedRows`: row count for fragmented + rowId workloads. Default: `100000`. + +The benchmark prints a parameter block first, for example: + +```text +SQLite MainIndex benchmark parameters + -Dpixels.sqlite.main.index.benchmark=true + -Dpixels.sqlite.main.index.benchmark.contiguousRows=1000000 + -Dpixels.sqlite.main.index.benchmark.fragmentedRows=10000 + index.sqlite.path=/tmp/sqlite + java.version=23.0.2 + os.name=Linux + os.arch=amd64 +``` + +Then it prints a summary table: + +```text +SQLite MainIndex benchmark summary +rows = logical MainIndex entries; ranges = persisted row_id_ranges. +markerRetry = retry when a matching per-file durable marker already exists. +emptyRetry = immediate second flush after marker retry discarded the buffer. +workload shape rows ranges markers put(ms) put rows/s flush(ms) flush ranges/s markerRetry(ms) emptyRetry(ms) get(ms) get rows/s +hot put/get path contiguous, pre-flush get 1,000,000 1 1 ... +contiguous first flush contiguous rows -> 1 range 1,000,000 1 1 ... +fragmented first flush 1-row gaps -> many ranges 10,000 10,000 1 ... +marker-hit retry flush matching marker already durable 10,000 10,000 1 ... +``` + +How to read the table: + +- `rows`: logical entries inserted into `MainIndex`. +- `ranges`: persisted `row_id_ranges` count after flush. +- `markers`: persisted `row_id_range_flush_markers` count. +- `put(ms)` / `put rows/s`: in-memory `putEntry` hot path. +- `flush(ms)` / `flush ranges/s`: first durable flush path. +- `markerRetry(ms)`: retry path when SQLite already has a matching durable marker. +- `emptyRetry(ms)`: immediate second flush after marker retry discarded the buffer. +- `get(ms)` / `get rows/s`: lookup cost after the workload setup. + +For durable flush marker overhead, focus on: + +- `contiguous first flush` `flush(ms)`: best-case file flush, many rows become one + range plus one marker. +- `fragmented first flush` `flush(ms)`: many persisted ranges plus one marker. +- `marker-hit retry flush` `markerRetry(ms)`: crash/retry path after the previous + transaction committed but the in-memory buffer was not discarded. + +Large fragmented workloads can take much longer than contiguous workloads. That +is expected because `N` fragmented rows produce `N` SQLite ranges, while +contiguous rows often collapse into a single range. diff --git a/pixels-index/pixels-index-main-sqlite/src/main/java/io/pixelsdb/pixels/index/main/sqlite/SqliteMainIndex.java b/pixels-index/pixels-index-main-sqlite/src/main/java/io/pixelsdb/pixels/index/main/sqlite/SqliteMainIndex.java index be60cbf016..35581dc2be 100644 --- a/pixels-index/pixels-index-main-sqlite/src/main/java/io/pixelsdb/pixels/index/main/sqlite/SqliteMainIndex.java +++ b/pixels-index/pixels-index-main-sqlite/src/main/java/io/pixelsdb/pixels/index/main/sqlite/SqliteMainIndex.java @@ -36,7 +36,10 @@ import java.io.File; import java.io.IOException; +import java.security.MessageDigest; +import java.security.NoSuchAlgorithmException; import java.sql.*; +import java.util.Arrays; import java.util.List; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; @@ -64,6 +67,13 @@ public class SqliteMainIndex implements MainIndex "(row_id_start BIGINT NOT NULL, row_id_end BIGINT NOT NULL, file_id BIGINT NOT NULL, rg_id INT NOT NULL," + "rg_row_offset_start INT NOT NULL, rg_row_offset_end INT NOT NULL, PRIMARY KEY (row_id_start, row_id_end))"; + /** + * The SQL statement to create the per-file flush marker table. + */ + private static final String createFlushMarkerTableSql = "CREATE TABLE IF NOT EXISTS row_id_range_flush_markers " + + "(file_id BIGINT NOT NULL PRIMARY KEY, entry_count BIGINT NOT NULL, range_count BIGINT NOT NULL, " + + "range_hash BLOB NOT NULL, committed_at_ms BIGINT NOT NULL)"; + /** * The SQL statement to query the row id range that covers the given row id (the two ? are of the same value). */ @@ -85,6 +95,42 @@ public class SqliteMainIndex implements MainIndex */ private static final String insertRangeSql = "INSERT INTO row_id_ranges VALUES(?, ?, ?, ?, ?, ?)"; + /** + * The SQL statement to query a per-file flush marker. + */ + private static final String queryFlushMarkerSql = + "SELECT entry_count, range_count, range_hash FROM row_id_range_flush_markers WHERE file_id = ?"; + + /** + * The SQL statement to insert a per-file flush marker. + */ + private static final String insertFlushMarkerSql = + "INSERT INTO row_id_range_flush_markers VALUES(?, ?, ?, ?, ?)"; + + private static final class FlushMarker + { + private final long fileId; + private final long entryCount; + private final long rangeCount; + private final byte[] rangeHash; + + private FlushMarker(long fileId, long entryCount, long rangeCount, byte[] rangeHash) + { + this.fileId = fileId; + this.entryCount = entryCount; + this.rangeCount = rangeCount; + this.rangeHash = rangeHash; + } + + private boolean matches(MainIndexBuffer.FlushSnapshot snapshot, byte[] snapshotHash) + { + return this.fileId == snapshot.getFileId() + && this.entryCount == snapshot.getEntryCount() + && this.rangeCount == snapshot.getRowIdRanges().size() + && Arrays.equals(this.rangeHash, snapshotHash); + } + } + private final long tableId; private final String sqlitePath; private final MainIndexBuffer indexBuffer; @@ -116,6 +162,7 @@ public SqliteMainIndex(long tableId, String sqlitePath) throws MainIndexExceptio try (Statement statement = connection.createStatement()) { statement.execute(createTableSql); + statement.execute(createFlushMarkerTableSql); } } catch (SQLException e) @@ -194,12 +241,9 @@ public IndexProto.RowLocation getLocation(long rowId) throws MainIndexException } if (location == null) { + // Return null when the rowId has no mapping in either the buffer or + // SQLite, leaving the caller to decide how to handle the miss. location = getRowLocationFromSqlite(rowId); - if (location == null) - { - throw new MainIndexException("Failed to get row location for rowId=" + rowId - + " (tableId=" + tableId + ")"); - } } return location; } @@ -213,18 +257,18 @@ public List getLocations(List rowIds) throws MainI { for (long rowId : rowIds) { - IndexProto.RowLocation location; - location = this.indexBuffer.lookup(rowId); + IndexProto.RowLocation location = this.indexBuffer.lookup(rowId); if (location == null) { location = getRowLocationFromSqlite(rowId); - if (location == null) - { - throw new MainIndexException("Failed to get row location for rowId=" + rowId - + " (tableId=" + tableId + ")"); - } } - builder.add(location); + // Skip rowIds that have no mapping in either the buffer or SQLite; + // the returned list contains only the resolvable locations and the + // caller decides how to handle the missing ones. + if (location != null) + { + builder.add(location); + } } } finally @@ -312,31 +356,68 @@ public List putEntries(List primaryEntrie @Override public boolean deleteRowIdRange(RowIdRange rowIdRange) throws MainIndexException { + long rowIdStart = rowIdRange.getRowIdStart(); + long rowIdEnd = rowIdRange.getRowIdEnd(); + if (rowIdEnd <= rowIdStart) + { + throw new MainIndexException("Invalid row id range to delete: [" + rowIdStart + ", " + rowIdEnd + ")"); + } + this.dbRwLock.writeLock().lock(); - try (PreparedStatement pst = connection.prepareStatement(deleteRangesSql)) - { - long rowIdStart = rowIdRange.getRowIdStart(); - long rowIdEnd = rowIdRange.getRowIdEnd(); - pst.setLong(1, rowIdStart); - pst.setLong(2, rowIdEnd); - RowIdRange leftBorderRange = getRowIdRangeFromSqlite(rowIdStart); - RowIdRange rightBorderRange = getRowIdRangeFromSqlite(rowIdEnd - 1); - boolean res = true; - if (leftBorderRange != null) + try + { + boolean originalAutoCommit = this.connection.getAutoCommit(); + try + { + this.connection.setAutoCommit(false); + RowIdRange leftBorderRange = getRowIdRangeFromSqlite(rowIdStart); + RowIdRange rightBorderRange = getRowIdRangeFromSqlite(rowIdEnd - 1); + boolean res = true; + try (PreparedStatement pst = connection.prepareStatement(deleteRangesSql)) + { + pst.setLong(1, rowIdStart); + pst.setLong(2, rowIdEnd); + pst.executeUpdate(); + } + if (leftBorderRange != null && rightBorderRange != null && + leftBorderRange.getRowIdStart() == rightBorderRange.getRowIdStart() && + leftBorderRange.getRowIdEnd() == rightBorderRange.getRowIdEnd()) + { + res &= trimSingleOverlappingRange(leftBorderRange, rowIdStart, rowIdEnd); + } + else + { + if (leftBorderRange != null && leftBorderRange.getRowIdStart() < rowIdStart && + rowIdStart < leftBorderRange.getRowIdEnd()) + { + int width = (int) (rowIdStart - leftBorderRange.getRowIdStart()); + RowIdRange newLeftBorderRange = leftBorderRange.toBuilder() + .setRowIdEnd(rowIdStart) + .setRgRowOffsetEnd(leftBorderRange.getRgRowOffsetStart() + width).build(); + res &= updateRowIdRangeWidth(leftBorderRange, newLeftBorderRange); + } + if (rightBorderRange != null && rightBorderRange.getRowIdStart() < rowIdEnd && + rowIdEnd < rightBorderRange.getRowIdEnd()) + { + int width = (int) (rightBorderRange.getRowIdEnd() - rowIdEnd); + RowIdRange newRightBorderRange = rightBorderRange.toBuilder() + .setRowIdStart(rowIdEnd) + .setRgRowOffsetStart(rightBorderRange.getRgRowOffsetEnd() - width).build(); + res &= updateRowIdRangeWidth(rightBorderRange, newRightBorderRange); + } + } + this.connection.commit(); + return res; + } + catch (SQLException | RowIdException e) { - int width = (int) (rowIdStart - leftBorderRange.getRowIdStart()); - RowIdRange newLeftBorderRange = leftBorderRange.toBuilder() - .setRowIdEnd(rowIdStart).setRgRowOffsetEnd(leftBorderRange.getRgRowOffsetStart() + width).build(); - res &= updateRowIdRangeWidth(leftBorderRange, newLeftBorderRange); + rollbackQuietly(e); + throw e; } - if (rightBorderRange != null) + finally { - int width = (int) (rightBorderRange.getRowIdEnd() - rowIdEnd); - RowIdRange newRightBorderRange = rightBorderRange.toBuilder() - .setRowIdStart(rowIdEnd).setRgRowOffsetStart(rightBorderRange.getRgRowOffsetEnd() - width).build(); - res &= updateRowIdRangeWidth(rightBorderRange, newRightBorderRange); + this.connection.setAutoCommit(originalAutoCommit); } - return res; } catch (SQLException | RowIdException e) { @@ -350,6 +431,46 @@ public boolean deleteRowIdRange(RowIdRange rowIdRange) throws MainIndexException } } + private boolean trimSingleOverlappingRange(RowIdRange range, long rowIdStart, long rowIdEnd) + throws RowIdException, SQLException + { + if (range.getRowIdStart() < rowIdStart && rowIdEnd < range.getRowIdEnd()) + { + int leftWidth = (int) (rowIdStart - range.getRowIdStart()); + RowIdRange newLeftRange = range.toBuilder() + .setRowIdEnd(rowIdStart) + .setRgRowOffsetEnd(range.getRgRowOffsetStart() + leftWidth).build(); + int rightWidth = (int) (range.getRowIdEnd() - rowIdEnd); + RowIdRange newRightRange = range.toBuilder() + .setRowIdStart(rowIdEnd) + .setRgRowOffsetStart(range.getRgRowOffsetEnd() - rightWidth).build(); + boolean res = updateRowIdRangeWidth(range, newLeftRange); + try (PreparedStatement pst = this.connection.prepareStatement(insertRangeSql)) + { + bindRangeInsertStatement(pst, newRightRange); + res &= pst.executeUpdate() > 0; + } + return res; + } + if (range.getRowIdStart() < rowIdStart && rowIdStart < range.getRowIdEnd()) + { + int width = (int) (rowIdStart - range.getRowIdStart()); + RowIdRange newLeftRange = range.toBuilder() + .setRowIdEnd(rowIdStart) + .setRgRowOffsetEnd(range.getRgRowOffsetStart() + width).build(); + return updateRowIdRangeWidth(range, newLeftRange); + } + if (range.getRowIdStart() < rowIdEnd && rowIdEnd < range.getRowIdEnd()) + { + int width = (int) (range.getRowIdEnd() - rowIdEnd); + RowIdRange newRightRange = range.toBuilder() + .setRowIdStart(rowIdEnd) + .setRgRowOffsetStart(range.getRgRowOffsetEnd() - width).build(); + return updateRowIdRangeWidth(range, newRightRange); + } + return true; + } + /** * Get the row id range that contains the given row id from sqlite. * @param rowId the given row id @@ -392,6 +513,16 @@ private RowIdRange getRowIdRangeFromSqlite (long rowId) throws RowIdException } } + private static void bindRangeInsertStatement(PreparedStatement pst, RowIdRange range) throws SQLException + { + pst.setLong(1, range.getRowIdStart()); + pst.setLong(2, range.getRowIdEnd()); + pst.setLong(3, range.getFileId()); + pst.setInt(4, range.getRgId()); + pst.setInt(5, range.getRgRowOffsetStart()); + pst.setInt(6, range.getRgRowOffsetEnd()); + } + /** * Update the width of an existing row id range. * @param oldRange the old row id range @@ -424,22 +555,52 @@ public boolean flushCache(long fileId) throws MainIndexException this.dbRwLock.writeLock().lock(); try { - List rowIdRanges = this.indexBuffer.flush(fileId); - try (PreparedStatement pst = this.connection.prepareStatement(insertRangeSql)) + MainIndexBuffer.FlushSnapshot snapshot = this.indexBuffer.snapshotForFlush(fileId); + if (snapshot.isEmpty()) { - for (RowIdRange range : rowIdRanges) + return true; + } + + byte[] snapshotHash = buildRangeHash(snapshot.getRowIdRanges()); + FlushMarker marker = readFlushMarker(snapshot.getFileId()); + if (marker != null) + { + if (!marker.matches(snapshot, snapshotHash)) { - pst.setLong(1, range.getRowIdStart()); - pst.setLong(2, range.getRowIdEnd()); - pst.setLong(3, range.getFileId()); - pst.setInt(4, range.getRgId()); - pst.setInt(5, range.getRgRowOffsetStart()); - pst.setInt(6, range.getRgRowOffsetEnd()); - pst.addBatch(); + throw new MainIndexException("Conflicting flush marker already exists for fileId=" + fileId); } - pst.executeBatch(); + this.indexBuffer.discardFlushed(snapshot); return true; } + + boolean originalAutoCommit = this.connection.getAutoCommit(); + try + { + this.connection.setAutoCommit(false); + try (PreparedStatement pst = this.connection.prepareStatement(insertRangeSql)) + { + for (RowIdRange range : snapshot.getRowIdRanges()) + { + bindRangeInsertStatement(pst, range); + pst.addBatch(); + } + pst.executeBatch(); + } + insertFlushMarker(snapshot, snapshotHash); + this.connection.commit(); + } + catch (SQLException e) + { + rollbackQuietly(e); + throw e; + } + finally + { + this.connection.setAutoCommit(originalAutoCommit); + } + + this.indexBuffer.discardFlushed(snapshot); + return true; } catch (MainIndexException | SQLException e) { @@ -452,6 +613,86 @@ public boolean flushCache(long fileId) throws MainIndexException } } + private FlushMarker readFlushMarker(long fileId) throws SQLException + { + try (PreparedStatement pst = this.connection.prepareStatement(queryFlushMarkerSql)) + { + pst.setLong(1, fileId); + try (ResultSet rs = pst.executeQuery()) + { + if (!rs.next()) + { + return null; + } + return new FlushMarker(fileId, rs.getLong("entry_count"), + rs.getLong("range_count"), rs.getBytes("range_hash")); + } + } + } + + private void insertFlushMarker(MainIndexBuffer.FlushSnapshot snapshot, byte[] rangeHash) throws SQLException + { + try (PreparedStatement pst = this.connection.prepareStatement(insertFlushMarkerSql)) + { + pst.setLong(1, snapshot.getFileId()); + pst.setLong(2, snapshot.getEntryCount()); + pst.setLong(3, snapshot.getRowIdRanges().size()); + pst.setBytes(4, rangeHash); + pst.setLong(5, System.currentTimeMillis()); + pst.executeUpdate(); + } + } + + private byte[] buildRangeHash(List rowIdRanges) throws MainIndexException + { + try + { + MessageDigest digest = MessageDigest.getInstance("SHA-256"); + for (RowIdRange range : rowIdRanges) + { + updateLong(digest, range.getRowIdStart()); + updateLong(digest, range.getRowIdEnd()); + updateLong(digest, range.getFileId()); + updateInt(digest, range.getRgId()); + updateInt(digest, range.getRgRowOffsetStart()); + updateInt(digest, range.getRgRowOffsetEnd()); + } + return digest.digest(); + } + catch (NoSuchAlgorithmException e) + { + throw new MainIndexException("Failed to build range hash for main index flush", e); + } + } + + private static void updateLong(MessageDigest digest, long value) + { + for (int shift = 56; shift >= 0; shift -= 8) + { + digest.update((byte) (value >>> shift)); + } + } + + private static void updateInt(MessageDigest digest, int value) + { + for (int shift = 24; shift >= 0; shift -= 8) + { + digest.update((byte) (value >>> shift)); + } + } + + private void rollbackQuietly(Exception failure) + { + try + { + this.connection.rollback(); + } + catch (SQLException rollbackException) + { + failure.addSuppressed(rollbackException); + } + } + @Override public void close() throws IOException { @@ -517,4 +758,4 @@ public boolean closeAndRemove() throws MainIndexException } return true; } -} \ No newline at end of file +} diff --git a/pixels-index/pixels-index-main-sqlite/src/test/java/io/pixelsdb/pixels/index/main/sqlite/TestSqliteMainIndex.java b/pixels-index/pixels-index-main-sqlite/src/test/java/io/pixelsdb/pixels/index/main/sqlite/TestSqliteMainIndex.java index ddf1a0aae3..e16b8fdf48 100644 --- a/pixels-index/pixels-index-main-sqlite/src/test/java/io/pixelsdb/pixels/index/main/sqlite/TestSqliteMainIndex.java +++ b/pixels-index/pixels-index-main-sqlite/src/test/java/io/pixelsdb/pixels/index/main/sqlite/TestSqliteMainIndex.java @@ -29,11 +29,21 @@ import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Tag; import org.junit.jupiter.api.Test; import java.io.File; import java.io.IOException; +import java.security.MessageDigest; +import java.sql.Connection; +import java.sql.DriverManager; +import java.sql.PreparedStatement; +import java.sql.ResultSet; +import java.sql.Statement; +import java.time.Duration; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import java.util.concurrent.CountDownLatch; import java.util.concurrent.ExecutorService; @@ -42,16 +52,19 @@ public class TestSqliteMainIndex { - long tableId = 100L; + private static long nextTableId = 100L; + long tableId; + String sqlitePath; MainIndex mainIndex; @BeforeEach public void setUp() throws MainIndexException { + tableId = nextTableId++; // Create SQLite Directory try { - String sqlitePath = ConfigFactory.Instance().getProperty("index.sqlite.path"); + sqlitePath = ConfigFactory.Instance().getProperty("index.sqlite.path"); FileUtils.forceMkdir(new File(sqlitePath)); } catch (IOException e) @@ -65,12 +78,11 @@ public void setUp() throws MainIndexException @AfterEach public void tearDown() throws Exception { - mainIndex.close(); + MainIndexFactory.Instance().closeIndex(tableId, true); // Clear SQLite Directory try { - String sqlitePath = ConfigFactory.Instance().getProperty("index.sqlite.path"); FileUtils.deleteDirectory(new File(sqlitePath)); } catch (IOException e) @@ -79,6 +91,428 @@ public void tearDown() throws Exception } } + @Test + public void testFlushCacheMissingFileIsNoop() throws MainIndexException + { + Assertions.assertTrue(mainIndex.flushCache(987654321L)); + } + + @Test + public void testFlushCacheAcceptsMatchingCommittedMarker() throws Exception + { + long fileId = 42L; + RowIdRange firstRange = new RowIdRange(5000L, 5002L, fileId, 0, 0, 2); + RowIdRange secondRange = new RowIdRange(5010L, 5011L, fileId, 1, 0, 1); + List ranges = new ArrayList<>(); + ranges.add(firstRange); + ranges.add(secondRange); + putMainIndexEntry(5000L, fileId, 0, 0); + putMainIndexEntry(5001L, fileId, 0, 1); + putMainIndexEntry(5010L, fileId, 1, 0); + + insertRange(firstRange); + insertRange(secondRange); + insertFlushMarker(fileId, 3, ranges); + Assertions.assertEquals(2, countRangesForFile(fileId)); + Assertions.assertEquals(1, countFlushMarkersForFile(fileId)); + + Assertions.assertTrue(mainIndex.flushCache(fileId)); + Assertions.assertEquals(2, countRangesForFile(fileId)); + assertLocation(5000L, fileId, 0, 0); + assertLocation(5001L, fileId, 0, 1); + assertLocation(5010L, fileId, 1, 0); + + Assertions.assertTrue(mainIndex.flushCache(fileId)); + Assertions.assertEquals(2, countRangesForFile(fileId)); + } + + @Test + public void testFlushCacheConflictingMarkerKeepsBufferRetryable() throws Exception + { + long fileId = 43L; + putMainIndexEntry(6000L, fileId, 0, 0); + putMainIndexEntry(6001L, fileId, 0, 1); + putMainIndexEntry(6010L, fileId, 1, 0); + + insertFlushMarker(fileId, 3, new ArrayList<>()); + + Assertions.assertThrows(MainIndexException.class, () -> mainIndex.flushCache(fileId)); + Assertions.assertEquals(0, countExactRanges(6010L, 6011L)); + assertLocation(6000L, fileId, 0, 0); + assertLocation(6010L, fileId, 1, 0); + + deleteFlushMarker(fileId); + Assertions.assertTrue(mainIndex.flushCache(fileId)); + Assertions.assertEquals(2, countRangesForFile(fileId)); + Assertions.assertEquals(1, countFlushMarkersForFile(fileId)); + assertLocation(6000L, fileId, 0, 0); + assertLocation(6010L, fileId, 1, 0); + } + + @Test + public void testFlushCacheRangeWithoutMarkerFailsAndKeepsBufferRetryable() throws Exception + { + long fileId = 44L; + putMainIndexEntry(7000L, fileId, 0, 0); + putMainIndexEntry(7001L, fileId, 0, 1); + putMainIndexEntry(7010L, fileId, 1, 0); + + insertRange(new RowIdRange(7000L, 7002L, fileId, 0, 0, 2)); + + Assertions.assertThrows(MainIndexException.class, () -> mainIndex.flushCache(fileId)); + Assertions.assertEquals(0, countExactRanges(7010L, 7011L)); + Assertions.assertEquals(0, countFlushMarkersForFile(fileId)); + assertLocation(7000L, fileId, 0, 0); + assertLocation(7010L, fileId, 1, 0); + + deleteExactRange(7000L, 7002L); + Assertions.assertTrue(mainIndex.flushCache(fileId)); + Assertions.assertEquals(2, countRangesForFile(fileId)); + Assertions.assertEquals(1, countFlushMarkersForFile(fileId)); + } + + @Test + public void testFlushCacheRejectsFlushMarkerMetadataMismatches() throws Exception + { + long fileId = 45L; + putMainIndexEntry(8000L, fileId, 0, 0); + putMainIndexEntry(8001L, fileId, 0, 1); + + List ranges = Arrays.asList(new RowIdRange(8000L, 8002L, fileId, 0, 0, 2)); + byte[] rangeHash = buildRangeHash(ranges); + + insertFlushMarker(fileId, 1, ranges.size(), rangeHash); + assertFlushFailsAndBufferSurvives(fileId, 8000L, 8001L); + + deleteFlushMarker(fileId); + insertFlushMarker(fileId, 2, ranges.size() + 1, rangeHash); + assertFlushFailsAndBufferSurvives(fileId, 8000L, 8001L); + + deleteFlushMarker(fileId); + byte[] badHash = rangeHash.clone(); + badHash[0] = (byte) (badHash[0] ^ 0x7f); + insertFlushMarker(fileId, 2, ranges.size(), badHash); + assertFlushFailsAndBufferSurvives(fileId, 8000L, 8001L); + + deleteFlushMarker(fileId); + Assertions.assertTrue(mainIndex.flushCache(fileId)); + Assertions.assertEquals(1, countRangesForFile(fileId)); + Assertions.assertEquals(1, countFlushMarkersForFile(fileId)); + } + + @Test + public void testFlushCacheRollsBackRangesWhenMarkerInsertFails() throws Exception + { + long fileId = 46L; + putMainIndexEntry(9000L, fileId, 0, 0); + putMainIndexEntry(9001L, fileId, 0, 1); + putMainIndexEntry(9010L, fileId, 1, 0); + + createFailingFlushMarkerTrigger(fileId); + Assertions.assertThrows(MainIndexException.class, () -> mainIndex.flushCache(fileId)); + Assertions.assertEquals(0, countRangesForFile(fileId)); + Assertions.assertEquals(0, countFlushMarkersForFile(fileId)); + assertLocation(9000L, fileId, 0, 0); + assertLocation(9010L, fileId, 1, 0); + + dropFailingFlushMarkerTrigger(); + Assertions.assertTrue(mainIndex.flushCache(fileId)); + Assertions.assertEquals(2, countRangesForFile(fileId)); + Assertions.assertEquals(1, countFlushMarkersForFile(fileId)); + } + + @Test + public void testFlushCacheConvergesAfterUnknownCommittedStateWithOutOfOrderBuffer() throws Exception + { + long fileId = 48L; + List committedRanges = Arrays.asList( + new RowIdRange(11000L, 11003L, fileId, 0, 0, 3), + new RowIdRange(11010L, 11012L, fileId, 1, 7, 9)); + + putMainIndexEntry(11002L, fileId, 0, 2); + putMainIndexEntry(11000L, fileId, 0, 0); + putMainIndexEntry(11010L, fileId, 1, 7); + putMainIndexEntry(11001L, fileId, 0, 1); + putMainIndexEntry(11011L, fileId, 1, 8); + + for (RowIdRange range : committedRanges) + { + insertRange(range); + } + insertFlushMarker(fileId, 5, committedRanges); + + Assertions.assertTrue(mainIndex.flushCache(fileId)); + Assertions.assertEquals(2, countRangesForFile(fileId)); + Assertions.assertEquals(1, countFlushMarkersForFile(fileId)); + assertNoInvalidRanges(fileId); + assertLocation(11000L, fileId, 0, 0); + assertLocation(11002L, fileId, 0, 2); + assertLocation(11011L, fileId, 1, 8); + } + + @Test + public void testFlushCacheFailureForOneFileDoesNotDiscardOtherFileBuffers() throws Exception + { + long failingFileId = 49L; + long healthyFileId = 50L; + putMainIndexEntry(12000L, failingFileId, 0, 0); + putMainIndexEntry(12001L, failingFileId, 0, 1); + putMainIndexEntry(12100L, healthyFileId, 0, 0); + putMainIndexEntry(12101L, healthyFileId, 0, 1); + + createFailingFlushMarkerTrigger(failingFileId); + Assertions.assertThrows(MainIndexException.class, () -> mainIndex.flushCache(failingFileId)); + Assertions.assertEquals(0, countRangesForFile(failingFileId)); + Assertions.assertEquals(0, countFlushMarkersForFile(failingFileId)); + assertLocation(12000L, failingFileId, 0, 0); + + Assertions.assertTrue(mainIndex.flushCache(healthyFileId)); + Assertions.assertEquals(1, countRangesForFile(healthyFileId)); + Assertions.assertEquals(1, countFlushMarkersForFile(healthyFileId)); + assertLocation(12101L, healthyFileId, 0, 1); + + dropFailingFlushMarkerTrigger(); + Assertions.assertTrue(mainIndex.flushCache(failingFileId)); + Assertions.assertEquals(1, countRangesForFile(failingFileId)); + Assertions.assertEquals(1, countFlushMarkersForFile(failingFileId)); + } + + @Test + public void testPutEntriesFlushesDurableRangesAndLocations() throws Exception + { + long fileId = 51L; + List entries = Arrays.asList( + primaryEntry(13002L, fileId, 0, 2), + primaryEntry(13000L, fileId, 0, 0), + primaryEntry(13001L, fileId, 0, 1), + primaryEntry(13020L, fileId, 2, 4), + primaryEntry(13021L, fileId, 2, 5)); + + assertAllTrue(mainIndex.putEntries(entries)); + Assertions.assertTrue(mainIndex.flushCache(fileId)); + + List ranges = listRangesForFile(fileId); + Assertions.assertEquals(2, ranges.size()); + assertRange(ranges.get(0), 13000L, 13003L, fileId, 0, 0, 3); + assertRange(ranges.get(1), 13020L, 13022L, fileId, 2, 4, 6); + assertNoInvalidRanges(fileId); + + List locations = mainIndex.getLocations(Arrays.asList(13000L, 13002L, 13021L)); + Assertions.assertEquals(3, locations.size()); + Assertions.assertEquals(0, locations.get(0).getRgRowOffset()); + Assertions.assertEquals(2, locations.get(1).getRgRowOffset()); + Assertions.assertEquals(5, locations.get(2).getRgRowOffset()); + } + + @Test + public void testCloseConvergesWhenPreviousFlushCommittedButBufferSurvived() throws Exception + { + long fileId = 52L; + RowIdRange committedRange = new RowIdRange(14000L, 14002L, fileId, 0, 0, 2); + putMainIndexEntry(14000L, fileId, 0, 0); + putMainIndexEntry(14001L, fileId, 0, 1); + + insertRange(committedRange); + insertFlushMarker(fileId, 2, Arrays.asList(committedRange)); + + MainIndexFactory.Instance().closeIndex(tableId, false); + mainIndex = MainIndexFactory.Instance().getMainIndex(tableId); + + Assertions.assertEquals(1, countRangesForFile(fileId)); + Assertions.assertEquals(1, countFlushMarkersForFile(fileId)); + assertLocation(14000L, fileId, 0, 0); + assertLocation(14001L, fileId, 0, 1); + } + + @Test + public void testDeleteRowIdRangeRemovesExactRangeWithoutInvalidResidue() throws Exception + { + long fileId = 53L; + putContiguousEntries(fileId, 0, 15000L, 15004L, 0); + Assertions.assertTrue(mainIndex.flushCache(fileId)); + + Assertions.assertTrue(mainIndex.deleteRowIdRange(new RowIdRange(15000L, 15004L, fileId, 0, 0, 4))); + + Assertions.assertEquals(0, countRangesForFile(fileId)); + assertNoInvalidRanges(fileId); + for (long rowId = 15000L; rowId < 15004L; rowId++) + { + assertLocationMissing(rowId); + } + } + + @Test + public void testDeleteRowIdRangeSplitsMiddleRangeForRecoveryCleanup() throws Exception + { + long fileId = 54L; + putContiguousEntries(fileId, 0, 16000L, 16010L, 0); + Assertions.assertTrue(mainIndex.flushCache(fileId)); + + Assertions.assertTrue(mainIndex.deleteRowIdRange(new RowIdRange(16003L, 16007L, fileId, 0, 3, 7))); + + List ranges = listRangesForFile(fileId); + Assertions.assertEquals(2, ranges.size()); + assertRange(ranges.get(0), 16000L, 16003L, fileId, 0, 0, 3); + assertRange(ranges.get(1), 16007L, 16010L, fileId, 0, 7, 10); + assertNoInvalidRanges(fileId); + assertLocation(16002L, fileId, 0, 2); + assertLocationMissing(16003L); + assertLocationMissing(16006L); + assertLocation(16007L, fileId, 0, 7); + } + + @Test + public void testDeleteRowIdRangeTrimsBordersAndDeletesCoveredRanges() throws Exception + { + long fileId = 55L; + putContiguousEntries(fileId, 0, 17000L, 17005L, 0); + putContiguousEntries(fileId, 1, 17010L, 17015L, 0); + putContiguousEntries(fileId, 2, 17020L, 17025L, 0); + Assertions.assertTrue(mainIndex.flushCache(fileId)); + + Assertions.assertTrue(mainIndex.deleteRowIdRange(new RowIdRange(17003L, 17022L, fileId, 0, 3, 22))); + + List ranges = listRangesForFile(fileId); + Assertions.assertEquals(2, ranges.size()); + assertRange(ranges.get(0), 17000L, 17003L, fileId, 0, 0, 3); + assertRange(ranges.get(1), 17022L, 17025L, fileId, 2, 2, 5); + assertNoInvalidRanges(fileId); + assertLocation(17002L, fileId, 0, 2); + assertLocationMissing(17010L); + assertLocationMissing(17021L); + assertLocation(17022L, fileId, 2, 2); + } + + @Test + public void testDeleteRowIdRangeLeftAlignedTrimsLeadingPortionOfSingleRange() throws Exception + { + long fileId = 60L; + putContiguousEntries(fileId, 0, 21000L, 21010L, 0); + Assertions.assertTrue(mainIndex.flushCache(fileId)); + + // Delete [21000, 21003) which shares its left edge with the existing range [21000, 21010). + // Expected to trim the leading portion and keep [21003, 21010). + Assertions.assertTrue(mainIndex.deleteRowIdRange(new RowIdRange(21000L, 21003L, fileId, 0, 0, 3))); + + List ranges = listRangesForFile(fileId); + Assertions.assertEquals(1, ranges.size()); + assertRange(ranges.get(0), 21003L, 21010L, fileId, 0, 3, 10); + assertNoInvalidRanges(fileId); + assertLocationMissing(21000L); + assertLocationMissing(21002L); + assertLocation(21003L, fileId, 0, 3); + assertLocation(21009L, fileId, 0, 9); + } + + @Test + public void testDeleteRowIdRangeRightAlignedTrimsTrailingPortionOfSingleRange() throws Exception + { + long fileId = 61L; + putContiguousEntries(fileId, 0, 22000L, 22010L, 0); + Assertions.assertTrue(mainIndex.flushCache(fileId)); + + // Delete [22007, 22010) which shares its right edge with the existing range [22000, 22010). + // Expected to trim the trailing portion and keep [22000, 22007). + Assertions.assertTrue(mainIndex.deleteRowIdRange(new RowIdRange(22007L, 22010L, fileId, 0, 7, 10))); + + List ranges = listRangesForFile(fileId); + Assertions.assertEquals(1, ranges.size()); + assertRange(ranges.get(0), 22000L, 22007L, fileId, 0, 0, 7); + assertNoInvalidRanges(fileId); + assertLocation(22000L, fileId, 0, 0); + assertLocation(22006L, fileId, 0, 6); + assertLocationMissing(22007L); + assertLocationMissing(22009L); + } + + @Test + public void testDeleteRowIdRangeFullyContainsSingleRangeRemovesItWithoutResidue() throws Exception + { + long fileId = 62L; + // Single committed range [23000, 23004) sitting in isolation. + putContiguousEntries(fileId, 0, 23000L, 23004L, 0); + Assertions.assertTrue(mainIndex.flushCache(fileId)); + + // Delete [22990, 23010) which strictly contains the entire range. + // No border range is partially overlapped, so the bulk DELETE clause should remove the range + // and leave no residue or split-out ranges. + Assertions.assertTrue(mainIndex.deleteRowIdRange(new RowIdRange(22990L, 23010L, fileId, 0, 0, 20))); + + Assertions.assertEquals(0, countRangesForFile(fileId)); + assertNoInvalidRanges(fileId); + for (long rowId = 23000L; rowId < 23004L; rowId++) + { + assertLocationMissing(rowId); + } + } + + @Test + public void testDeleteRowIdRangeMissingAllRangesIsNoop() throws Exception + { + long fileId = 63L; + // Persist a single range [24000, 24004) so the table is non-empty. + putContiguousEntries(fileId, 0, 24000L, 24004L, 0); + Assertions.assertTrue(mainIndex.flushCache(fileId)); + + // Delete a row id window that does not overlap any committed range; should be a no-op. + Assertions.assertTrue(mainIndex.deleteRowIdRange(new RowIdRange(30000L, 30010L, fileId, 0, 0, 10))); + + List ranges = listRangesForFile(fileId); + Assertions.assertEquals(1, ranges.size()); + assertRange(ranges.get(0), 24000L, 24004L, fileId, 0, 0, 4); + assertNoInvalidRanges(fileId); + assertLocation(24000L, fileId, 0, 0); + assertLocation(24003L, fileId, 0, 3); + // Row ids inside the deleted (but never committed) window remain unknown. + assertLocationMissing(30000L); + assertLocationMissing(30009L); + } + + @Test + public void testDeleteRowIdRangeRollsBackSplitWhenRightRangeInsertFails() throws Exception + { + long fileId = 57L; + putContiguousEntries(fileId, 0, 19000L, 19010L, 0); + Assertions.assertTrue(mainIndex.flushCache(fileId)); + + createFailingRangeInsertTrigger(19007L); + Assertions.assertThrows(MainIndexException.class, + () -> mainIndex.deleteRowIdRange(new RowIdRange(19003L, 19007L, fileId, 0, 3, 7))); + dropFailingRangeInsertTrigger(); + + List ranges = listRangesForFile(fileId); + Assertions.assertEquals(1, ranges.size()); + assertRange(ranges.get(0), 19000L, 19010L, fileId, 0, 0, 10); + assertNoInvalidRanges(fileId); + assertLocation(19003L, fileId, 0, 3); + assertLocation(19007L, fileId, 0, 7); + } + + @Test + public void testDeleteRowIdRangeRejectsEmptyOrReversedRange() throws Exception + { + Assertions.assertThrows(MainIndexException.class, + () -> mainIndex.deleteRowIdRange(new RowIdRange(20000L, 20000L, 58L, 0, 0, 0))); + Assertions.assertThrows(MainIndexException.class, + () -> mainIndex.deleteRowIdRange(new RowIdRange(20001L, 20000L, 58L, 0, 1, 0))); + } + + @Test + public void testCloseFlushesCacheWithMarkerAndReopenReadsRows() throws Exception + { + long fileId = 47L; + putMainIndexEntry(10000L, fileId, 0, 0); + putMainIndexEntry(10001L, fileId, 0, 1); + + MainIndexFactory.Instance().closeIndex(tableId, false); + mainIndex = MainIndexFactory.Instance().getMainIndex(tableId); + + Assertions.assertEquals(1, countRangesForFile(fileId)); + Assertions.assertEquals(1, countFlushMarkersForFile(fileId)); + assertLocation(10000L, fileId, 0, 0); + assertLocation(10001L, fileId, 0, 1); + } + @Test public void testPutAndGetLocation() throws MainIndexException { @@ -95,7 +529,7 @@ public void testPutAndGetLocation() throws MainIndexException } @Test - public void testFlushCacheAndDeleteEntry() throws MainIndexException + public void testFlushCacheAndDeleteEntry() throws Exception { long rowId = 2000L; IndexProto.RowLocation location = IndexProto.RowLocation.newBuilder() @@ -107,52 +541,107 @@ public void testFlushCacheAndDeleteEntry() throws MainIndexException Assertions.assertTrue(mainIndex.deleteRowIdRange(new RowIdRange(rowId, rowId + 1, 2, 2, 0, 1))); - Assertions.assertNull(mainIndex.getLocation(rowId)); + assertLocationMissing(rowId); + Assertions.assertEquals(0, countRangesForFile(2)); + location = location.toBuilder().setFileId(3).build(); Assertions.assertTrue(mainIndex.putEntry(rowId, location)); Assertions.assertNotNull(mainIndex.getLocation(rowId)); - Assertions.assertTrue(mainIndex.flushCache(2)); + Assertions.assertTrue(mainIndex.flushCache(3)); Assertions.assertTrue(mainIndex.deleteRowIdRange(new RowIdRange(rowId - 1, rowId + 1, - 2, 2, 0, 2))); - Assertions.assertNull(mainIndex.getLocation(rowId)); + 3, 2, 0, 2))); + assertLocationMissing(rowId); + Assertions.assertEquals(0, countRangesForFile(3)); + location = location.toBuilder().setFileId(4).build(); Assertions.assertTrue(mainIndex.putEntry(rowId, location)); Assertions.assertNotNull(mainIndex.getLocation(rowId)); - Assertions.assertTrue(mainIndex.flushCache(2)); + Assertions.assertTrue(mainIndex.flushCache(4)); Assertions.assertTrue(mainIndex.deleteRowIdRange(new RowIdRange(rowId - 1, rowId, - 2, 2, 0, 1))); + 4, 2, 0, 1))); Assertions.assertNotNull(mainIndex.getLocation(rowId)); + } - Assertions.assertTrue(mainIndex.deleteRowIdRange(new RowIdRange(rowId, rowId + 1, - 2, 2, 0, 1))); + @Test + @Tag("performance") + public void testFlushCachePerformanceSmoke() throws Exception + { + int entryCount = Integer.getInteger("sqlite.main.index.perf.smoke.entries", 50_000); + long timeoutSeconds = Long.getLong("sqlite.main.index.perf.smoke.timeout.sec", 30L); + long fileId = 56L; + long rowIdBase = 18000L; + long[] elapsedMs = new long[4]; + + Assertions.assertTimeout(Duration.ofSeconds(timeoutSeconds), () -> { + IndexProto.RowLocation.Builder locationBuilder = IndexProto.RowLocation.newBuilder() + .setFileId(fileId).setRgId(0); + long start = System.nanoTime(); + for (int i = 0; i < entryCount; i++) + { + Assertions.assertTrue(mainIndex.putEntry(rowIdBase + i, + locationBuilder.setRgRowOffset(i).build())); + } + elapsedMs[0] = nanosToMillis(System.nanoTime() - start); + + start = System.nanoTime(); + Assertions.assertTrue(mainIndex.flushCache(fileId)); + elapsedMs[1] = nanosToMillis(System.nanoTime() - start); + + start = System.nanoTime(); + int sampleStep = Math.max(1, entryCount / 100); + for (int i = 0; i < entryCount; i += sampleStep) + { + IndexProto.RowLocation location = mainIndex.getLocation(rowIdBase + i); + Assertions.assertEquals(fileId, location.getFileId()); + Assertions.assertEquals(i, location.getRgRowOffset()); + } + elapsedMs[2] = nanosToMillis(System.nanoTime() - start); + + start = System.nanoTime(); + Assertions.assertTrue(mainIndex.flushCache(fileId)); + Assertions.assertTrue(mainIndex.deleteRowIdRange(new RowIdRange( + rowIdBase, rowIdBase + entryCount, fileId, 0, 0, entryCount))); + elapsedMs[3] = nanosToMillis(System.nanoTime() - start); + }); + + Assertions.assertEquals(1, countFlushMarkersForFile(fileId)); + Assertions.assertEquals(0, countRangesForFile(fileId)); + System.out.println("sqlite main index perf smoke entries=" + entryCount + + ", putMs=" + elapsedMs[0] + + ", flushMs=" + elapsedMs[1] + + ", sampledGetMs=" + elapsedMs[2] + + ", idempotentFlushAndDeleteMs=" + elapsedMs[3]); } @Test + @Disabled("Manual performance smoke test; not a correctness gate.") + @Tag("performance") public void testPutAndGetPerformance() throws MainIndexException { final long rowIdBase = 0L; + final int entryCount = Integer.getInteger("sqlite.main.index.perf.entries", 10_000_000); IndexProto.RowLocation.Builder locationBuilder = IndexProto.RowLocation.newBuilder() .setFileId(1L).setRgId(0); long start = System.currentTimeMillis(); - for (int i = 0; i < 10000000; i++) + for (int i = 0; i < entryCount; i++) { mainIndex.putEntry(rowIdBase + i, locationBuilder.setRgRowOffset(i).build()); } - System.out.println("put 10M entries in " + (System.currentTimeMillis() - start) + " ms"); + System.out.println("put " + entryCount + " entries in " + (System.currentTimeMillis() - start) + " ms"); start = System.currentTimeMillis(); - for (int i = 0; i < 10000000; i++) + for (int i = 0; i < entryCount; i++) { mainIndex.getLocation(rowIdBase + i); } - System.out.println("get 10M entries in " + (System.currentTimeMillis() - start) + " ms"); + System.out.println("get " + entryCount + " entries in " + (System.currentTimeMillis() - start) + " ms"); start = System.currentTimeMillis(); mainIndex.flushCache(1); System.out.println("flush cache in " + (System.currentTimeMillis() - start) + " ms"); start = System.currentTimeMillis(); mainIndex.deleteRowIdRange(new RowIdRange( - 0L, 10_000_000L, 1L, 0, 0, 10_000_000)); + 0L, entryCount, 1L, 0, 0, entryCount)); System.out.println("delete all entries in " + (System.currentTimeMillis() - start) + " ms"); } @@ -261,10 +750,10 @@ public void testConcurrentPutAndDeleteRowIds() throws Exception { mainIndex.flushCache(threadNum); RowIdRange range = ranges.get(threadNum); - Assertions.assertTrue(mainIndex.deleteRowIdRange(range)); - for (long id = range.getRowIdStart(); id <= range.getRowIdEnd(); id++) + mainIndex.deleteRowIdRange(range); + for (long id = range.getRowIdStart(); id < range.getRowIdEnd(); id++) { - Assertions.assertNull(mainIndex.getLocation(id)); + assertLocationMissing(id); } } finally @@ -282,4 +771,304 @@ public void testConcurrentPutAndDeleteRowIds() throws Exception } executor.shutdown(); } -} \ No newline at end of file + + private void putMainIndexEntry(long rowId, long fileId, int rgId, int rgRowOffset) + { + Assertions.assertTrue(mainIndex.putEntry(rowId, IndexProto.RowLocation.newBuilder() + .setFileId(fileId).setRgId(rgId).setRgRowOffset(rgRowOffset).build())); + } + + private void putContiguousEntries(long fileId, int rgId, long rowIdStart, long rowIdEnd, int rgRowOffsetStart) + { + int offset = rgRowOffsetStart; + for (long rowId = rowIdStart; rowId < rowIdEnd; rowId++) + { + putMainIndexEntry(rowId, fileId, rgId, offset++); + } + } + + private IndexProto.PrimaryIndexEntry primaryEntry(long rowId, long fileId, int rgId, int rgRowOffset) + { + return IndexProto.PrimaryIndexEntry.newBuilder() + .setRowId(rowId) + .setRowLocation(IndexProto.RowLocation.newBuilder() + .setFileId(fileId).setRgId(rgId).setRgRowOffset(rgRowOffset).build()) + .build(); + } + + private void assertAllTrue(List results) + { + for (Boolean result : results) + { + Assertions.assertTrue(result); + } + } + + private void assertLocation(long rowId, long fileId, int rgId, int rgRowOffset) throws MainIndexException + { + IndexProto.RowLocation location = mainIndex.getLocation(rowId); + Assertions.assertNotNull(location); + Assertions.assertEquals(fileId, location.getFileId()); + Assertions.assertEquals(rgId, location.getRgId()); + Assertions.assertEquals(rgRowOffset, location.getRgRowOffset()); + } + + private void assertLocationMissing(long rowId) throws MainIndexException + { + // A missing rowId is reported as null so the caller can treat the absence + // as a logical not-found rather than a failure. + Assertions.assertNull(mainIndex.getLocation(rowId)); + } + + private void assertFlushFailsAndBufferSurvives(long fileId, long firstRowId, long secondRowId) throws Exception + { + Assertions.assertThrows(MainIndexException.class, () -> mainIndex.flushCache(fileId)); + Assertions.assertEquals(0, countRangesForFile(fileId)); + assertLocation(firstRowId, fileId, 0, 0); + assertLocation(secondRowId, fileId, 0, 1); + } + + private void assertRange(RowIdRange range, long rowIdStart, long rowIdEnd, long fileId, + int rgId, int rgRowOffsetStart, int rgRowOffsetEnd) + { + Assertions.assertEquals(rowIdStart, range.getRowIdStart()); + Assertions.assertEquals(rowIdEnd, range.getRowIdEnd()); + Assertions.assertEquals(fileId, range.getFileId()); + Assertions.assertEquals(rgId, range.getRgId()); + Assertions.assertEquals(rgRowOffsetStart, range.getRgRowOffsetStart()); + Assertions.assertEquals(rgRowOffsetEnd, range.getRgRowOffsetEnd()); + } + + private void assertNoInvalidRanges(long fileId) throws Exception + { + Assertions.assertEquals(0, countInvalidRangesForFile(fileId)); + } + + private Connection openMainIndexConnection() throws Exception + { + String path = sqlitePath.endsWith("/") ? sqlitePath : sqlitePath + "/"; + return DriverManager.getConnection("jdbc:sqlite:" + path + tableId + ".main.index.db"); + } + + private void insertRange(RowIdRange range) throws Exception + { + try (Connection connection = openMainIndexConnection(); + PreparedStatement pst = connection.prepareStatement("INSERT INTO row_id_ranges VALUES(?, ?, ?, ?, ?, ?)")) + { + pst.setLong(1, range.getRowIdStart()); + pst.setLong(2, range.getRowIdEnd()); + pst.setLong(3, range.getFileId()); + pst.setInt(4, range.getRgId()); + pst.setInt(5, range.getRgRowOffsetStart()); + pst.setInt(6, range.getRgRowOffsetEnd()); + pst.executeUpdate(); + } + } + + private void deleteExactRange(long rowIdStart, long rowIdEnd) throws Exception + { + try (Connection connection = openMainIndexConnection(); + PreparedStatement pst = connection.prepareStatement( + "DELETE FROM row_id_ranges WHERE row_id_start = ? AND row_id_end = ?")) + { + pst.setLong(1, rowIdStart); + pst.setLong(2, rowIdEnd); + pst.executeUpdate(); + } + } + + private void insertFlushMarker(long fileId, long entryCount, List ranges) throws Exception + { + insertFlushMarker(fileId, entryCount, ranges.size(), buildRangeHash(ranges)); + } + + private void insertFlushMarker(long fileId, long entryCount, long rangeCount, byte[] rangeHash) throws Exception + { + try (Connection connection = openMainIndexConnection(); + PreparedStatement pst = connection.prepareStatement( + "INSERT INTO row_id_range_flush_markers VALUES(?, ?, ?, ?, ?)")) + { + pst.setLong(1, fileId); + pst.setLong(2, entryCount); + pst.setLong(3, rangeCount); + pst.setBytes(4, rangeHash); + pst.setLong(5, System.currentTimeMillis()); + pst.executeUpdate(); + } + } + + private void deleteFlushMarker(long fileId) throws Exception + { + try (Connection connection = openMainIndexConnection(); + PreparedStatement pst = connection.prepareStatement( + "DELETE FROM row_id_range_flush_markers WHERE file_id = ?")) + { + pst.setLong(1, fileId); + pst.executeUpdate(); + } + } + + private void createFailingFlushMarkerTrigger(long fileId) throws Exception + { + try (Connection connection = openMainIndexConnection(); + Statement statement = connection.createStatement()) + { + statement.executeUpdate("DROP TRIGGER IF EXISTS fail_marker_insert"); + statement.executeUpdate("CREATE TRIGGER fail_marker_insert BEFORE INSERT ON row_id_range_flush_markers " + + "WHEN NEW.file_id = " + fileId + " BEGIN SELECT RAISE(ABORT, 'forced marker failure'); END"); + } + } + + private void dropFailingFlushMarkerTrigger() throws Exception + { + try (Connection connection = openMainIndexConnection(); + Statement statement = connection.createStatement()) + { + statement.executeUpdate("DROP TRIGGER IF EXISTS fail_marker_insert"); + } + } + + private void createFailingRangeInsertTrigger(long rowIdStart) throws Exception + { + try (Connection connection = openMainIndexConnection(); + Statement statement = connection.createStatement()) + { + statement.executeUpdate("DROP TRIGGER IF EXISTS fail_range_insert"); + statement.executeUpdate("CREATE TRIGGER fail_range_insert BEFORE INSERT ON row_id_ranges " + + "WHEN NEW.row_id_start = " + rowIdStart + " " + + "BEGIN SELECT RAISE(ABORT, 'forced range insert failure'); END"); + } + } + + private void dropFailingRangeInsertTrigger() throws Exception + { + try (Connection connection = openMainIndexConnection(); + Statement statement = connection.createStatement()) + { + statement.executeUpdate("DROP TRIGGER IF EXISTS fail_range_insert"); + } + } + + private List listRangesForFile(long fileId) throws Exception + { + try (Connection connection = openMainIndexConnection(); + PreparedStatement pst = connection.prepareStatement( + "SELECT * FROM row_id_ranges WHERE file_id = ? ORDER BY row_id_start")) + { + pst.setLong(1, fileId); + List ranges = new ArrayList<>(); + try (ResultSet rs = pst.executeQuery()) + { + while (rs.next()) + { + ranges.add(new RowIdRange( + rs.getLong("row_id_start"), + rs.getLong("row_id_end"), + rs.getLong("file_id"), + rs.getInt("rg_id"), + rs.getInt("rg_row_offset_start"), + rs.getInt("rg_row_offset_end"))); + } + } + return ranges; + } + } + + private int countRangesForFile(long fileId) throws Exception + { + try (Connection connection = openMainIndexConnection(); + PreparedStatement pst = connection.prepareStatement("SELECT COUNT(*) FROM row_id_ranges WHERE file_id = ?")) + { + pst.setLong(1, fileId); + try (ResultSet rs = pst.executeQuery()) + { + Assertions.assertTrue(rs.next()); + return rs.getInt(1); + } + } + } + + private int countFlushMarkersForFile(long fileId) throws Exception + { + try (Connection connection = openMainIndexConnection(); + PreparedStatement pst = connection.prepareStatement( + "SELECT COUNT(*) FROM row_id_range_flush_markers WHERE file_id = ?")) + { + pst.setLong(1, fileId); + try (ResultSet rs = pst.executeQuery()) + { + Assertions.assertTrue(rs.next()); + return rs.getInt(1); + } + } + } + + private int countExactRanges(long rowIdStart, long rowIdEnd) throws Exception + { + try (Connection connection = openMainIndexConnection(); + PreparedStatement pst = connection.prepareStatement( + "SELECT COUNT(*) FROM row_id_ranges WHERE row_id_start = ? AND row_id_end = ?")) + { + pst.setLong(1, rowIdStart); + pst.setLong(2, rowIdEnd); + try (ResultSet rs = pst.executeQuery()) + { + Assertions.assertTrue(rs.next()); + return rs.getInt(1); + } + } + } + + private int countInvalidRangesForFile(long fileId) throws Exception + { + try (Connection connection = openMainIndexConnection(); + PreparedStatement pst = connection.prepareStatement( + "SELECT COUNT(*) FROM row_id_ranges WHERE file_id = ? AND " + + "(row_id_end <= row_id_start OR " + + "(row_id_end - row_id_start) != (rg_row_offset_end - rg_row_offset_start))")) + { + pst.setLong(1, fileId); + try (ResultSet rs = pst.executeQuery()) + { + Assertions.assertTrue(rs.next()); + return rs.getInt(1); + } + } + } + + private byte[] buildRangeHash(List ranges) throws Exception + { + MessageDigest digest = MessageDigest.getInstance("SHA-256"); + for (RowIdRange range : ranges) + { + updateLong(digest, range.getRowIdStart()); + updateLong(digest, range.getRowIdEnd()); + updateLong(digest, range.getFileId()); + updateInt(digest, range.getRgId()); + updateInt(digest, range.getRgRowOffsetStart()); + updateInt(digest, range.getRgRowOffsetEnd()); + } + return digest.digest(); + } + + private static void updateLong(MessageDigest digest, long value) + { + for (int shift = 56; shift >= 0; shift -= 8) + { + digest.update((byte) (value >>> shift)); + } + } + + private static void updateInt(MessageDigest digest, int value) + { + for (int shift = 24; shift >= 0; shift -= 8) + { + digest.update((byte) (value >>> shift)); + } + } + + private long nanosToMillis(long nanos) + { + return nanos / 1_000_000L; + } +} diff --git a/pixels-index/pixels-index-main-sqlite/src/test/java/io/pixelsdb/pixels/index/main/sqlite/TestSqliteMainIndexBenchmark.java b/pixels-index/pixels-index-main-sqlite/src/test/java/io/pixelsdb/pixels/index/main/sqlite/TestSqliteMainIndexBenchmark.java new file mode 100644 index 0000000000..d4b07de060 --- /dev/null +++ b/pixels-index/pixels-index-main-sqlite/src/test/java/io/pixelsdb/pixels/index/main/sqlite/TestSqliteMainIndexBenchmark.java @@ -0,0 +1,462 @@ +/* + * Copyright 2025 PixelsDB. + * + * This file is part of Pixels. + * + * Pixels is free software: you can redistribute it and/or modify + * it under the terms of the Affero GNU General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * Pixels is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Affero GNU General Public License for more details. + * + * You should have received a copy of the Affero GNU General Public + * License along with Pixels. If not, see + * . + */ +package io.pixelsdb.pixels.index.main.sqlite; + +import io.pixelsdb.pixels.common.exception.MainIndexException; +import io.pixelsdb.pixels.common.index.MainIndex; +import io.pixelsdb.pixels.common.index.MainIndexFactory; +import io.pixelsdb.pixels.common.index.RowIdRange; +import io.pixelsdb.pixels.common.utils.ConfigFactory; +import io.pixelsdb.pixels.index.IndexProto; +import org.apache.commons.io.FileUtils; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Assumptions; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Tag; +import org.junit.jupiter.api.Test; + +import java.io.File; +import java.io.IOException; +import java.security.MessageDigest; +import java.sql.Connection; +import java.sql.DriverManager; +import java.sql.PreparedStatement; +import java.sql.ResultSet; +import java.util.ArrayList; +import java.util.List; +import java.util.Locale; + +@Tag("benchmark") +public class TestSqliteMainIndexBenchmark +{ + private static final String ENABLE_PROPERTY = "pixels.sqlite.main.index.benchmark"; + private static final long NOT_APPLICABLE = -1L; + private static final int CONTIGUOUS_ROWS = Integer.getInteger( + "pixels.sqlite.main.index.benchmark.contiguousRows", 1_000_000); + private static final int FRAGMENTED_ROWS = Integer.getInteger( + "pixels.sqlite.main.index.benchmark.fragmentedRows", 100_000); + private static long nextTableId = 900_000L; + + private String sqlitePath; + private long tableId; + private MainIndex mainIndex; + + @BeforeEach + public void setUp() + { + Assumptions.assumeTrue(Boolean.getBoolean(ENABLE_PROPERTY), + "Set -D" + ENABLE_PROPERTY + "=true to run manual sqlite main-index benchmarks."); + } + + @AfterEach + public void tearDown() throws Exception + { + closeAndRemoveIndex(); + } + + @Test + public void benchmarkPutGetAndFlushPaths() throws Exception + { + System.out.println(); + printBenchmarkParameters(); + List results = new ArrayList<>(); + results.add(benchmarkHotPutGetPath()); + results.add(benchmarkContiguousFlush()); + results.add(benchmarkFragmentedFlush()); + results.add(benchmarkMarkerHitRetry()); + printBenchmarkSummary(results); + } + + private BenchmarkResult benchmarkHotPutGetPath() throws Exception + { + openFreshIndex(); + long fileId = 1L; + long rowIdBase = 1_000_000_000L; + + long putNs = elapsedNanos(() -> putContiguousEntries(CONTIGUOUS_ROWS, fileId, rowIdBase)); + long getNs = elapsedNanos(() -> getContiguousEntries(CONTIGUOUS_ROWS, rowIdBase)); + long cleanupFlushNs = elapsedNanos(() -> Assertions.assertTrue(mainIndex.flushCache(fileId))); + long ranges = countRangesForFile(fileId); + long markers = countFlushMarkersForFile(fileId); + + closeAndRemoveIndex(); + return new BenchmarkResult("hot put/get path", "contiguous, pre-flush get", + CONTIGUOUS_ROWS, ranges, markers, putNs, cleanupFlushNs, + NOT_APPLICABLE, NOT_APPLICABLE, getNs); + } + + private BenchmarkResult benchmarkContiguousFlush() throws Exception + { + openFreshIndex(); + long fileId = 2L; + long rowIdBase = 2_000_000_000L; + + long putNs = elapsedNanos(() -> putContiguousEntries(CONTIGUOUS_ROWS, fileId, rowIdBase)); + long flushNs = elapsedNanos(() -> Assertions.assertTrue(mainIndex.flushCache(fileId))); + long getNs = elapsedNanos(() -> getContiguousEntries(CONTIGUOUS_ROWS, rowIdBase)); + long ranges = countRangesForFile(fileId); + long markers = countFlushMarkersForFile(fileId); + + Assertions.assertEquals(1L, ranges); + Assertions.assertEquals(1L, markers); + closeAndRemoveIndex(); + return new BenchmarkResult("contiguous first flush", "contiguous rows -> 1 range", + CONTIGUOUS_ROWS, ranges, markers, putNs, flushNs, + NOT_APPLICABLE, NOT_APPLICABLE, getNs); + } + + private BenchmarkResult benchmarkFragmentedFlush() throws Exception + { + openFreshIndex(); + long fileId = 3L; + long rowIdBase = 3_000_000_000L; + + long putNs = elapsedNanos(() -> putFragmentedEntries(FRAGMENTED_ROWS, fileId, rowIdBase)); + long flushNs = elapsedNanos(() -> Assertions.assertTrue(mainIndex.flushCache(fileId))); + long getNs = elapsedNanos(() -> getFragmentedEntries(FRAGMENTED_ROWS, rowIdBase)); + long ranges = countRangesForFile(fileId); + long markers = countFlushMarkersForFile(fileId); + + Assertions.assertEquals(FRAGMENTED_ROWS, ranges); + Assertions.assertEquals(1L, markers); + closeAndRemoveIndex(); + return new BenchmarkResult("fragmented first flush", "1-row gaps -> many ranges", + FRAGMENTED_ROWS, ranges, markers, putNs, flushNs, + NOT_APPLICABLE, NOT_APPLICABLE, getNs); + } + + private BenchmarkResult benchmarkMarkerHitRetry() throws Exception + { + openFreshIndex(); + long fileId = 4L; + long rowIdBase = 4_000_000_000L; + List ranges = buildFragmentedRanges(FRAGMENTED_ROWS, fileId, rowIdBase); + + insertRangesAndMarker(fileId, FRAGMENTED_ROWS, ranges); + long putNs = elapsedNanos(() -> putFragmentedEntries(FRAGMENTED_ROWS, fileId, rowIdBase)); + long markerRetryNs = elapsedNanos(() -> Assertions.assertTrue(mainIndex.flushCache(fileId))); + long emptyRetryNs = elapsedNanos(() -> Assertions.assertTrue(mainIndex.flushCache(fileId))); + long getNs = elapsedNanos(() -> getFragmentedEntries(FRAGMENTED_ROWS, rowIdBase)); + long storedRanges = countRangesForFile(fileId); + long markers = countFlushMarkersForFile(fileId); + + Assertions.assertEquals(FRAGMENTED_ROWS, storedRanges); + Assertions.assertEquals(1L, markers); + closeAndRemoveIndex(); + return new BenchmarkResult("marker-hit retry flush", "matching marker already durable", + FRAGMENTED_ROWS, storedRanges, markers, putNs, NOT_APPLICABLE, + markerRetryNs, emptyRetryNs, getNs); + } + + private void openFreshIndex() throws Exception + { + closeAndRemoveIndex(); + this.tableId = nextTableId++; + this.sqlitePath = ConfigFactory.Instance().getProperty("index.sqlite.path"); + try + { + FileUtils.forceMkdir(new File(sqlitePath)); + } + catch (IOException e) + { + throw new MainIndexException("Failed to create SQLite benchmark directory", e); + } + this.mainIndex = MainIndexFactory.Instance().getMainIndex(tableId); + } + + private void closeAndRemoveIndex() throws Exception + { + if (this.mainIndex != null) + { + MainIndexFactory.Instance().closeIndex(this.tableId, true); + this.mainIndex = null; + } + if (this.sqlitePath != null) + { + FileUtils.deleteDirectory(new File(sqlitePath)); + } + } + + private void putContiguousEntries(int rowCount, long fileId, long rowIdBase) + { + IndexProto.RowLocation.Builder locationBuilder = IndexProto.RowLocation.newBuilder() + .setFileId(fileId).setRgId(0); + for (int i = 0; i < rowCount; i++) + { + Assertions.assertTrue(mainIndex.putEntry(rowIdBase + i, locationBuilder.setRgRowOffset(i).build())); + } + } + + private void putFragmentedEntries(int rowCount, long fileId, long rowIdBase) + { + IndexProto.RowLocation.Builder locationBuilder = IndexProto.RowLocation.newBuilder() + .setFileId(fileId).setRgId(0); + for (int i = 0; i < rowCount; i++) + { + Assertions.assertTrue(mainIndex.putEntry(rowIdBase + i * 2L, locationBuilder.setRgRowOffset(i).build())); + } + } + + private void getContiguousEntries(int rowCount, long rowIdBase) throws MainIndexException + { + for (int i = 0; i < rowCount; i++) + { + Assertions.assertNotNull(mainIndex.getLocation(rowIdBase + i)); + } + } + + private void getFragmentedEntries(int rowCount, long rowIdBase) throws MainIndexException + { + for (int i = 0; i < rowCount; i++) + { + Assertions.assertNotNull(mainIndex.getLocation(rowIdBase + i * 2L)); + } + } + + private List buildFragmentedRanges(int rowCount, long fileId, long rowIdBase) + { + List ranges = new ArrayList<>(rowCount); + for (int i = 0; i < rowCount; i++) + { + long rowId = rowIdBase + i * 2L; + ranges.add(new RowIdRange(rowId, rowId + 1, fileId, 0, i, i + 1)); + } + return ranges; + } + + private void insertRangesAndMarker(long fileId, long entryCount, List ranges) throws Exception + { + try (Connection connection = openMainIndexConnection()) + { + boolean originalAutoCommit = connection.getAutoCommit(); + connection.setAutoCommit(false); + try + { + try (PreparedStatement pst = connection.prepareStatement("INSERT INTO row_id_ranges VALUES(?, ?, ?, ?, ?, ?)")) + { + for (RowIdRange range : ranges) + { + pst.setLong(1, range.getRowIdStart()); + pst.setLong(2, range.getRowIdEnd()); + pst.setLong(3, range.getFileId()); + pst.setInt(4, range.getRgId()); + pst.setInt(5, range.getRgRowOffsetStart()); + pst.setInt(6, range.getRgRowOffsetEnd()); + pst.addBatch(); + } + pst.executeBatch(); + } + try (PreparedStatement pst = connection.prepareStatement( + "INSERT INTO row_id_range_flush_markers VALUES(?, ?, ?, ?, ?)")) + { + pst.setLong(1, fileId); + pst.setLong(2, entryCount); + pst.setLong(3, ranges.size()); + pst.setBytes(4, buildRangeHash(ranges)); + pst.setLong(5, System.currentTimeMillis()); + pst.executeUpdate(); + } + connection.commit(); + } + catch (Exception e) + { + connection.rollback(); + throw e; + } + finally + { + connection.setAutoCommit(originalAutoCommit); + } + } + } + + private Connection openMainIndexConnection() throws Exception + { + String path = sqlitePath.endsWith("/") ? sqlitePath : sqlitePath + "/"; + return DriverManager.getConnection("jdbc:sqlite:" + path + tableId + ".main.index.db"); + } + + private long countRangesForFile(long fileId) throws Exception + { + try (Connection connection = openMainIndexConnection(); + PreparedStatement pst = connection.prepareStatement("SELECT COUNT(*) FROM row_id_ranges WHERE file_id = ?")) + { + pst.setLong(1, fileId); + try (ResultSet rs = pst.executeQuery()) + { + Assertions.assertTrue(rs.next()); + return rs.getLong(1); + } + } + } + + private long countFlushMarkersForFile(long fileId) throws Exception + { + try (Connection connection = openMainIndexConnection(); + PreparedStatement pst = connection.prepareStatement( + "SELECT COUNT(*) FROM row_id_range_flush_markers WHERE file_id = ?")) + { + pst.setLong(1, fileId); + try (ResultSet rs = pst.executeQuery()) + { + Assertions.assertTrue(rs.next()); + return rs.getLong(1); + } + } + } + + private byte[] buildRangeHash(List ranges) throws Exception + { + MessageDigest digest = MessageDigest.getInstance("SHA-256"); + for (RowIdRange range : ranges) + { + updateLong(digest, range.getRowIdStart()); + updateLong(digest, range.getRowIdEnd()); + updateLong(digest, range.getFileId()); + updateInt(digest, range.getRgId()); + updateInt(digest, range.getRgRowOffsetStart()); + updateInt(digest, range.getRgRowOffsetEnd()); + } + return digest.digest(); + } + + private static void updateLong(MessageDigest digest, long value) + { + for (int shift = 56; shift >= 0; shift -= 8) + { + digest.update((byte) (value >>> shift)); + } + } + + private static void updateInt(MessageDigest digest, int value) + { + for (int shift = 24; shift >= 0; shift -= 8) + { + digest.update((byte) (value >>> shift)); + } + } + + private long elapsedNanos(ThrowingRunnable runnable) throws Exception + { + long start = System.nanoTime(); + runnable.run(); + return System.nanoTime() - start; + } + + private void printBenchmarkParameters() + { + System.out.println("SQLite MainIndex benchmark parameters"); + System.out.println(" -D" + ENABLE_PROPERTY + "=" + Boolean.getBoolean(ENABLE_PROPERTY)); + System.out.println(" -Dpixels.sqlite.main.index.benchmark.contiguousRows=" + CONTIGUOUS_ROWS); + System.out.println(" -Dpixels.sqlite.main.index.benchmark.fragmentedRows=" + FRAGMENTED_ROWS); + System.out.println(" index.sqlite.path=" + ConfigFactory.Instance().getProperty("index.sqlite.path")); + System.out.println(" java.version=" + System.getProperty("java.version")); + System.out.println(" os.name=" + System.getProperty("os.name")); + System.out.println(" os.arch=" + System.getProperty("os.arch")); + } + + private void printBenchmarkSummary(List results) + { + System.out.println(); + System.out.println("SQLite MainIndex benchmark summary"); + System.out.println("rows = logical MainIndex entries; ranges = persisted row_id_ranges."); + System.out.println("markerRetry = retry when a matching per-file durable marker already exists."); + System.out.println("emptyRetry = immediate second flush after marker retry discarded the buffer."); + System.out.println(String.format("%-27s %-31s %12s %10s %7s %10s %13s %10s %16s %15s %13s %10s %13s", + "workload", "shape", "rows", "ranges", "markers", "put(ms)", "put rows/s", + "flush(ms)", "flush ranges/s", "markerRetry(ms)", "emptyRetry(ms)", "get(ms)", "get rows/s")); + for (BenchmarkResult result : results) + { + System.out.println(String.format("%-27s %-31s %12s %10s %7s %10s %13s %10s %16s %15s %13s %10s %13s", + result.name, + result.shape, + formatLong(result.rows), + formatLong(result.ranges), + formatLong(result.markers), + formatMillis(result.putNs), + formatRate(result.rows, result.putNs), + formatMillis(result.flushNs), + formatRate(result.ranges, result.flushNs), + formatMillis(result.markerRetryNs), + formatMillis(result.emptyRetryNs), + formatMillis(result.getNs), + formatRate(result.rows, result.getNs))); + } + } + + private String formatLong(long value) + { + return String.format(Locale.US, "%,d", value); + } + + private String formatMillis(long nanos) + { + if (nanos < 0) + { + return "-"; + } + return String.format(Locale.US, "%,.3f", nanos / 1_000_000.0D); + } + + private String formatRate(long count, long nanos) + { + if (nanos <= 0) + { + return "-"; + } + double rate = count * 1_000_000_000.0D / nanos; + return String.format(Locale.US, "%,.0f", rate); + } + + private static final class BenchmarkResult + { + private final String name; + private final String shape; + private final long rows; + private final long ranges; + private final long markers; + private final long putNs; + private final long flushNs; + private final long markerRetryNs; + private final long emptyRetryNs; + private final long getNs; + + private BenchmarkResult(String name, String shape, long rows, long ranges, long markers, + long putNs, long flushNs, long markerRetryNs, long emptyRetryNs, long getNs) + { + this.name = name; + this.shape = shape; + this.rows = rows; + this.ranges = ranges; + this.markers = markers; + this.putNs = putNs; + this.flushNs = flushNs; + this.markerRetryNs = markerRetryNs; + this.emptyRetryNs = emptyRetryNs; + this.getNs = getNs; + } + } + + private interface ThrowingRunnable + { + void run() throws Exception; + } +} diff --git a/pixels-index/pixels-index-main-sqlite/src/test/java/io/pixelsdb/pixels/index/main/sqlite/TestSqliteMainIndexQuery.java b/pixels-index/pixels-index-main-sqlite/src/test/java/io/pixelsdb/pixels/index/main/sqlite/TestSqliteMainIndexQuery.java index 7847fcd34c..df5bbaaea0 100644 --- a/pixels-index/pixels-index-main-sqlite/src/test/java/io/pixelsdb/pixels/index/main/sqlite/TestSqliteMainIndexQuery.java +++ b/pixels-index/pixels-index-main-sqlite/src/test/java/io/pixelsdb/pixels/index/main/sqlite/TestSqliteMainIndexQuery.java @@ -19,8 +19,6 @@ */ package io.pixelsdb.pixels.index.main.sqlite; -import io.pixelsdb.pixels.common.exception.MainIndexException; -import io.pixelsdb.pixels.common.exception.RowIdException; import io.pixelsdb.pixels.common.index.MainIndex; import io.pixelsdb.pixels.common.index.MainIndexFactory; import io.pixelsdb.pixels.common.index.RowIdRange; @@ -39,63 +37,170 @@ import java.sql.PreparedStatement; import java.sql.ResultSet; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; -import java.util.concurrent.CountDownLatch; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; -import java.util.concurrent.Future; + public class TestSqliteMainIndexQuery { + private static long nextTableId = 3035L; + MainIndex mainIndex; - Long tableId =3035L; + long tableId; + String sqlitePath; Connection connection; + @BeforeEach public void setUp() throws Exception { - String sqlitePath = ConfigFactory.Instance().getProperty("index.sqlite.path"); - if (!sqlitePath.endsWith("/")) + tableId = nextTableId++; + sqlitePath = ConfigFactory.Instance().getProperty("index.sqlite.path"); + try { - sqlitePath += "/"; + FileUtils.forceMkdir(new File(sqlitePath)); } + catch (IOException e) + { + System.err.println("Failed to create SQLite test directory: " + e.getMessage()); + } + mainIndex = MainIndexFactory.Instance().getMainIndex(tableId); - connection = DriverManager.getConnection("jdbc:sqlite:" + sqlitePath + tableId + ".main.index.db"); + String path = sqlitePath.endsWith("/") ? sqlitePath : sqlitePath + "/"; + connection = DriverManager.getConnection("jdbc:sqlite:" + path + tableId + ".main.index.db"); + } + + @AfterEach + public void tearDown() throws Exception + { + if (connection != null) + { + connection.close(); + } + MainIndexFactory.Instance().closeIndex(tableId, true); + try + { + FileUtils.deleteDirectory(new File(sqlitePath)); + } + catch (IOException e) + { + System.err.println("Failed to clean up SQLite test directory: " + e.getMessage()); + } + } + + @Test + public void testQueryRowRangesFromCommittedFlush() throws Exception + { + putMainIndexEntry(11000L, 51L, 0, 0); + putMainIndexEntry(11001L, 51L, 0, 1); + putMainIndexEntry(11010L, 51L, 1, 0); + Assertions.assertTrue(mainIndex.flushCache(51L)); + + List rowIdRanges = queryRowRanges(); + Assertions.assertEquals(2, rowIdRanges.size()); + assertRange(rowIdRanges.get(0), 11000L, 11002L, 51L, 0, 0, 2); + assertRange(rowIdRanges.get(1), 11010L, 11011L, 51L, 1, 0, 1); + } + + @Test + public void testQueryRowRangesFromOutOfOrderBatchFlushesMultipleFiles() throws Exception + { + assertAllTrue(mainIndex.putEntries(Arrays.asList( + primaryEntry(11102L, 52L, 0, 2), + primaryEntry(11201L, 53L, 0, 1), + primaryEntry(11100L, 52L, 0, 0), + primaryEntry(11200L, 53L, 0, 0), + primaryEntry(11101L, 52L, 0, 1), + primaryEntry(11202L, 53L, 0, 2)))); + + Assertions.assertTrue(mainIndex.flushCache(53L)); + Assertions.assertTrue(mainIndex.flushCache(52L)); + + List rowIdRanges = queryRowRanges(); + Assertions.assertEquals(2, rowIdRanges.size()); + assertRange(rowIdRanges.get(0), 11100L, 11103L, 52L, 0, 0, 3); + assertRange(rowIdRanges.get(1), 11200L, 11203L, 53L, 0, 0, 3); } @Test - public void testQueryRowRanges() throws Exception + public void testQueryRowRangesReflectDeleteSplitForRecoveryCleanup() throws Exception { - String query = "SELECT * FROM row_id_ranges order by row_id_start"; - long fileid = 0; - try (PreparedStatement pst = this.connection.prepareStatement(query)) + putContiguousEntries(11300L, 11306L, 54L, 0, 0); + Assertions.assertTrue(mainIndex.flushCache(54L)); + + Assertions.assertTrue(mainIndex.deleteRowIdRange(new RowIdRange(11302L, 11305L, 54L, 0, 2, 5))); + + List rowIdRanges = queryRowRanges(); + Assertions.assertEquals(2, rowIdRanges.size()); + assertRange(rowIdRanges.get(0), 11300L, 11302L, 54L, 0, 0, 2); + assertRange(rowIdRanges.get(1), 11305L, 11306L, 54L, 0, 5, 6); + } + + private void putMainIndexEntry(long rowId, long fileId, int rgId, int rgRowOffset) + { + Assertions.assertTrue(mainIndex.putEntry(rowId, IndexProto.RowLocation.newBuilder() + .setFileId(fileId).setRgId(rgId).setRgRowOffset(rgRowOffset).build())); + } + + private void putContiguousEntries(long rowIdStart, long rowIdEnd, long fileId, int rgId, int rgRowOffsetStart) + { + int offset = rgRowOffsetStart; + for (long rowId = rowIdStart; rowId < rowIdEnd; rowId++) { -// pst.setLong(1, fileid); - try (ResultSet rs = pst.executeQuery()) + putMainIndexEntry(rowId, fileId, rgId, offset++); + } + } + + private IndexProto.PrimaryIndexEntry primaryEntry(long rowId, long fileId, int rgId, int rgRowOffset) + { + return IndexProto.PrimaryIndexEntry.newBuilder() + .setRowId(rowId) + .setRowLocation(IndexProto.RowLocation.newBuilder() + .setFileId(fileId).setRgId(rgId).setRgRowOffset(rgRowOffset).build()) + .build(); + } + + private void assertAllTrue(List results) + { + for (Boolean result : results) + { + Assertions.assertTrue(result); + } + } + + private List queryRowRanges() throws Exception + { + String query = "SELECT * FROM row_id_ranges ORDER BY row_id_start"; + List ranges = new ArrayList<>(); + try (PreparedStatement pst = this.connection.prepareStatement(query); + ResultSet rs = pst.executeQuery()) + { + while (rs.next()) { - while (rs.next()) - { - long rowIdStart = rs.getLong("row_id_start"); - long rowIdEnd = rs.getLong("row_id_end"); - long fileId = rs.getLong("file_id"); - int rgId = rs.getInt("rg_id"); - int rgRowOffsetStart = rs.getInt("rg_row_offset_start"); - int rgRowOffsetEnd = rs.getInt("rg_row_offset_end"); - if (rowIdEnd - rowIdStart != rgRowOffsetEnd - rgRowOffsetStart) - { - throw new RowIdException("The width of row id range (" + rowIdStart + ", " + - rgRowOffsetEnd + ") does not match the width of row group row offset range (" + - rgRowOffsetStart + ", " + rgRowOffsetEnd + ")"); - } - System.out.println( - "rowIdStart=" + rowIdStart + - ", rowIdEnd=" + rowIdEnd + - ", fileId=" + fileId + - ", rgId=" + rgId + - ", rgRowOffsetStart=" + rgRowOffsetStart + - ", rgRowOffsetEnd=" + rgRowOffsetEnd - ); - } + long rowIdStart = rs.getLong("row_id_start"); + long rowIdEnd = rs.getLong("row_id_end"); + int rgRowOffsetStart = rs.getInt("rg_row_offset_start"); + int rgRowOffsetEnd = rs.getInt("rg_row_offset_end"); + Assertions.assertEquals(rowIdEnd - rowIdStart, rgRowOffsetEnd - rgRowOffsetStart); + + ranges.add(new RowIdRange( + rowIdStart, + rowIdEnd, + rs.getLong("file_id"), + rs.getInt("rg_id"), + rgRowOffsetStart, + rgRowOffsetEnd)); } } + return ranges; + } + private void assertRange(RowIdRange range, long rowIdStart, long rowIdEnd, long fileId, + int rgId, int rgRowOffsetStart, int rgRowOffsetEnd) + { + Assertions.assertEquals(rowIdStart, range.getRowIdStart()); + Assertions.assertEquals(rowIdEnd, range.getRowIdEnd()); + Assertions.assertEquals(fileId, range.getFileId()); + Assertions.assertEquals(rgId, range.getRgId()); + Assertions.assertEquals(rgRowOffsetStart, range.getRgRowOffsetStart()); + Assertions.assertEquals(rgRowOffsetEnd, range.getRgRowOffsetEnd()); } } diff --git a/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/PixelsPlanner.java b/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/PixelsPlanner.java index aef36f4cfb..f4d9b68481 100644 --- a/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/PixelsPlanner.java +++ b/pixels-planner/src/main/java/io/pixelsdb/pixels/planner/PixelsPlanner.java @@ -1895,7 +1895,7 @@ public static List getFilePaths(List dirPaths, MetadataService met { base += "/"; } - for (File file : metadataService.getFiles(dirPath.getId())) + for (File file : metadataService.getRegularFiles(dirPath.getId())) { filePaths.add(base + file.getName()); } diff --git a/pixels-retina/pom.xml b/pixels-retina/pom.xml index f17e8b27af..b7a9357da4 100644 --- a/pixels-retina/pom.xml +++ b/pixels-retina/pom.xml @@ -88,7 +88,6 @@ io.etcd jetcd-core - test diff --git a/pixels-retina/src/main/java/io/pixelsdb/pixels/retina/FileWriterManager.java b/pixels-retina/src/main/java/io/pixelsdb/pixels/retina/FileWriterManager.java index f470cb728e..3acd97283f 100644 --- a/pixels-retina/src/main/java/io/pixelsdb/pixels/retina/FileWriterManager.java +++ b/pixels-retina/src/main/java/io/pixelsdb/pixels/retina/FileWriterManager.java @@ -31,17 +31,19 @@ import io.pixelsdb.pixels.core.TypeDescription; import io.pixelsdb.pixels.core.encoding.EncodingLevel; import io.pixelsdb.pixels.core.vector.VectorizedRowBatch; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; -import java.io.IOException; import java.nio.ByteBuffer; import java.util.Collections; -import java.util.concurrent.CompletableFuture; /** * Responsible for managing several blocks of data and writing them to a file. */ public class FileWriterManager { + private static final Logger logger = LogManager.getLogger(FileWriterManager.class); + private final long tableId; private final PixelsWriter writer; private final File file; @@ -50,6 +52,17 @@ public class FileWriterManager private final long firstBlockId; private long lastBlockId = -1; private final int virtualNodeId; + + // [fileMinRowId, fileMaxRowId] is the range of row ids in the file. + private long fileMinRowId = Long.MAX_VALUE; + private long fileMaxRowId = Long.MIN_VALUE; + + private volatile boolean physicalClosed; + private volatile RetinaException physicalCloseFailure; + + // Signals that the index has been flushed. + private volatile boolean indexFlushed; + /** * Creating pixelsWriter by passing in parameters avoids the need to read * the configuration file for each call. @@ -84,10 +97,13 @@ public FileWriterManager(long tableId, TypeDescription schema, MetadataService metadataService = MetadataService.Instance(); file = new File(); this.file.setName(targetFileName); - this.file.setType(File.Type.TEMPORARY); + this.file.setType(File.Type.TEMPORARY_INGEST); this.file.setNumRowGroup(1); this.file.setPathId(targetOrderedDirPath.getId()); - metadataService.addFiles(Collections.singletonList(file)); + if (!metadataService.addFiles(Collections.singletonList(file))) + { + throw new MetadataException("failed to add metadata for ingest file " + targetFilePath); + } this.file.setId(metadataService.getFileId(targetFilePath)); } catch (MetadataException e) { @@ -118,6 +134,20 @@ public FileWriterManager(long tableId, TypeDescription schema, .build(); } catch (Exception e) { + retinaResourceManager.removeVisibility(this.file.getId()); + try + { + if (!MetadataService.Instance().deleteFiles(Collections.singletonList(this.file.getId()))) + { + logger.warn("Failed to delete metadata for ingest file after writer creation failure, fileId={}", + this.file.getId()); + } + } + catch (MetadataException metadataException) + { + logger.warn("Failed to delete metadata for ingest file after writer creation failure, fileId={}", + this.file.getId(), metadataException); + } throw new RetinaException("Failed to create pixels writer", e); } } @@ -127,6 +157,11 @@ public long getFileId() return this.file.getId(); } + public String getFileName() + { + return this.file.getName(); + } + public void setLastBlockId(long lastBlockId) { this.lastBlockId = lastBlockId; @@ -142,29 +177,76 @@ public long getLastBlockId() return this.lastBlockId; } - public void addRowBatch(VectorizedRowBatch rowBatch) throws RetinaException + public int getVirtualNodeId() { - try - { - this.writer.addRowBatch(rowBatch); - } catch (IOException e) + return this.virtualNodeId; + } + + public synchronized void includeRowId(long rowId) + { + this.fileMinRowId = Math.min(this.fileMinRowId, rowId); + this.fileMaxRowId = Math.max(this.fileMaxRowId, rowId); + } + + public synchronized boolean hasRowIds() + { + return this.fileMinRowId != Long.MAX_VALUE && this.fileMaxRowId != Long.MIN_VALUE; + } + + public boolean isPhysicalClosed() + { + return this.physicalClosed; + } + + public boolean isIndexFlushed() + { + return this.indexFlushed; + } + + void markIndexFlushed() + { + this.indexFlushed = true; + } + + public synchronized File getFileSnapshot() throws RetinaException + { + if (!hasRowIds()) { - throw new RetinaException("Failed to add rowBatch to pixels writer", e); + throw new RetinaException("Cannot create file snapshot without row-id hull: fileId=" + getFileId()); } + File snapshot = new File(); + snapshot.setId(this.file.getId()); + snapshot.setName(this.file.getName()); + snapshot.setType(this.file.getType()); + snapshot.setNumRowGroup(this.file.getNumRowGroup()); + snapshot.setMinRowId(this.fileMinRowId); + snapshot.setMaxRowId(this.fileMaxRowId); + snapshot.setPathId(this.file.getPathId()); + return snapshot; } /** - * Create a background thread to write the block of data stored in shared storage to a file. + * Replay object blocks and physically close the writer. + * Idempotent after success; failed closes rethrow the cached failure. */ - public CompletableFuture finish() + public synchronized void finish() throws RetinaException { - CompletableFuture future = new CompletableFuture<>(); + if (this.physicalCloseFailure != null) + { + throw this.physicalCloseFailure; + } + if (this.physicalClosed) + { + return; + } - new Thread(() -> { - try { + try + { + if (this.lastBlockId >= this.firstBlockId) + { + ObjectStorageManager objectStorageManager = ObjectStorageManager.Instance(); for (long blockId = firstBlockId; blockId <= lastBlockId; ++blockId) { - ObjectStorageManager objectStorageManager = ObjectStorageManager.Instance(); /* * Issue-1083: Since we obtain a read-only ByteBuffer from the S3 Reader, * we cannot read a byte[]. Instead, we should return the ByteBuffer directly. @@ -172,20 +254,47 @@ public CompletableFuture finish() ByteBuffer data = objectStorageManager.read(this.tableId, virtualNodeId, blockId); this.writer.addRowBatch(VectorizedRowBatch.deserialize(data)); } - this.writer.close(); - - // Update the file's type. - this.file.setType(File.Type.REGULAR); - MetadataService metadataService = MetadataService.Instance(); - metadataService.updateFile(this.file); - - future.complete(null); - } catch (Exception e) - { - future.completeExceptionally(e); } - }).start(); + this.writer.close(); + this.physicalClosed = true; + } catch (Exception e) + { + RetinaException wrapped = new RetinaException( + "Failed to physically close ingest file " + this.file.getId(), e); + this.physicalCloseFailure = wrapped; + throw wrapped; + } + } - return future; + /** + * Discard a zero-data ingest file by aborting the writer and removing metadata. + * The caller deletes any half-written physical bytes before calling this. + * Must not be called after {@link #finish()}. + */ + public synchronized void discard() throws RetinaException + { + if (isPhysicalClosed()) + { + throw new RetinaException( + "Cannot discard a physically closed FileWriterManager, fileId=" + getFileId()); + } + try + { + this.writer.abort(); + } + catch (Exception e) + { + logger.warn("FileWriterManager.discard: writer abort failed, fileId={}", getFileId(), e); + } + try + { + MetadataService.Instance().deleteFiles(Collections.singletonList(this.file.getId())); + } + catch (MetadataException e) + { + throw new RetinaException( + "Failed to delete TEMPORARY_INGEST file metadata, fileId=" + getFileId(), e); + } + RetinaResourceManager.Instance().removeVisibility(this.file.getId()); } } diff --git a/pixels-retina/src/main/java/io/pixelsdb/pixels/retina/IngestFilePublisher.java b/pixels-retina/src/main/java/io/pixelsdb/pixels/retina/IngestFilePublisher.java new file mode 100644 index 0000000000..c9ec95be4e --- /dev/null +++ b/pixels-retina/src/main/java/io/pixelsdb/pixels/retina/IngestFilePublisher.java @@ -0,0 +1,105 @@ +/* + * Copyright 2026 PixelsDB. + * + * This file is part of Pixels. + * + * Pixels is free software: you can redistribute it and/or modify + * it under the terms of the Affero GNU General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * Pixels is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Affero GNU General Public License for more details. + * + * You should have received a copy of the Affero GNU General Public + * License along with Pixels. If not, see + * . + */ +package io.pixelsdb.pixels.retina; + +import io.pixelsdb.pixels.common.exception.RetinaException; + +import java.util.ArrayList; +import java.util.List; +import java.util.NavigableMap; +import java.util.TreeMap; + +/** + * Publishes prepared ingest files in stream-append order. + *

+ * The scheduled fast-path inside {@link PixelsWriteBuffer} already drains + * {@code fileWriterManagers} in FIFO order on a single thread, so admission + * naturally arrives sorted by {@code firstBlockId}. This class is what keeps + * the ordering invariant intact on the {@code close()} path, where multiple + * drivers (the scheduler and the buffer's close thread) may race to admit + * the same manager: every publish action runs synchronously inside the + * monitor, and admissions whose predecessor has not yet been published are + * parked in {@link #readyFiles} until the head of the run is publishable. + */ +final class IngestFilePublisher +{ + interface PublishAction + { + void publish(FileWriterManager fileWriterManager) throws RetinaException; + } + + private final NavigableMap readyFiles = new TreeMap<>(); + private long nextCommitFirstBlockId; + + IngestFilePublisher(long nextCommitFirstBlockId) + { + this.nextCommitFirstBlockId = nextCommitFirstBlockId; + } + + /** + * The {@code firstBlockId} of the next FileWriterManager waiting to be + * published. Since block ids are assigned monotonically and commit + * timestamps are monotonic across blocks, this is the block whose + * minimum ts equals the buffer's earliest not-yet-published commit ts. + */ + synchronized long getNextCommitFirstBlockId() + { + return this.nextCommitFirstBlockId; + } + + synchronized List admitReady(FileWriterManager fileWriterManager, + PublishAction publishAction) throws RetinaException + { + long firstBlockId = fileWriterManager.getFirstBlockId(); + if (firstBlockId < this.nextCommitFirstBlockId) + { + // Already published in a previous admission. Re-admission is a + // benign no-op so that callers (the scheduler and the close() + // driver) can both attempt to publish without coordinating. + return new ArrayList<>(); + } + + FileWriterManager existing = this.readyFiles.putIfAbsent(firstBlockId, fileWriterManager); + if (existing != null && existing != fileWriterManager) + { + throw new RetinaException("Conflicting ingest file publisher admission for firstBlockId=" + firstBlockId); + } + + return publishReadyPrefix(publishAction); + } + + private List publishReadyPrefix(PublishAction publishAction) throws RetinaException + { + List published = new ArrayList<>(); + while (true) + { + FileWriterManager next = this.readyFiles.get(this.nextCommitFirstBlockId); + if (next == null) + { + return published; + } + + publishAction.publish(next); + this.readyFiles.remove(this.nextCommitFirstBlockId); + this.nextCommitFirstBlockId = next.getLastBlockId() + 1; + published.add(next); + } + } +} diff --git a/pixels-retina/src/main/java/io/pixelsdb/pixels/retina/MemTable.java b/pixels-retina/src/main/java/io/pixelsdb/pixels/retina/MemTable.java index e3d3004296..cefa83c90f 100644 --- a/pixels-retina/src/main/java/io/pixelsdb/pixels/retina/MemTable.java +++ b/pixels-retina/src/main/java/io/pixelsdb/pixels/retina/MemTable.java @@ -21,6 +21,7 @@ import io.pixelsdb.pixels.common.exception.RetinaException; import io.pixelsdb.pixels.core.TypeDescription; +import io.pixelsdb.pixels.core.vector.LongColumnVector; import io.pixelsdb.pixels.core.vector.VectorizedRowBatch; /** @@ -93,6 +94,38 @@ public int getLength() return this.length; } + public synchronized int getSize() + { + return this.rowBatch.size; + } + + /** + * Minimum commit timestamp over the appended rows, derived from the hidden + * timestamp column. Same-stream input is monotonically increasing by + * contract, so the first appended row carries the minimum. + */ + public synchronized long getMinCommitTs() + { + if (this.rowBatch.size == 0) + { + return Long.MAX_VALUE; + } + return ((LongColumnVector) this.rowBatch.cols[this.schema.getChildren().size()]).vector[0]; + } + + /** + * Maximum commit timestamp over the appended rows, derived from the hidden + * timestamp column rather than a separately maintained field. + */ + public synchronized long getMaxCommitTs() + { + if (this.rowBatch.size == 0) + { + return Long.MIN_VALUE; + } + return ((LongColumnVector) this.rowBatch.cols[this.schema.getChildren().size()]).vector[this.rowBatch.size - 1]; + } + public VectorizedRowBatch getRowBatch() { return this.rowBatch; diff --git a/pixels-retina/src/main/java/io/pixelsdb/pixels/retina/ObjectEntry.java b/pixels-retina/src/main/java/io/pixelsdb/pixels/retina/ObjectEntry.java index 1820b258ea..6df4a0fa61 100644 --- a/pixels-retina/src/main/java/io/pixelsdb/pixels/retina/ObjectEntry.java +++ b/pixels-retina/src/main/java/io/pixelsdb/pixels/retina/ObjectEntry.java @@ -31,13 +31,19 @@ public class ObjectEntry implements Referenceable private final long fileId; private final int startIndex; private final int length; + /** + * Min commit timestamp captured from the source memtable at flush time. + * {@link Long#MAX_VALUE} indicates "no rows captured". + */ + private final long minCommitTs; - public ObjectEntry(long id, long fileId, int startIndex, int length) + public ObjectEntry(long id, long fileId, int startIndex, int length, long minCommitTs) { this.id = id; this.fileId = fileId; this.startIndex = startIndex; this.length = length; + this.minCommitTs = minCommitTs; } public long getId() @@ -60,6 +66,11 @@ public int getLength() return this.length; } + public long getMinCommitTs() + { + return this.minCommitTs; + } + @Override public void ref() { diff --git a/pixels-retina/src/main/java/io/pixelsdb/pixels/retina/PixelsWriteBuffer.java b/pixels-retina/src/main/java/io/pixelsdb/pixels/retina/PixelsWriteBuffer.java index 0b9b47c80f..799e487cbf 100644 --- a/pixels-retina/src/main/java/io/pixelsdb/pixels/retina/PixelsWriteBuffer.java +++ b/pixels-retina/src/main/java/io/pixelsdb/pixels/retina/PixelsWriteBuffer.java @@ -26,6 +26,7 @@ import io.pixelsdb.pixels.common.index.service.IndexServiceProvider; import io.pixelsdb.pixels.common.index.RowIdAllocator; import io.pixelsdb.pixels.common.metadata.MetadataService; +import io.pixelsdb.pixels.common.metadata.domain.File; import io.pixelsdb.pixels.common.metadata.domain.Path; import io.pixelsdb.pixels.common.metadata.domain.SinglePointIndex; import io.pixelsdb.pixels.common.physical.Storage; @@ -37,13 +38,13 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; +import java.io.IOException; import java.util.*; import java.util.concurrent.*; import java.util.concurrent.atomic.AtomicLong; import java.util.concurrent.locks.ReadWriteLock; import java.util.concurrent.locks.ReentrantReadWriteLock; import java.util.stream.Collectors; -import java.util.stream.LongStream; import static com.google.common.base.Preconditions.checkArgument; @@ -102,6 +103,7 @@ public class PixelsWriteBuffer // backend flush thread private final ExecutorService flushObjectExecutor; + // Single-threaded by design: it serializes file publishing and FileWriterManager physical close initialization. private final ScheduledExecutorService flushFileExecutor; private ScheduledFuture flushFileFuture; @@ -111,6 +113,7 @@ public class PixelsWriteBuffer private int currentMemTableCount; private final Queue fileWriterManagers; private FileWriterManager currentFileWriterManager; + private IngestFilePublisher ingestFilePublisher; /** * Issue #1254: Multi-threaded flush @@ -160,6 +163,7 @@ public PixelsWriteBuffer(long tableId, TypeDescription schema, Path targetOrdere this.objectEntries = new ArrayList<>(); this.flushObjectExecutor = Executors.newFixedThreadPool(Integer.parseInt(configFactory.getProperty("retina.buffer.object.flush.threads"))); + // Keep file publishing serialized: physical close, index flush, metadata publish, and cleanup are ordered per stream. this.flushFileExecutor = Executors.newSingleThreadScheduledExecutor(); this.fileWriterManagers = new ConcurrentLinkedQueue<>(); @@ -175,6 +179,7 @@ public PixelsWriteBuffer(long tableId, TypeDescription schema, Path targetOrdere this.targetOrderedStorage, this.memTableSize, this.blockSize, this.replication, this.encodingLevel, this.nullsPadding, idCounter, this.memTableSize * this.maxMemTableCount, retinaHostName, virtualNodeId); + this.ingestFilePublisher = new IngestFilePublisher(this.currentFileWriterManager.getFirstBlockId()); this.activeMemTable = new MemTable(this.idCounter, schema, memTableSize, TypeDescription.Mode.CREATE_INT_VECTOR_FOR_INT, this.currentFileWriterManager.getFileId(), @@ -190,12 +195,17 @@ public PixelsWriteBuffer(long tableId, TypeDescription schema, Path targetOrdere } /** - * Add all column values and timestamp into the buffer. + * Append a row to the active memTable atomically. On return the row is + * query-visible and {@code builder} is populated with its + * {@link IndexProto.RowLocation} for downstream MainIndex / primary index + * writes. If those writes fail, the caller MUST compensate by writing an + * RGVisibility delete on that RowLocation; do not try to rewind the append. * - * @param values - * @param timestamp - * @param builder - * @return the unique row identifier (rowId) allocated for the added row + * @param values the column values of the row. + * @param timestamp the commit timestamp of the row. + * @param builder the builder of the row location, populated on return. + * @return the allocated rowId. + * @throws RetinaException if the buffer is fail-closed or rowId allocation fails. */ public long addRow(byte[][] values, long timestamp, IndexProto.RowLocation.Builder builder) throws RetinaException { @@ -207,15 +217,19 @@ public long addRow(byte[][] values, long timestamp, IndexProto.RowLocation.Build long rowId = -1; while (rowOffset < 0) { - currentMemTable = this.activeMemTable; try { synchronized (rowLock) { - // Ensure rgRowOffset and rowId are allocated synchronously to minimize - // fragmentation after MainIndex flush. + currentMemTable = this.activeMemTable; + FileWriterManager appendFileWriterManager = this.currentFileWriterManager; + // Keep row offsets and row IDs aligned for index flush. rowOffset = currentMemTable.add(values, timestamp); - rowId = rowIdAllocator.getRowId(); + if (rowOffset >= 0) + { + rowId = rowIdAllocator.getRowId(); + appendFileWriterManager.includeRowId(rowId); + } } } catch (NullPointerException e) { @@ -232,11 +246,11 @@ public long addRow(byte[][] values, long timestamp, IndexProto.RowLocation.Build } } int rgRowOffset = currentMemTable.getStartIndex() + rowOffset; - if(rgRowOffset < 0) + if (rgRowOffset < 0) { throw new RetinaException("Expect rgRowOffset >= 0, get " + rgRowOffset); } - builder.setFileId(activeMemTable.getFileId()) + builder.setFileId(currentMemTable.getFileId()) .setRgId(0) .setRgRowOffset(rgRowOffset); return rowId; @@ -251,39 +265,7 @@ private void switchMemTable() throws RetinaException { return; } - - if (this.currentMemTableCount >= this.maxMemTableCount) - { - this.currentMemTableCount = 0; - this.currentFileWriterManager.setLastBlockId(this.activeMemTable.getId()); - this.fileWriterManagers.add(this.currentFileWriterManager); - this.currentFileWriterManager = new FileWriterManager( - this.tableId, this.schema, - this.targetOrderedDirPath, this.targetOrderedStorage, - this.memTableSize, this.blockSize, this.replication, - this.encodingLevel, this.nullsPadding, this.idCounter, - this.memTableSize * this.maxMemTableCount, this.retinaHostName, virtualNodeId); - } - - /* - * For activeMemTable, at initialization the reference count is 2 because of *this and superVersion - * Here only currentVersion is destroyed, *this is still in use, so only one call to unref() is needed. - */ - MemTable oldMemTable = this.activeMemTable; - SuperVersion oldVersion = this.currentVersion; - this.immutableMemTables.add(this.activeMemTable); - this.activeMemTable = new MemTable(this.idCounter, this.schema, - this.memTableSize, TypeDescription.Mode.CREATE_INT_VECTOR_FOR_INT, - this.currentFileWriterManager.getFileId(), - this.currentMemTableCount * this.memTableSize, - this.memTableSize); - this.currentMemTableCount += 1; - this.idCounter++; - - this.currentVersion = new SuperVersion(this.activeMemTable, this.immutableMemTables, this.objectEntries); - oldVersion.unref(); - - triggerFlushToObject(oldMemTable); + retireActiveMemTableLocked(); } catch (Exception e) { throw new RetinaException("Failed to switch memtable", e); @@ -293,8 +275,48 @@ private void switchMemTable() throws RetinaException } } + // Caller must hold versionLock.writeLock(). + private void retireActiveMemTableLocked() throws RetinaException + { + if (this.currentMemTableCount >= this.maxMemTableCount) + { + this.currentMemTableCount = 0; + this.currentFileWriterManager.setLastBlockId(this.activeMemTable.getId()); + this.fileWriterManagers.add(this.currentFileWriterManager); + this.currentFileWriterManager = new FileWriterManager( + this.tableId, this.schema, + this.targetOrderedDirPath, this.targetOrderedStorage, + this.memTableSize, this.blockSize, this.replication, + this.encodingLevel, this.nullsPadding, this.idCounter, + this.memTableSize * this.maxMemTableCount, this.retinaHostName, virtualNodeId); + } + + /* + * For activeMemTable, at initialization the reference count is 2 because of *this and currentVersion + * Here only currentVersion is destroyed, *this is still in use, so only one call to unref() is needed. + */ + MemTable oldMemTable = this.activeMemTable; + SuperVersion oldVersion = this.currentVersion; + this.immutableMemTables.add(this.activeMemTable); + this.activeMemTable = new MemTable(this.idCounter, this.schema, + this.memTableSize, TypeDescription.Mode.CREATE_INT_VECTOR_FOR_INT, + this.currentFileWriterManager.getFileId(), + this.currentMemTableCount * this.memTableSize, + this.memTableSize); + this.currentMemTableCount += 1; + this.idCounter++; + + this.currentVersion = new SuperVersion(this.activeMemTable, this.immutableMemTables, this.objectEntries); + oldVersion.unref(); + + triggerFlushToObject(oldMemTable); + } + private void triggerFlushToObject(MemTable flushMemTable) { + // Capture ts before submitting: the memtable will be unref'd after + // flush, but checkpoint generation still needs its minCommitTs. + long capturedMinCommitTs = flushMemTable.getMinCommitTs(); flushObjectExecutor.submit(() -> { try { @@ -303,7 +325,7 @@ private void triggerFlushToObject(MemTable flushMemTable) this.objectStorageManager.write(this.tableId, virtualNodeId, id, flushMemTable.serialize()); ObjectEntry objectEntry = new ObjectEntry(id, flushMemTable.getFileId(), - flushMemTable.getStartIndex(), flushMemTable.getLength()); + flushMemTable.getStartIndex(), flushMemTable.getSize(), capturedMinCommitTs); objectEntry.ref(); // update watermark @@ -351,6 +373,52 @@ private void triggerFlushToObject(MemTable flushMemTable) }); } + public long getTableId() + { + return this.tableId; + } + + public int getVirtualNodeId() + { + return this.virtualNodeId; + } + + /** + * Earliest not-yet-published commit timestamp seen by this buffer. + */ + public long getEarliestPendingMinTs() + { + long nextBlockId = this.ingestFilePublisher.getNextCommitFirstBlockId(); + SuperVersion sv = getCurrentVersion(); + try + { + for (ObjectEntry oe : sv.getObjectEntries()) + { + if (oe.getId() == nextBlockId) + { + return oe.getMinCommitTs(); + } + } + for (MemTable mt : sv.getImmutableMemTables()) + { + if (mt.getId() == nextBlockId) + { + return mt.getMinCommitTs(); + } + } + MemTable activeMt = sv.getActiveMemTable(); + if (activeMt != null && activeMt.getId() == nextBlockId) + { + return activeMt.getMinCommitTs(); + } + return Long.MAX_VALUE; + } + finally + { + sv.unref(); + } + } + /** * Get the current version. * Caller must call unref(). @@ -368,6 +436,77 @@ public SuperVersion getCurrentVersion() } } + private List publishFinishedFile(FileWriterManager fileWriterManager) throws RetinaException + { + try + { + fileWriterManager.finish(); + + if (!fileWriterManager.isIndexFlushed()) + { + if (this.index == null) + { + this.index = MetadataService.Instance().getPrimaryIndex(tableId); + if (this.index == null) + { + throw new RetinaException("Primary index not found for table " + tableId); + } + } + + boolean flushed = IndexServiceProvider.getService(IndexServiceProvider.ServiceMode.local) + .flushIndexEntriesOfFile( + tableId, index.getId(), fileWriterManager.getFileId(), true, indexOption); + if (!flushed) + { + throw new RetinaException("Failed to flush main index for ingest file " + + fileWriterManager.getFileId()); + } + fileWriterManager.markIndexFlushed(); + } + } catch (IndexException e) + { + throw new RetinaException("Failed to flush main index for ingest file " + + fileWriterManager.getFileId(), e); + } catch (MetadataException e) + { + throw new RetinaException("Failed to load primary index for table " + tableId, e); + } + return this.ingestFilePublisher.admitReady(fileWriterManager, this::publishPreparedFile); + } + + private void publishPreparedFile(FileWriterManager fileWriterManager) throws RetinaException + { + try + { + if (!fileWriterManager.isPhysicalClosed()) + { + throw new RetinaException("Cannot publish ingest file before physical close: fileId=" + + fileWriterManager.getFileId()); + } + if (!fileWriterManager.isIndexFlushed()) + { + throw new RetinaException("Cannot publish ingest file before main index flush: fileId=" + + fileWriterManager.getFileId()); + } + if (!fileWriterManager.hasRowIds()) + { + throw new RetinaException("Cannot publish ingest file without row-id hull: fileId=" + + fileWriterManager.getFileId()); + } + File regularFile = fileWriterManager.getFileSnapshot(); + regularFile.setType(File.Type.REGULAR); + if (!MetadataService.Instance().updateFile(regularFile)) + { + throw new RetinaException("Failed to publish ingest file " + + fileWriterManager.getFileId() + " as REGULAR"); + } + } catch (MetadataException e) + { + throw new RetinaException("Failed to publish ingest file " + + fileWriterManager.getFileId() + " as REGULAR", e); + } + } + /** * Determine whether the last data block managed by fileWriterManager has * been written to Object. If it has been written, execute the file write @@ -378,54 +517,19 @@ private void startFlushObjectToFileScheduler(long intervalSeconds) this.flushFileFuture = this.flushFileExecutor.scheduleWithFixedDelay(() -> { try { - if(index == null) - { - try - { - index = MetadataService.Instance().getPrimaryIndex(tableId); - } catch (MetadataException ignored) - { - logger.warn("There isn't primary index on table {}", tableId); - } - } - Iterator iterator = this.fileWriterManagers.iterator(); while (iterator.hasNext()) { FileWriterManager fileWriterManager = iterator.next(); - if (fileWriterManager.getLastBlockId() <= this.continuousFlushedId.get()) + if (fileWriterManager.getLastBlockId() > this.continuousFlushedId.get()) { - CompletableFuture finished = fileWriterManager.finish(); - iterator.remove(); - - // update super version - this.versionLock.writeLock().lock(); - Set idsToRemove = LongStream.rangeClosed(fileWriterManager.getFirstBlockId(), - fileWriterManager.getLastBlockId()).boxed().collect(Collectors.toSet()); - List toRemove = this.objectEntries.stream() - .filter(objectEntry -> idsToRemove.contains(objectEntry.getId())) - .collect(Collectors.toList()); - - this.objectEntries.removeAll(toRemove); - - SuperVersion oldVersion = this.currentVersion; - this.currentVersion = new SuperVersion(this.activeMemTable, this.immutableMemTables, this.objectEntries); - oldVersion.unref(); - this.versionLock.writeLock().unlock(); - - finished.get(); - if(index != null) - { - IndexServiceProvider.getService(IndexServiceProvider.ServiceMode.local) - .flushIndexEntriesOfFile(tableId, index.getId(), fileWriterManager.getFileId(), true, indexOption); - } - for (ObjectEntry objectEntry : toRemove) - { - if (objectEntry.unref()) - { - this.objectStorageManager.delete(this.tableId, virtualNodeId, objectEntry.getId()); - } - } + break; + } + List publishedFiles = publishFinishedFile(fileWriterManager); + for (FileWriterManager publishedFile : publishedFiles) + { + this.fileWriterManagers.remove(publishedFile); + cleanupPublishedObjects(publishedFile.getFirstBlockId(), publishedFile.getLastBlockId()); } } } catch (Exception e) @@ -435,25 +539,46 @@ private void startFlushObjectToFileScheduler(long intervalSeconds) }, 0, intervalSeconds, TimeUnit.SECONDS); } - /** - * Gracefully close the writer buffer, ensuring all in-memory data is persisted. - */ - public void close() throws RetinaException + private void cleanupPublishedObjects(long firstBlockId, long lastBlockId) throws RetinaException { - // First, shut down the flush process to prevent changes to the data view. - this.flushObjectExecutor.shutdown(); + if (lastBlockId < firstBlockId) + { + return; + } + + List toRemove; + this.versionLock.writeLock().lock(); try { - if (!this.flushObjectExecutor.awaitTermination(60, TimeUnit.SECONDS)) + toRemove = this.objectEntries.stream() + .filter(objectEntry -> objectEntry.getId() >= firstBlockId && objectEntry.getId() <= lastBlockId) + .collect(Collectors.toList()); + this.objectEntries.removeAll(toRemove); + + SuperVersion oldVersion = this.currentVersion; + this.currentVersion = new SuperVersion( + this.activeMemTable, this.immutableMemTables, this.objectEntries); + oldVersion.unref(); + } finally + { + this.versionLock.writeLock().unlock(); + } + + for (ObjectEntry objectEntry : toRemove) + { + if (objectEntry.unref()) { - this.flushObjectExecutor.shutdownNow(); + this.objectStorageManager.delete(this.tableId, virtualNodeId, objectEntry.getId()); } - } catch (InterruptedException e) - { - this.flushObjectExecutor.shutdownNow(); - Thread.currentThread().interrupt(); - throw new RetinaException("Close process was interrupted while waiting for flushObjectExecutor", e); } + } + + public void close() throws RetinaException + { + // The caller (RetinaServer / RetinaResourceManager shutdown path) is + // responsible for quiescing append traffic before invoking close(). + // There is no buffer-internal "append-to-publish" window to drain. + // Stop scheduled publishing before the driver thread publishes leftovers. if (this.flushFileFuture != null) { this.flushFileFuture.cancel(false); @@ -463,94 +588,102 @@ public void close() throws RetinaException { if (!this.flushFileExecutor.awaitTermination(60, TimeUnit.SECONDS)) { - this.flushFileExecutor.shutdownNow(); + logger.warn("Close timed out waiting for flushFileExecutor to drain; proceeding"); } - } catch (InterruptedException e) + } + catch (InterruptedException e) { - this.flushFileExecutor.shutdownNow(); Thread.currentThread().interrupt(); - throw new RetinaException("Close process was interrupted while waiting for flushDiskExecutor", e); + throw new RetinaException("Close process was interrupted while waiting for flushFileExecutor", e); } - SuperVersion sv = getCurrentVersion(); - List> futures = new ArrayList<>(); + // Retire non-empty active data so file close only replays ObjectEntry bytes. + this.versionLock.writeLock().lock(); try { - long maxObjectKey = this.continuousFlushedId.get(); - - // process current fileWriterManager - this.currentFileWriterManager.setLastBlockId(maxObjectKey); - this.currentFileWriterManager.addRowBatch(sv.getActiveMemTable().getRowBatch()); - long firstBlockId = this.currentFileWriterManager.getFirstBlockId(); - Iterator iterator = sv.getImmutableMemTables().iterator(); - while (iterator.hasNext()) + if (!this.activeMemTable.isEmpty()) { - MemTable immutableMemtable = iterator.next(); - if (immutableMemtable.getId() >= firstBlockId) - { - this.currentFileWriterManager.addRowBatch(immutableMemtable.getRowBatch()); - iterator.remove(); - } + retireActiveMemTableLocked(); } - this.currentFileWriterManager.finish().get(); + } + finally + { + this.versionLock.writeLock().unlock(); + } - // process the remaining fileWriterManager - for (FileWriterManager fileWriterManager : this.fileWriterManagers) + // Let submitted object flushes finish; never interrupt in-flight uploads. + this.flushObjectExecutor.shutdown(); + try + { + if (!this.flushObjectExecutor.awaitTermination(60, TimeUnit.SECONDS)) { - firstBlockId = fileWriterManager.getFirstBlockId(); - long lastBlockId = fileWriterManager.getLastBlockId(); + logger.warn("Close timed out waiting for flushObjectExecutor to drain; proceeding"); + } + } + catch (InterruptedException e) + { + Thread.currentThread().interrupt(); + throw new RetinaException("Close process was interrupted while waiting for flushObjectExecutor", e); + } - // all written to object - if (lastBlockId <= maxObjectKey) - { - futures.add(fileWriterManager.finish()); - } else + // Publish files with rows; discard an empty current ingest file. + if (this.currentFileWriterManager != null) + { + if (this.currentFileWriterManager.hasRowIds()) + { + this.currentFileWriterManager.setLastBlockId(this.continuousFlushedId.get()); + this.fileWriterManagers.add(this.currentFileWriterManager); + } + else + { + FileWriterManager zeroDataFwm = this.currentFileWriterManager; + String filePath = this.targetOrderedDirPath.getUri() + "/" + + zeroDataFwm.getFileName(); + try { - // process elements in immutable memTable - iterator = sv.getImmutableMemTables().iterator(); - while (iterator.hasNext()) + if (this.targetOrderedStorage.exists(filePath)) { - MemTable immutableMemtable = iterator.next(); - long id = immutableMemtable.getId(); - if (id >= firstBlockId && id <= lastBlockId) - { - fileWriterManager.addRowBatch(immutableMemtable.getRowBatch()); - iterator.remove(); - } + this.targetOrderedStorage.delete(filePath, false); } - - // elements in object will be processed in finish() later - fileWriterManager.setLastBlockId(maxObjectKey); - futures.add(fileWriterManager.finish()); + } + catch (IOException e) + { + logger.warn("Close failed to delete half-written bytes of empty FileWriterManager fileId={}, path={}; continuing", + zeroDataFwm.getFileId(), filePath, e); + } + try + { + zeroDataFwm.discard(); + } + catch (RetinaException e) + { + logger.warn("Close failed to discard empty current FileWriterManager fileId={}; continuing", + zeroDataFwm.getFileId(), e); } } + this.currentFileWriterManager = null; + } - CompletableFuture all = CompletableFuture.allOf( - futures.toArray(new CompletableFuture[0]) - ); - all.get(15, TimeUnit.SECONDS); - } catch (InterruptedException e) + SuperVersion sv = getCurrentVersion(); + try { - Thread.currentThread().interrupt(); - throw new RetinaException("Data persistence was interrupted during close", e); - } catch (Exception e) + for (FileWriterManager fwm : new ArrayList<>(this.fileWriterManagers)) + { + List published = publishFinishedFile(fwm); + for (FileWriterManager publishedFile : published) + { + this.fileWriterManagers.remove(publishedFile); + cleanupPublishedObjects(publishedFile.getFirstBlockId(), publishedFile.getLastBlockId()); + } + } + } + catch (Exception e) { - throw new RetinaException("Failed to persist data during close operation. Data may be lost", e); - } finally + throw new RetinaException("Failed to publish ingest files during close", e); + } + finally { sv.unref(); - currentVersion.unref(); - activeMemTable.unref(); - for (MemTable immutableMemTable: sv.getImmutableMemTables()) - { - immutableMemTable.unref(); - } - - for (ObjectEntry objectEntry : sv.getObjectEntries()) - { - objectEntry.unref(); - this.objectStorageManager.delete(this.tableId, virtualNodeId, objectEntry.getId()); - } } } } diff --git a/pixels-retina/src/main/java/io/pixelsdb/pixels/retina/RGVisibility.java b/pixels-retina/src/main/java/io/pixelsdb/pixels/retina/RGVisibility.java index 1816f262d5..6b4696e7d1 100644 --- a/pixels-retina/src/main/java/io/pixelsdb/pixels/retina/RGVisibility.java +++ b/pixels-retina/src/main/java/io/pixelsdb/pixels/retina/RGVisibility.java @@ -34,6 +34,54 @@ public class RGVisibility implements AutoCloseable { private static final Logger logger = LogManager.getLogger(RGVisibility.class); + + /** + * Selects how a visibility DELETE should be applied. + * + *

The modes separate the timestamp semantics from the lifecycle concurrency + * guarantees. NORMAL is the live fast path and only appends to the delete chain. + * VERSIONED is for replay while READY readers may be active, so historical + * deletes fold into baseBitmap through copy-on-write. EXCLUSIVE is for the + * RECOVERING replay window where readers and GC are blocked; historical deletes + * may fold into baseBitmap in place, with native writer synchronization.

+ */ + public enum ReplayMode + { + /** + * Normal live apply. The caller is expected to provide delete timestamps + * newer than the current baseTimestamp, so native code appends the delete + * record to the timestamped chain and does not inspect baseBitmap first. + */ + NORMAL(0), + + /** + * Replay while concurrent readers may exist, for example READY backlog + * catchup. Deletes with timestamp <= baseTimestamp are folded into + * baseBitmap by publishing a new version; newer deletes append to the chain. + */ + VERSIONED(1), + + /** + * Replay in an exclusive recovery window. Query and GC readers must be + * blocked, but multiple recovery writers may still run; native code uses a + * tile-level writer lock and folds historical deletes into baseBitmap in + * place. + */ + EXCLUSIVE(2); + + private final int code; + + ReplayMode(int code) + { + this.code = code; + } + + int code() + { + return code; + } + } + static { String pixelsHome = System.getenv("PIXELS_HOME"); @@ -93,7 +141,7 @@ public void close() // native methods private native long createNativeObject(long rgRecordNum, long timestamp, long[] bitmap); private native void destroyNativeObject(long nativeHandle); - private native void deleteRecord(int rgRowOffset, long timestamp, long nativeHandle); + private native void deleteRecord(int rgRowOffset, long timestamp, long nativeHandle, int replayMode); private native long[] getVisibilityBitmap(long timestamp, long nativeHandle); private native long[] garbageCollect(long timestamp, long nativeHandle); private native long[] exportChainItemsAfter(long safeGcTs, long nativeHandle); @@ -103,10 +151,16 @@ public void close() private static native long getRetinaObjectCount(); public void deleteRecord(int rgRowOffset, long timestamp) + { + deleteRecord(rgRowOffset, timestamp, ReplayMode.NORMAL); + } + + public void deleteRecord(int rgRowOffset, long timestamp, ReplayMode replayMode) { long handle = nativeHandle.get(); if (handle == 0) throw new IllegalStateException("RGVisibility is closed"); - deleteRecord(rgRowOffset, timestamp, handle); + if (replayMode == null) throw new IllegalArgumentException("replayMode is null"); + deleteRecord(rgRowOffset, timestamp, handle, replayMode.code()); } public long[] getVisibilityBitmap(long timestamp) diff --git a/pixels-retina/src/main/java/io/pixelsdb/pixels/retina/RecoveryCheckpoint.java b/pixels-retina/src/main/java/io/pixelsdb/pixels/retina/RecoveryCheckpoint.java new file mode 100644 index 0000000000..05d0619ade --- /dev/null +++ b/pixels-retina/src/main/java/io/pixelsdb/pixels/retina/RecoveryCheckpoint.java @@ -0,0 +1,632 @@ +/* + * Copyright 2026 PixelsDB. + * + * This file is part of Pixels. + * + * Pixels is free software: you can redistribute it and/or modify + * it under the terms of the Affero GNU General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * Pixels is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Affero GNU General Public License for more details. + * + * You should have received a copy of the Affero GNU General Public + * License along with Pixels. If not, see + * . + */ +package io.pixelsdb.pixels.retina; + +import io.etcd.jetcd.KeyValue; +import io.pixelsdb.pixels.common.exception.RetinaException; +import io.pixelsdb.pixels.common.physical.Storage; +import io.pixelsdb.pixels.common.physical.StorageFactory; +import io.pixelsdb.pixels.common.utils.ConfigFactory; +import io.pixelsdb.pixels.common.utils.Constants; +import io.pixelsdb.pixels.common.utils.EtcdUtil; +import io.pixelsdb.pixels.common.utils.NetUtils; +import io.pixelsdb.pixels.common.utils.RetinaUtils; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; + + +/** + * Single owner of the recovery-checkpoint contract for a Retina host: + * binary format, value object ({@link Body} with its entry POJOs), and + * the etcd-pointer + Storage IO protocol that publishes and loads + * bodies. Catalog reconciliation, replay-start computation, and orphan + * retirement are not this class's concern; see {@code RetinaServerImpl}. + *

+ * High-level surface: + *

    + *
  • {@link #generate(long, List, List)} — given pre-collected + * {@code rgEntries} and {@code segments} captured by the caller at + * {@code checkpointAppliedTs}, sort canonically, serialise a body, + * write it through {@link Storage}, and publish the per-host etcd + * pointer via CAS. Idempotent across rounds: a no-op when + * {@code checkpointAppliedTs} has not advanced since the last + * successful round. Pure transform + IO; never reads back into RRM.
  • + *
  • {@link #load()} — read the etcd pointer, fetch the body it + * references, and run minimal header-level acceptability checks + * (matching {@code retinaNodeId}, sane {@code checkpointAppliedTs}, + * and a {@code virtualNodesPerNode} match). Returns {@code null} + * only when the pointer is absent (the sole legitimate + * fresh-deployment signal); once the pointer exists every failure + * fails closed by throwing, so a transient read error or corrupted + * body is never mistaken for "no checkpoint".
  • + *
  • {@link Body#writeTo(DataOutputStream)} / {@link Body#readFrom(byte[])} — the + * on-disk format codec; integrity is delegated to the underlying storage layer.
  • + *
+ */ +public final class RecoveryCheckpoint +{ + private static final Logger logger = LogManager.getLogger(RecoveryCheckpoint.class); + + private static final int MAGIC = 0x5052434B; + + // ============================================================ + // Section 1 — Configuration / IO state + // ============================================================ + + private final Storage storage; + private final String checkpointDir; + private final EtcdUtil etcd; + private final int virtualNodesPerNode; + private final String retinaNodeId; + private final String pointerKey; + /** Last checkpointAppliedTs that was successfully persisted; -1 before the first round. */ + private long lastCheckpointAppliedTs = -1L; + + RecoveryCheckpoint(Storage storage, + String checkpointDir, + EtcdUtil etcd, + int virtualNodesPerNode, + String retinaNodeId) + { + this.storage = storage; + this.checkpointDir = checkpointDir; + this.etcd = etcd; + this.virtualNodesPerNode = virtualNodesPerNode; + this.retinaNodeId = retinaNodeId; + this.pointerKey = "/pixels/retina/recovery/checkpoint/" + retinaNodeId + "/current"; + } + + /** + * Build a recovery checkpoint from the running configuration (shared + * {@link EtcdUtil#Instance()}, storage resolved from + * {@code retina.recovery.checkpoint.dir}, vnode count from + * {@code node.virtual.num}). The local hostname is used as the + * per-host retinaNodeId. + */ + public static RecoveryCheckpoint createFromConfig() throws RetinaException + { + ConfigFactory config = ConfigFactory.Instance(); + String retinaNodeId = NetUtils.getLocalHostName(); + String checkpointDir = config.getProperty("retina.recovery.checkpoint.dir"); + Storage storage; + try + { + storage = StorageFactory.Instance().getStorage(checkpointDir); + } + catch (IOException e) + { + throw new RetinaException("Failed to resolve storage for " + checkpointDir, e); + } + int virtualNodesPerNode = Integer.parseInt(config.getProperty("node.virtual.num")); + + return new RecoveryCheckpoint( + storage, + checkpointDir, + EtcdUtil.Instance(), + virtualNodesPerNode, + retinaNodeId); + } + + public int getVirtualNodesPerNode() + { + return virtualNodesPerNode; + } + + // ============================================================ + // Section 2 — Entry POJOs serialised inside a body + // ============================================================ + + /** + * Per-scope earliest unsafe-insert commit timestamp captured at + * checkpoint time: the smallest commit ts across the scope's + * pending/open {@link io.pixelsdb.pixels.retina.FileWriterManager}s. + * Already-published REGULAR files are not tracked separately in the + * body; their {@code fileId} appears in {@link VisibilityEntry} and + * that is the only ingest-path identity recovery needs. + */ + public static final class PendingSegmentEntry + { + private final int virtualNodeId; + private final long minCommitTs; + + public PendingSegmentEntry(int virtualNodeId, long minCommitTs) + { + this.virtualNodeId = virtualNodeId; + this.minCommitTs = minCommitTs; + } + + public int getVirtualNodeId() { return virtualNodeId; } + public long getMinCommitTs() { return minCommitTs; } + } + + /** + * One {@code (fileId, rgId, bitmap)} entry captured by the recovery + * checkpoint. The bitmap folds every delete with + * {@code delete_ts <= baseTimestamp} into the base, so the loader can + * rebuild RGVisibility with an empty deletion chain. + */ + public static final class VisibilityEntry + { + private final long fileId; + private final int rgId; + private final int recordNum; + private final long baseTimestamp; + private final long[] bitmap; + + public VisibilityEntry(long fileId, int rgId, int recordNum, + long baseTimestamp, long[] bitmap) + { + this.fileId = fileId; + this.rgId = rgId; + this.recordNum = recordNum; + this.baseTimestamp = baseTimestamp; + this.bitmap = bitmap; + } + + public long getFileId() { return fileId; } + public int getRgId() { return rgId; } + public int getRecordNum() { return recordNum; } + public long getBaseTimestamp() { return baseTimestamp; } + public long[] getBitmap() { return bitmap; } + } + + // ============================================================ + // Section 3 — Body value object + format codec + // ============================================================ + + /** + * Immutable in-memory representation of one checkpoint body. + * Use {@link Body#builder()} to construct, {@link #writeTo(DataOutputStream)} to + * write, and {@link #readFrom(byte[])} to parse. + */ + public static final class Body + { + private final long writeTimeMs; + private final long checkpointAppliedTs; + /** Value of {@code node.virtual.num} at checkpoint time; mismatch aborts recovery. */ + private final int virtualNodesPerNode; + /** Original retinaNodeId string, stored for diagnostics. */ + private final String retinaNodeId; + + private final List segmentEntries; + private final List rgEntries; + + private Body(Builder builder) + { + this.writeTimeMs = builder.writeTimeMs; + this.checkpointAppliedTs = builder.checkpointAppliedTs; + this.virtualNodesPerNode = builder.virtualNodesPerNode; + this.retinaNodeId = builder.retinaNodeId; + this.segmentEntries = Collections.unmodifiableList(new ArrayList<>( + builder.segmentEntries == null ? Collections.emptyList() : builder.segmentEntries)); + this.rgEntries = Collections.unmodifiableList(new ArrayList<>( + builder.rgEntries == null ? Collections.emptyList() : builder.rgEntries)); + } + + public long getWriteTimeMs() { return writeTimeMs; } + public long getCheckpointAppliedTs() { return checkpointAppliedTs; } + public int getVirtualNodesPerNode() { return virtualNodesPerNode; } + public String getRetinaNodeId() { return retinaNodeId; } + public List getSegmentEntries() { return segmentEntries; } + public List getRgEntries() { return rgEntries; } + + public void writeTo(DataOutputStream out) throws IOException + { + out.writeInt(MAGIC); + out.writeLong(writeTimeMs); + out.writeLong(checkpointAppliedTs); + out.writeInt(virtualNodesPerNode); + out.writeInt(segmentEntries.size()); + out.writeInt(rgEntries.size()); + + byte[] nodeIdBytes = retinaNodeId.getBytes(StandardCharsets.UTF_8); + out.writeInt(nodeIdBytes.length); + out.write(nodeIdBytes); + + for (PendingSegmentEntry se : segmentEntries) + { + out.writeInt(se.virtualNodeId); + out.writeLong(se.minCommitTs); + } + + for (VisibilityEntry ve : rgEntries) + { + out.writeLong(ve.fileId); + out.writeInt(ve.rgId); + out.writeInt(ve.recordNum); + out.writeLong(ve.baseTimestamp); + long[] bitmap = ve.bitmap; + int bitmapLen = bitmap == null ? 0 : bitmap.length; + out.writeInt(bitmapLen); + for (int i = 0; i < bitmapLen; i++) + { + out.writeLong(bitmap[i]); + } + } + } + + /** + * Parse the supplied bytes. Throws {@link RetinaException} on + * magic mismatch or malformed content. + */ + public static Body readFrom(byte[] bytes) throws RetinaException + { + if (bytes == null || bytes.length == 0) + { + throw new RetinaException("body is empty"); + } + + try (DataInputStream dis = new DataInputStream(new ByteArrayInputStream(bytes))) + { + int magic = dis.readInt(); + if (magic != MAGIC) + { + throw new RetinaException("bad magic: " + Integer.toHexString(magic)); + } + long writeTimeMs = dis.readLong(); + long checkpointAppliedTs = dis.readLong(); + int virtualNodesPerNode = dis.readInt(); + int segmentEntryCount = dis.readInt(); + int rgEntryCount = dis.readInt(); + if (segmentEntryCount < 0 || rgEntryCount < 0) + { + throw new RetinaException("negative entry counts"); + } + + int nodeIdLen = dis.readInt(); + if (nodeIdLen < 0 || nodeIdLen > 1024 || nodeIdLen > dis.available()) + { + throw new RetinaException("invalid retinaNodeId length: " + nodeIdLen); + } + byte[] nodeIdBytes = new byte[nodeIdLen]; + dis.readFully(nodeIdBytes); + String retinaNodeId = new String(nodeIdBytes, StandardCharsets.UTF_8); + + List segments = new ArrayList<>(); + for (int i = 0; i < segmentEntryCount; i++) + { + int virtualNodeId = dis.readInt(); + long minCommitTs = dis.readLong(); + segments.add(new PendingSegmentEntry(virtualNodeId, minCommitTs)); + } + List rgs = new ArrayList<>(); + for (int i = 0; i < rgEntryCount; i++) + { + long fileId = dis.readLong(); + int rgId = dis.readInt(); + int recordNum = dis.readInt(); + long baseTimestamp = dis.readLong(); + int bitmapLen = dis.readInt(); + if (rgId < 0 || recordNum < 0 || bitmapLen < 0 || (long) bitmapLen * Long.BYTES > dis.available()) + { + throw new RetinaException("invalid visibility entry for fileId=" + fileId + + ", rgId=" + rgId + ", recordNum=" + recordNum + + ", bitmapLen=" + bitmapLen); + } + long[] bitmap = new long[bitmapLen]; + for (int j = 0; j < bitmapLen; j++) + { + bitmap[j] = dis.readLong(); + } + rgs.add(new VisibilityEntry(fileId, rgId, recordNum, baseTimestamp, bitmap)); + } + if (dis.available() != 0) + { + throw new RetinaException("trailing bytes after checkpoint payload: " + dis.available()); + } + + return Body.builder() + .retinaNodeId(retinaNodeId) + .writeTimeMs(writeTimeMs) + .checkpointAppliedTs(checkpointAppliedTs) + .virtualNodesPerNode(virtualNodesPerNode) + .segmentEntries(segments) + .rgEntries(rgs) + .build(); + } + catch (IOException e) + { + throw new RetinaException("failed to parse body", e); + } + } + + public static Builder builder() + { + return new Builder(); + } + + public static final class Builder + { + private long writeTimeMs; + private long checkpointAppliedTs; + private int virtualNodesPerNode; + private String retinaNodeId; + private List segmentEntries = Collections.emptyList(); + private List rgEntries = Collections.emptyList(); + + public Builder writeTimeMs(long writeTimeMs) { this.writeTimeMs = writeTimeMs; return this; } + public Builder checkpointAppliedTs(long ts) { this.checkpointAppliedTs = ts; return this; } + public Builder virtualNodesPerNode(int n) { this.virtualNodesPerNode = n; return this; } + public Builder retinaNodeId(String id) { this.retinaNodeId = id; return this; } + public Builder segmentEntries(List entries) { this.segmentEntries = entries; return this; } + public Builder rgEntries(List entries) { this.rgEntries = entries; return this; } + + public Body build() + { + if (retinaNodeId == null || retinaNodeId.isEmpty()) + { + throw new IllegalArgumentException("retinaNodeId is required"); + } + return new Body(this); + } + } + } + + // ============================================================ + // Section 4 — Load result + // ============================================================ + + /** Body loaded from the etcd pointer. */ + public static final class LoadedCheckpoint + { + public final String bodyPath; + public final Body body; + + LoadedCheckpoint(String bodyPath, Body body) + { + this.bodyPath = bodyPath; + this.body = body; + } + } + + // ============================================================ + // Section 5 — Write path: generate() + // ============================================================ + + /** + * @param checkpointAppliedTs the safe visibility folding timestamp at which + * the body should be snapshotted; supplied by the caller (typically + * the same value the surrounding GC cycle has just folded against) + * so the body reflects exactly that fold and TransService is not + * re-read here. + * @param rgEntries per-RG visibility entries already snapshotted by the + * caller against {@code checkpointAppliedTs} (typically collected + * in-line during Memory GC's single pass over RGVisibility, so the + * post-fold bitmap is reused without a second native traversal). + * Sorted in-place to the canonical on-disk order. + * @param segments per-scope earliest pending commit timestamps already + * snapshotted by the caller. Sorted in-place. + *

A no-op when {@code checkpointAppliedTs} has not advanced since + * the last successful round (no new committed transactions, nothing + * to flush). The published body and pointer are logged at INFO. + */ + public void generate(long checkpointAppliedTs, + List rgEntries, + List segments) throws RetinaException + { + if (checkpointAppliedTs == lastCheckpointAppliedTs) + { + logger.debug("Recovery checkpoint: checkpointAppliedTs={} unchanged since last round; skipping", + checkpointAppliedTs); + return; + } + long now = System.currentTimeMillis(); + + rgEntries.sort((a, b) -> { + int byFile = Long.compare(a.getFileId(), b.getFileId()); + if (byFile != 0) return byFile; + return Integer.compare(a.getRgId(), b.getRgId()); + }); + segments.sort((a, b) -> Integer.compare(a.getVirtualNodeId(), b.getVirtualNodeId())); + + Body body = Body.builder() + .retinaNodeId(retinaNodeId) + .writeTimeMs(now) + .checkpointAppliedTs(checkpointAppliedTs) + .virtualNodesPerNode(virtualNodesPerNode) + .segmentEntries(segments) + .rgEntries(rgEntries) + .build(); + + String bodyPath = RetinaUtils.buildCheckpointPath( + checkpointDir, RetinaUtils.CHECKPOINT_PREFIX_RECOVERY, retinaNodeId, checkpointAppliedTs); + try (DataOutputStream out = storage.create(bodyPath, true, Constants.CHECKPOINT_BUFFER_SIZE)) + { + body.writeTo(out); + out.flush(); + } + catch (IOException e) + { + throw new RetinaException("Failed to write recovery checkpoint body " + bodyPath, e); + } + + // Body is durable; publish the pointer atomically. If publish fails the + // body becomes a one-round orphan and is overwritten/cleaned next round. + String displacedPath = publishPointer(bodyPath); + if (displacedPath != null && !displacedPath.isEmpty()) + { + try + { + storage.delete(displacedPath, false); + } + catch (IOException e) + { + logger.warn("Failed to delete orphan checkpoint body {}; will retry next round", + displacedPath, e); + } + } + + logger.info("Recovery checkpoint published: body={}, checkpointAppliedTs={}, segments={}, rgs={}", + bodyPath, checkpointAppliedTs, + segments.size(), rgEntries.size()); + lastCheckpointAppliedTs = checkpointAppliedTs; + } + + /** + * Atomically replace the published checkpoint pointer. + * + * @return the displaced old body path (null on first publish). + */ + private String publishPointer(String newBodyPath) throws RetinaException + { + String old = readPointer(); + boolean committed; + try + { + committed = etcd.compareAndPut(pointerKey, old, newBodyPath); + } + catch (Exception e) + { + throw new RetinaException("etcd CAS failed for recovery checkpoint pointer " + pointerKey, e); + } + if (!committed) + { + throw new RetinaException("concurrent writer or stale snapshot on recovery checkpoint pointer " + + pointerKey); + } + return old; + } + + // ============================================================ + // Section 6 — Read path: load() + // ============================================================ + + /** + * Read the etcd pointer and load the body it references. Returns + * {@code null} only when the pointer is absent, which is the + * sole legitimate fresh-deployment signal (this node has never + * checkpointed). Once the pointer exists a checkpoint was definitely + * taken, so every subsequent failure — a transient read error, a + * corrupted body, a stale vnode mapping, or any other unusable body — + * fails closed by throwing. Recovery must abort and retry (or wait for + * operator intervention) rather than silently rebuild from scratch, + * which would wipe real visibility state and serve dirty reads while + * CDC replay catches up. + */ + public LoadedCheckpoint load() throws RetinaException + { + String bodyPath = readPointer(); + if (bodyPath == null || bodyPath.isEmpty()) + { + return null; + } + byte[] bytes; + try + { + bytes = readBody(bodyPath); + } + catch (IOException e) + { + // Fail-closed: the pointer exists, so a checkpoint was taken; a + // transient read failure is not the same as "no checkpoint". + // Abort and let recovery retry rather than fresh-deploy. + throw new RetinaException(String.format( + "Recovery aborted: pointer references %s but reading the checkpoint body failed. " + + "Refusing to treat a transient read error as a fresh deployment.", bodyPath), e); + } + Body body; + try + { + body = Body.readFrom(bytes); + } + catch (RetinaException e) + { + // Fail-closed: a corrupted/truncated body (e.g. a half-written + // body from a power loss) must not be misread as "no checkpoint". + throw new RetinaException(String.format( + "Recovery aborted: checkpoint body %s is corrupted/unreadable. " + + "Operator intervention required; refusing to fresh-deploy over a damaged checkpoint.", + bodyPath), e); + } + // Fail-closed: configuration changed since last checkpoint. Abort + // recovery and let the operator intervene rather than rebuild with + // a stale vnode mapping. + if (body.getVirtualNodesPerNode() != virtualNodesPerNode) + { + throw new RetinaException(String.format( + "Recovery aborted: body %s was written with node.virtual.num=%d, current=%d. " + + "Configuration changed since last checkpoint; refusing to recover with stale vnode mapping.", + bodyPath, body.getVirtualNodesPerNode(), virtualNodesPerNode)); + } + // Fail-closed: the pointer named this body, so it must be usable. + // A mismatched node id or illegal timestamp is corruption, not a + // fresh-deployment signal. + ensureAcceptable(body, bodyPath); + return new LoadedCheckpoint(bodyPath, body); + } + + private String readPointer() + { + KeyValue kv = etcd.getKeyValue(pointerKey); + if (kv == null) + { + return null; + } + String value = kv.getValue().toString(StandardCharsets.UTF_8); + return value.isEmpty() ? null : value; + } + + private byte[] readBody(String path) throws IOException + { + ByteArrayOutputStream buf = new ByteArrayOutputStream(); + try (DataInputStream in = storage.open(path)) + { + byte[] chunk = new byte[8192]; + int n; + while ((n = in.read(chunk)) != -1) + { + buf.write(chunk, 0, n); + } + } + byte[] result = buf.toByteArray(); + if (result.length == 0) + { + throw new IOException("empty body file at " + path); + } + return result; + } + + private void ensureAcceptable(Body body, String bodyPath) throws RetinaException + { + if (!retinaNodeId.equals(body.getRetinaNodeId())) + { + throw new RetinaException(String.format( + "Recovery aborted: body %s retinaNodeId='%s' does not match expected '%s'. " + + "Pointer references a body for a different node; refusing to fresh-deploy.", + bodyPath, body.getRetinaNodeId(), retinaNodeId)); + } + if (body.getCheckpointAppliedTs() < 0) + { + throw new RetinaException(String.format( + "Recovery aborted: body %s has illegal checkpointAppliedTs=%d. " + + "Corrupted checkpoint; refusing to fresh-deploy.", + bodyPath, body.getCheckpointAppliedTs())); + } + } +} diff --git a/pixels-retina/src/main/java/io/pixelsdb/pixels/retina/RetinaResourceManager.java b/pixels-retina/src/main/java/io/pixelsdb/pixels/retina/RetinaResourceManager.java index 2eeb97f015..d22563b0e0 100644 --- a/pixels-retina/src/main/java/io/pixelsdb/pixels/retina/RetinaResourceManager.java +++ b/pixels-retina/src/main/java/io/pixelsdb/pixels/retina/RetinaResourceManager.java @@ -22,9 +22,15 @@ import com.google.protobuf.ByteString; import io.pixelsdb.pixels.common.exception.RetinaException; import io.pixelsdb.pixels.common.exception.TransException; +import io.pixelsdb.pixels.common.index.service.IndexService; +import io.pixelsdb.pixels.common.index.service.IndexServiceProvider; import io.pixelsdb.pixels.common.metadata.MetadataService; import io.pixelsdb.pixels.common.metadata.domain.Column; +import io.pixelsdb.pixels.common.metadata.domain.File; import io.pixelsdb.pixels.common.metadata.domain.Layout; +import io.pixelsdb.pixels.common.metadata.domain.Path; +import io.pixelsdb.pixels.common.metadata.domain.Schema; +import io.pixelsdb.pixels.common.metadata.domain.Table; import io.pixelsdb.pixels.common.physical.PhysicalReader; import io.pixelsdb.pixels.common.physical.PhysicalReaderUtil; import io.pixelsdb.pixels.common.physical.Storage; @@ -38,20 +44,23 @@ import io.pixelsdb.pixels.core.TypeDescription; import io.pixelsdb.pixels.core.encoding.EncodingLevel; import io.pixelsdb.pixels.index.IndexProto; -import org.apache.logging.log4j.LogManager; -import org.apache.logging.log4j.Logger; - +import io.pixelsdb.pixels.retina.RecoveryCheckpoint.PendingSegmentEntry; +import io.pixelsdb.pixels.retina.RecoveryCheckpoint.VisibilityEntry; import java.io.*; - import java.nio.ByteBuffer; import java.nio.ByteOrder; import java.nio.file.Paths; import java.util.*; import java.util.concurrent.*; +import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.locks.ReadWriteLock; import java.util.concurrent.locks.ReentrantReadWriteLock; import java.util.stream.Collectors; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; + + /** * Use the singleton pattern to manage data resources in the retina service. @@ -59,25 +68,34 @@ public class RetinaResourceManager { private static final Logger logger = LogManager.getLogger(RetinaResourceManager.class); + private final MetadataService metadataService; + private final IndexService indexService; private final Map rgVisibilityMap; private final Map> pixelsWriteBufferMap; private String retinaHostName; // GC related fields private final ScheduledExecutorService gcExecutor; - private final boolean storageGcEnabled; + private final AtomicBoolean gcScheduled; + private final StorageGcWal storageGcWal; private final StorageGarbageCollector storageGarbageCollector; + // Initialised by startBackgroundGc(); recovery checkpoint publication + // is part of every GC cycle once the scheduler is running. Null until + // then so unit/integration tests that never start the scheduler are + // unaffected. + private RecoveryCheckpoint recoveryCheckpoint; - // Checkpoint related fields - private final ExecutorService checkpointExecutor; - private final Map offloadedCheckpoints; - private final Map> checkpointFutures; - private final String checkpointDir; private volatile long latestGcTimestamp = -1; private final int totalVirtualNodeNum; - private final Map checkpointRefCounts; + // whether in the RECOVERING lifecycle state + private volatile boolean recovering = false; + + // Offload checkpoint state (see "Offload Checkpoint Section" at the bottom of this file). + private final String offloadCheckpointDir; + private final ExecutorService offloadCheckpointExecutor; + private final Map offloadCheckpoints = new ConcurrentHashMap<>(); // Dual-write: oldFileId → result AND newFileId → result in a single map. // Direction is distinguished by checking fileId == result.newFileId. @@ -124,63 +142,56 @@ static final class RetiredFile } } - private enum CheckpointType + /** + * Holds the {@link Path} for a retired file and the owning tableId resolved + * from its enclosing {@link Layout}. + */ + private static final class RetiredPathInfo { - GC, - OFFLOAD + final Path path; + final long tableId; + + private RetiredPathInfo(Path path, long tableId) + { + this.path = path; + this.tableId = tableId; + } } private RetinaResourceManager() { this.metadataService = MetadataService.Instance(); + this.indexService = IndexServiceProvider.getService(IndexServiceProvider.ServiceMode.local); this.rgVisibilityMap = new ConcurrentHashMap<>(); this.pixelsWriteBufferMap = new ConcurrentHashMap<>(); - this.offloadedCheckpoints = new ConcurrentHashMap<>(); - this.checkpointFutures = new ConcurrentHashMap<>(); ConfigFactory config = ConfigFactory.Instance(); - this.checkpointRefCounts = new ConcurrentHashMap<>(); - this.checkpointDir = config.getProperty("retina.checkpoint.dir"); - - int cpThreads = Integer.parseInt(config.getProperty("retina.checkpoint.threads")); - this.checkpointExecutor = Executors.newFixedThreadPool(cpThreads, r -> { - Thread t = new Thread(r, "retina-checkpoint-thread"); - t.setDaemon(true); - return t; - }); - - ScheduledExecutorService executor = Executors.newSingleThreadScheduledExecutor(r -> { + this.gcExecutor = Executors.newSingleThreadScheduledExecutor(r -> { Thread t = new Thread(r, "retina-gc-thread"); t.setDaemon(true); return t; }); - try - { - long interval = Long.parseLong(config.getProperty("retina.gc.interval")); - if (interval > 0) - { - executor.scheduleAtFixedRate( - this::runGC, - interval, - interval, - TimeUnit.SECONDS - ); - } - } catch (Exception e) - { - logger.error("Failed to start retina background gc", e); - } - this.gcExecutor = executor; + this.gcScheduled = new AtomicBoolean(false); totalVirtualNodeNum = Integer.parseInt(ConfigFactory.Instance().getProperty("node.virtual.num")); this.retinaHostName = NetUtils.getLocalHostName(); - boolean gcEnabled = false; + this.offloadCheckpointDir = config.getProperty("retina.offload.checkpoint.dir"); + this.offloadCheckpointExecutor = Executors.newFixedThreadPool( + Integer.parseInt(config.getProperty("retina.offload.checkpoint.threads")), + r -> { + Thread t = new Thread(r, "retina-checkpoint-thread"); + t.setDaemon(true); + return t; + }); + + this.storageGcWal = new StorageGcWal(); + StorageGarbageCollector gc = null; try { - gcEnabled = Boolean.parseBoolean(config.getProperty("retina.storage.gc.enabled")); - if (gcEnabled) + boolean storageGcEnabled = Boolean.parseBoolean(config.getProperty("retina.storage.gc.enabled")); + if (storageGcEnabled) { double threshold = Double.parseDouble(config.getProperty("retina.storage.gc.threshold")); long targetFileSize = Long.parseLong(config.getProperty("retina.storage.gc.target.file.size")); @@ -190,9 +201,9 @@ private RetinaResourceManager() EncodingLevel encodingLevel = EncodingLevel.from( Integer.parseInt(config.getProperty("retina.storage.gc.encoding.level"))); long retireDelayMs = (long) (Double.parseDouble(config.getProperty("retina.storage.gc.file.retire.delay.hours")) * 3_600_000L); - gc = new StorageGarbageCollector(this, this.metadataService, + gc = new StorageGarbageCollector(this, this.metadataService, this.indexService, threshold, targetFileSize, maxFilesPerGroup, maxGroups, - rowGroupSize, encodingLevel, retireDelayMs); + rowGroupSize, encodingLevel, retireDelayMs, storageGcWal); logger.info("Storage GC enabled (threshold={}, targetFileSize={}, maxFilesPerGroup={}, maxGroups={})", threshold, targetFileSize, maxFilesPerGroup, maxGroups); } @@ -200,13 +211,16 @@ private RetinaResourceManager() catch (Exception e) { logger.error("Failed to initialise StorageGarbageCollector, Storage GC will be disabled", e); - gcEnabled = false; gc = null; } - this.storageGcEnabled = gcEnabled; this.storageGarbageCollector = gc; } + public StorageGcWal getStorageGcWal() + { + return storageGcWal; + } + private static final class InstanceHolder { private static final RetinaResourceManager instance = new RetinaResourceManager(); @@ -217,13 +231,88 @@ public static RetinaResourceManager Instance() return InstanceHolder.instance; } + /** + * Starts the periodic Retina GC scheduler after the service has reached + * the lifecycle point where background cleanup is safe to run. + * + *

The constructor intentionally does not schedule GC: startup must + * stay fail-closed until initialization succeeds, otherwise a background + * GC tick could observe partially constructed state. This method is + * idempotent so callers that wire it into a service-ready hook can + * invoke it more than once safely.

+ * + * @throws RetinaException if GC configuration is invalid or the scheduler cannot be started. + */ + public void startBackgroundGc() throws RetinaException + { + long interval; + try + { + interval = Long.parseLong(ConfigFactory.Instance().getProperty("retina.gc.interval")); + } + catch (Exception e) + { + throw new RetinaException("Invalid retina GC interval configuration", e); + } + + if (interval <= 0) + { + logger.info("Retina background GC is disabled"); + return; + } + + if (!this.gcScheduled.compareAndSet(false, true)) + { + logger.debug("Retina background GC scheduler has already been started"); + return; + } + + // Fail-closed: recovery checkpoint is a durability primitive. If we + // cannot construct it (missing/unreadable config, unreachable etcd + // or storage backend), refuse to start the GC scheduler rather than + // silently run without crash recovery. + this.recoveryCheckpoint = RecoveryCheckpoint.createFromConfig(); + + try + { + this.gcExecutor.scheduleAtFixedRate( + this::runGC, + interval, + interval, + TimeUnit.SECONDS + ); + logger.info("Retina background GC scheduler started with interval {} seconds", interval); + } + catch (RuntimeException e) + { + this.gcScheduled.set(false); + throw new RetinaException("Failed to start retina background GC", e); + } + } + + public boolean isBackgroundGcStarted() + { + return this.gcScheduled.get(); + } + + public void setRecovering(boolean recovering) + { + this.recovering = recovering; + } + public void addVisibility(long fileId, int rgId, int recordNum, long timestamp, long[] bitmap, boolean overwrite) { String rgKey = RetinaUtils.buildRgKey(fileId, rgId); if (overwrite) { - rgVisibilityMap.put(rgKey, new RGVisibility(recordNum, timestamp, bitmap)); + RGVisibility prev = rgVisibilityMap.put(rgKey, new RGVisibility(recordNum, timestamp, bitmap)); + if (prev != null) + { + // RGVisibility holds a native handle; the replaced instance must be + // closed explicitly or its off-heap memory leaks. + prev.close(); + } } else { @@ -260,10 +349,28 @@ public void addVisibility(String filePath) throws RetinaException } } + public void removeVisibility(long fileId) + { + String prefix = fileId + "_"; + this.rgVisibilityMap.entrySet().removeIf(entry -> + { + if (!entry.getKey().startsWith(prefix)) + { + return false; + } + RGVisibility rgVisibility = entry.getValue(); + if (rgVisibility != null) + { + rgVisibility.close(); + } + return true; + }); + } + public long[] queryVisibility(long fileId, int rgId, long timestamp, long transId) throws RetinaException { // read from memory - RGVisibility rgVisibility = checkRGVisibility(fileId, rgId); + RGVisibility rgVisibility = checkRGVisibility(fileId, rgId, false); long[] visibilityBitmap = rgVisibility.getVisibilityBitmap(timestamp); if (visibilityBitmap == null) { @@ -278,325 +385,171 @@ public long[] queryVisibility(long fileId, int rgId, long timestamp) throws Reti return queryVisibility(fileId, rgId, timestamp, -1); } + public void reclaimVisibility(long fileId, int rgId, long timestamp) throws RetinaException + { + String retinaKey = RetinaUtils.buildRgKey(fileId, rgId); + RGVisibility rgVisibility = this.rgVisibilityMap.remove(retinaKey); + if (rgVisibility != null) + { + rgVisibility.close(); + } + } + /** - * Long-running queries register an "Offload" status and ensure that - * the required visibility checkpoint is correctly created and manages. - * For long-running transactions, newly written data is not required. - * Therefore, even if checkpoints are created under the same timestamp - * and only one copy is retained, this has virtually no impact on queries. - * - * @param timestamp - * @throws RetinaException + * Enqueues an old file for delayed cleanup after a configurable wall-clock + * grace period has elapsed. */ - public void registerOffload(long timestamp) throws RetinaException + public void scheduleRetiredFile(RetiredFile retiredFile) { - AtomicInteger refCount = checkpointRefCounts.computeIfAbsent(timestamp, k -> new AtomicInteger(0)); - CompletableFuture future; + retiredFiles.add(retiredFile); + } - synchronized (refCount) + /** + * Processes retired files from both the in-memory queue and durable catalog. + * The catalog scan makes delayed cleanup retryable after process restart. + */ + public void processRetiredFiles() + { + // In-memory queue for files retired in this process. + long now = System.currentTimeMillis(); + retiredFiles.removeIf(rf -> { - refCount.incrementAndGet(); - - // If checkpoint already exists and is fully committed, just return - if (offloadedCheckpoints.containsKey(timestamp)) - { - logger.info("Registered offload for Timestamp: {} (already exists)", timestamp); - return; - } - - // Check if there is an existing future - future = checkpointFutures.get(timestamp); - if (future != null && future.isCompletedExceptionally()) - { - // If previous attempt failed, remove it so we can retry - checkpointFutures.remove(timestamp, future); - future = null; - } - - if (future == null) + if (now <= rf.retireTimestamp) { - future = checkpointFutures.computeIfAbsent(timestamp, k -> { - try - { - return createCheckpoint(timestamp, CheckpointType.OFFLOAD); - } catch (RetinaException e) - { - throw new CompletionException(e); - } - }); + return false; } - } + boolean success = cleanupRetiredFile(rf.fileId, rf.rgCount, rf.filePath, + -1L, -1L, 0); + return success; + }); + // Durable catalog retry for files left RETIRED across restart. + List dueFiles = Collections.emptyList(); try { - future.join(); - logger.info("Registered offload for Timestamp: {}", timestamp); - } catch (Exception e) + dueFiles = metadataService.listRetiredFilesDue(); + } + catch (Exception e) { - synchronized (refCount) - { - refCount.decrementAndGet(); - // We don't remove from checkpointFutures here anymore, - // because it's handled above in the synchronized block for retries - // or let the next caller handle it. - } - throw new RetinaException("Failed to create checkpoint for timestamp: " + timestamp, e); + logger.warn("processRetiredFiles: failed to list due RETIRED files", e); } - } - - public void unregisterOffload(long timestamp) - { - AtomicInteger refCount = checkpointRefCounts.get(timestamp); - if (refCount != null) + if (!dueFiles.isEmpty()) { - synchronized (refCount) + Map pathsById = new HashMap<>(); + try { - int remaining = refCount.decrementAndGet(); - if (remaining <= 0) + for (Schema schema : metadataService.getSchemas()) { - offloadedCheckpoints.remove(timestamp); - checkpointFutures.remove(timestamp); - if (refCount.get() > 0) + for (Table table : metadataService.getTables(schema.getName())) { - logger.info("Checkpoint resurrection detected, skipping deletion. TS: {}", timestamp); - return; + for (Layout layout : metadataService.getLayouts(schema.getName(), table.getName())) + { + long tableId = layout.getTableId(); + for (Path path : layout.getOrderedPaths()) + { + pathsById.put(path.getId(), new RetiredPathInfo(path, tableId)); + } + } } - removeCheckpointFile(timestamp, CheckpointType.OFFLOAD); - checkpointRefCounts.remove(timestamp); - logger.info("Offload checkpoint for timestamp {} removed.", timestamp); } } - } - } - - private CompletableFuture createCheckpoint(long timestamp, CheckpointType type) throws RetinaException - { - return createCheckpoint(timestamp, type, null); - } - - private CompletableFuture createCheckpoint( - long timestamp, CheckpointType type, Map precomputedBitmaps) throws RetinaException - { - String prefix = (type == CheckpointType.GC) ? RetinaUtils.CHECKPOINT_PREFIX_GC : RetinaUtils.CHECKPOINT_PREFIX_OFFLOAD; - String filePath = RetinaUtils.buildCheckpointPath(checkpointDir, prefix, retinaHostName, timestamp); - - // 1. Capture current entries to ensure we process a consistent set of RGs - List> entries = new ArrayList<>(this.rgVisibilityMap.entrySet()); - int totalRgs = entries.size(); - logger.info("Starting {} checkpoint for {} RGs at timestamp {}", type, totalRgs, timestamp); - - // 2. Use a BlockingQueue for producer-consumer pattern - BlockingQueue queue = new LinkedBlockingQueue<>(1024); - - // 3. Start producer tasks to fetch bitmaps - for (Map.Entry entry : entries) - { - checkpointExecutor.submit(() -> { - try - { - String key = entry.getKey(); - long fileId = RetinaUtils.parseFileIdFromRgKey(key); - int rgId = RetinaUtils.parseRgIdFromRgKey(key); - RGVisibility rgVisibility = entry.getValue(); - long[] bitmap; - if (precomputedBitmaps != null && precomputedBitmaps.containsKey(key)) - { - bitmap = precomputedBitmaps.get(key); - } else - { - bitmap = rgVisibility.getVisibilityBitmap(timestamp); - } - queue.put(new CheckpointFileIO.CheckpointEntry(fileId, rgId, (int) rgVisibility.getRecordNum(), bitmap)); - } catch (Exception e) + catch (Exception e) + { + logger.warn("processRetiredFiles: failed to load path map for retired cleanup", e); + } + for (File file : dueFiles) + { + RetiredPathInfo pathInfo = pathsById.get(file.getPathId()); + if (pathInfo == null) { - logger.error("Failed to fetch visibility bitmap for checkpoint", e); + logger.warn("processRetiredFiles: pathId={} for retired fileId={} is not available; will retry later", + file.getPathId(), file.getId()); + continue; } - }); - } - - // 4. Async Write: perform IO in background thread (Consumer). - // Use commonPool to avoid deadlocks with checkpointExecutor. - // Concurrency safety: for OFFLOAD type, registerOffload() guarantees at most - // one future per timestamp via synchronized(refCount) + checkpointFutures.computeIfAbsent. - // For GC type, runGC() is single-threaded. No file-level locking is needed here. - return CompletableFuture.runAsync(() -> { - long startWrite = System.currentTimeMillis(); - try - { - CheckpointFileIO.writeCheckpoint(filePath, totalRgs, queue); - long endWrite = System.currentTimeMillis(); - logger.info("Writing {} checkpoint file to {} took {} ms", type, filePath, (endWrite - startWrite)); - - if (type == CheckpointType.OFFLOAD) + String filePath = File.getFilePath(pathInfo.path, file); + long rowCount = file.getMaxRowId() - file.getMinRowId() + 1; + if (!cleanupRetiredFile(file.getId(), file.getNumRowGroup(), filePath, + pathInfo.tableId, file.getMinRowId(), rowCount)) { - offloadedCheckpoints.put(timestamp, filePath); + continue; } - } catch (Exception e) - { - logger.error("Failed to commit {} checkpoint file for timestamp: {}", type, timestamp, e); try { - StorageFactory.Instance().getStorage(filePath).delete(filePath, false); - } catch (IOException ignored) + metadataService.deleteFiles(Collections.singletonList(file.getId())); + } + catch (Exception e) { + logger.warn("processRetiredFiles: failed to delete retired catalog fileId={}", file.getId(), e); } - throw new CompletionException(e); } - }); + } } - /** - * Writes a checkpoint from pre-built {@link CheckpointFileIO.CheckpointEntry} objects, - * bypassing the {@code rgVisibilityMap} traversal and per-entry thread-pool submission - * that the other {@code createCheckpoint} overload performs. - * - *

This is used by {@link #runGC()} when the entries have already been constructed - * during the Memory GC single-pass, avoiding a redundant second traversal of - * {@code rgVisibilityMap}. - */ - private CompletableFuture createCheckpointDirect( - long timestamp, CheckpointType type, - List preBuiltEntries) throws RetinaException + private boolean cleanupRetiredFile(long fileId, int rgCount, String filePath, + long tableId, long rowIdStart, long rowCount) { - String prefix = (type == CheckpointType.GC) ? RetinaUtils.CHECKPOINT_PREFIX_GC : RetinaUtils.CHECKPOINT_PREFIX_OFFLOAD; - String filePath = RetinaUtils.buildCheckpointPath(checkpointDir, prefix, retinaHostName, timestamp); - - int totalRgs = preBuiltEntries.size(); - logger.info("Starting {} checkpoint (direct) for {} RGs at timestamp {}", type, totalRgs, timestamp); - - BlockingQueue queue = new LinkedBlockingQueue<>(1024); - - // Feed pre-built entries into the queue via the checkpoint executor so that the - // producer-consumer pattern with the writer thread is preserved (the queue has a - // bounded capacity of 1024, so this may block and must not run on the caller thread). - checkpointExecutor.submit(() -> { + boolean success = true; + for (int rgId = 0; rgId < rgCount; rgId++) + { try { - for (CheckpointFileIO.CheckpointEntry entry : preBuiltEntries) - { - queue.put(entry); - } + reclaimVisibility(fileId, rgId, 0); } - catch (InterruptedException e) + catch (Exception e) { - Thread.currentThread().interrupt(); - logger.error("Interrupted while feeding pre-built checkpoint entries", e); + success = false; + logger.warn("processRetiredFiles: failed to reclaim Visibility for fileId={}, rgId={}", + fileId, rgId, e); } - }); - - return CompletableFuture.runAsync(() -> { + } + if (tableId > 0 && rowIdStart >= 0 && rowCount > 0 && rowCount <= Integer.MAX_VALUE) + { try { - CheckpointFileIO.writeCheckpoint(filePath, totalRgs, queue); - - if (type == CheckpointType.OFFLOAD) - { - offloadedCheckpoints.put(timestamp, filePath); - } + indexService.deleteMainIndexRange(tableId, fileId, rowIdStart, (int) rowCount); } catch (Exception e) { - logger.error("Failed to commit {} checkpoint file for timestamp: {}", type, timestamp, e); - try - { - StorageFactory.Instance().getStorage(filePath).delete(filePath, false); - } - catch (IOException ignored) - { - } - throw new CompletionException(e); + success = false; + logger.warn("processRetiredFiles: failed to delete MainIndex range for tableId={}, fileId={}", + tableId, fileId, e); } - }); - } - - private void removeCheckpointFile(long timestamp, CheckpointType type) - { - String prefix = (type == CheckpointType.GC) ? RetinaUtils.CHECKPOINT_PREFIX_GC : RetinaUtils.CHECKPOINT_PREFIX_OFFLOAD; - String path = RetinaUtils.buildCheckpointPath(checkpointDir, prefix, retinaHostName, timestamp); - - try - { - StorageFactory.Instance().getStorage(path).delete(path, false); - } catch (IOException e) - { - logger.warn("Failed to delete checkpoint file", e); } - } - - public void reclaimVisibility(long fileId, int rgId, long timestamp) throws RetinaException - { - String retinaKey = RetinaUtils.buildRgKey(fileId, rgId); - RGVisibility rgVisibility = this.rgVisibilityMap.remove(retinaKey); - if (rgVisibility != null) + if (filePath != null) { - rgVisibility.close(); - } - } - - /** - * Enqueues an old file for delayed cleanup after a configurable wall-clock - * grace period has elapsed. - */ - public void scheduleRetiredFile(RetiredFile retiredFile) - { - retiredFiles.add(retiredFile); - } - - /** - * Processes the retired files queue: for each file whose wall-clock - * {@code retireTimestamp} deadline has passed, removes its Visibility - * entries and deletes the physical file. - */ - public void processRetiredFiles() - { - long now = System.currentTimeMillis(); - retiredFiles.removeIf(rf -> - { - if (now <= rf.retireTimestamp) - { - return false; - } - for (int rgId = 0; rgId < rf.rgCount; rgId++) + try { - try - { - reclaimVisibility(rf.fileId, rgId, 0); - } - catch (Exception e) + Storage storage = StorageFactory.Instance().getStorage(filePath); + if (storage.exists(filePath)) { - logger.warn("processRetiredFiles: failed to reclaim Visibility for fileId={}, rgId={}", - rf.fileId, rgId, e); + storage.delete(filePath, false); } } - // Old MainIndex entries for retired files are purged lazily by the - // MainIndex implementation; no explicit cleanup is needed here. - if (rf.filePath != null) + catch (IOException e) { - try - { - Storage storage = StorageFactory.Instance().getStorage(rf.filePath); - if (storage.exists(rf.filePath)) - { - storage.delete(rf.filePath, false); - } - } - catch (IOException e) - { - logger.warn("processRetiredFiles: failed to delete physical file {}", rf.filePath, e); - } + success = false; + logger.warn("processRetiredFiles: failed to delete physical file {}", filePath, e); } - return true; - }); + } + return success; } - public String getCheckpointPath(long timestamp) + public void deleteRecord(long fileId, int rgId, int rgRowOffset, long timestamp) throws RetinaException { - return offloadedCheckpoints.get(timestamp); + deleteRecord(fileId, rgId, rgRowOffset, timestamp, RGVisibility.ReplayMode.NORMAL); } - public void deleteRecord(long fileId, int rgId, int rgRowOffset, long timestamp) throws RetinaException + public void deleteRecord(long fileId, int rgId, int rgRowOffset, long timestamp, + RGVisibility.ReplayMode replayMode) throws RetinaException { - checkRGVisibility(fileId, rgId).deleteRecord(rgRowOffset, timestamp); + RGVisibility rgVisibility = checkRGVisibility(fileId, rgId, true); + if (rgVisibility == null) + { + // Recovery-replay best-effort no-op: nothing to delete and, since + // dual-write is never active during recovery, no propagation either. + return; + } + rgVisibility.deleteRecord(rgRowOffset, timestamp, replayMode); if (!isDualWriteActive) { @@ -623,7 +576,7 @@ public void deleteRecord(long fileId, int rgId, int rgRowOffset, long timestamp) int oldGlobal = bwdMapping[rgRowOffset]; int oldRgId = rgIdForGlobalRowOffset(oldGlobal, bwd.oldFileRgRowStart); int oldRgOff = oldGlobal - bwd.oldFileRgRowStart[oldRgId]; - checkRGVisibility(bwd.oldFileId, oldRgId).deleteRecord(oldRgOff, timestamp); + checkRGVisibility(bwd.oldFileId, oldRgId, false).deleteRecord(oldRgOff, timestamp, replayMode); } } } @@ -637,7 +590,7 @@ public void deleteRecord(long fileId, int rgId, int rgRowOffset, long timestamp) int newGlobal = fwdMapping[rgRowOffset]; int newRgId = rgIdForGlobalRowOffset(newGlobal, result.newFileRgRowStart); int newRgOff = newGlobal - result.newFileRgRowStart[newRgId]; - checkRGVisibility(result.newFileId, newRgId).deleteRecord(newRgOff, timestamp); + checkRGVisibility(result.newFileId, newRgId, false).deleteRecord(newRgOff, timestamp, replayMode); } } } @@ -652,6 +605,14 @@ public void deleteRecord(IndexProto.RowLocation rowLocation, long timestamp) thr deleteRecord(rowLocation.getFileId(), rowLocation.getRgId(), rowLocation.getRgRowOffset(), timestamp); } + public void deleteRecord(IndexProto.RowLocation rowLocation, long timestamp, + RGVisibility.ReplayMode replayMode) + throws RetinaException + { + deleteRecord(rowLocation.getFileId(), rowLocation.getRgId(), rowLocation.getRgRowOffset(), + timestamp, replayMode); + } + /** * Registers dual-write redirection so that {@link #deleteRecord} propagates * deletes between old and new files. The write lock acts as a barrier: all @@ -699,12 +660,12 @@ void unregisterDualWrite(StorageGarbageCollector.RewriteResult result) long[] exportChainItemsAfter(long fileId, int rgId, long safeGcTs) throws RetinaException { - return checkRGVisibility(fileId, rgId).exportChainItemsAfter(safeGcTs); + return checkRGVisibility(fileId, rgId, false).exportChainItemsAfter(safeGcTs); } void importDeletionChain(long fileId, int rgId, long[] items) throws RetinaException { - checkRGVisibility(fileId, rgId).importDeletionChain(items); + checkRGVisibility(fileId, rgId, false).importDeletionChain(items); } public void addWriteBuffer(String schemaName, String tableName) throws RetinaException @@ -741,7 +702,9 @@ public void addWriteBuffer(String schemaName, String tableName) throws RetinaExc } } - public IndexProto.PrimaryIndexEntry.Builder insertRecord(String schemaName, String tableName, byte[][] colValues, long timestamp, int vNodeId) throws RetinaException + public IndexProto.PrimaryIndexEntry.Builder insertRecord(String schemaName, String tableName, + byte[][] colValues, long timestamp, + int vNodeId) throws RetinaException { IndexProto.PrimaryIndexEntry.Builder builder = IndexProto.PrimaryIndexEntry.newBuilder(); PixelsWriteBuffer writeBuffer = checkPixelsWriteBuffer(schemaName, tableName, vNodeId); @@ -751,17 +714,18 @@ public IndexProto.PrimaryIndexEntry.Builder insertRecord(String schemaName, Stri private RetinaProto.VisibilityBitmap getVisibilityBitmapSlice(long[] visibilityBitmap, long startIndex, int length) throws RetinaException { - if (startIndex % 64 != 0 || length % 64 != 0) + if (startIndex % 64 != 0) { - throw new RetinaException("StartIndex and length must be multiple of 64"); + throw new RetinaException("StartIndex must be multiple of 64"); } - if (length == 0) + if (length <= 0) { return RetinaProto.VisibilityBitmap.newBuilder().build(); } + int alignedLength = ((length + 63) / 64) * 64; int startLongIndex = (int) (startIndex / 64); - int endLongIndex = startLongIndex + (length / 64); + int endLongIndex = startLongIndex + (alignedLength / 64); if (visibilityBitmap == null || endLongIndex > visibilityBitmap.length) { @@ -787,13 +751,13 @@ public RetinaProto.GetWriteBufferResponse.Builder getWriteBuffer(String schemaNa Set fileIds = new HashSet<>(); - // active memTable returns directly - if (!activeMemtable.getRowBatch().isEmpty()) + // Active memTable returns its full appended rows; visibility is masked + // downstream by the RGVisibility bitmap slice below. + int activeSize = activeMemtable.getSize(); + if (activeSize > 0) { - ByteString data = ByteString.copyFrom(activeMemtable.getRowBatch().serialize()); + ByteString data = ByteString.copyFrom(activeMemtable.serialize()); responseBuilder.setData(data); - - fileIds.add(activeMemtable.getFileId()); } else { responseBuilder.setData(ByteString.EMPTY); @@ -804,8 +768,11 @@ public RetinaProto.GetWriteBufferResponse.Builder getWriteBuffer(String schemaNa fileIds.add(activeMemtable.getFileId()); for (MemTable immutableMemtable : immutableMemTables) { - fileIds.add(immutableMemtable.getFileId()); - ids.add(immutableMemtable.getId()); + if (!immutableMemtable.isEmpty()) + { + fileIds.add(immutableMemtable.getFileId()); + ids.add(immutableMemtable.getId()); + } } for (ObjectEntry objectEntry : objectEntries) { @@ -822,21 +789,25 @@ public RetinaProto.GetWriteBufferResponse.Builder getWriteBuffer(String schemaNa fileIdToVisibility.put(fileId, visibility); } - // only return the corresponding part of bitmap - if (!activeMemtable.getRowBatch().isEmpty()) + // only return the corresponding visible part of bitmap + if (activeSize > 0) { responseBuilder.addBitmaps(getVisibilityBitmapSlice( fileIdToVisibility.get(activeMemtable.getFileId()), - activeMemtable.getStartIndex(), activeMemtable.getLength())); + activeMemtable.getStartIndex(), activeSize)); } else { responseBuilder.addBitmaps(RetinaProto.VisibilityBitmap.newBuilder()); } for (MemTable immutableMemtable : immutableMemTables) { - responseBuilder.addBitmaps(getVisibilityBitmapSlice( - fileIdToVisibility.get(immutableMemtable.getFileId()), - immutableMemtable.getStartIndex(), immutableMemtable.getLength())); + int immutableSize = immutableMemtable.getSize(); + if (immutableSize > 0) + { + responseBuilder.addBitmaps(getVisibilityBitmapSlice( + fileIdToVisibility.get(immutableMemtable.getFileId()), + immutableMemtable.getStartIndex(), immutableSize)); + } } for (ObjectEntry objectEntry : objectEntries) { @@ -856,14 +827,21 @@ public RetinaProto.GetWriteBufferResponse.Builder getWriteBuffer(String schemaNa * * @param fileId the file id. * @param rgId the row group id. + * @param missingTolerant if true, returns null instead of throwing if the RGVisibility is not found * @throws RetinaException if the retina does not exist. */ - private RGVisibility checkRGVisibility(long fileId, int rgId) throws RetinaException + private RGVisibility checkRGVisibility(long fileId, int rgId, boolean missingTolerant) throws RetinaException { String retinaKey = RetinaUtils.buildRgKey(fileId, rgId); RGVisibility rgVisibility = this.rgVisibilityMap.get(retinaKey); if (rgVisibility == null) { + if (missingTolerant && recovering) + { + logger.debug("Recovery delete no-op: RGVisibility not loaded for fileId={}, rgId={} " + + "(durable index resolved to a non-baseline file)", fileId, rgId); + return null; + } throw new RetinaException(String.format("RGVisibility not found for fileId: %s, rgId: %s", fileId, rgId)); } return rgVisibility; @@ -908,31 +886,29 @@ private PixelsWriteBuffer checkPixelsWriteBuffer(String schema, String table, in } /** - * Run a full GC cycle: Memory GC → checkpoint → Storage GC. + * Run a full GC cycle: Memory GC → Storage GC → Recovery Checkpoint. * *

Ordering rationale: *

    *
  1. Memory GC first: {@code collectTileGarbage} compacts Deletion Chain blocks - * whose last item ts ≤ lwm into {@code baseBitmap}. After compaction, the remaining - * chain starts at the first block that straddles the lwm boundary, so the subsequent - * {@code getVisibilityBitmap(lwm)} call traverses at most one partial block - * (≤ {@code BLOCK_CAPACITY} items) instead of the entire pre-GC chain. This makes - * checkpoint bitmap serialisation significantly cheaper.
  2. - *
  3. Checkpoint second, unconditional and blocking: written regardless of whether - * Storage GC finds any candidate files. The {@code .join()} ensures the checkpoint - * file is fully on disk before Storage GC begins rewriting any files, so crash - * recovery can always restore the post-Memory-GC visibility state independently of - * any in-progress Storage GC rewrite. {@code gcExecutor} is single-threaded, so the - * blocking join is also the simplest way to guarantee no two GC cycles overlap.
  4. - *
  5. Storage GC third: requires an up-to-date {@code baseBitmap} (hence after - * Memory GC) and its own WAL for crash recovery. Placing it after the checkpoint - * keeps the two recovery paths independent: on restart, the GC checkpoint restores - * the post-Memory-GC visibility state, and the GcWal resumes any in-progress Storage - * GC task separately. Once scan completes, bitmaps for non-candidate files are - * immediately released from memory (they are no longer needed by subsequent phases).
  6. - *
  7. Advance {@code latestGcTimestamp} last: updated only after the entire cycle - * succeeds (Memory GC + checkpoint + Storage GC). If any step throws, the timestamp - * is not advanced and the next scheduled invocation will retry the full cycle.
  8. + * whose last item ts ≤ the safe folding timestamp into {@code baseBitmap}. After compaction, + * the remaining chain starts at the first block that straddles that boundary, so the subsequent + * {@code getVisibilityBitmap(timestamp)} call traverses at most one partial block + * (≤ {@code BLOCK_CAPACITY} items) instead of the entire pre-GC chain. The same pass + * also captures one {@link VisibilityEntry} per RG by reusing the post-fold bitmap, + * so Recovery Checkpoint does not have to traverse RGVisibility a second time. + *
  9. Storage GC second: requires an up-to-date {@code baseBitmap} (hence after + * Memory GC) and its own WAL to resume in-progress tasks after a crash. Once scan + * completes, bitmaps for non-candidate files are immediately released from memory + * (they are no longer needed by subsequent phases).
  10. + *
  11. Recovery Checkpoint third: receives the {@code rgEntries} collected in + * Step 1 plus per-scope earliest pending commit timestamps, then publishes the + * body + etcd pointer. Unlike Storage GC, a publish failure here aborts the cycle: + * the outer catch skips the {@code latestGcTimestamp} advancement, and the next + * cycle retries the full sequence so crash recovery never silently lags.
  12. + *
  13. Advance {@code latestGcTimestamp} last: updated only after Memory GC and + * Recovery Checkpoint both succeed. Storage GC failures do not block advancement + * because compaction is opportunistic.
  14. *
*/ private void runGC() @@ -942,10 +918,10 @@ private void runGC() long timestamp = 0; try { - timestamp = TransService.Instance().getSafeGcTimestamp(); + timestamp = TransService.Instance().getSafeVisibilityFoldingTimestamp(true); } catch (TransException e) { - logger.error("Error while getting safe garbage collection timestamp", e); + logger.error("Error while getting safe visibility folding timestamp", e); return; } @@ -957,22 +933,24 @@ private void runGC() try { // Step 1: Single pass over rgVisibilityMap — Memory GC + file-level stats - // aggregation + CheckpointEntry pre-building. Produces everything needed by - // checkpoint and Storage GC without any additional traversal. + // aggregation + Recovery Checkpoint entries. Produces everything needed by + // Storage GC and Recovery Checkpoint without any additional traversal of + // rgVisibilityMap or extra native-side bitmap reads. Map gcSnapshotBitmaps = new HashMap<>(); Map fileStats = new HashMap<>(); // fileId → {totalRows, totalInvalid} - List checkpointEntries = new ArrayList<>(); + List rgEntries = new ArrayList<>(this.rgVisibilityMap.size()); for (Map.Entry entry : this.rgVisibilityMap.entrySet()) { String rgKey = entry.getKey(); long fileId = RetinaUtils.parseFileIdFromRgKey(rgKey); int rgId = RetinaUtils.parseRgIdFromRgKey(rgKey); + RGVisibility rgVisibility = entry.getValue(); - long[] bitmap = entry.getValue().garbageCollect(timestamp); + long[] bitmap = rgVisibility.garbageCollect(timestamp); gcSnapshotBitmaps.put(rgKey, bitmap); - long recordNum = entry.getValue().getRecordNum(); + long recordNum = rgVisibility.getRecordNum(); long rgInvalidCount = 0; for (long word : bitmap) { @@ -990,17 +968,16 @@ private void runGC() return existing; }); - checkpointEntries.add( - new CheckpointFileIO.CheckpointEntry(fileId, rgId, (int) recordNum, bitmap)); + // Reuse the post-fold bitmap as the checkpoint entry's bitmap: it + // already reflects every delete with delete_ts <= timestamp folded + // into base, which is exactly what the loader needs to rebuild + // RGVisibility with an empty deletion chain. + rgEntries.add(new VisibilityEntry(fileId, rgId, (int) recordNum, timestamp, bitmap)); } - // Step 2: Checkpoint — write pre-built entries directly to disk, skipping - // the second rgVisibilityMap traversal and per-entry thread-pool submission. - createCheckpointDirect(timestamp, CheckpointType.GC, checkpointEntries).join(); - - // Step 3: Storage GC — pass file-level stats so that candidate selection + // Step 2: Storage GC — pass file-level stats so that candidate selection // uses O(1) lookups instead of per-RG aggregation loops. - if (storageGcEnabled && storageGarbageCollector != null) + if (storageGarbageCollector != null) { try { @@ -1012,105 +989,312 @@ private void runGC() } } - // Step 4: Advance the timestamp only after the full cycle succeeds. - // latestGcTimestamp is no longer updated inside createCheckpoint's async - // callback for GC type; this is the single authoritative update point. - long oldGcTs = this.latestGcTimestamp; - this.latestGcTimestamp = timestamp; - if (oldGcTs != -1 && oldGcTs != timestamp) + // Step 3: Publish a recovery checkpoint at the same timestamp the + // Memory GC just folded against, reusing the rgEntries already + // collected in Step 1. Unlike Storage GC failures (which we swallow + // because compaction is opportunistic), checkpoint publication + // failures must propagate: the outer catch will skip the + // latestGcTimestamp advancement so the next cycle retries. + if (recoveryCheckpoint != null) { - removeCheckpointFile(oldGcTs, CheckpointType.GC); + // Project per-scope earliest pending commit ts. Buffers with + // ts == Long.MAX_VALUE have no committed pending data and are + // omitted: the scope contributes nothing to recovery replay. + List segments = new ArrayList<>(); + for (Map perTable : this.pixelsWriteBufferMap.values()) + { + for (PixelsWriteBuffer buffer : perTable.values()) + { + long ts = buffer.getEarliestPendingMinTs(); + if (ts != Long.MAX_VALUE) + { + segments.add(new PendingSegmentEntry(buffer.getVirtualNodeId(), ts)); + } + } + } + recoveryCheckpoint.generate(timestamp, rgEntries, segments); + + if (!rgEntries.isEmpty()) + { + // A checkpoint containing the new file makes the GC WAL task durable. + Set checkpointFileIds = rgEntries.stream() + .map(VisibilityEntry::getFileId) + .collect(Collectors.toSet()); + for (StorageGcWal.Task task : storageGcWal.listAllTasks()) + { + if (task.getState() != StorageGcWal.State.SWAPPED_NOT_CHECKPOINTED + || !checkpointFileIds.contains(task.getNewFileId())) + { + continue; + } + try + { + storageGcWal.markCheckpointed(task.getTaskId()); + logger.info("Storage GC WAL reconciled: taskId={}, newFileId={} advanced to CHECKPOINTED", + task.getTaskId(), task.getNewFileId()); + } + catch (Exception e) + { + logger.warn("Storage GC WAL reconciler failed for taskId={}, newFileId={}", + task.getTaskId(), task.getNewFileId(), e); + } + } + } } + + // Step 4: Advance the timestamp only after the full cycle succeeds. + this.latestGcTimestamp = timestamp; } catch (Exception e) { logger.error("Error while running GC", e); } } - public void recoverCheckpoints() + // ───────────────────────────────────────────────────────────────────── + // Offload Checkpoint Section + // + // Long-running queries register an "offload" status with a logical + // timestamp; this section materialises one visibility checkpoint file per + // registered timestamp and reference-counts concurrent registrations so + // that the file is created exactly once and deleted only after the last + // unregistration. + // + // State lives in three RRM fields declared at the top of the class: + // offloadCheckpointDir, offloadCheckpointExecutor, offloadCheckpoints. + // ───────────────────────────────────────────────────────────────────── + + /** + * Per-timestamp state aggregating reference count, in-flight creation + * future, and the resulting file path. Doubles as the synchronization + * monitor for all transitions on this timestamp's lifecycle. + */ + private static final class OffloadCheckpoint { - try + final AtomicInteger refCount = new AtomicInteger(0); + /** Set once createOffloadCheckpoint successfully commits the file; null otherwise. */ + volatile String filePath; + /** Tracks the in-flight creation task; cleared lazily on retry after failure. */ + volatile CompletableFuture future; + } + + /** + * Long-running queries register an "Offload" status to ensure that the + * required visibility checkpoint is created. Concurrent registrations of + * the same timestamp are reference-counted and share a single checkpoint + * file, which has virtually no impact on queries since long-running + * transactions do not need newly written data. + */ + public void registerOffload(long timestamp) throws RetinaException + { + OffloadCheckpoint cp = offloadCheckpoints.computeIfAbsent(timestamp, k -> new OffloadCheckpoint()); + CompletableFuture future; + + synchronized (cp) { - Storage storage = StorageFactory.Instance().getStorage(checkpointDir); - if (!storage.exists(checkpointDir)) + cp.refCount.incrementAndGet(); + + if (cp.filePath != null) { - storage.mkdirs(checkpointDir); + logger.info("Registered offload for Timestamp: {} (already exists)", timestamp); return; } - List allFiles = storage.listPaths(checkpointDir); - // filter only .bin files - allFiles = allFiles.stream().filter(p -> p.endsWith(".bin")).collect(Collectors.toList()); + future = cp.future; + if (future != null && future.isCompletedExceptionally()) + { + // Previous attempt failed; drop the stale future so this caller retries. + cp.future = null; + future = null; + } - List gcTimestamps = new ArrayList<>(); - String offloadPrefix = RetinaUtils.getCheckpointPrefix(RetinaUtils.CHECKPOINT_PREFIX_OFFLOAD, retinaHostName); - String gcPrefix = RetinaUtils.getCheckpointPrefix(RetinaUtils.CHECKPOINT_PREFIX_GC, retinaHostName); + if (future == null) + { + future = createOffloadCheckpoint(timestamp, cp); + cp.future = future; + } + } - for (String path : allFiles) + try + { + future.join(); + logger.info("Registered offload for Timestamp: {}", timestamp); + } + catch (Exception e) + { + synchronized (cp) { - // use Paths.get().getFileName() to extract filename from path string - String filename = Paths.get(path).getFileName().toString(); - if (filename.startsWith(offloadPrefix)) - { - // delete offload checkpoint files when restarting - try - { - storage.delete(path, false); - } catch (IOException e) - { - logger.error("Failed to delete checkpoint file {}", path, e); - } - } else if (filename.startsWith(gcPrefix)) + // Clear failed creation so later registration retries; drop unreferenced state. + cp.future = null; + if (cp.refCount.decrementAndGet() <= 0) { - try - { - gcTimestamps.add(Long.parseLong(filename.replace(gcPrefix, "").replace(".bin", ""))); - } catch (Exception e) - { - logger.error("Failed to parse checkpoint timestamp from file {}", path, e); - } + offloadCheckpoints.remove(timestamp, cp); } } + throw new RetinaException("Failed to create checkpoint for timestamp: " + timestamp, e); + } + } - if (gcTimestamps.isEmpty()) + public void unregisterOffload(long timestamp) + { + OffloadCheckpoint cp = offloadCheckpoints.get(timestamp); + if (cp == null) + { + return; + } + synchronized (cp) + { + if (cp.refCount.decrementAndGet() > 0) { return; } + offloadCheckpoints.remove(timestamp); + deleteOffloadCheckpoint(timestamp); + logger.info("Offload checkpoint for timestamp {} removed.", timestamp); + } + } - Collections.sort(gcTimestamps); - long latestTs = gcTimestamps.get(gcTimestamps.size() - 1); - this.latestGcTimestamp = latestTs; - logger.info("Loading system state from GC checkpoint: {}", latestTs); + public String getOffloadCheckpointPath(long timestamp) + { + OffloadCheckpoint cp = offloadCheckpoints.get(timestamp); + return cp == null ? null : cp.filePath; + } - // load to rgVisibilityMap - String latestPath = RetinaUtils.buildCheckpointPath( - checkpointDir, RetinaUtils.CHECKPOINT_PREFIX_GC, retinaHostName, latestTs); + /** + * Cleans up stale offload checkpoint files left over by previous runs of + * this node before the service opens for queries. Long-running queries + * that owned those checkpoints are no longer active after a restart, so + * the files are safe to drop. + * + *

Cross-restart RG visibility recovery is handled separately during + * startup checkpoint load; this method does not rebuild {@code rgVisibilityMap}. + */ + public void recoverOffloadCheckpoints() + { + try + { + Storage storage = StorageFactory.Instance().getStorage(offloadCheckpointDir); + if (!storage.exists(offloadCheckpointDir)) + { + storage.mkdirs(offloadCheckpointDir); + return; + } - try + String offloadPrefix = RetinaUtils.getCheckpointPrefix( + RetinaUtils.CHECKPOINT_PREFIX_OFFLOAD, retinaHostName); + for (String path : storage.listPaths(offloadCheckpointDir)) { - Storage latestStorage = StorageFactory.Instance().getStorage(latestPath); - if (latestStorage.exists(latestPath)) + if (!path.endsWith(".bin")) { - final long ts = latestTs; - int rgCount = CheckpointFileIO.readCheckpointParallel(latestPath, entry -> { - addVisibility(entry.fileId, entry.rgId, entry.recordNum, ts, entry.bitmap, true); - }, checkpointExecutor); - - logger.info("Recovered {} RG entries from GC checkpoint", rgCount); + continue; + } + String filename = Paths.get(path).getFileName().toString(); + if (!filename.startsWith(offloadPrefix)) + { + continue; + } + try + { + storage.delete(path, false); + } + catch (IOException e) + { + logger.error("Failed to delete stale offload checkpoint file {}", path, e); } - } catch (IOException e) - { - logger.error("Failed to read checkpoint file", e); } + } + catch (IOException e) + { + logger.error("Failed to recover offload checkpoints", e); + } + } + + /** + * Two-phase checkpoint creation: + *

    + *
  1. Fold each RG's deletion chain at {@code timestamp} in parallel. + * A failure in any fold task surfaces through the returned future + * (no swallowed errors, no waiting on the writer's 60s timeout).
  2. + *
  3. Once all bitmaps are ready, drain them into the queue and write + * the file. On any failure the partial file is removed via the + * {@code whenComplete} side effect.
  4. + *
+ * + *

Concurrency safety: {@link #registerOffload} guarantees at most one + * in-flight future per OffloadCheckpoint via {@code synchronized(cp)} + + * single-writer of {@code cp.future}, so no file-level locking is needed. + */ + private CompletableFuture createOffloadCheckpoint(long timestamp, OffloadCheckpoint cp) + { + String filePath = RetinaUtils.buildCheckpointPath( + offloadCheckpointDir, RetinaUtils.CHECKPOINT_PREFIX_OFFLOAD, retinaHostName, timestamp); + + List> entries = new ArrayList<>(rgVisibilityMap.entrySet()); + int totalRgs = entries.size(); + logger.info("Starting offload checkpoint for {} RGs at timestamp {}", totalRgs, timestamp); + + List> bitmapFutures = new ArrayList<>(totalRgs); + for (Map.Entry entry : entries) + { + bitmapFutures.add(CompletableFuture.supplyAsync(() -> { + String key = entry.getKey(); + long fileId = RetinaUtils.parseFileIdFromRgKey(key); + int rgId = RetinaUtils.parseRgIdFromRgKey(key); + RGVisibility rgVisibility = entry.getValue(); + long[] bitmap = rgVisibility.getVisibilityBitmap(timestamp); + return new CheckpointFileIO.CheckpointEntry( + fileId, rgId, (int) rgVisibility.getRecordNum(), bitmap); + }, offloadCheckpointExecutor)); + } + + return CompletableFuture + .allOf(bitmapFutures.toArray(new CompletableFuture[0])) + .thenRunAsync(() -> { + long startWrite = System.currentTimeMillis(); + BlockingQueue queue = + new ArrayBlockingQueue<>(Math.max(1, totalRgs)); + try + { + for (CompletableFuture f : bitmapFutures) + { + queue.put(f.join()); + } + CheckpointFileIO.writeCheckpoint(filePath, totalRgs, queue); + long endWrite = System.currentTimeMillis(); + logger.info("Writing offload checkpoint file to {} took {} ms", + filePath, (endWrite - startWrite)); + cp.filePath = filePath; + } + catch (Exception e) + { + throw new CompletionException(e); + } + }, offloadCheckpointExecutor) + .whenComplete((unused, throwable) -> { + if (throwable != null) + { + logger.error("Failed to create offload checkpoint for timestamp: {}", + timestamp, throwable); + deleteOffloadCheckpoint(timestamp); + } + }); + } + + private void deleteOffloadCheckpoint(long timestamp) + { + String path = RetinaUtils.buildCheckpointPath( + offloadCheckpointDir, RetinaUtils.CHECKPOINT_PREFIX_OFFLOAD, retinaHostName, timestamp); - // delete old GC checkpoint files - for (int i = 0; i < gcTimestamps.size() - 1; i++) + try + { + Storage storage = StorageFactory.Instance().getStorage(path); + if (storage.exists(path)) { - removeCheckpointFile(gcTimestamps.get(i), CheckpointType.GC); + storage.delete(path, false); } - } catch (IOException e) + } + catch (IOException e) { - logger.error("Failed to recover checkpoints", e); + logger.warn("Failed to delete offload checkpoint file {}", path, e); } } } diff --git a/pixels-retina/src/main/java/io/pixelsdb/pixels/retina/StorageGarbageCollector.java b/pixels-retina/src/main/java/io/pixelsdb/pixels/retina/StorageGarbageCollector.java index fbc6da0e22..6669ceaf78 100644 --- a/pixels-retina/src/main/java/io/pixelsdb/pixels/retina/StorageGarbageCollector.java +++ b/pixels-retina/src/main/java/io/pixelsdb/pixels/retina/StorageGarbageCollector.java @@ -23,10 +23,9 @@ import io.pixelsdb.pixels.common.exception.MetadataException; import io.pixelsdb.pixels.common.exception.RetinaException; import io.pixelsdb.pixels.common.index.IndexOption; -import io.pixelsdb.pixels.common.index.MainIndex; -import io.pixelsdb.pixels.common.index.MainIndexFactory; -import io.pixelsdb.pixels.common.index.RowIdRange; -import io.pixelsdb.pixels.common.index.SinglePointIndexFactory; +import io.pixelsdb.pixels.common.index.ResolvedPrimary; +import io.pixelsdb.pixels.common.index.RollbackEntry; +import io.pixelsdb.pixels.common.index.service.IndexService; import io.pixelsdb.pixels.common.metadata.MetadataService; import io.pixelsdb.pixels.common.metadata.domain.File; import io.pixelsdb.pixels.common.metadata.domain.KeyColumns; @@ -36,7 +35,6 @@ import io.pixelsdb.pixels.common.metadata.domain.Table; import io.pixelsdb.pixels.common.physical.Storage; import io.pixelsdb.pixels.common.physical.StorageFactory; - import io.pixelsdb.pixels.common.utils.NetUtils; import io.pixelsdb.pixels.common.utils.PixelsFileNameUtils; import io.pixelsdb.pixels.common.utils.RetinaUtils; @@ -53,9 +51,6 @@ import io.pixelsdb.pixels.core.vector.LongColumnVector; import io.pixelsdb.pixels.core.vector.VectorizedRowBatch; import io.pixelsdb.pixels.index.IndexProto; -import org.apache.logging.log4j.LogManager; -import org.apache.logging.log4j.Logger; - import java.io.IOException; import java.nio.ByteBuffer; import java.util.ArrayList; @@ -67,8 +62,13 @@ import java.util.LinkedHashMap; import java.util.List; import java.util.Map; +import java.util.Optional; import java.util.Set; import java.util.stream.Collectors; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; + + /** * Storage GC: identifies high-deletion-ratio files and rewrites them @@ -84,6 +84,7 @@ public class StorageGarbageCollector private final RetinaResourceManager resourceManager; private final MetadataService metadataService; + private final IndexService indexService; private final double gcThreshold; private final long targetFileSize; private final int maxFilesPerGroup; @@ -91,6 +92,7 @@ public class StorageGarbageCollector private final int rowGroupSize; private final EncodingLevel encodingLevel; private final long retireDelayMs; + private final StorageGcWal wal; // ------------------------------------------------------------------------- // Value types @@ -208,14 +210,16 @@ static final class RewriteResult /** Set by {@link #syncIndex} after allocating new rowIds. */ long newRowIdStart = -1; + /** Set by {@link #syncIndex}: open WAL writer; closed by commitFileGroup or rollback. */ + StorageGcWal.Writer walWriter; /** * Set by {@link #syncIndex} after updating SinglePointIndex; old rowIds that were replaced. *
* Alignment invariant: {@code oldRowIds.size() == pendingIndexEntries.size()}; each * slot corresponds 1:1 to the same-position entry in {@link #pendingIndexEntries}. Slots - * where {@link io.pixelsdb.pixels.common.index.SinglePointIndex#updatePrimaryEntry} returned - * a negative value (i.e. no prior entry to replace) are stored as {@code -1L} placeholders, - * so that rollback can pair each {@code PendingIndexEntry} with its own old rowId. + * where {@link IndexService#resolvePrimary} returned an empty optional (i.e. no prior entry + * to replace) are stored as {@code -1L} placeholders, so that rollback can pair each + * {@code PendingIndexEntry} with its own old rowId. */ List oldRowIds; @@ -243,16 +247,19 @@ static final class RewriteResult StorageGarbageCollector(RetinaResourceManager resourceManager, MetadataService metadataService, + IndexService indexService, double gcThreshold, long targetFileSize, int maxFilesPerGroup, int maxFileGroupsPerRun, int rowGroupSize, EncodingLevel encodingLevel, - long retireDelayMs) + long retireDelayMs, + StorageGcWal wal) { this.resourceManager = resourceManager; this.metadataService = metadataService; + this.indexService = indexService; this.gcThreshold = gcThreshold; this.targetFileSize = targetFileSize; this.maxFilesPerGroup = maxFilesPerGroup; @@ -260,6 +267,7 @@ static final class RewriteResult this.rowGroupSize = rowGroupSize; this.encodingLevel = encodingLevel; this.retireDelayMs = retireDelayMs; + this.wal = wal; } // ------------------------------------------------------------------------- @@ -381,7 +389,7 @@ List scanAndGroupFiles(Set candidateFileIds, List files; try { - files = metadataService.getFiles(path.getId()); + files = metadataService.getRegularFiles(path.getId()); } catch (MetadataException e) { @@ -405,11 +413,9 @@ List scanAndGroupFiles(Set candidateFileIds, continue; } + // stats is guaranteed non-null with stats[0] > 0: candidateFileIds was + // built in runStorageGC from these same fileStats entries. long[] stats = fileStats.get(file.getId()); - if (stats == null || stats[0] == 0) - { - continue; - } double invalidRatio = (double) stats[1] / stats[0]; long sizeBytes; @@ -607,7 +613,7 @@ void processFileGroups(List fileGroups, long safeGcTs, * Rewrites all files in one {@link FileGroup} into a single new file, filtering out * rows marked as deleted in {@code gcSnapshotBitmaps}. * - *

The new file is registered as {@code TEMPORARY} in the catalog and its + *

The new file is registered as {@code TEMPORARY_GC} in the catalog and its * {@link RGVisibility} objects are initialised with {@code baseTimestamp = safeGcTs}. * *

After rewriting completes the {@code gcSnapshotBitmaps} entries for this group @@ -877,7 +883,7 @@ RewriteResult rewriteFileGroup(FileGroup group, long safeGcTs, backwardInfos.add(new BackwardInfo(fc.fileId, bwdMappings, oldFileRgRowStart)); } - // Register the new file as TEMPORARY in the catalog and initialise Visibility. + // Register the new file as TEMPORARY_GC in the catalog and initialise Visibility. // Track registration progress so that partial state can be cleaned up on failure. long newFileId = -1; int registeredRgCount = 0; @@ -891,12 +897,15 @@ RewriteResult rewriteFileGroup(FileGroup group, long safeGcTs, } File newFile = new File(); newFile.setName(newFileName); - newFile.setType(File.Type.TEMPORARY); + newFile.setType(File.Type.TEMPORARY_GC); newFile.setNumRowGroup(newFileRgCount); newFile.setMinRowId(minRowId); newFile.setMaxRowId(maxRowId); newFile.setPathId(group.files.get(0).file.getPathId()); - metadataService.addFiles(Collections.singletonList(newFile)); + if (!metadataService.addFiles(Collections.singletonList(newFile))) + { + throw new MetadataException("failed to add metadata for GC rewrite file " + newFilePath); + } newFileId = metadataService.getFileId(newFilePath); for (int rgId = 0; rgId < newFileRgCount; rgId++) @@ -917,7 +926,7 @@ RewriteResult rewriteFileGroup(FileGroup group, long safeGcTs, } /** - * Best-effort cleanup of a partially-created TEMPORARY file. Removes the + * Best-effort cleanup of a partially-created TEMPORARY_GC file. Removes the * catalog record, the physical file, and any RGVisibility keys that were * registered before the failure. */ @@ -939,11 +948,14 @@ private void cleanupTemporaryFile(Storage storage, String newFilePath, } try { - metadataService.deleteFiles(Collections.singletonList(newFileId)); + if (!metadataService.deleteFiles(Collections.singletonList(newFileId))) + { + logger.warn("StorageGC cleanup: catalog delete returned false for fileId={}", newFileId); + } } - catch (Exception ex) + catch (MetadataException ex) { - logger.warn("StorageGC cleanup: failed to delete catalog entry for fileId={}", newFileId, ex); + logger.warn("StorageGC cleanup: metadata service failed to delete catalog entry for fileId={}", newFileId, ex); } } try @@ -980,10 +992,6 @@ void syncVisibility(RewriteResult result, long safeGcTs) throws RetinaException for (FileCandidate fc : result.group.files) { Map fileMapping = result.forwardRgMappings.get(fc.fileId); - if (fileMapping == null) - { - continue; - } for (int rgId = 0; rgId < fc.rgCount; rgId++) { long[] items = resourceManager.exportChainItemsAfter(fc.fileId, rgId, safeGcTs); @@ -992,10 +1000,6 @@ void syncVisibility(RewriteResult result, long safeGcTs) throws RetinaException continue; } int[] fwdMapping = fileMapping.get(rgId); - if (fwdMapping == null) - { - continue; - } for (int i = 0; i < items.length; i += 2) { @@ -1093,21 +1097,31 @@ void syncIndex(RewriteResult result, long tableId) throws Exception return; } - MainIndex mainIndex = MainIndexFactory.Instance().getMainIndex(tableId); - IndexProto.RowIdBatch rowIdBatch = mainIndex.allocateRowIdBatch(tableId, totalRows); + long primaryIndexId = metadataService.getPrimaryIndex(tableId).getId(); + IndexOption indexOption = IndexOption.builder().vNodeId(result.group.virtualNodeId).build(); + + IndexProto.RowIdBatch rowIdBatch = indexService.allocateRowIdBatch(tableId, totalRows); long newRowIdStart = rowIdBatch.getRowIdStart(); result.newRowIdStart = newRowIdStart; - insertMainIndexEntries(result, mainIndex, newRowIdStart); + insertMainIndexEntries(result, tableId, primaryIndexId, indexOption, newRowIdStart); + + String journalTaskId = RetinaUtils.buildStorageGcJournalTaskId( + tableId, result.group.virtualNodeId, result.newFileId); + List oldFileIds = result.group.files.stream() + .map(fc -> fc.fileId).collect(Collectors.toList()); + result.walWriter = wal.createTask(journalTaskId, tableId, + result.group.virtualNodeId, oldFileIds, result.newFileId, result.newFilePath, + newRowIdStart, totalRows); if (!result.pendingIndexEntries.isEmpty()) { - result.oldRowIds = updateSinglePointIndex(result, tableId, newRowIdStart); + result.oldRowIds = updateSinglePointIndex(result, tableId, primaryIndexId, indexOption, newRowIdStart); } } - private void insertMainIndexEntries(RewriteResult result, MainIndex mainIndex, - long newRowIdStart) throws Exception + private void insertMainIndexEntries(RewriteResult result, long tableId, long primaryIndexId, + IndexOption indexOption, long newRowIdStart) throws Exception { int totalRows = result.newFileRgRowStart[result.newFileRgCount]; List entries = new ArrayList<>(totalRows); @@ -1126,39 +1140,66 @@ private void insertMainIndexEntries(RewriteResult result, MainIndex mainIndex, .setFileId(result.newFileId).setRgId(curRgId).setRgRowOffset(rgOff)) .build()); } - mainIndex.putEntries(entries); - mainIndex.flushCache(result.newFileId); + indexService.putMainIndexEntriesOnly(tableId, entries); + indexService.flushIndexEntriesOfFile(tableId, primaryIndexId, result.newFileId, true, indexOption); } - private List updateSinglePointIndex(RewriteResult result, long tableId, - long newRowIdStart) throws Exception + /** + * Mirrors Retina's write-path "resolve + Only" pattern: one batch resolve to capture + * pre-update rowIds (recorded for rollback), then one batch updatePrimaryIndexEntriesOnly + * to swing the primary pointers onto the freshly allocated rowIds. + * + *

TODO(concurrency): This pair of calls is not atomic, unlike the previous single-shot + * {@code SinglePointIndex#updatePrimaryEntry} (per-key atomic getAndSet). If a concurrent + * writer mutates the same primary key between {@code resolvePrimary} and + * {@code updatePrimaryIndexEntriesOnly}, the {@code oldRowIds} we record can be stale w.r.t. + * the value actually clobbered by our update. Rollback is still safe — {@code restorePrimaryIndexEntries} + * only writes back when the current pointer still equals our {@code newRowId}, so concurrent + * writes that ran after our update are never overwritten — but a rollback in the narrow + * resolve→update window can restore a stale {@code oldRowId} instead of the concurrent + * writer's value. This matches the rest of Retina's write path and is acceptable here because + * Storage GC by design targets files dominated by deleted rows. Revisit if/when + * {@code IndexService} grows a batch API that returns the rowIds atomically replaced. + */ + private List updateSinglePointIndex(RewriteResult result, long tableId, long primaryIndexId, + IndexOption indexOption, long newRowIdStart) throws Exception { - io.pixelsdb.pixels.common.metadata.domain.SinglePointIndex primaryIndex = - metadataService.getPrimaryIndex(tableId); - IndexOption indexOption = IndexOption.builder().vNodeId(result.group.virtualNodeId).build(); - io.pixelsdb.pixels.common.index.SinglePointIndex spIndex = - SinglePointIndexFactory.Instance().getSinglePointIndex( - tableId, primaryIndex.getId(), indexOption); - - // Keep oldRowIds aligned 1:1 with pendingIndexEntries: slots where - // updatePrimaryEntry returned a negative value are stored as -1L placeholders. - // rollbackSinglePointIndex relies on this alignment to pair each PendingIndexEntry - // with its own old rowId. - List oldRowIds = new ArrayList<>(result.pendingIndexEntries.size()); + int size = result.pendingIndexEntries.size(); + List keys = new ArrayList<>(size); + List entries = new ArrayList<>(size); for (PendingIndexEntry pe : result.pendingIndexEntries) { - long newRowId = newRowIdStart + pe.newGlobalRowOffset; IndexProto.IndexKey key = IndexProto.IndexKey.newBuilder() - .setTableId(tableId).setIndexId(primaryIndex.getId()) + .setTableId(tableId).setIndexId(primaryIndexId) .setKey(pe.pkBytes).setTimestamp(pe.createTs).build(); - long oldRowId = spIndex.updatePrimaryEntry(key, newRowId); + keys.add(key); + entries.add(IndexProto.PrimaryIndexEntry.newBuilder() + .setIndexKey(key) + .setRowId(newRowIdStart + pe.newGlobalRowOffset) + .build()); + } + + List> resolved = + indexService.resolvePrimary(tableId, primaryIndexId, keys, indexOption); + List oldRowIds = new ArrayList<>(size); + for (int i = 0; i < size; i++) + { + long oldRowId = resolved.get(i).map(ResolvedPrimary::getRowId).orElse(-1L); oldRowIds.add(oldRowId); if (oldRowId < 0) { - logger.warn("StorageGC syncIndex: updatePrimaryEntry returned {} for tableId={}, " + - "newGlobalRowOffset={} — index may be inconsistent", oldRowId, tableId, pe.newGlobalRowOffset); + logger.warn("StorageGC syncIndex: no resolvable primary for tableId={}, " + + "newGlobalRowOffset={} — index may be inconsistent", + tableId, result.pendingIndexEntries.get(i).newGlobalRowOffset); } + result.walWriter.appendRollbackEntry( + keys.get(i), oldRowId, entries.get(i).getRowId()); } + // Flush all rollback entries to durable storage before the batch index update so + // that recovery can restore the old rowIds if we crash between this flush and commit. + result.walWriter.flush(); + + indexService.updatePrimaryIndexEntriesOnly(tableId, primaryIndexId, entries, indexOption); return oldRowIds; } @@ -1167,17 +1208,18 @@ private List updateSinglePointIndex(RewriteResult result, long tableId, // ------------------------------------------------------------------------- /** - * Atomically promotes the new TEMPORARY file to REGULAR, deletes old files from + * Atomically promotes the new TEMPORARY_GC file to REGULAR, retires old files in * the catalog, unregisters dual-write, and enqueues the old files for delayed cleanup. */ void commitFileGroup(RewriteResult result) throws Exception { List oldFileIds = result.group.files.stream() .map(fc -> fc.fileId).collect(Collectors.toList()); + long retireDeadline = System.currentTimeMillis() + retireDelayMs; try { - metadataService.atomicSwapFiles(result.newFileId, oldFileIds); + metadataService.atomicSwapFiles(result.newFileId, oldFileIds, retireDeadline); } catch (Exception e) { @@ -1192,9 +1234,14 @@ void commitFileGroup(RewriteResult result) throws Exception } } + if (result.walWriter != null) + { + result.walWriter.markSwapped(); + result.walWriter = null; + } + unregisterDualWrite(result); - long retireDeadline = System.currentTimeMillis() + retireDelayMs; for (FileCandidate fc : result.group.files) { resourceManager.scheduleRetiredFile( @@ -1231,18 +1278,22 @@ void rollback(RewriteResult result) rollbackSinglePointIndex(result); } - if (result.newRowIdStart > 0) + // Only delete the MainIndex range if rowIds were actually allocated. newRowIdStart + // stays -1 when rollback runs before syncIndex's allocateRowIdBatch (e.g. syncVisibility + // threw); deleting [newRowIdStart, newRowIdStart+totalRows) with newRowIdStart=-1 would + // wipe an unrelated global rowId band (deleteRowIdRange is not scoped by fileId). + int totalRows = result.newFileRgRowStart == null ? 0 : result.newFileRgRowStart[result.newFileRgCount]; + if (result.newRowIdStart >= 0) { try { - int totalRows = result.newFileRgRowStart[result.newFileRgCount]; - MainIndex mainIndex = MainIndexFactory.Instance().getMainIndex(result.group.tableId); - mainIndex.deleteRowIdRange(new RowIdRange(result.newRowIdStart, - result.newRowIdStart + totalRows, result.newFileId, 0, 0, totalRows)); + indexService.deleteMainIndexRange(result.group.tableId, result.newFileId, + result.newRowIdStart, totalRows); } catch (Exception ex) { - logger.warn("Rollback: failed to clean MainIndex for fileId={}", result.newFileId, ex); + logger.warn("Rollback: failed to delete MainIndex range for fileId={}, rowIdStart={}, rowCount={}", + result.newFileId, result.newRowIdStart, totalRows, ex); } } @@ -1263,11 +1314,14 @@ void rollback(RewriteResult result) try { - metadataService.deleteFiles(Collections.singletonList(result.newFileId)); + if (!metadataService.deleteFiles(Collections.singletonList(result.newFileId))) + { + logger.warn("Rollback: catalog delete returned false for fileId={}", result.newFileId); + } } - catch (Exception ex) + catch (MetadataException ex) { - logger.warn("Rollback: failed to delete catalog entry for fileId={}", result.newFileId, ex); + logger.warn("Rollback: metadata service failed to delete catalog entry for fileId={}", result.newFileId, ex); } try @@ -1282,11 +1336,29 @@ void rollback(RewriteResult result) { logger.warn("Rollback: failed to delete physical file {}", result.newFilePath, ex); } + } catch (Exception e) { logger.error("Rollback failed for FileGroup tableId={}", result.group.tableId, e); } + finally + { + if (result.walWriter != null) + { + try + { + result.walWriter.markAborted(); + } + catch (IOException e) + { + logger.warn("Rollback: failed to write ABORTED to WAL for taskId={}", + result.walWriter.getTaskId(), e); + try { result.walWriter.close(); } catch (IOException ignored) {} + } + result.walWriter = null; + } + } } private void rollbackSinglePointIndex(RewriteResult result) @@ -1299,10 +1371,8 @@ private void rollbackSinglePointIndex(RewriteResult result) { return; } + long primaryIndexId = primaryIndex.getId(); IndexOption indexOption = IndexOption.builder().vNodeId(result.group.virtualNodeId).build(); - io.pixelsdb.pixels.common.index.SinglePointIndex spIndex = - SinglePointIndexFactory.Instance().getSinglePointIndex( - result.group.tableId, primaryIndex.getId(), indexOption); // Alignment invariant: oldRowIds.size() == pendingIndexEntries.size() // (established in updateSinglePointIndex). Walk them in lockstep by @@ -1315,6 +1385,7 @@ private void rollbackSinglePointIndex(RewriteResult result) "rolling back the common prefix only — index may remain inconsistent", result.pendingIndexEntries.size(), result.oldRowIds.size()); } + List rollbackEntries = new ArrayList<>(n); for (int i = 0; i < n; i++) { long oldRowId = result.oldRowIds.get(i); @@ -1324,9 +1395,15 @@ private void rollbackSinglePointIndex(RewriteResult result) } PendingIndexEntry pe = result.pendingIndexEntries.get(i); IndexProto.IndexKey key = IndexProto.IndexKey.newBuilder() - .setTableId(result.group.tableId).setIndexId(primaryIndex.getId()) + .setTableId(result.group.tableId).setIndexId(primaryIndexId) .setKey(pe.pkBytes).setTimestamp(pe.createTs).build(); - spIndex.updatePrimaryEntry(key, oldRowId); + long newRowId = result.newRowIdStart + pe.newGlobalRowOffset; + rollbackEntries.add(new RollbackEntry(key, oldRowId, newRowId)); + } + if (!rollbackEntries.isEmpty()) + { + indexService.restorePrimaryIndexEntries( + result.group.tableId, primaryIndexId, rollbackEntries, indexOption); } } catch (Exception e) @@ -1373,19 +1450,14 @@ void processFileGroup(FileGroup group, long safeGcTs, { logger.error("StorageGC failed for FileGroup tableId={}, vNodeId={}", group.tableId, group.virtualNodeId, e); - releaseGroupBitmaps(group, gcSnapshotBitmaps); - rollback(result); - } - } - - private void releaseGroupBitmaps(FileGroup group, Map gcSnapshotBitmaps) - { - for (FileCandidate fc : group.files) - { - for (int rgId = 0; rgId < fc.rgCount; rgId++) + for (FileCandidate fc : group.files) { - gcSnapshotBitmaps.remove(RetinaUtils.buildRgKey(fc.fileId, rgId)); + for (int rgId = 0; rgId < fc.rgCount; rgId++) + { + gcSnapshotBitmaps.remove(RetinaUtils.buildRgKey(fc.fileId, rgId)); + } } + rollback(result); } } } diff --git a/pixels-retina/src/main/java/io/pixelsdb/pixels/retina/StorageGcWal.java b/pixels-retina/src/main/java/io/pixelsdb/pixels/retina/StorageGcWal.java new file mode 100644 index 0000000000..4d0423497d --- /dev/null +++ b/pixels-retina/src/main/java/io/pixelsdb/pixels/retina/StorageGcWal.java @@ -0,0 +1,909 @@ +/* + * Copyright 2026 PixelsDB. + * + * This file is part of Pixels. + * + * Pixels is free software: you can redistribute it and/or modify + * it under the terms of the Affero GNU General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * Pixels is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Affero GNU General Public License for more details. + * + * You should have received a copy of the Affero GNU General Public + * License along with Pixels. If not, see + * . + */ +package io.pixelsdb.pixels.retina; + +import io.pixelsdb.pixels.common.exception.IndexException; +import io.pixelsdb.pixels.common.exception.MetadataException; +import io.pixelsdb.pixels.common.exception.RetinaException; +import io.pixelsdb.pixels.common.index.IndexOption; +import io.pixelsdb.pixels.common.index.RollbackEntry; +import io.pixelsdb.pixels.common.index.service.IndexService; +import io.pixelsdb.pixels.common.metadata.MetadataService; +import io.pixelsdb.pixels.common.metadata.domain.File; +import io.pixelsdb.pixels.common.metadata.domain.SinglePointIndex; +import io.pixelsdb.pixels.common.physical.Storage; +import io.pixelsdb.pixels.common.physical.StorageFactory; +import io.pixelsdb.pixels.common.utils.ConfigFactory; +import io.pixelsdb.pixels.common.utils.Constants; +import io.pixelsdb.pixels.common.utils.RetinaUtils; +import io.pixelsdb.pixels.index.IndexProto; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.Closeable; +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.EOFException; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Objects; +import java.util.Optional; +import java.util.Set; +import java.util.stream.Collectors; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; + + +/** + * Append-only write-ahead log for Storage GC rewrite operations. + * + *

Each GC task is stored as a single WAL file under {@code retina.storage.gc.journal.dir}. + * The file is a stream of length-prefixed records: + *

+ *   [type: 1 byte][payloadLen: 4 bytes][payload: payloadLen bytes] ...
+ * 
+ * Record types: + *
    + *
  • {@code CREATE (1)}: written once when the task is opened; carries all task metadata.
  • + *
  • {@code ROLLBACK_ENTRY (2)}: appended once per kept row's primary-key switch, + * before the batch index update, so that recovery can restore the old rowId.
  • + *
  • {@code STATE_TRANSITION (3)}: appended when the task reaches a terminal or + * intermediate state.
  • + *
+ * + *

State machine: {@code INDEX_SWITCHING → SWAPPED_NOT_CHECKPOINTED → CHECKPOINTED} + * {@code ↘ ABORTED ←────────────────────────} + * + *

A truncated record at the end of a file (crash mid-write) is silently dropped + * during replay; the task state is reconstructed from the last complete record. + * + *

This class is not thread-safe for concurrent read/write access. Callers must + * serialize WAL mutation, replay, and deletion for the WAL directory. Current + * production use relies on startup recovery finishing before background GC starts, + * and on the background GC cycle running WAL writes and reconciliation in one + * single-threaded executor. + */ +public final class StorageGcWal +{ + private static final Logger logger = LogManager.getLogger(StorageGcWal.class); + + // ─── Record type codes ────────────────────────────────────────────────── + static final byte RT_CREATE = 1; + static final byte RT_ROLLBACK_ENTRY = 2; + static final byte RT_STATE_TRANSITION = 3; + + // ─── State ────────────────────────────────────────────────────────────── + + public enum State + { + /** Primary index entries are being switched from old to new rowIds. */ + INDEX_SWITCHING((byte) 1), + /** All index switches done, but the new file is not yet in a recovery checkpoint. */ + SWAPPED_NOT_CHECKPOINTED((byte) 2), + /** New file confirmed in the checkpoint baseline; the rewrite is permanently committed. */ + CHECKPOINTED((byte) 3), + /** Rolled back; old files and index entries have been restored. */ + ABORTED((byte) 4); + + final byte code; + + State(byte code) + { + this.code = code; + } + + static State fromCode(byte code) + { + for (State s : values()) + { + if (s.code == code) + { + return s; + } + } + throw new IllegalArgumentException("Unknown WAL state code: " + code); + } + + public boolean isPending() + { + return this == INDEX_SWITCHING || this == SWAPPED_NOT_CHECKPOINTED; + } + + public boolean isTerminal() + { + return this == CHECKPOINTED || this == ABORTED; + } + } + + // ─── Task (read-only view, reconstructed from WAL replay) ─────────────── + + public static final class Task + { + private final String taskId; + private final long tableId; + private final int virtualNodeId; + private final List oldFileIds; + private final long newFileId; + private final String newFilePath; + private final long newRowIdStart; + private final int newRowCount; + private final List rollbackEntries; + private final State state; + + Task(String taskId, long tableId, int virtualNodeId, + List oldFileIds, long newFileId, String newFilePath, + long newRowIdStart, int newRowCount, + List rollbackEntries, State state) + { + this.taskId = taskId; + this.tableId = tableId; + this.virtualNodeId = virtualNodeId; + this.oldFileIds = Collections.unmodifiableList(new ArrayList<>(oldFileIds)); + this.newFileId = newFileId; + this.newFilePath = newFilePath == null ? "" : newFilePath; + this.newRowIdStart = newRowIdStart; + this.newRowCount = newRowCount; + this.rollbackEntries = Collections.unmodifiableList(new ArrayList<>(rollbackEntries)); + this.state = state; + } + + public String getTaskId() { return taskId; } + public long getTableId() { return tableId; } + public int getVirtualNodeId() { return virtualNodeId; } + public List getOldFileIds() { return oldFileIds; } + public long getNewFileId() { return newFileId; } + public String getNewFilePath() { return newFilePath; } + public long getNewRowIdStart() { return newRowIdStart; } + public int getNewRowCount() { return newRowCount; } + public List getRollbackEntries() { return rollbackEntries; } + public State getState() { return state; } + } + + // ─── Writer (per-task write handle) ───────────────────────────────────── + + /** + * Write handle for one GC task's WAL file. Holds an open append stream for the + * duration of the {@code syncIndex} → {@code commitFileGroup/rollback} window. + * Calling {@link #markSwapped()} or {@link #markAborted()} writes the final state + * record and closes the stream automatically. + */ + public static final class Writer implements Closeable + { + private final String taskId; + private DataOutputStream out; + + Writer(String taskId, DataOutputStream out) + { + this.taskId = taskId; + this.out = out; + } + + public String getTaskId() + { + return taskId; + } + + /** + * Appends one rollback entry. Must be called before the corresponding index + * pointer is updated so that recovery can restore the old rowId. + */ + public void appendRollbackEntry(IndexProto.IndexKey indexKey, + long oldRowId, long newRowId) throws IOException + { + byte[] keyBytes = indexKey.toByteArray(); + int payloadLen = 4 + keyBytes.length + 16; + out.writeByte(RT_ROLLBACK_ENTRY); + out.writeInt(payloadLen); + out.writeInt(keyBytes.length); + out.write(keyBytes); + out.writeLong(oldRowId); + out.writeLong(newRowId); + } + + /** + * Flushes buffered data to the underlying storage. Call this after all + * rollback entries have been appended and before the batch index update, + * to ensure crash-safety of the undo log. + */ + public void flush() throws IOException + { + out.flush(); + } + + /** Writes {@code SWAPPED_NOT_CHECKPOINTED} record and closes the stream. */ + public void markSwapped() throws IOException + { + writeStateTransition(out, State.SWAPPED_NOT_CHECKPOINTED); + close(); + } + + /** Writes {@code ABORTED} record and closes the stream. */ + public void markAborted() throws IOException + { + writeStateTransition(out, State.ABORTED); + close(); + } + + @Override + public void close() throws IOException + { + if (out != null) + { + out.flush(); + out.close(); + out = null; + } + } + } + + // ─── Fields & constructor ──────────────────────────────────────────────── + + private final Storage storage; + private final String walDir; + + public StorageGcWal() + { + this.walDir = ConfigFactory.Instance().getProperty("retina.storage.gc.journal.dir"); + try + { + this.storage = StorageFactory.Instance().getStorage(walDir); + } + catch (IOException e) + { + throw new IllegalStateException( + "Failed to initialize Storage GC WAL at " + walDir, e); + } + } + + // ─── Write path (GC hot path) ──────────────────────────────────────────── + + /** + * Creates a new WAL file for a task, writes the {@code CREATE} record, and + * returns an open {@link Writer} that the caller uses to append rollback entries + * and mark the final task state. + */ + public Writer createTask(String taskId, long tableId, int virtualNodeId, + List oldFileIds, long newFileId, String newFilePath, + long newRowIdStart, int newRowCount) throws IOException + { + String path = walPath(taskId); + DataOutputStream out = storage.create(path, false, + Constants.STORAGE_GC_JOURNAL_BUFFER_SIZE); + try + { + writeCreateRecord(out, taskId, tableId, virtualNodeId, oldFileIds, + newFileId, newFilePath, newRowIdStart, newRowCount); + out.flush(); + } + catch (IOException e) + { + try { out.close(); } catch (IOException ignored) {} + throw e; + } + return new Writer(taskId, out); + } + + // ─── Cold-path state transitions (reconciliation / recovery) ──────────── + + /** Appends a {@code CHECKPOINTED} record to an existing task's WAL file. */ + public void markCheckpointed(String taskId) throws IOException + { + appendStateTransition(taskId, State.CHECKPOINTED); + } + + /** Appends an {@code ABORTED} record to an existing task's WAL file. */ + void markAborted(String taskId) throws IOException + { + appendStateTransition(taskId, State.ABORTED); + } + + private void appendStateTransition(String taskId, State state) throws IOException + { + try (DataOutputStream out = storage.append(walPath(taskId), + Constants.STORAGE_GC_JOURNAL_BUFFER_SIZE)) + { + writeStateTransition(out, state); + out.flush(); + } + } + + // ─── Query path ────────────────────────────────────────────────────────── + + /** Returns all tasks by replaying every WAL file in the WAL directory. */ + public List listAllTasks() + { + try + { + if (!storage.exists(walDir)) + { + return Collections.emptyList(); + } + List tasks = new ArrayList<>(); + for (String path : storage.listPaths(walDir)) + { + if (path == null || !path.endsWith(RetinaUtils.STORAGE_GC_JOURNAL_SUFFIX)) + { + continue; + } + try + { + Task task = replayWal(path); + if (task != null) + { + tasks.add(task); + } + } + catch (IOException | RuntimeException e) + { + throw new IllegalStateException( + "Corrupted Storage GC WAL file at " + path + + ". Delete this file to allow recovery to proceed.", e); + } + } + return Collections.unmodifiableList(tasks); + } + catch (IOException e) + { + throw new IllegalStateException( + "Failed to list Storage GC WAL files under " + walDir, e); + } + } + + /** Returns the task for the given ID, or empty if the WAL file does not exist. */ + public Optional getTask(String taskId) + { + String path = walPath(taskId); + try + { + if (!storage.exists(path)) + { + return Optional.empty(); + } + Task task = replayWal(path); + return Optional.ofNullable(task); + } + catch (IOException e) + { + throw new IllegalStateException( + "Failed to read Storage GC WAL for taskId=" + taskId, e); + } + } + + /** Returns all file IDs referenced by tasks in a pending (non-terminal) state. */ + public Set collectPendingFileIds() + { + Set fileIds = new HashSet<>(); + for (Task task : listAllTasks()) + { + if (task.getState().isPending()) + { + fileIds.add(task.getNewFileId()); + fileIds.addAll(task.getOldFileIds()); + } + } + return Collections.unmodifiableSet(fileIds); + } + + /** Returns only tasks in a terminal state (CHECKPOINTED or ABORTED). */ + public List listTerminalTasks() + { + return listAllTasks().stream() + .filter(t -> t.getState().isTerminal()) + .collect(Collectors.toList()); + } + + // ─── Management ────────────────────────────────────────────────────────── + + /** + * Deletes the WAL files for the given task IDs. Only terminal tasks may be deleted; + * calling this with a non-terminal task ID throws {@link IllegalArgumentException}. + */ + public void deleteTerminalTasks(Collection taskIds) + { + if (taskIds == null || taskIds.isEmpty()) + { + return; + } + for (String taskId : taskIds) + { + Optional task = getTask(taskId); + if (!task.isPresent()) + { + continue; + } + if (!task.get().getState().isTerminal()) + { + throw new IllegalArgumentException( + "Cannot delete non-terminal WAL task " + taskId + + " in state " + task.get().getState()); + } + String path = walPath(taskId); + try + { + if (storage.exists(path)) + { + storage.delete(path, false); + } + } + catch (IOException e) + { + throw new IllegalStateException( + "Failed to delete Storage GC WAL file for taskId=" + taskId, e); + } + } + } + + // ─── Internal record I/O ───────────────────────────────────────────────── + + private static void writeCreateRecord(DataOutputStream out, + String taskId, long tableId, int virtualNodeId, + List oldFileIds, long newFileId, + String newFilePath, long newRowIdStart, + int newRowCount) throws IOException + { + ByteArrayOutputStream baos = new ByteArrayOutputStream(256); + DataOutputStream payload = new DataOutputStream(baos); + payload.writeUTF(taskId); + payload.writeLong(tableId); + payload.writeInt(virtualNodeId); + payload.writeInt(oldFileIds.size()); + for (Long id : oldFileIds) + { + payload.writeLong(id); + } + payload.writeLong(newFileId); + payload.writeUTF(newFilePath == null ? "" : newFilePath); + payload.writeLong(newRowIdStart); + payload.writeInt(newRowCount); + byte[] bytes = baos.toByteArray(); + out.writeByte(RT_CREATE); + out.writeInt(bytes.length); + out.write(bytes); + } + + private static void writeStateTransition(DataOutputStream out, State state) throws IOException + { + out.writeByte(RT_STATE_TRANSITION); + out.writeInt(1); + out.writeByte(state.code); + } + + /** + * Replays a WAL file and returns the reconstructed {@link Task}, or {@code null} if the + * file is empty. Throws {@link IllegalStateException} if the CREATE record is missing or + * truncated (caller wraps this for the operator-friendly error message). + */ + private Task replayWal(String path) throws IOException + { + try (DataInputStream in = storage.open(path)) + { + // ── CREATE record (mandatory, must be first) ────────────────── + int typeByte = in.read(); + if (typeByte == -1) + { + throw new IllegalStateException("Empty WAL file: no records"); + } + if ((byte) typeByte != RT_CREATE) + { + throw new IllegalStateException( + "First record must be CREATE, got type " + typeByte); + } + int createLen; + byte[] createPayload; + try + { + createLen = in.readInt(); + createPayload = new byte[createLen]; + in.readFully(createPayload); + } + catch (EOFException e) + { + throw new IllegalStateException("Truncated CREATE record", e); + } + + DataInputStream cp = new DataInputStream( + new ByteArrayInputStream(createPayload)); + String taskId = cp.readUTF(); + long tableId = cp.readLong(); + int virtualNodeId = cp.readInt(); + int oldFileCount = cp.readInt(); + List oldFileIds = new ArrayList<>(oldFileCount); + for (int i = 0; i < oldFileCount; i++) + { + oldFileIds.add(cp.readLong()); + } + long newFileId = cp.readLong(); + String newFilePath = cp.readUTF(); + long newRowIdStart = cp.readLong(); + int newRowCount = cp.readInt(); + + // ── Subsequent records (rollback entries + state transitions) ── + List rollbackEntries = new ArrayList<>(); + State state = State.INDEX_SWITCHING; + + while (true) + { + int rt = in.read(); + if (rt == -1) + { + break; // normal EOF + } + int rLen; + byte[] rPayload; + try + { + rLen = in.readInt(); + rPayload = new byte[rLen]; + in.readFully(rPayload); + } + catch (EOFException e) + { + logger.warn("WAL file {} has a truncated record of type {} at the end " + + "(possible crash mid-write); ignoring partial record", path, rt); + break; + } + + if ((byte) rt == RT_ROLLBACK_ENTRY) + { + DataInputStream re = new DataInputStream( + new ByteArrayInputStream(rPayload)); + int keyLen = re.readInt(); + byte[] keyBytes = new byte[keyLen]; + re.readFully(keyBytes); + long oldRowId = re.readLong(); + long newRowId = re.readLong(); + rollbackEntries.add(new RollbackEntry( + IndexProto.IndexKey.parseFrom(keyBytes), oldRowId, newRowId)); + } + else if ((byte) rt == RT_STATE_TRANSITION) + { + state = State.fromCode(rPayload[0]); + } + else + { + logger.warn("WAL file {} has unknown record type {}, skipping", path, rt); + } + } + + return new Task(taskId, tableId, virtualNodeId, oldFileIds, + newFileId, newFilePath, newRowIdStart, newRowCount, + rollbackEntries, state); + } + } + + private String walPath(String taskId) + { + return RetinaUtils.buildStorageGcJournalPath(walDir, taskId); + } + + // ─── RecoveryHandler ───────────────────────────────────────────────────── + + public static final class RecoveryHandler + { + private final StorageGcWal wal; + private final MetadataService metadataService; + private final IndexService indexService; + + public RecoveryHandler(StorageGcWal wal, + MetadataService metadataService, + IndexService indexService) + { + this.wal = Objects.requireNonNull(wal, "wal"); + this.metadataService = Objects.requireNonNull(metadataService, "metadataService"); + this.indexService = Objects.requireNonNull(indexService, "indexService"); + } + + public void recover(Set baselineVisibleFileIds) throws RetinaException + { + Set baseline = baselineVisibleFileIds == null + ? Collections.emptySet() : baselineVisibleFileIds; + + // A single pass over all tasks. The durable truth is the checkpoint baseline + // (which newFileIds are visible) plus the file catalog (REGULAR/RETIRED); the + // WAL state only records how far the non-atomic rewrite progressed. For a + // pending task the decision is uniform: if its newFile made it into the + // baseline the rewrite is committed → advance to CHECKPOINTED; otherwise roll + // back, restoring old files only when the catalog confirms the swap committed. + for (Task task : wal.listAllTasks()) + { + switch (task.getState()) + { + case CHECKPOINTED: + if (!baseline.contains(task.getNewFileId())) + { + throw new RetinaException( + "Storage GC WAL recovery failed: CHECKPOINTED task " + + task.getTaskId() + " newFileId=" + task.getNewFileId() + + " is absent from selected checkpoint baseline"); + } + break; + case ABORTED: + // Re-run unconditionally: the live rollback swallows per-step + // failures before writing ABORTED, so cleanup may be partial. + // cleanupNewFile is idempotent, so repeating it until + // deleteTerminalTasks removes the WAL is safe. + cleanupNewFile(task); + break; + case SWAPPED_NOT_CHECKPOINTED: + case INDEX_SWITCHING: + if (baseline.contains(task.getNewFileId())) + { + // Invariant: a checkpoint baseline can only contain a newFile + // whose task already reached SWAPPED_NOT_CHECKPOINTED. The GC + // cycle publishes the checkpoint (RRM.runGC Step 3) strictly + // after runStorageGC returns (Step 2), by which point every task + // has been marked SWAPPED or ABORTED; a crash inside the + // swap→markSwapped window aborts that whole cycle before any + // checkpoint is published. An INDEX_SWITCHING task in the + // baseline therefore means that ordering invariant was broken + // (e.g. storage GC made asynchronous, or checkpoint generation + // reordered ahead of the swap). Fail fast rather than committing + // a half-switched primary index. + if (task.getState() == State.INDEX_SWITCHING) + { + throw new RetinaException( + "Storage GC WAL recovery failed: INDEX_SWITCHING task " + + task.getTaskId() + " newFileId=" + task.getNewFileId() + + " is present in the checkpoint baseline before its " + + "index switch was marked complete (invariant violation)"); + } + try { wal.markCheckpointed(task.getTaskId()); } + catch (IOException e) + { + throw new RetinaException( + "WAL recovery failed to mark CHECKPOINTED for taskId=" + + task.getTaskId(), e); + } + } + else + { + // restoreOldFiles is derived uniformly from the catalog. For a + // SWAPPED task the swap definitely committed, so isSwapCommitted + // returns true (its newFile stays REGULAR because + // collectPendingFileIds protects it from orphan-retirement); for + // an INDEX_SWITCHING task the swap may or may not have committed, + // so the catalog is the source of truth. + rollbackTask(task, isSwapCommitted(task)); + } + break; + default: + break; + } + } + } + + private void rollbackTask(Task task, boolean restoreOldFiles) throws RetinaException + { + restorePrimaryIndex(task); + if (restoreOldFiles) + { + restoreOldFileCatalog(task); + } + cleanupNewFile(task); + try { wal.markAborted(task.getTaskId()); } + catch (IOException e) + { + throw new RetinaException( + "WAL recovery failed to mark ABORTED for taskId=" + + task.getTaskId(), e); + } + } + + private void restorePrimaryIndex(Task task) throws RetinaException + { + List entries = new ArrayList<>(); + for (RollbackEntry entry : task.getRollbackEntries()) + { + if (entry.getOldRowId() >= 0) + { + entries.add(entry); + } + } + if (entries.isEmpty()) + { + return; + } + long primaryIndexId = getPrimaryIndexId(task.getTableId()); + IndexOption indexOption = IndexOption.builder() + .vNodeId(task.getVirtualNodeId()).build(); + try + { + indexService.restorePrimaryIndexEntries( + task.getTableId(), primaryIndexId, entries, indexOption); + } + catch (IndexException | UnsupportedOperationException e) + { + throw new RetinaException( + "WAL recovery failed to restore primary index for taskId=" + + task.getTaskId(), e); + } + } + + private long getPrimaryIndexId(long tableId) throws RetinaException + { + try + { + SinglePointIndex primaryIndex = metadataService.getPrimaryIndex(tableId); + if (primaryIndex == null) + { + throw new RetinaException( + "WAL recovery failed: primary index not found for tableId=" + tableId); + } + return primaryIndex.getId(); + } + catch (MetadataException e) + { + throw new RetinaException( + "WAL recovery failed to load primary index for tableId=" + tableId, e); + } + } + + private void restoreOldFileCatalog(Task task) throws RetinaException + { + for (Long oldFileId : task.getOldFileIds()) + { + File file = loadRequiredFile(task, oldFileId, "old file"); + if (file.getType() == File.Type.REGULAR) + { + file.setCleanupAt(null); + updateFile(task, file, "clear cleanupAt for old file"); + continue; + } + if (file.getType() != File.Type.RETIRED) + { + throw new RetinaException( + "WAL recovery failed: old fileId=" + oldFileId + + " for taskId=" + task.getTaskId() + + " is " + file.getType() + ", expected RETIRED or REGULAR"); + } + file.setType(File.Type.REGULAR); + file.setCleanupAt(null); + updateFile(task, file, "restore old file catalog"); + } + } + + private void cleanupNewFile(Task task) throws RetinaException + { + if (task.getNewRowCount() > 0) + { + try + { + indexService.deleteMainIndexRange(task.getTableId(), task.getNewFileId(), + task.getNewRowIdStart(), task.getNewRowCount()); + } + catch (IndexException | UnsupportedOperationException e) + { + throw new RetinaException( + "WAL recovery failed to delete new MainIndex range for taskId=" + + task.getTaskId(), e); + } + } + + File file = loadOptionalFile(task.getNewFileId()); + if (file != null) + { + if (file.getType() == File.Type.RETIRED) + { + throw new RetinaException( + "WAL recovery failed: new fileId=" + task.getNewFileId() + + " for taskId=" + task.getTaskId() + " is RETIRED"); + } + try + { + if (!metadataService.deleteFiles( + Collections.singletonList(task.getNewFileId()))) + { + throw new RetinaException( + "deleteFiles returned false for newFileId=" + task.getNewFileId()); + } + } + catch (MetadataException e) + { + throw new RetinaException( + "WAL recovery failed to delete new file catalog for taskId=" + + task.getTaskId(), e); + } + } + + String path = task.getNewFilePath(); + if (path != null && !path.trim().isEmpty()) + { + try + { + Storage fs = StorageFactory.Instance().getStorage(path); + if (fs.exists(path)) + { + fs.delete(path, false); + } + } + catch (IOException e) + { + throw new RetinaException( + "WAL recovery failed to delete new physical file for taskId=" + + task.getTaskId() + ", path=" + path, e); + } + } + } + + private boolean isSwapCommitted(Task task) throws RetinaException + { + File newFile = loadOptionalFile(task.getNewFileId()); + if (newFile != null && newFile.getType() == File.Type.REGULAR) + { + return true; + } + for (Long oldFileId : task.getOldFileIds()) + { + File oldFile = loadOptionalFile(oldFileId); + if (oldFile != null && oldFile.getType() == File.Type.RETIRED) + { + return true; + } + } + return false; + } + + private File loadRequiredFile(Task task, long fileId, String role) throws RetinaException + { + File file = loadOptionalFile(fileId); + if (file == null) + { + throw new RetinaException( + "WAL recovery failed: missing " + role + + " fileId=" + fileId + " for taskId=" + task.getTaskId()); + } + return file; + } + + private File loadOptionalFile(long fileId) throws RetinaException + { + try + { + return metadataService.getFileById(fileId); + } + catch (MetadataException e) + { + throw new RetinaException( + "WAL recovery failed to load fileId=" + fileId, e); + } + } + + private void updateFile(Task task, File file, String action) throws RetinaException + { + try + { + if (!metadataService.updateFile(file)) + { + throw new RetinaException( + action + " returned false for fileId=" + file.getId()); + } + } + catch (MetadataException e) + { + throw new RetinaException( + "WAL recovery failed to " + action + + " for taskId=" + task.getTaskId(), e); + } + } + } +} diff --git a/pixels-retina/src/test/java/io/pixelsdb/pixels/retina/TestPixelsWriteBuffer.java b/pixels-retina/src/test/java/io/pixelsdb/pixels/retina/TestPixelsWriteBuffer.java index 3e6b4d9b6c..4eb9a0dd08 100644 --- a/pixels-retina/src/test/java/io/pixelsdb/pixels/retina/TestPixelsWriteBuffer.java +++ b/pixels-retina/src/test/java/io/pixelsdb/pixels/retina/TestPixelsWriteBuffer.java @@ -21,18 +21,22 @@ import io.pixelsdb.pixels.common.metadata.domain.Path; import io.pixelsdb.pixels.core.TypeDescription; +import io.pixelsdb.pixels.core.vector.VectorizedRowBatch; import io.pixelsdb.pixels.index.IndexProto; import org.junit.Before; import org.junit.Test; -import java.lang.management.ManagementFactory; import java.nio.ByteBuffer; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import java.util.concurrent.CountDownLatch; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.atomic.AtomicBoolean; + +import static org.junit.Assert.assertEquals; + public class TestPixelsWriteBuffer { private List columnNames = new ArrayList<>(); @@ -44,29 +48,32 @@ public class TestPixelsWriteBuffer @Before public void setup() { + columnNames.clear(); + columnTypes.clear(); + columnNames.add("id"); + columnNames.add("name"); + columnTypes.add("int"); + columnTypes.add("int"); + schema = TypeDescription.createSchemaFromStrings(columnNames, columnTypes); + targetOrderDirPath = new Path(); targetOrderDirPath.setUri("file:///home/gengdy/data/tpch/1g/customer/v-0-ordered"); targetOrderDirPath.setId(1); // path id get from mysql `PATHS` table targetCompactDirPath = new Path(); targetCompactDirPath.setUri("file:///home/gengdy/data/tpch/1g/customer/v-0-compact"); targetCompactDirPath.setId(2); // get from mysql `PATHS` table + } + + @Test + public void testConcurrentWriteOperations() + { try { - columnNames.add("id"); - columnNames.add("name"); - columnTypes.add("int"); - columnTypes.add("int"); - - schema = TypeDescription.createSchemaFromStrings(columnNames, columnTypes); buffer = new PixelsWriteBuffer(0L, schema, targetOrderDirPath, targetCompactDirPath, "localhost", 0); // table id get from mysql `TBLS` table } catch (Exception e) { System.out.println("setup error: " + e); } - } - @Test - public void testConcurrentWriteOperations() - { // // print pid if you want to attach a profiler like async-profiler or YourKit // try @@ -114,10 +121,45 @@ public void testConcurrentWriteOperations() { completionLatch.await(); Thread.sleep(10000); // wait for async flush to complete - buffer.close(); - } catch (Exception e) - { - System.out.println("error: " + e); - } + buffer.close(); + } catch (Exception e) + { + System.out.println("error: " + e); } } + + @Test + public void appendedRowsAreImmediatelyVisibleAndAdvanceCommitTsBounds() throws Exception + { + // After removing the two-phase publish, append is the only step and a + // row is query-visible as soon as it returns. The hidden ts column + // bounds therefore cover all appended rows immediately, and serialize() + // returns the full row batch with no truncation. + MemTable memTable = newMemTable(4); + + memTable.add(row(1), 10L); + assertEquals(1, memTable.getSize()); + assertEquals(1, VectorizedRowBatch.deserialize(memTable.serialize()).size); + assertEquals(10L, memTable.getMinCommitTs()); + assertEquals(10L, memTable.getMaxCommitTs()); + + memTable.add(row(2), 20L); + assertEquals(2, memTable.getSize()); + assertEquals(2, VectorizedRowBatch.deserialize(memTable.serialize()).size); + assertEquals(10L, memTable.getMinCommitTs()); + assertEquals(20L, memTable.getMaxCommitTs()); + } + + private static MemTable newMemTable(int size) + { + TypeDescription schema = TypeDescription.createSchemaFromStrings( + Arrays.asList("id"), Arrays.asList("int")); + return new MemTable(0L, schema, size, + TypeDescription.Mode.CREATE_INT_VECTOR_FOR_INT, 100L, 0, size); + } + + private static byte[][] row(int value) + { + return new byte[][] {ByteBuffer.allocate(Integer.BYTES).putInt(value).array()}; + } +} diff --git a/pixels-retina/src/test/java/io/pixelsdb/pixels/retina/TestRecoveryCheckpoint.java b/pixels-retina/src/test/java/io/pixelsdb/pixels/retina/TestRecoveryCheckpoint.java new file mode 100644 index 0000000000..cc37f7f694 --- /dev/null +++ b/pixels-retina/src/test/java/io/pixelsdb/pixels/retina/TestRecoveryCheckpoint.java @@ -0,0 +1,609 @@ +/* + * Copyright 2026 PixelsDB. + * + * This file is part of Pixels. + * + * Pixels is free software: you can redistribute it and/or modify + * it under the terms of the Affero GNU General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * Pixels is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Affero GNU General Public License for more details. + * + * You should have received a copy of the Affero GNU General Public + * License along with Pixels. If not, see + * . + */ +package io.pixelsdb.pixels.retina; + +import io.etcd.jetcd.ByteSequence; +import io.etcd.jetcd.KeyValue; +import io.pixelsdb.pixels.common.exception.RetinaException; +import io.pixelsdb.pixels.common.physical.Storage; +import io.pixelsdb.pixels.common.utils.EtcdUtil; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import org.junit.Before; +import org.junit.Test; +import org.mockito.stubbing.Answer; + +import static org.junit.Assert.*; +import static org.mockito.ArgumentMatchers.anyInt; +import static org.mockito.ArgumentMatchers.anyString; +import static org.mockito.ArgumentMatchers.eq; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.never; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +/** + * Unit tests for {@link RecoveryCheckpoint}: the {@link RecoveryCheckpoint.Body} + * serialization codec, {@link RecoveryCheckpoint#generate(long, List, List)}'s + * write/publish contract, and {@link RecoveryCheckpoint#load()}'s fail-closed + * contract. + *

+ * The load() contract under test: only an absent pointer is a legitimate + * fresh-deployment signal ({@code null}); once the pointer exists, any unusable + * body — transient read error, empty/corrupted/truncated body, stale vnode + * mapping, mismatched node id, illegal timestamp — aborts recovery by throwing, + * never a silent fresh deployment. + */ +public class TestRecoveryCheckpoint +{ + private static final String NODE_ID = "host1:8080"; + private static final int VNODES = 4; + private static final String BODY_PATH = "checkpoint/recovery/host1/body-1"; + private static final String POINTER_KEY = "/pixels/retina/recovery/checkpoint/" + NODE_ID + "/current"; + + private Storage storage; + private EtcdUtil etcd; + private RecoveryCheckpoint checkpoint; + private List createdPaths; + private List createdBodies; + + @Before + public void setUp() + { + storage = mock(Storage.class); + etcd = mock(EtcdUtil.class); + checkpoint = new RecoveryCheckpoint(storage, "checkpoint/recovery", etcd, VNODES, NODE_ID); + createdPaths = new ArrayList<>(); + createdBodies = new ArrayList<>(); + } + + // =============================================================== + // Shared helpers + // =============================================================== + + private static RecoveryCheckpoint.Body.Builder baseBuilder() + { + return RecoveryCheckpoint.Body.builder() + .retinaNodeId(NODE_ID) + .writeTimeMs(1000L) + .checkpointAppliedTs(3000L) + .virtualNodesPerNode(VNODES); + } + + private static byte[] serialize(RecoveryCheckpoint.Body body) throws IOException + { + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + body.writeTo(new DataOutputStream(baos)); + return baos.toByteArray(); + } + + private static String generatedBodyPath(long checkpointAppliedTs) + { + return "checkpoint/recovery/recovery_" + NODE_ID + "_" + checkpointAppliedTs + ".bin"; + } + + /** Assert every field of two Body objects is identical. */ + private static void assertBodyEquals(RecoveryCheckpoint.Body expected, RecoveryCheckpoint.Body actual) + { + assertEquals(expected.getRetinaNodeId(), actual.getRetinaNodeId()); + assertEquals(expected.getWriteTimeMs(), actual.getWriteTimeMs()); + assertEquals(expected.getCheckpointAppliedTs(), actual.getCheckpointAppliedTs()); + assertEquals(expected.getVirtualNodesPerNode(), actual.getVirtualNodesPerNode()); + + List es = expected.getSegmentEntries(); + List as = actual.getSegmentEntries(); + assertEquals(es.size(), as.size()); + for (int i = 0; i < es.size(); i++) + { + assertEquals("segment[" + i + "].virtualNodeId", es.get(i).getVirtualNodeId(), as.get(i).getVirtualNodeId()); + assertEquals("segment[" + i + "].minCommitTs", es.get(i).getMinCommitTs(), as.get(i).getMinCommitTs()); + } + + List er = expected.getRgEntries(); + List ar = actual.getRgEntries(); + assertEquals(er.size(), ar.size()); + for (int i = 0; i < er.size(); i++) + { + assertEquals("rg[" + i + "].fileId", er.get(i).getFileId(), ar.get(i).getFileId()); + assertEquals("rg[" + i + "].rgId", er.get(i).getRgId(), ar.get(i).getRgId()); + assertEquals("rg[" + i + "].recordNum", er.get(i).getRecordNum(), ar.get(i).getRecordNum()); + assertEquals("rg[" + i + "].baseTimestamp", er.get(i).getBaseTimestamp(), ar.get(i).getBaseTimestamp()); + assertArrayEquals("rg[" + i + "].bitmap", er.get(i).getBitmap(), ar.get(i).getBitmap()); + } + } + + /** Serialize → deserialize → re-serialize: both byte arrays must be identical. */ + private static void assertSerializeStable(RecoveryCheckpoint.Body body) throws IOException, RetinaException + { + byte[] first = serialize(body); + byte[] second = serialize(RecoveryCheckpoint.Body.readFrom(first)); + assertArrayEquals("re-serialized bytes differ", first, second); + } + + /** Stub the etcd pointer to reference {@code value} (null/empty ⇒ absent). */ + private void stubPointer(String value) + { + if (value == null) + { + when(etcd.getKeyValue(anyString())).thenReturn(null); + return; + } + KeyValue kv = mock(KeyValue.class); + when(kv.getValue()).thenReturn(ByteSequence.from(value, StandardCharsets.UTF_8)); + when(etcd.getKeyValue(anyString())).thenReturn(kv); + } + + private void stubBodyBytes(byte[] bytes) throws IOException + { + when(storage.open(anyString())).thenReturn(new DataInputStream(new ByteArrayInputStream(bytes))); + } + + private void stubCreateCapturingBody() throws IOException + { + when(storage.create(anyString(), eq(true), anyInt())).thenAnswer(captureCreate()); + } + + private Answer captureCreate() + { + return invocation -> { + createdPaths.add(invocation.getArgument(0)); + ByteArrayOutputStream bodyBytes = new ByteArrayOutputStream(); + createdBodies.add(bodyBytes); + return new DataOutputStream(bodyBytes); + }; + } + + private static void putInt(byte[] bytes, int offset, int value) + { + ByteBuffer.wrap(bytes).putInt(offset, value); + } + + private static void assertSegmentEntry(RecoveryCheckpoint.PendingSegmentEntry entry, + int virtualNodeId, + long minCommitTs) + { + assertEquals(virtualNodeId, entry.getVirtualNodeId()); + assertEquals(minCommitTs, entry.getMinCommitTs()); + } + + private static void assertRgEntry(RecoveryCheckpoint.VisibilityEntry entry, + long fileId, + int rgId, + int recordNum, + long baseTimestamp, + long[] bitmap) + { + assertEquals(fileId, entry.getFileId()); + assertEquals(rgId, entry.getRgId()); + assertEquals(recordNum, entry.getRecordNum()); + assertEquals(baseTimestamp, entry.getBaseTimestamp()); + assertArrayEquals(bitmap, entry.getBitmap()); + } + + // =============================================================== + // Body codec — roundtrip + // =============================================================== + + @Test + public void testRoundtrip_emptyBody() throws IOException, RetinaException + { + RecoveryCheckpoint.Body original = baseBuilder().build(); + assertBodyEquals(original, RecoveryCheckpoint.Body.readFrom(serialize(original))); + assertSerializeStable(original); + } + + @Test + public void testRoundtrip_full() throws IOException, RetinaException + { + RecoveryCheckpoint.Body original = baseBuilder() + .segmentEntries(Arrays.asList( + new RecoveryCheckpoint.PendingSegmentEntry(1, 500L), + new RecoveryCheckpoint.PendingSegmentEntry(2, 600L))) + .rgEntries(Arrays.asList( + new RecoveryCheckpoint.VisibilityEntry(1L, 0, 100, 1000L, new long[]{0xFFL}), + new RecoveryCheckpoint.VisibilityEntry(2L, 1, 200, 2000L, new long[]{0xAAL, 0xBBL}), + new RecoveryCheckpoint.VisibilityEntry(3L, 2, 300, 3000L, new long[]{}))) + .build(); + assertBodyEquals(original, RecoveryCheckpoint.Body.readFrom(serialize(original))); + assertSerializeStable(original); + } + + @Test + public void testRoundtrip_nullBitmap_treatedAsEmpty() throws IOException, RetinaException + { + // null bitmap is normalised to empty array on write; verify the parsed result reflects that + RecoveryCheckpoint.Body original = baseBuilder().rgEntries(Collections.singletonList( + new RecoveryCheckpoint.VisibilityEntry(1L, 0, 0, 1000L, null) + )).build(); + RecoveryCheckpoint.Body parsed = RecoveryCheckpoint.Body.readFrom(serialize(original)); + assertArrayEquals(new long[0], parsed.getRgEntries().get(0).getBitmap()); + assertSerializeStable(parsed); + } + + /** Multi-word bitmaps and a multibyte (non-ASCII) nodeId must survive a roundtrip. */ + @Test + public void testRoundtrip_multiWordBitmapAndUnicodeNodeId() throws IOException, RetinaException + { + RecoveryCheckpoint.Body original = RecoveryCheckpoint.Body.builder() + .retinaNodeId("节点-host①:8080") + .writeTimeMs(1L) + .checkpointAppliedTs(2L) + .virtualNodesPerNode(7) + .rgEntries(Collections.singletonList(new RecoveryCheckpoint.VisibilityEntry( + 99L, 5, 4096, 123456789L, + new long[]{-1L, 0L, 0x1234_5678_9ABC_DEF0L, Long.MIN_VALUE, Long.MAX_VALUE}))) + .build(); + assertBodyEquals(original, RecoveryCheckpoint.Body.readFrom(serialize(original))); + } + + /** Many entries exercise the count fields and ordering of the codec. */ + @Test + public void testRoundtrip_manyEntries() throws IOException, RetinaException + { + List segs = new ArrayList<>(); + List rgs = new ArrayList<>(); + for (int i = 0; i < 256; i++) + { + segs.add(new RecoveryCheckpoint.PendingSegmentEntry(i % 8, 1000L + i)); + rgs.add(new RecoveryCheckpoint.VisibilityEntry(i, i % 4, i, 2000L + i, new long[]{(long) i})); + } + RecoveryCheckpoint.Body original = baseBuilder().segmentEntries(segs).rgEntries(rgs).build(); + assertBodyEquals(original, RecoveryCheckpoint.Body.readFrom(serialize(original))); + } + + // =============================================================== + // Body codec — corruption is rejected + // =============================================================== + + @Test + public void testReadFrom_emptyBytes_throws() + { + assertThrows(RetinaException.class, () -> RecoveryCheckpoint.Body.readFrom(new byte[0])); + assertThrows(RetinaException.class, () -> RecoveryCheckpoint.Body.readFrom(null)); + } + + @Test + public void testReadFrom_badMagic_throws() + { + assertThrows(RetinaException.class, () -> RecoveryCheckpoint.Body.readFrom(new byte[64])); + } + + /** + * A truncated body (e.g. a half-written body from a power loss) must be + * rejected, not silently parsed. This underpins load()'s fail-closed + * contract: corruption is never mistaken for "no checkpoint". + */ + @Test + public void testReadFrom_truncatedBody_throws() throws IOException + { + byte[] full = serialize(baseBuilder() + .rgEntries(Collections.singletonList( + new RecoveryCheckpoint.VisibilityEntry(1L, 0, 100, 1000L, new long[]{0xFFL, 0xEEL}))) + .build()); + // Drop the trailing bytes so the final bitmap long is incomplete. + for (int cut = 1; cut <= 8; cut++) + { + byte[] truncated = Arrays.copyOf(full, full.length - cut); + assertThrows("truncation of " + cut + " byte(s) must be rejected", + RetinaException.class, () -> RecoveryCheckpoint.Body.readFrom(truncated)); + } + } + + /** Trailing bytes after a valid payload are corruption and must be rejected. */ + @Test + public void testReadFrom_trailingBytes_throws() throws IOException + { + byte[] full = serialize(baseBuilder().build()); + byte[] padded = Arrays.copyOf(full, full.length + 4); // 4 trailing zero bytes + assertThrows(RetinaException.class, () -> RecoveryCheckpoint.Body.readFrom(padded)); + } + + /** A header that declares a wildly large nodeId length must not allocate; it must throw. */ + @Test + public void testReadFrom_oversizedNodeIdLen_throws() throws IOException + { + byte[] full = serialize(baseBuilder().build()); + // Layout: magic(4) writeTimeMs(8) checkpointAppliedTs(8) vnodes(4) segCount(4) rgCount(4) then nodeIdLen(4). + byte[] corrupt = full.clone(); + int nodeIdLenOffset = 4 + 8 + 8 + 4 + 4 + 4; + putInt(corrupt, nodeIdLenOffset, Integer.MAX_VALUE); + assertThrows(RetinaException.class, () -> RecoveryCheckpoint.Body.readFrom(corrupt)); + } + + @Test + public void testReadFrom_negativeEntryCount_throws() throws IOException + { + byte[] full = serialize(baseBuilder().build()); + byte[] corrupt = full.clone(); + int segmentCountOffset = 4 + 8 + 8 + 4; + putInt(corrupt, segmentCountOffset, -1); + assertThrows(RetinaException.class, () -> RecoveryCheckpoint.Body.readFrom(corrupt)); + } + + // =============================================================== + // Body builder — edge cases + // =============================================================== + + @Test + public void testBuilder_missingOrEmptyNodeId_throws() + { + assertThrows(IllegalArgumentException.class, () -> + RecoveryCheckpoint.Body.builder() + .writeTimeMs(1000L) + .checkpointAppliedTs(3000L) + .virtualNodesPerNode(4) + .build()); + assertThrows(IllegalArgumentException.class, () -> + RecoveryCheckpoint.Body.builder() + .retinaNodeId("") + .build()); + } + + @Test + public void testBuilder_defaultsToEmptyLists() + { + RecoveryCheckpoint.Body body = baseBuilder().build(); + assertTrue(body.getSegmentEntries().isEmpty()); + assertTrue(body.getRgEntries().isEmpty()); + } + + /** Entry lists are defensively copied: mutating the source after build must not affect the body. */ + @Test + public void testBuilder_defensiveCopy() + { + List source = new ArrayList<>(); + source.add(new RecoveryCheckpoint.VisibilityEntry(1L, 0, 0, 0L, new long[0])); + RecoveryCheckpoint.Body body = baseBuilder().rgEntries(source).build(); + source.add(new RecoveryCheckpoint.VisibilityEntry(2L, 0, 0, 0L, new long[0])); + assertEquals(1, body.getRgEntries().size()); + } + + /** Exposed entry lists are immutable. */ + @Test + public void testBody_listsAreUnmodifiable() + { + RecoveryCheckpoint.Body body = baseBuilder().build(); + assertThrows(UnsupportedOperationException.class, () -> + body.getRgEntries().add(new RecoveryCheckpoint.VisibilityEntry(1L, 0, 0, 0L, new long[0]))); + assertThrows(UnsupportedOperationException.class, () -> + body.getSegmentEntries().add(new RecoveryCheckpoint.PendingSegmentEntry(0, 0L))); + } + + // =============================================================== + // load() — only an absent pointer is a fresh-deployment signal + // =============================================================== + + @Test + public void testLoad_pointerAbsentOrEmpty_returnsNull() throws RetinaException + { + stubPointer(null); + assertNull(checkpoint.load()); + + stubPointer(""); // empty value is treated as absent by readPointer + assertNull(checkpoint.load()); + } + + // =============================================================== + // load() — happy path + // =============================================================== + + @Test + public void testLoad_validBody_returnsLoaded() throws IOException, RetinaException + { + stubPointer(BODY_PATH); + stubBodyBytes(serialize(baseBuilder() + .rgEntries(Collections.singletonList( + new RecoveryCheckpoint.VisibilityEntry(7L, 0, 10, 2500L, new long[]{0x1L}))) + .build())); + + RecoveryCheckpoint.LoadedCheckpoint loaded = checkpoint.load(); + assertNotNull(loaded); + assertEquals(BODY_PATH, loaded.bodyPath); + assertEquals(NODE_ID, loaded.body.getRetinaNodeId()); + assertEquals(3000L, loaded.body.getCheckpointAppliedTs()); + assertEquals(1, loaded.body.getRgEntries().size()); + } + + // =============================================================== + // load() — fail-closed: pointer exists but body is unusable + // =============================================================== + + /** Transient read failure must abort recovery, not fresh-deploy. */ + @Test + public void testLoad_readThrows_failsClosed() throws IOException + { + stubPointer(BODY_PATH); + when(storage.open(anyString())).thenThrow(new IOException("transient S3 read error")); + assertThrows(RetinaException.class, () -> checkpoint.load()); + } + + /** An empty body file is treated as a read failure, not "no checkpoint". */ + @Test + public void testLoad_emptyBodyFile_failsClosed() throws IOException + { + stubPointer(BODY_PATH); + stubBodyBytes(new byte[0]); + assertThrows(RetinaException.class, () -> checkpoint.load()); + } + + /** A corrupted/half-written body must abort recovery. */ + @Test + public void testLoad_corruptBody_failsClosed() throws IOException + { + stubPointer(BODY_PATH); + stubBodyBytes(new byte[]{1, 2, 3, 4, 5, 6, 7, 8}); // bad magic / truncated + assertThrows(RetinaException.class, () -> checkpoint.load()); + } + + /** A body written under a different node.virtual.num must abort recovery. */ + @Test + public void testLoad_vnodeMismatch_failsClosed() throws IOException + { + stubPointer(BODY_PATH); + stubBodyBytes(serialize(baseBuilder().virtualNodesPerNode(VNODES + 1).build())); + assertThrows(RetinaException.class, () -> checkpoint.load()); + } + + /** A body belonging to a different retina node must abort recovery. */ + @Test + public void testLoad_nodeIdMismatch_failsClosed() throws IOException + { + stubPointer(BODY_PATH); + stubBodyBytes(serialize(baseBuilder().retinaNodeId("other-host:9090").build())); + assertThrows(RetinaException.class, () -> checkpoint.load()); + } + + /** A body with a negative checkpointAppliedTs is corruption, not a fresh signal. */ + @Test + public void testLoad_negativeCheckpointTs_failsClosed() throws IOException + { + stubPointer(BODY_PATH); + stubBodyBytes(serialize(baseBuilder().checkpointAppliedTs(-1L).build())); + assertThrows(RetinaException.class, () -> checkpoint.load()); + } + + // =============================================================== + // generate() — write body and publish pointer + // =============================================================== + + @Test + public void testGenerate_writesSortedBodyAndPublishesPointer() throws Exception + { + stubPointer(null); + stubCreateCapturingBody(); + when(etcd.compareAndPut(eq(POINTER_KEY), eq(null), eq(generatedBodyPath(4000L)))).thenReturn(true); + + List rgs = new ArrayList<>(Arrays.asList( + new RecoveryCheckpoint.VisibilityEntry(20L, 3, 30, 2200L, new long[]{0x20L}), + new RecoveryCheckpoint.VisibilityEntry(10L, 2, 20, 2100L, new long[]{0x10L}), + new RecoveryCheckpoint.VisibilityEntry(10L, 1, 10, 2000L, new long[]{0x01L}))); + List segments = new ArrayList<>(Arrays.asList( + new RecoveryCheckpoint.PendingSegmentEntry(2, 900L), + new RecoveryCheckpoint.PendingSegmentEntry(7, 370L), + new RecoveryCheckpoint.PendingSegmentEntry(1, 310L))); + + checkpoint.generate(4000L, rgs, segments); + + assertEquals(Collections.singletonList(generatedBodyPath(4000L)), createdPaths); + verify(etcd).compareAndPut(POINTER_KEY, null, generatedBodyPath(4000L)); + verify(storage, never()).delete(anyString(), eq(false)); + + RecoveryCheckpoint.Body written = RecoveryCheckpoint.Body.readFrom(createdBodies.get(0).toByteArray()); + assertEquals(NODE_ID, written.getRetinaNodeId()); + assertEquals(4000L, written.getCheckpointAppliedTs()); + assertEquals(VNODES, written.getVirtualNodesPerNode()); + assertTrue("writeTimeMs should be populated", written.getWriteTimeMs() > 0L); + + assertSegmentEntry(written.getSegmentEntries().get(0), 1, 310L); + assertSegmentEntry(written.getSegmentEntries().get(1), 2, 900L); + assertSegmentEntry(written.getSegmentEntries().get(2), 7, 370L); + assertRgEntry(written.getRgEntries().get(0), 10L, 1, 10, 2000L, new long[]{0x01L}); + assertRgEntry(written.getRgEntries().get(1), 10L, 2, 20, 2100L, new long[]{0x10L}); + assertRgEntry(written.getRgEntries().get(2), 20L, 3, 30, 2200L, new long[]{0x20L}); + } + + @Test + public void testGenerate_deletesDisplacedBodyAfterPublish() throws Exception + { + stubPointer(BODY_PATH); + stubCreateCapturingBody(); + when(etcd.compareAndPut(eq(POINTER_KEY), eq(BODY_PATH), eq(generatedBodyPath(4001L)))).thenReturn(true); + + checkpoint.generate(4001L, new ArrayList<>(), new ArrayList<>()); + + verify(etcd).compareAndPut(POINTER_KEY, BODY_PATH, generatedBodyPath(4001L)); + verify(storage).delete(BODY_PATH, false); + } + + @Test + public void testGenerate_deleteDisplacedBodyFailure_doesNotFailPublish() throws Exception + { + stubPointer(BODY_PATH); + stubCreateCapturingBody(); + when(etcd.compareAndPut(eq(POINTER_KEY), eq(BODY_PATH), eq(generatedBodyPath(4002L)))).thenReturn(true); + when(storage.delete(BODY_PATH, false)).thenThrow(new IOException("delete failed")); + + checkpoint.generate(4002L, new ArrayList<>(), new ArrayList<>()); + + verify(storage, times(1)).create(anyString(), eq(true), anyInt()); + verify(etcd, times(1)).compareAndPut(eq(POINTER_KEY), eq(BODY_PATH), eq(generatedBodyPath(4002L))); + verify(storage).delete(BODY_PATH, false); + } + + @Test + public void testGenerate_sameTimestampSkipsAfterSuccessfulPublish() throws Exception + { + stubPointer(null); + stubCreateCapturingBody(); + when(etcd.compareAndPut(eq(POINTER_KEY), eq(null), eq(generatedBodyPath(4003L)))).thenReturn(true); + + checkpoint.generate(4003L, new ArrayList<>(), new ArrayList<>()); + checkpoint.generate(4003L, new ArrayList<>(Collections.singletonList( + new RecoveryCheckpoint.VisibilityEntry(1L, 0, 0, 0L, new long[0]))), new ArrayList<>()); + + verify(storage, times(1)).create(anyString(), eq(true), anyInt()); + verify(etcd, times(1)).compareAndPut(eq(POINTER_KEY), eq(null), eq(generatedBodyPath(4003L))); + assertEquals(1, createdBodies.size()); + } + + @Test + public void testGenerate_writeFailureDoesNotPublishOrAdvanceTimestamp() throws Exception + { + stubPointer(null); + when(storage.create(anyString(), eq(true), anyInt())) + .thenThrow(new IOException("write failed")) + .thenAnswer(captureCreate()); + when(etcd.compareAndPut(eq(POINTER_KEY), eq(null), eq(generatedBodyPath(4004L)))).thenReturn(true); + + assertThrows(RetinaException.class, + () -> checkpoint.generate(4004L, new ArrayList<>(), new ArrayList<>())); + checkpoint.generate(4004L, new ArrayList<>(), new ArrayList<>()); + + verify(storage, times(2)).create(anyString(), eq(true), anyInt()); + verify(etcd, times(1)).compareAndPut(eq(POINTER_KEY), eq(null), eq(generatedBodyPath(4004L))); + } + + @Test + public void testGenerate_casFailureDoesNotAdvanceTimestamp() throws Exception + { + stubPointer(null); + stubCreateCapturingBody(); + when(etcd.compareAndPut(eq(POINTER_KEY), eq(null), eq(generatedBodyPath(4005L)))) + .thenReturn(false) + .thenReturn(true); + + assertThrows(RetinaException.class, + () -> checkpoint.generate(4005L, new ArrayList<>(), new ArrayList<>())); + checkpoint.generate(4005L, new ArrayList<>(), new ArrayList<>()); + + verify(storage, times(2)).create(anyString(), eq(true), anyInt()); + verify(etcd, times(2)).compareAndPut(eq(POINTER_KEY), eq(null), eq(generatedBodyPath(4005L))); + } +} diff --git a/pixels-retina/src/test/java/io/pixelsdb/pixels/retina/TestRetinaCheckpoint.java b/pixels-retina/src/test/java/io/pixelsdb/pixels/retina/TestRetinaCheckpoint.java index 15ba28ce14..87e6adec15 100644 --- a/pixels-retina/src/test/java/io/pixelsdb/pixels/retina/TestRetinaCheckpoint.java +++ b/pixels-retina/src/test/java/io/pixelsdb/pixels/retina/TestRetinaCheckpoint.java @@ -22,33 +22,23 @@ import io.pixelsdb.pixels.common.exception.RetinaException; import io.pixelsdb.pixels.common.physical.Storage; import io.pixelsdb.pixels.common.physical.StorageFactory; -import io.pixelsdb.pixels.common.utils.CheckpointFileIO; import io.pixelsdb.pixels.common.utils.ConfigFactory; import io.pixelsdb.pixels.common.utils.RetinaUtils; import org.junit.Before; import org.junit.Test; import java.io.DataInputStream; -import java.io.DataOutputStream; import java.io.IOException; import java.lang.reflect.Field; -import java.lang.reflect.Method; import java.net.InetAddress; -import java.util.Arrays; -import java.util.HashMap; import java.util.Map; -import java.util.concurrent.CompletableFuture; import java.util.concurrent.CountDownLatch; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; -import java.util.concurrent.ThreadLocalRandom; -import static org.junit.Assert.assertArrayEquals; -import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertTrue; /** @@ -67,7 +57,7 @@ public class TestRetinaCheckpoint @Before public void setUp() throws IOException, RetinaException { - testCheckpointDir = ConfigFactory.Instance().getProperty("retina.checkpoint.dir"); + testCheckpointDir = ConfigFactory.Instance().getProperty("retina.offload.checkpoint.dir"); storage = StorageFactory.Instance().getStorage(testCheckpointDir); if (!storage.exists(testCheckpointDir)) @@ -104,10 +94,6 @@ private String getOffloadFileName(long timestamp) { return RetinaUtils.getCheckpointFileName(RetinaUtils.CHECKPOINT_PREFIX_OFFLOAD, hostName, timestamp); } - private String getGcFileName(long timestamp) { - return RetinaUtils.getCheckpointFileName(RetinaUtils.CHECKPOINT_PREFIX_GC, hostName, timestamp); - } - @Test public void testRegisterOffload() throws RetinaException, IOException { @@ -162,69 +148,6 @@ public void testMultipleOffloads() throws RetinaException, IOException System.out.println("Verified: Checkpoint removed after final unregister. testMultipleOffloads passed."); } - @Test - public void testCheckpointRecovery() throws RetinaException, IOException - { - System.out.println("\n[Test] Starting testCheckpointRecovery..."); - retinaManager.addVisibility(fileId, rgId, numRows, 0L, null, false); - long timestamp = 100L; - - // 1. Delete row 10 - int rowToDelete = 10; - System.out.println("Deleting row " + rowToDelete + " in memory..."); - retinaManager.deleteRecord(fileId, rgId, rowToDelete, timestamp); - - // Verify deleted in memory - long[] memBitmap = retinaManager.queryVisibility(fileId, rgId, timestamp); - assertTrue("Row 10 should be deleted in memory", isBitSet(memBitmap, rowToDelete)); - - // 2. Register Offload to generate checkpoint file - System.out.println("Creating checkpoint on disk..."); - retinaManager.registerOffload(timestamp); - String offloadPath = resolve(testCheckpointDir, getOffloadFileName(timestamp)); - assertTrue("Checkpoint file should exist", storage.exists(offloadPath)); - - // 3. Rename offload file to GC file to simulate checkpoint generated by GC - String gcPath = resolve(testCheckpointDir, getGcFileName(timestamp)); - System.out.println("Simulating GC checkpoint by renaming offload file to: " + gcPath); - // Storage interface doesn't have rename, using copy and delete - try (DataInputStream in = storage.open(offloadPath); - DataOutputStream out = storage.create(gcPath, true, 4096)) - { - byte[] buffer = new byte[4096]; - int bytesRead; - while ((bytesRead = in.read(buffer)) != -1) - { - out.write(buffer, 0, bytesRead); - } - } - storage.delete(offloadPath, false); - - // 4. Reset singleton state (Simulate Crash/Restart) - System.out.println("Simulating system restart (resetting memory state)..."); - resetSingletonState(); - - // 5. Perform recovery - System.out.println("Running recoverCheckpoints()..."); - // At this point rgVisibilityMap is empty, recoverCheckpoints will load data directly into rgVisibilityMap - retinaManager.recoverCheckpoints(); - - // 6. Verify recovered state immediately after recovery - System.out.println("Verifying recovered state immediately after recoverCheckpoints()..."); - long[] recoveredBitmap = retinaManager.queryVisibility(fileId, rgId, timestamp); - assertTrue("Row 10 should be deleted after recovery", isBitSet(recoveredBitmap, rowToDelete)); - assertFalse("Row 11 should not be deleted", isBitSet(recoveredBitmap, rowToDelete + 1)); - - // 7. Re-add Visibility, at this point it should see that it already exists in rgVisibilityMap - System.out.println("Re-adding visibility for file (should skip as it already exists)..."); - retinaManager.addVisibility(fileId, rgId, numRows, 0L, null, false); - - // 8. Verify state still correct - long[] finalBitmap = retinaManager.queryVisibility(fileId, rgId, timestamp); - assertTrue("Row 10 should still be deleted", isBitSet(finalBitmap, rowToDelete)); - System.out.println("Verified: Recovery successful, row state restored directly to map. testCheckpointRecovery passed."); - } - @Test public void testCheckpointRetryAfterFailure() throws RetinaException, IOException { @@ -260,51 +183,6 @@ public void testCheckpointRetryAfterFailure() throws RetinaException, IOExceptio System.out.println("Verified: Retry successful. testCheckpointRetryAfterFailure passed."); } - @Test - public void testMultiRGCheckpoint() throws RetinaException, IOException - { - System.out.println("\n[Test] Starting testMultiRGCheckpoint..."); - int numRgs = 3; - for (int i = 0; i < numRgs; i++) - { - retinaManager.addVisibility(fileId, i, numRows, 0L, null, false); - } - long timestamp = 200L; - - // Delete records in different RGs - retinaManager.deleteRecord(fileId, 0, 10, timestamp); - retinaManager.deleteRecord(fileId, 1, 20, timestamp); - retinaManager.deleteRecord(fileId, 2, 30, timestamp); - - // Create checkpoint - retinaManager.registerOffload(timestamp); - String offloadPath = resolve(testCheckpointDir, getOffloadFileName(timestamp)); - - // Simulating GC checkpoint for recovery - String gcPath = resolve(testCheckpointDir, getGcFileName(timestamp)); - try (DataInputStream in = storage.open(offloadPath); - DataOutputStream out = storage.create(gcPath, true, 4096)) - { - byte[] buffer = new byte[4096]; - int bytesRead; - while ((bytesRead = in.read(buffer)) != -1) - { - out.write(buffer, 0, bytesRead); - } - } - - // Reset and recover - resetSingletonState(); - retinaManager.recoverCheckpoints(); - - // Verify all RGs - assertTrue("RG 0 row 10 should be deleted", isBitSet(retinaManager.queryVisibility(fileId, 0, timestamp), 10)); - assertTrue("RG 1 row 20 should be deleted", isBitSet(retinaManager.queryVisibility(fileId, 1, timestamp), 20)); - assertTrue("RG 2 row 30 should be deleted", isBitSet(retinaManager.queryVisibility(fileId, 2, timestamp), 30)); - - System.out.println("Verified: Multi-RG state correctly restored. testMultiRGCheckpoint passed."); - } - @Test public void testCheckpointDataIntegrity() throws RetinaException, IOException { @@ -393,132 +271,6 @@ else if (j % 3 == 1) assertFalse("Errors occurred during concurrency test", errorOccurred.get()); } - @Test - public void testCheckpointPerformance() throws RetinaException, IOException, InterruptedException - { - // 1. Performance Test Configuration - double targetDeleteRatio = 0.0; // @TARGET_DELETE_RATIO@ - int numFiles = 50000; - int rowsPerRg = 200000; - long totalRows = (long) numFiles * rowsPerRg; - long timestamp = System.currentTimeMillis(); - - System.out.printf("Target Delete Ratio: %.2f%%%n", targetDeleteRatio * 100); - System.out.printf("Total Rows: %,d%n", totalRows); - - // 2. Populate Visibility Data - System.out.println("[Perf] Populating visibility data..."); - for (int i = 0; i < numFiles; i++) - { - retinaManager.addVisibility(i, 0, rowsPerRg, 0L, null, false); - } - - // 3. Delete Records based on Ratio - System.out.println("[Perf] Deleting records..."); - long totalDeleted = 0; - if (targetDeleteRatio > 0) - { - // Delete contiguous block for performance stability - int rowsToDeletePerRg = (int) (rowsPerRg * targetDeleteRatio); - for (int i = 0; i < numFiles; i++) - { - // Delete rows 0 to rowsToDeletePerRg - 1 - for (int j = 0; j < rowsToDeletePerRg; j++) - { - retinaManager.deleteRecord(i, 0, j, timestamp); - } - totalDeleted += rowsToDeletePerRg; - } - } - double actualRatio = (double) totalDeleted / totalRows; - System.out.printf("Actual Ratio: %.2f%%%n", actualRatio * 100); - - // Measure Memory before Offload - System.gc(); - Thread.sleep(1000); - long memBeforeOffload = Runtime.getRuntime().totalMemory() - Runtime.getRuntime().freeMemory(); - - // 4. Register Offload (Checkpoint Creation) - System.out.println("[Perf] Starting Offload..."); - long startOffload = System.nanoTime(); - retinaManager.registerOffload(timestamp); - long endOffload = System.nanoTime(); - double offloadTimeMs = (endOffload - startOffload) / 1_000_000.0; - System.out.printf("Total Offload Time: %.2f ms%n", offloadTimeMs); - - // Measure Peak Memory (Approximation: Current - Before) - long memAfterOffload = Runtime.getRuntime().totalMemory() - Runtime.getRuntime().freeMemory(); - double peakMemMb = Math.max(0, (memAfterOffload - memBeforeOffload) / (1024.0 * 1024.0)); - System.out.printf("Offload Peak Mem Overhead: %.2f MB%n", peakMemMb); - - // File Size - String checkpointPath = resolve(testCheckpointDir, getOffloadFileName(timestamp)); - long fileSizeBytes = storage.getStatus(checkpointPath).getLength(); - double fileSizeMb = fileSizeBytes / (1024.0 * 1024.0); - System.out.printf("Checkpoint File Size: %.2f MB%n", fileSizeMb); - - // Write Throughput - double writeThroughput = fileSizeMb / (offloadTimeMs / 1000.0); - System.out.printf("Write Throughput: %.2f MB/s%n", writeThroughput); - - // 5. Simulate System Restart (Cold Load) - System.out.println("[Perf] Simulating restart..."); - // Rename to GC file to simulate persisted state - String gcPath = resolve(testCheckpointDir, getGcFileName(timestamp)); - // Simple copy since no rename - try (DataInputStream in = storage.open(checkpointPath); - DataOutputStream out = storage.create(gcPath, true, 8 * 1024 * 1024)) - { - byte[] buffer = new byte[64 * 1024]; // 64KB copy buffer - int bytesRead; - while ((bytesRead = in.read(buffer)) != -1) - { - out.write(buffer, 0, bytesRead); - } - } - storage.delete(checkpointPath, false); - - resetSingletonState(); - System.gc(); - Thread.sleep(1000); - long memBeforeLoad = Runtime.getRuntime().totalMemory() - Runtime.getRuntime().freeMemory(); - - // Recover - long startLoad = System.nanoTime(); - retinaManager.recoverCheckpoints(); - long endLoad = System.nanoTime(); - double loadTimeMs = (endLoad - startLoad) / 1_000_000.0; - System.out.printf("First Load Time (Cold): %.2f ms%n", loadTimeMs); - - // Load Memory Overhead - long memAfterLoad = Runtime.getRuntime().totalMemory() - Runtime.getRuntime().freeMemory(); - double loadMemMb = Math.max(0, (memAfterLoad - memBeforeLoad) / (1024.0 * 1024.0)); - System.out.printf("Load Memory Overhead: %.2f MB%n", loadMemMb); - - // Read Throughput - double readThroughput = fileSizeMb / (loadTimeMs / 1000.0); - System.out.printf("Read/Parse Throughput: %.2f MB/s%n", readThroughput); - - // 6. Avg Memory Hit Latency - System.out.println("[Perf] Measuring Memory Hit Latency..."); - long totalLatencyNs = 0; - int latencySamples = 10000; - for (int i = 0; i < latencySamples; i++) - { - // Random file query - long randomFileId = ThreadLocalRandom.current().nextInt(numFiles); - long startQuery = System.nanoTime(); - retinaManager.queryVisibility(randomFileId, 0, timestamp); - long endQuery = System.nanoTime(); - totalLatencyNs += (endQuery - startQuery); - } - double avgLatencyMs = (totalLatencyNs / (double) latencySamples) / 1_000_000.0; - System.out.printf("Avg Memory Hit Latency: %.4f ms%n", avgLatencyMs); - - // Cleanup - storage.delete(gcPath, false); - } - /** * Use reflection to reset internal state of RetinaResourceManager, simulating a restart. */ @@ -534,13 +286,9 @@ private void resetSingletonState() bufferMapField.setAccessible(true); ((Map) bufferMapField.get(retinaManager)).clear(); - Field offloadedField = RetinaResourceManager.class.getDeclaredField("offloadedCheckpoints"); - offloadedField.setAccessible(true); - ((Map) offloadedField.get(retinaManager)).clear(); - - Field refCountsField = RetinaResourceManager.class.getDeclaredField("checkpointRefCounts"); - refCountsField.setAccessible(true); - ((Map) refCountsField.get(retinaManager)).clear(); + Field offloadCheckpointsField = RetinaResourceManager.class.getDeclaredField("offloadCheckpoints"); + offloadCheckpointsField.setAccessible(true); + ((Map) offloadCheckpointsField.get(retinaManager)).clear(); Field gcTimestampField = RetinaResourceManager.class.getDeclaredField("latestGcTimestamp"); gcTimestampField.setAccessible(true); @@ -564,166 +312,4 @@ private boolean isBitSet(long[] bitmap, int rowIndex) return (bitmap[longIndex] & (1L << bitOffset)) != 0; } - // ----------------------------------------------------------------------- - // GC checkpoint: completeness + bitmap correctness - // ----------------------------------------------------------------------- - - /** - * Creates a {@code long[]} GC snapshot bitmap for one RG where exactly {@code deletedRows} - * out of {@code totalRows} rows are marked deleted (rows 0..deletedRows-1 are set). - */ - private static long[] makeBitmap(int totalRows, int deletedRows) - { - int words = (totalRows + 63) / 64; - long[] bitmap = new long[words]; - for (int r = 0; r < deletedRows; r++) - { - bitmap[r / 64] |= (1L << (r % 64)); - } - return bitmap; - } - - /** - * Calls {@code RetinaResourceManager.createCheckpoint(ts, CheckpointType.GC, bitmaps)} - * via reflection and blocks until the write completes. - */ - @SuppressWarnings("unchecked") - private void invokeCreateGCCheckpoint(long ts, Map bitmaps) throws Exception - { - // Locate the private CheckpointType enum class - Class cpTypeClass = Arrays.stream(RetinaResourceManager.class.getDeclaredClasses()) - .filter(c -> c.getSimpleName().equals("CheckpointType")) - .findFirst() - .orElseThrow(() -> new RuntimeException("CheckpointType enum not found")); - - // Get the GC constant - Object gcConstant = Arrays.stream(cpTypeClass.getEnumConstants()) - .filter(e -> e.toString().equals("GC")) - .findFirst() - .orElseThrow(() -> new RuntimeException("CheckpointType.GC not found")); - - // Get the overloaded createCheckpoint(long, CheckpointType, Map) method - Method method = RetinaResourceManager.class.getDeclaredMethod( - "createCheckpoint", long.class, cpTypeClass, Map.class); - method.setAccessible(true); - - CompletableFuture future = (CompletableFuture) method.invoke( - retinaManager, ts, gcConstant, bitmaps); - future.join(); - } - - /** - * Verifies that a GC checkpoint written with a full {@code gcSnapshotBitmaps} map - * contains ALL RG entries — including those that would not be selected as Storage GC - * candidates — because the checkpoint is written before S1 scanning begins. - * - *

Setup: 3 files in {@code rgVisibilityMap}: - *

    - *
  • File A: 80 % deleted (would be a candidate)
  • - *
  • File B: 60 % deleted (would be a candidate)
  • - *
  • File C: 20 % deleted (non-candidate)
  • - *
- * - *

Expected: checkpoint rgCount = 3; all three entries present with correct - * {@code recordNum} and bitmap content. - */ - @Test - public void testGCCheckpoint_containsAllRGs() throws Exception - { - final long fileIdA = 77001L; - final long fileIdB = 77002L; - final long fileIdC = 77003L; - final int rows = 100; - final long safeGcTs = 500L; - - retinaManager.addVisibility(fileIdA, 0, rows, 0L, null, false); - retinaManager.addVisibility(fileIdB, 0, rows, 0L, null, false); - retinaManager.addVisibility(fileIdC, 0, rows, 0L, null, false); - - long[] bitmapA = makeBitmap(rows, 80); - long[] bitmapB = makeBitmap(rows, 60); - long[] bitmapC = makeBitmap(rows, 20); - - Map gcBitmaps = new HashMap<>(); - gcBitmaps.put(fileIdA + "_0", bitmapA); - gcBitmaps.put(fileIdB + "_0", bitmapB); - gcBitmaps.put(fileIdC + "_0", bitmapC); - - invokeCreateGCCheckpoint(safeGcTs, gcBitmaps); - - String cpPath = resolve(testCheckpointDir, getGcFileName(safeGcTs)); - assertTrue("GC checkpoint file must exist", storage.exists(cpPath)); - - Map entries = new HashMap<>(); - int rgCount = CheckpointFileIO.readCheckpointParallel(cpPath, - e -> entries.put(e.fileId + "_" + e.rgId, e)); - - assertEquals("checkpoint must contain all 3 RGs (not just candidates)", 3, rgCount); - assertEquals("entries map size must be 3", 3, entries.size()); - - CheckpointFileIO.CheckpointEntry entA = entries.get(fileIdA + "_0"); - assertNotNull("fileIdA must be present", entA); - assertEquals("fileIdA recordNum", rows, entA.recordNum); - assertArrayEquals("fileIdA bitmap must match", bitmapA, entA.bitmap); - - CheckpointFileIO.CheckpointEntry entB = entries.get(fileIdB + "_0"); - assertNotNull("fileIdB must be present", entB); - assertEquals("fileIdB recordNum", rows, entB.recordNum); - assertArrayEquals("fileIdB bitmap must match", bitmapB, entB.bitmap); - - CheckpointFileIO.CheckpointEntry entC = entries.get(fileIdC + "_0"); - assertNotNull("fileIdC (non-candidate) must be present", entC); - assertEquals("fileIdC recordNum", rows, entC.recordNum); - assertArrayEquals("fileIdC bitmap must match", bitmapC, entC.bitmap); - } - - /** - * Verifies that the GC checkpoint bitmap content faithfully matches the - * {@code gcSnapshotBitmaps} passed to {@code createCheckpoint}: each word of each - * per-RG bitmap must be preserved exactly, with no cross-RG contamination. - * - *

Uses a 2-RG file with deliberately complementary bitmaps: - *

    - *
  • RG 0: first word all-ones ({@code rows 0-63} deleted), second word zero
  • - *
  • RG 1: first word zero, second word all-ones ({@code rows 64-127} deleted)
  • - *
- */ - @Test - public void testGCCheckpoint_bitmapContentIsExact() throws Exception - { - final long fileId = 88001L; - final int rows = 128; // 2 words per RG - final long safeGcTs = 600L; - - retinaManager.addVisibility(fileId, 0, rows, 0L, null, false); - retinaManager.addVisibility(fileId, 1, rows, 0L, null, false); - - long[] bitmapRg0 = new long[]{-1L, 0L}; // rows 0-63 deleted - long[] bitmapRg1 = new long[]{0L, -1L}; // rows 64-127 deleted - - Map gcBitmaps = new HashMap<>(); - gcBitmaps.put(fileId + "_0", bitmapRg0); - gcBitmaps.put(fileId + "_1", bitmapRg1); - - invokeCreateGCCheckpoint(safeGcTs, gcBitmaps); - - String cpPath = resolve(testCheckpointDir, getGcFileName(safeGcTs)); - assertTrue("GC checkpoint file must exist", storage.exists(cpPath)); - - Map entries = new HashMap<>(); - int rgCount = CheckpointFileIO.readCheckpointParallel(cpPath, - e -> entries.put(e.fileId + "_" + e.rgId, e)); - - assertEquals("checkpoint must contain 2 RGs", 2, rgCount); - - CheckpointFileIO.CheckpointEntry rg0 = entries.get(fileId + "_0"); - assertNotNull("RG 0 must be present", rg0); - assertEquals("RG 0 word 0 must be all-ones (rows 0-63 deleted)", -1L, rg0.bitmap[0]); - assertEquals("RG 0 word 1 must be zero (rows 64-127 live)", 0L, rg0.bitmap[1]); - - CheckpointFileIO.CheckpointEntry rg1 = entries.get(fileId + "_1"); - assertNotNull("RG 1 must be present", rg1); - assertEquals("RG 1 word 0 must be zero (rows 0-63 live)", 0L, rg1.bitmap[0]); - assertEquals("RG 1 word 1 must be all-ones (rows 64-127 deleted)", -1L, rg1.bitmap[1]); - } } diff --git a/pixels-retina/src/test/java/io/pixelsdb/pixels/retina/TestRetinaResourceManager.java b/pixels-retina/src/test/java/io/pixelsdb/pixels/retina/TestRetinaResourceManager.java index 6edb341693..48986a7468 100644 --- a/pixels-retina/src/test/java/io/pixelsdb/pixels/retina/TestRetinaResourceManager.java +++ b/pixels-retina/src/test/java/io/pixelsdb/pixels/retina/TestRetinaResourceManager.java @@ -20,10 +20,27 @@ package io.pixelsdb.pixels.retina; import io.pixelsdb.pixels.common.exception.RetinaException; +import io.pixelsdb.pixels.common.utils.ConfigFactory; import io.pixelsdb.pixels.core.vector.VectorizedRowBatch; +import org.junit.Ignore; import org.junit.Test; +import java.lang.reflect.Constructor; +import java.lang.reflect.Field; +import java.lang.reflect.Method; import java.nio.ByteBuffer; +import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.TimeUnit; + +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.ArgumentMatchers.eq; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.verifyNoMoreInteractions; +import static org.mockito.Mockito.when; public class TestRetinaResourceManager { @@ -45,6 +62,150 @@ private boolean checkVisibility(long[] visibility, int rowId) return (targetLong & (1L << (rowId % 64))) != 0; } + private RetinaResourceManager newIsolatedManager() throws Exception + { + Constructor constructor = RetinaResourceManager.class.getDeclaredConstructor(); + constructor.setAccessible(true); + return constructor.newInstance(); + } + + private void setGcExecutor(RetinaResourceManager manager, + ScheduledExecutorService executor) throws Exception + { + Field field = RetinaResourceManager.class.getDeclaredField("gcExecutor"); + field.setAccessible(true); + field.set(manager, executor); + } + + @Test + public void testBackgroundGcIsNotStartedByConstructor() throws Exception + { + Constructor constructor = RetinaResourceManager.class.getDeclaredConstructor(); + constructor.setAccessible(true); + RetinaResourceManager manager = constructor.newInstance(); + + assertFalse("background GC must be started by lifecycle only", + manager.isBackgroundGcStarted()); + } + + @Test + public void testStartBackgroundGcIsExplicitAndIdempotent() throws Exception + { + String originalInterval = ConfigFactory.Instance().getProperty("retina.gc.interval"); + RetinaResourceManager manager = newIsolatedManager(); + ScheduledExecutorService executor = mock(ScheduledExecutorService.class); + setGcExecutor(manager, executor); + try + { + ConfigFactory.Instance().addProperty("retina.gc.interval", "300"); + + manager.startBackgroundGc(); + manager.startBackgroundGc(); + + assertTrue("explicit lifecycle start must mark background GC as started", + manager.isBackgroundGcStarted()); + verify(executor).scheduleAtFixedRate(any(Runnable.class), eq(300L), eq(300L), eq(TimeUnit.SECONDS)); + verifyNoMoreInteractions(executor); + } + finally + { + ConfigFactory.Instance().addProperty("retina.gc.interval", originalInterval); + } + } + + @Test + public void testStartBackgroundGcDisabledByNonPositiveInterval() throws Exception + { + String originalInterval = ConfigFactory.Instance().getProperty("retina.gc.interval"); + RetinaResourceManager manager = newIsolatedManager(); + try + { + ConfigFactory.Instance().addProperty("retina.gc.interval", "0"); + + manager.startBackgroundGc(); + + assertFalse("disabled interval must not mark background GC as started", + manager.isBackgroundGcStarted()); + } + finally + { + ConfigFactory.Instance().addProperty("retina.gc.interval", originalInterval); + } + } + + @Test + public void testStartBackgroundGcInvalidIntervalFailsWithoutStarting() throws Exception + { + String originalInterval = ConfigFactory.Instance().getProperty("retina.gc.interval"); + RetinaResourceManager manager = newIsolatedManager(); + try + { + ConfigFactory.Instance().addProperty("retina.gc.interval", "not-a-number"); + + try + { + manager.startBackgroundGc(); + fail("invalid GC interval must fail closed"); + } + catch (RetinaException e) + { + assertTrue(e.getMessage().contains("Invalid retina GC interval configuration")); + } + + assertFalse("failed lifecycle start must not mark GC as started", + manager.isBackgroundGcStarted()); + } + finally + { + ConfigFactory.Instance().addProperty("retina.gc.interval", originalInterval); + } + } + + @Test + public void testStartBackgroundGcSchedulerFailureRollsBackStartedFlag() throws Exception + { + String originalInterval = ConfigFactory.Instance().getProperty("retina.gc.interval"); + RetinaResourceManager manager = newIsolatedManager(); + ScheduledExecutorService executor = mock(ScheduledExecutorService.class); + setGcExecutor(manager, executor); + try + { + ConfigFactory.Instance().addProperty("retina.gc.interval", "300"); + when(executor.scheduleAtFixedRate(any(Runnable.class), eq(300L), eq(300L), eq(TimeUnit.SECONDS))) + .thenThrow(new RuntimeException("scheduler rejected")); + + try + { + manager.startBackgroundGc(); + fail("scheduler failure must fail closed"); + } + catch (RetinaException e) + { + assertTrue(e.getMessage().contains("Failed to start retina background GC")); + } + + assertFalse("scheduler failure must roll back started flag", + manager.isBackgroundGcStarted()); + } + finally + { + ConfigFactory.Instance().addProperty("retina.gc.interval", originalInterval); + } + } + + @Test + public void testRunGcBeforeLifecycleStartIsRejected() throws Exception + { + RetinaResourceManager manager = newIsolatedManager(); + Method runGc = RetinaResourceManager.class.getDeclaredMethod("runGC"); + runGc.setAccessible(true); + + runGc.invoke(manager); + + assertFalse("manual GC invocation before lifecycle start must be ignored", + manager.isBackgroundGcStarted()); + } + @Test public void TestVisibility() { @@ -80,6 +241,7 @@ private byte[][] createTpchNationRow(long nationKey, String name, long regionKey return row; } + @Ignore("Integration test requires real tpch.nation metadata and storage state.") @Test public void testWriteBuffer() { diff --git a/pixels-retina/src/test/java/io/pixelsdb/pixels/retina/TestStorageGarbageCollector.java b/pixels-retina/src/test/java/io/pixelsdb/pixels/retina/TestStorageGarbageCollector.java index 6281626267..9f55fd1b47 100644 --- a/pixels-retina/src/test/java/io/pixelsdb/pixels/retina/TestStorageGarbageCollector.java +++ b/pixels-retina/src/test/java/io/pixelsdb/pixels/retina/TestStorageGarbageCollector.java @@ -19,8 +19,11 @@ */ package io.pixelsdb.pixels.retina; +import io.pixelsdb.pixels.common.index.service.LocalIndexService; import io.pixelsdb.pixels.common.metadata.MetadataService; import io.pixelsdb.pixels.common.utils.CheckpointFileIO; +import io.pixelsdb.pixels.common.utils.ConfigFactory; +import io.pixelsdb.pixels.common.utils.MetaDBUtil; import io.pixelsdb.pixels.common.utils.PixelsFileNameUtils; import io.pixelsdb.pixels.common.utils.RetinaUtils; import io.pixelsdb.pixels.common.metadata.domain.Column; @@ -48,10 +51,11 @@ import org.junit.Ignore; import org.junit.Test; +import java.io.IOException; import java.lang.reflect.Field; -import java.lang.reflect.Method; import java.nio.file.Files; import java.nio.file.Path; +import java.sql.PreparedStatement; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; @@ -70,6 +74,7 @@ import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; /** * Tests for {@link StorageGarbageCollector}, covering scan/grouping, data rewrite, @@ -93,6 +98,7 @@ * * Legacy test names (pre-convention) are preserved for CI stability. */ +@Ignore("Integration suite requires a running metadata server and external metadata DB state.") public class TestStorageGarbageCollector { // ----------------------------------------------------------------------- @@ -116,6 +122,7 @@ public class TestStorageGarbageCollector private RetinaResourceManager retinaManager; private StorageGarbageCollector gc; + private StorageGcWal storageGcWal; // ----------------------------------------------------------------------- // Class-level setup / teardown @@ -170,13 +177,16 @@ public static void classTearDown() // ----------------------------------------------------------------------- @Before - public void setUp() + public void setUp() throws Exception { retinaManager = RetinaResourceManager.Instance(); resetManagerState(); cleanupOrderedDir(); - gc = new StorageGarbageCollector(retinaManager, metadataService, 0.5, 134_217_728L, Integer.MAX_VALUE, 10, - 1048576, EncodingLevel.EL2, 86_400_000L); + cleanupJournalDir(); + storageGcWal = new StorageGcWal(); + gc = new StorageGarbageCollector(retinaManager, metadataService, LocalIndexService.Instance(), + 0.5, 134_217_728L, Integer.MAX_VALUE, 10, 1048576, EncodingLevel.EL2, 86_400_000L, + storageGcWal); } @After @@ -186,6 +196,20 @@ public void tearDown() cleanupOrderedDir(); } + private static void cleanupJournalDir() throws IOException + { + String journalDir = ConfigFactory.Instance().getProperty("retina.storage.gc.journal.dir"); + Storage storage = StorageFactory.Instance().getStorage(journalDir); + if (!storage.exists(journalDir)) + { + return; + } + for (String path : storage.listPaths(journalDir)) + { + storage.delete(path, false); + } + } + /** * Deletes all {@code .pxl} files from the shared test ordered-path directory after * each test. Multiple tests write output files (from {@code rewriteFileGroup}) into @@ -702,96 +726,166 @@ public void testScanAndGroupFiles_skipsFilesWithNoVisibility() // ======================================================================= /** - * After {@code runStorageGC}, the {@code gcSnapshotBitmaps} map must have had - * non-candidate entries removed. Candidate bitmaps must be retained for the rewrite phase. + * When no file crosses the strict deletion-ratio threshold, + * {@code runStorageGC} must return before metadata scan and keep the bitmap + * snapshot intact for the already-written GC checkpoint. */ @Test - public void testRunStorageGC_trimsBitmapMapToCandidate() + public void testRunStorageGC_noCandidateDoesNotScanOrTrim() { - long candidateFileId = 66001L; - long otherFileId = 66002L; + long belowThresholdFileId = 66101L; + long exactlyThresholdFileId = 66102L; - Map bitmaps = new HashMap<>(); - bitmaps.put(candidateFileId + "_0", makeBitmap(100, 60)); - bitmaps.put(otherFileId + "_0", makeBitmap(100, 20)); - - // File-level stats: candidateFileId has 60% deletion, otherFileId has 20% Map fileStats = new HashMap<>(); - fileStats.put(candidateFileId, makeRgStats(100, 60)); - fileStats.put(otherFileId, makeRgStats(100, 20)); + fileStats.put(belowThresholdFileId, makeRgStats(100, 40)); + fileStats.put(exactlyThresholdFileId, makeRgStats(100, 50)); - List fakeFiles = Arrays.asList( - new FakeFileEntry(candidateFileId, 1, 1L, 0), - new FakeFileEntry(otherFileId, 1, 1L, 0)); + Map bitmaps = new HashMap<>(); + bitmaps.put(RetinaUtils.buildRgKey(belowThresholdFileId, 0), makeBitmap(100, 40)); + bitmaps.put(RetinaUtils.buildRgKey(exactlyThresholdFileId, 0), makeBitmap(100, 50)); - DirectScanStorageGC gc = new DirectScanStorageGC( - retinaManager, 0.5, 10, fakeFiles); + TrackingRunStorageGC trackingGc = new TrackingRunStorageGC(Collections.emptyList()); - gc.runStorageGC(300L, fileStats, bitmaps); + trackingGc.runStorageGC(301L, fileStats, bitmaps); - assertTrue("candidate RG key must be retained", - bitmaps.containsKey(candidateFileId + "_0")); - assertFalse("non-candidate RG key must be removed", - bitmaps.containsKey(otherFileId + "_0")); + assertFalse("no candidate means metadata scan must not run", trackingGc.scanCalled); + assertFalse("no candidate means process phase must not run", trackingGc.processCalled); + assertTrue("below-threshold bitmap must remain for checkpoint recovery", + bitmaps.containsKey(RetinaUtils.buildRgKey(belowThresholdFileId, 0))); + assertTrue("exact-threshold bitmap must remain because threshold is strict >", + bitmaps.containsKey(RetinaUtils.buildRgKey(exactlyThresholdFileId, 0))); + assertEquals("bitmap snapshot must remain unchanged", 2, bitmaps.size()); } - // ======================================================================= - // Section 4: runStorageGC end-to-end scan → process - // ======================================================================= - /** - * A file whose invalidRatio is exactly equal to the threshold (0.5) must NOT - * be selected as a candidate. The design uses strict {@code >}, not {@code >=}. + * Candidate selection must be driven by file-level stats only. Files at the + * threshold, with zero rows, or below threshold must not be passed to scan; + * their bitmap entries are released before rewrite processing starts. */ @Test - public void testRunStorageGC_thresholdExactlyEqual() + public void testRunStorageGC_passesOnlyStrictFileLevelCandidatesToScan() { - long fileId = 57001L; + long candidateA = 66201L; + long candidateB = 66202L; + long exactlyThreshold = 66203L; + long zeroRows = 66204L; + long belowThreshold = 66205L; Map fileStats = new HashMap<>(); - fileStats.put(fileId, makeRgStats(100, 50)); // exactly 50% = threshold + fileStats.put(candidateA, makeRgStats(100, 51)); + fileStats.put(candidateB, makeRgStats(200, 120)); + fileStats.put(exactlyThreshold, makeRgStats(100, 50)); + fileStats.put(zeroRows, new long[]{0, 10}); + fileStats.put(belowThreshold, makeRgStats(100, 49)); Map bitmaps = new HashMap<>(); - bitmaps.put(fileId + "_0", makeBitmap(100, 50)); + for (long fileId : Arrays.asList(candidateA, candidateB, exactlyThreshold, zeroRows, belowThreshold)) + { + bitmaps.put(RetinaUtils.buildRgKey(fileId, 0), makeBitmap(100, 1)); + } + bitmaps.put(RetinaUtils.buildRgKey(candidateB, 1), makeBitmap(100, 1)); + + TrackingRunStorageGC trackingGc = new TrackingRunStorageGC(Collections.emptyList()); + + trackingGc.runStorageGC(302L, fileStats, bitmaps); + + assertTrue("candidate scan must run when at least one file qualifies", trackingGc.scanCalled); + assertEquals(new HashSet<>(Arrays.asList(candidateA, candidateB)), trackingGc.capturedCandidateFileIds); + assertEquals("only candidate RG bitmaps should remain", 3, bitmaps.size()); + assertTrue(bitmaps.containsKey(RetinaUtils.buildRgKey(candidateA, 0))); + assertTrue(bitmaps.containsKey(RetinaUtils.buildRgKey(candidateB, 0))); + assertTrue(bitmaps.containsKey(RetinaUtils.buildRgKey(candidateB, 1))); + assertFalse(bitmaps.containsKey(RetinaUtils.buildRgKey(exactlyThreshold, 0))); + assertFalse(bitmaps.containsKey(RetinaUtils.buildRgKey(zeroRows, 0))); + assertFalse(bitmaps.containsKey(RetinaUtils.buildRgKey(belowThreshold, 0))); + assertFalse("empty scan result must skip process phase", trackingGc.processCalled); + } - DirectScanStorageGC gc = new DirectScanStorageGC( - retinaManager, 0.5, 10, - Collections.singletonList(new FakeFileEntry(fileId, 1, 1L, 0))); + /** + * The process phase must see the safe GC timestamp, the groups returned from + * scan, and a bitmap map already trimmed to candidate files. This protects + * the Storage GC rewrite path from accidentally consuming non-candidate RGs. + */ + @Test + public void testRunStorageGC_processSeesTrimmedCandidateBitmapsAndSafeTs() + { + long candidateFileId = 66301L; + long otherFileId = 66302L; + long safeGcTs = 303L; - gc.runStorageGC(400L, fileStats, bitmaps); + StorageGarbageCollector.FileGroup group = new StorageGarbageCollector.FileGroup( + 7L, 4, Collections.singletonList( + new StorageGarbageCollector.FileCandidate( + makeFile(candidateFileId, 2), "fake_candidate", candidateFileId, 2, 7L, 4, 0.75, 0L))); + TrackingRunStorageGC trackingGc = new TrackingRunStorageGC(Collections.singletonList(group)); - assertTrue("file at exactly threshold must NOT be trimmed (no candidates)", - bitmaps.containsKey(fileId + "_0")); - assertEquals(1, bitmaps.size()); + Map fileStats = new HashMap<>(); + fileStats.put(candidateFileId, makeRgStats(100, 75)); + fileStats.put(otherFileId, makeRgStats(100, 10)); + + Map bitmaps = new HashMap<>(); + bitmaps.put(RetinaUtils.buildRgKey(candidateFileId, 0), makeBitmap(100, 75)); + bitmaps.put(RetinaUtils.buildRgKey(candidateFileId, 1), makeBitmap(100, 60)); + bitmaps.put(RetinaUtils.buildRgKey(otherFileId, 0), makeBitmap(100, 10)); + + trackingGc.runStorageGC(safeGcTs, fileStats, bitmaps); + + assertTrue("process phase must run for non-empty groups", trackingGc.processCalled); + assertEquals("safeGcTs must be forwarded to process phase", safeGcTs, trackingGc.capturedSafeGcTs); + assertEquals("scan groups must be forwarded unchanged", 1, trackingGc.capturedFileGroups.size()); + assertEquals(candidateFileId, trackingGc.capturedFileGroups.get(0).files.get(0).fileId); + assertEquals(new HashSet<>(Arrays.asList( + RetinaUtils.buildRgKey(candidateFileId, 0), + RetinaUtils.buildRgKey(candidateFileId, 1))), trackingGc.bitmapKeysSeenByProcess); + assertFalse("non-candidate bitmap must be trimmed before process", + bitmaps.containsKey(RetinaUtils.buildRgKey(otherFileId, 0))); } /** - * A file whose {@code fileStats} entry has {@code totalRows=0} must not - * produce a candidate even if invalidCount is also 0 (division by zero guard). + * If the downstream process phase fails, {@code runStorageGC} must already + * have released non-candidate bitmaps. This mirrors the real GC ordering: + * checkpoint is complete, then candidate-only rewrite state is retained. */ @Test - public void testRunStorageGC_skipsTotalRowsZero() + public void testRunStorageGC_processFailureKeepsOnlyCandidateBitmaps() { - long fileId = 58001L; + long candidateFileId = 66401L; + long otherFileId = 66402L; + + StorageGarbageCollector.FileGroup group = new StorageGarbageCollector.FileGroup( + 8L, 0, Collections.singletonList( + new StorageGarbageCollector.FileCandidate( + makeFile(candidateFileId, 1), "fake_candidate", candidateFileId, 1, 8L, 0, 0.80, 0L))); + TrackingRunStorageGC trackingGc = new TrackingRunStorageGC(Collections.singletonList(group)); + trackingGc.processFailure = new RuntimeException("simulated process failure"); Map fileStats = new HashMap<>(); - fileStats.put(fileId, new long[]{0, 0}); // totalRows=0 + fileStats.put(candidateFileId, makeRgStats(100, 80)); + fileStats.put(otherFileId, makeRgStats(100, 20)); Map bitmaps = new HashMap<>(); - bitmaps.put(fileId + "_0", new long[]{0L}); + bitmaps.put(RetinaUtils.buildRgKey(candidateFileId, 0), makeBitmap(100, 80)); + bitmaps.put(RetinaUtils.buildRgKey(otherFileId, 0), makeBitmap(100, 20)); - DirectScanStorageGC gc = new DirectScanStorageGC( - retinaManager, 0.5, 10, - Collections.singletonList(new FakeFileEntry(fileId, 1, 1L, 0))); - - gc.runStorageGC(500L, fileStats, bitmaps); + try + { + trackingGc.runStorageGC(304L, fileStats, bitmaps); + fail("process failure should propagate to the caller"); + } + catch (RuntimeException e) + { + assertEquals("simulated process failure", e.getMessage()); + } - assertTrue("totalRows=0 file must remain untouched (no candidates)", - bitmaps.containsKey(fileId + "_0")); + assertTrue("process phase should have been entered", trackingGc.processCalled); + assertTrue("candidate bitmap remains available for failure handling", + bitmaps.containsKey(RetinaUtils.buildRgKey(candidateFileId, 0))); + assertFalse("non-candidate bitmap must remain released after failure", + bitmaps.containsKey(RetinaUtils.buildRgKey(otherFileId, 0))); } // ======================================================================= - // Section 4b: processFileGroups error handling + // Section 4: processFileGroups error handling // ======================================================================= /** @@ -1553,117 +1647,6 @@ public void testRgIdForGlobalRowOffset_manyRgs() } } - // ======================================================================= - // Section 7c: createCheckpointDirect vs createCheckpoint consistency - // ======================================================================= - - /** - * Both checkpoint paths (queued via rgVisibilityMap traversal and direct via - * pre-built entries) must produce byte-identical files when given the same - * visibility state. - */ - @Test - public void testCheckpointDirect_matchesStandardCheckpoint() throws Exception - { - long ts = 500L; - int numFiles = 3; - int rowsPerRg = 64; - - for (int fid = 1; fid <= numFiles; fid++) - { - retinaManager.addVisibility(fid, 0, rowsPerRg, 0L, null, false); - for (int d = 0; d < fid; d++) - { - retinaManager.deleteRecord(fid, 0, d, ts - 100); - } - } - - // Build pre-built entries identical to what runGC() would construct. - List entries = new ArrayList<>(); - Field rgMapField = RetinaResourceManager.class.getDeclaredField("rgVisibilityMap"); - rgMapField.setAccessible(true); - @SuppressWarnings("unchecked") - Map rgMap = - (Map) rgMapField.get(retinaManager); - for (Map.Entry e : rgMap.entrySet()) - { - long fileId = RetinaUtils.parseFileIdFromRgKey(e.getKey()); - int rgId = RetinaUtils.parseRgIdFromRgKey(e.getKey()); - long[] bitmap = e.getValue().getVisibilityBitmap(ts); - entries.add(new CheckpointFileIO.CheckpointEntry( - fileId, rgId, (int) e.getValue().getRecordNum(), bitmap)); - } - - // Obtain the private CheckpointType.GC enum value via reflection. - @SuppressWarnings("unchecked") - Class> checkpointTypeClass = (Class>) - Class.forName("io.pixelsdb.pixels.retina.RetinaResourceManager$CheckpointType"); - Object gcType = null; - for (Object constant : checkpointTypeClass.getEnumConstants()) - { - if (constant.toString().equals("GC")) - { - gcType = constant; - break; - } - } - assertNotNull("CheckpointType.GC must exist", gcType); - - // Call createCheckpoint (standard path) - Method createCheckpointMethod = RetinaResourceManager.class.getDeclaredMethod( - "createCheckpoint", long.class, checkpointTypeClass); - createCheckpointMethod.setAccessible(true); - @SuppressWarnings("unchecked") - CompletableFuture f1 = (CompletableFuture) createCheckpointMethod.invoke( - retinaManager, ts, gcType); - f1.join(); - - // Call createCheckpointDirect (optimized path) with a different timestamp to get a different file name - long ts2 = ts + 1; - Method createCheckpointDirectMethod = RetinaResourceManager.class.getDeclaredMethod( - "createCheckpointDirect", long.class, checkpointTypeClass, List.class); - createCheckpointDirectMethod.setAccessible(true); - @SuppressWarnings("unchecked") - CompletableFuture f2 = (CompletableFuture) createCheckpointDirectMethod.invoke( - retinaManager, ts2, gcType, entries); - f2.join(); - - // Read both checkpoint files and compare entries. - // Files may have entries in different order (due to producer-consumer concurrency), - // so we normalize by sorting entries by (fileId, rgId) before comparing. - Field checkpointDirField = RetinaResourceManager.class.getDeclaredField("checkpointDir"); - checkpointDirField.setAccessible(true); - String checkpointDir = (String) checkpointDirField.get(retinaManager); - - Field hostField = RetinaResourceManager.class.getDeclaredField("retinaHostName"); - hostField.setAccessible(true); - String hostName = (String) hostField.get(retinaManager); - - String path1 = RetinaUtils.buildCheckpointPath( - checkpointDir, RetinaUtils.CHECKPOINT_PREFIX_GC, hostName, ts); - String path2 = RetinaUtils.buildCheckpointPath( - checkpointDir, RetinaUtils.CHECKPOINT_PREFIX_GC, hostName, ts2); - - Map standard = new HashMap<>(); - CheckpointFileIO.readCheckpointParallel(path1, entry -> - standard.put(entry.fileId + "_" + entry.rgId, - Arrays.copyOf(entry.bitmap, entry.bitmap.length))); - - Map direct = new HashMap<>(); - CheckpointFileIO.readCheckpointParallel(path2, entry -> - direct.put(entry.fileId + "_" + entry.rgId, - Arrays.copyOf(entry.bitmap, entry.bitmap.length))); - - assertEquals("entry count must match", standard.size(), direct.size()); - for (Map.Entry e : standard.entrySet()) - { - long[] directBitmap = direct.get(e.getKey()); - assertNotNull("direct checkpoint must contain key=" + e.getKey(), directBitmap); - assertTrue("bitmaps must be identical for key=" + e.getKey(), - Arrays.equals(e.getValue(), directBitmap)); - } - } - // ======================================================================= // Section 7d: concurrent dual-write pressure test // ======================================================================= @@ -1702,8 +1685,8 @@ public void testDualWrite_concurrentPressure() throws Exception // batch (any encoded pixel exceeds 1 byte), preserving the 1:1 old-RG-to-new-RG // mapping so each thread targets a distinct new RGVisibility object. StorageGarbageCollector localGc = new StorageGarbageCollector( - retinaManager, metadataService, 0.5, 134_217_728L, - Integer.MAX_VALUE, 10, 1, EncodingLevel.EL2, 86_400_000L); + retinaManager, metadataService, LocalIndexService.Instance(), 0.5, 134_217_728L, + Integer.MAX_VALUE, 10, 1, EncodingLevel.EL2, 86_400_000L, storageGcWal); StorageGarbageCollector.RewriteResult result = localGc.rewriteFileGroup(makeGroup(fileId, srcPath, schema), 100L, bitmaps); @@ -1808,10 +1791,10 @@ public void testDualWrite_concurrentPressure() throws Exception // ======================================================================= /** - * Atomicity with multiple old files: one TEMPORARY new file and three REGULAR + * Atomicity with multiple old files: one TEMPORARY_GC new file and three REGULAR * old files are swapped in a single call. Verifies that after the call the new - * file is promoted to REGULAR and all old files are removed from the - * catalog—i.e., the UPDATE and DELETE execute as one indivisible transaction. + * file is promoted to REGULAR and all old files are marked RETIRED with + * the same cleanup deadline—i.e., both UPDATE steps execute as one transaction. */ @Test public void testAtomicSwap_multipleOldFilesAtomicity() throws Exception @@ -1827,82 +1810,392 @@ public void testAtomicSwap_multipleOldFilesAtomicity() throws Exception new String[]{"atom_old1.pxl", "atom_old2.pxl", "atom_old3.pxl"}, new File.Type[]{File.Type.REGULAR, File.Type.REGULAR, File.Type.REGULAR}, new int[]{1, 1, 1}, new long[]{0, 0, 0}, new long[]{1, 1, 1}); - long newFileId = registerTestFile("atom_new.pxl", File.Type.TEMPORARY, 1, 0, 1); + long newFileId = registerTestFile("atom_new.pxl", File.Type.TEMPORARY_GC, 1, 0, 1); + long cleanupAt = 1_700_000_010_000L; File preSwapNew = metadataService.getFileById(newFileId); assertNotNull("New file must exist before swap", preSwapNew); - assertEquals("New file should be TEMPORARY before swap", - File.Type.TEMPORARY, preSwapNew.getType()); + assertEquals("New file should be TEMPORARY_GC before swap", + File.Type.TEMPORARY_GC, preSwapNew.getType()); - metadataService.atomicSwapFiles(newFileId, Arrays.asList(oldIds[0], oldIds[1], oldIds[2])); + metadataService.atomicSwapFiles(newFileId, Arrays.asList(oldIds[0], oldIds[1], oldIds[2]), cleanupAt); assertFileRegular(newFileId, "New file should be REGULAR after swap"); for (long oldId : oldIds) { - assertFileGone(oldId, "Old file " + oldId + " should be gone after swap"); + assertFileRetired(oldId, cleanupAt, + "Old file " + oldId + " should be retired after swap"); } } /** * Idempotency: calling {@code atomicSwapFiles} a second time after the swap has - * already committed must not throw. The UPDATE is a no-op (already REGULAR) and - * the DELETE is a no-op (old files already removed). + * already committed must not throw. The new file remains REGULAR and the old file + * remains RETIRED with the retry's cleanup deadline. */ @Test public void testAtomicSwap_idempotent() throws Exception { writeTestFile("idem_old.pxl", LONG_ID_SCHEMA, new long[]{0, 1, 2}, true, new long[]{100, 100, 100}); long oldFileId = registerTestFile("idem_old.pxl", File.Type.REGULAR, 1, 0, 2); - long newFileId = registerTestFile("idem_new.pxl", File.Type.TEMPORARY, 1, 0, 2); + long newFileId = registerTestFile("idem_new.pxl", File.Type.TEMPORARY_GC, 1, 0, 2); + long firstCleanupAt = 1_700_000_020_000L; + long retryCleanupAt = 1_700_000_030_000L; - metadataService.atomicSwapFiles(newFileId, Collections.singletonList(oldFileId)); + metadataService.atomicSwapFiles(newFileId, Collections.singletonList(oldFileId), firstCleanupAt); assertFileRegular(newFileId, "File should be REGULAR after first swap"); + assertFileRetired(oldFileId, firstCleanupAt, "Old file should be RETIRED after first swap"); - metadataService.atomicSwapFiles(newFileId, Collections.singletonList(oldFileId)); + metadataService.atomicSwapFiles(newFileId, Collections.singletonList(oldFileId), retryCleanupAt); assertFileRegular(newFileId, "File should remain REGULAR after idempotent retry"); - assertFileGone(oldFileId, "Old file should remain absent after idempotent retry"); + assertFileRetired(oldFileId, retryCleanupAt, + "Old file should remain RETIRED after idempotent retry"); } + // ----------------------------------------------------------------------- + // Coverage for getRegularFiles(pathId) REGULAR-only enumeration. + // ----------------------------------------------------------------------- + /** - * TEMPORARY visibility semantics: before the swap, {@code getFiles(pathId)} must - * not return the TEMPORARY new file (the DAO filters {@code FILE_TYPE <> 0}). - * After the swap the promoted file is visible and the old file disappears. + * A path containing REGULAR and non-REGULAR FILE_TYPE values returns only REGULAR entries. */ @Test - public void testAtomicSwap_temporaryInvisibleViaGetFiles() throws Exception + public void testGetFiles_mixedAllFileTypes_onlyRegular() throws Exception { - writeTestFile("vis_old.pxl", LONG_ID_SCHEMA, new long[]{0, 1}, true, new long[]{100, 100}); - long[] fileIds = registerTestFiles( - new String[]{"vis_old.pxl", "vis_new_temp.pxl"}, - new File.Type[]{File.Type.REGULAR, File.Type.TEMPORARY}, - new int[]{1, 1}, new long[]{0, 0}, new long[]{1, 1}); - long oldFileId = fileIds[0]; - long tempFileId = fileIds[1]; + long regularId = -1L; + long tempId = -1L; + long nonRegularPositiveId = -1L; + long negativeId = -1L; + long extremeId = -1L; + try + { + String suffix = Long.toString(System.nanoTime()); + regularId = registerTestFile("mix_regular_" + suffix + ".pxl", + File.Type.REGULAR, 1, 0L, 1L); + tempId = registerTestFile("mix_temp_" + suffix + ".pxl", + File.Type.TEMPORARY_INGEST, 1, 0L, 1L); + nonRegularPositiveId = insertRawFileWithType("mix_non_regular_" + suffix + ".pxl", + File.Type.TEMPORARY_GC.getNumber(), 1, 0L, 1L); + negativeId = insertRawFileWithType("mix_negative_" + suffix + ".pxl", + -2, 1, 0L, 1L); + extremeId = insertRawFileWithType("mix_extreme_max_" + suffix + ".pxl", + Integer.MAX_VALUE, 1, 0L, 1L); + + List files = metadataService.getRegularFiles(testPathId); + Set visible = new HashSet<>(); + for (File f : files) + { + assertEquals("getRegularFiles must only emit REGULAR", + File.Type.REGULAR, f.getType()); + visible.add(f.getId()); + } + assertTrue("REGULAR member of the mix must be visible", + visible.contains(regularId)); + assertFalse("TEMPORARY_INGEST (FILE_TYPE=0) must be hidden", + visible.contains(tempId)); + assertFalse("non-REGULAR positive FILE_TYPE must be hidden", + visible.contains(nonRegularPositiveId)); + assertFalse("negative FILE_TYPE must be hidden", + visible.contains(negativeId)); + assertFalse("Integer.MAX_VALUE FILE_TYPE must be hidden", + visible.contains(extremeId)); + } + finally + { + List cleanup = new ArrayList<>(); + if (regularId > 0) cleanup.add(regularId); + if (tempId > 0) cleanup.add(tempId); + if (nonRegularPositiveId > 0) cleanup.add(nonRegularPositiveId); + if (negativeId > 0) cleanup.add(negativeId); + if (extremeId > 0) cleanup.add(extremeId); + if (!cleanup.isEmpty()) metadataService.deleteFiles(cleanup); + } + } - List beforeSwap = metadataService.getFiles(testPathId); - Set beforeIds = new HashSet<>(); - for (File f : beforeSwap) + // ------------------------------------------------------------------------- + // c01.1 regression — RETIRED is a new File.Type and must be invisible to + // query-time enumeration just like the two TEMPORARY_* states. These tests + // pin down the contract that the DAO filters FILE_TYPE = REGULAR and nothing + // else, so future refactors cannot accidentally widen the visible set. + // ------------------------------------------------------------------------- + + + + /** + * Exhaustive coverage: for every defined non-REGULAR {@link File.Type}, getFiles must + * exclude that file. Using {@link File.Type#values()} guards against future enum + * additions silently leaking into query results. + */ + @Test + public void testGetFiles_allNonRegularTypes_allHidden() throws Exception + { + List registeredIds = new ArrayList<>(); + long regularId = -1L; + try + { + String suffix = Long.toString(System.nanoTime()); + regularId = registerTestFile("all_types_regular_" + suffix + ".pxl", + File.Type.REGULAR, 1, 0L, 1L); + + // Register one file per non-REGULAR type, including RETIRED. + Set nonRegularIds = new HashSet<>(); + for (File.Type t : File.Type.values()) + { + if (t == File.Type.REGULAR) continue; + long id = insertRawFileWithType( + "all_types_" + t + "_" + suffix + ".pxl", + t.getNumber(), 1, 0L, 1L); + registeredIds.add(id); + nonRegularIds.add(id); + } + registeredIds.add(regularId); + + List visible = metadataService.getRegularFiles(testPathId); + Set visibleIds = new HashSet<>(); + for (File f : visible) + { + assertEquals("every visible file must carry FILE_TYPE = REGULAR", + File.Type.REGULAR, f.getType()); + visibleIds.add(f.getId()); + } + assertTrue("the seed REGULAR file must be visible", + visibleIds.contains(regularId)); + for (long id : nonRegularIds) + { + assertFalse("non-REGULAR file (id=" + id + ") leaked into getFiles", + visibleIds.contains(id)); + } + } + finally { - beforeIds.add(f.getId()); + if (!registeredIds.isEmpty()) metadataService.deleteFiles(registeredIds); } - assertTrue("REGULAR old file should be visible via getFiles before swap", - beforeIds.contains(oldFileId)); - assertFalse("TEMPORARY new file must NOT be visible via getFiles before swap", - beforeIds.contains(tempFileId)); + } + + /** + * After the swap of a TEMPORARY_GC -> REGULAR, a RETIRED tombstone for the *old* file + * (i.e. the same file ids that were just deleted) cannot pollute the new visible set + * even if the catalog still carries unrelated RETIRED entries on the same path. + */ + @Test + public void testGetFiles_retiredCoexistsWithFreshlyPromoted() throws Exception + { + long oldRegularId = -1L; + long tempGcId = -1L; + long retiredCoexistingId = -1L; + try + { + String suffix = Long.toString(System.nanoTime()); + + // Pre-existing RETIRED file on the same path. This must remain hidden + // throughout the entire scenario. + retiredCoexistingId = insertRawFileWithType( + "coexist_retired_" + suffix + ".pxl", + File.Type.RETIRED.getNumber(), 1, 0L, 1L); + + // The classic swap pair. + oldRegularId = registerTestFile("coexist_old_regular_" + suffix + ".pxl", + File.Type.REGULAR, 1, 0L, 1L); + tempGcId = registerTestFile("coexist_new_temp_gc_" + suffix + ".pxl", + File.Type.TEMPORARY_GC, 1, 0L, 1L); + + // Before swap: only oldRegular visible; RETIRED + TEMPORARY_GC hidden. + Set beforeIds = new HashSet<>(); + for (File f : metadataService.getRegularFiles(testPathId)) beforeIds.add(f.getId()); + assertTrue("old REGULAR must be visible before swap", + beforeIds.contains(oldRegularId)); + assertFalse("RETIRED tombstone must be hidden before swap", + beforeIds.contains(retiredCoexistingId)); + assertFalse("TEMPORARY_GC must be hidden before swap", + beforeIds.contains(tempGcId)); + + long cleanupAt = 1_700_000_050_000L; + metadataService.atomicSwapFiles(tempGcId, Collections.singletonList(oldRegularId), cleanupAt); + + // After swap: tempGcId is now REGULAR (visible); old REGULAR is now RETIRED and + // hidden; the coexisting RETIRED file must STILL be hidden (the swap did not promote it). + Set afterIds = new HashSet<>(); + for (File f : metadataService.getRegularFiles(testPathId)) + { + assertEquals("getRegularFiles must only emit REGULAR after swap", + File.Type.REGULAR, f.getType()); + afterIds.add(f.getId()); + } + assertTrue("freshly-promoted file must be visible after swap", + afterIds.contains(tempGcId)); + assertFalse("the retired old REGULAR must be hidden after swap", + afterIds.contains(oldRegularId)); + assertFileRetired(oldRegularId, cleanupAt, + "the old REGULAR must become RETIRED after swap"); + assertFalse("the unrelated RETIRED tombstone must remain hidden after swap", + afterIds.contains(retiredCoexistingId)); + } + finally + { + List cleanup = new ArrayList<>(); + if (oldRegularId > 0) cleanup.add(oldRegularId); + if (tempGcId > 0) cleanup.add(tempGcId); + if (retiredCoexistingId > 0) cleanup.add(retiredCoexistingId); + if (!cleanup.isEmpty()) metadataService.deleteFiles(cleanup); + } + } + + /** + * A minimum-size REGULAR file is returned with its catalog fields intact. + */ + @Test + public void testGetFiles_singleRegularMinimumData() throws Exception + { + long fileId = -1L; + try + { + fileId = registerTestFile("min_single_regular_" + System.nanoTime() + ".pxl", + File.Type.REGULAR, 1, 0L, 0L); + List files = metadataService.getRegularFiles(testPathId); + File found = null; + for (File f : files) + { + if (f.getId() == fileId) + { + found = f; + } + assertEquals("every returned entry must be REGULAR", + File.Type.REGULAR, f.getType()); + } + assertNotNull("the single REGULAR minimum-data file must be visible", found); + assertEquals("type must be REGULAR", File.Type.REGULAR, found.getType()); + assertEquals("numRowGroup of minimum file must be 1", 1, found.getNumRowGroup()); + assertEquals("minRowId of minimum file must be 0", 0L, found.getMinRowId()); + assertEquals("maxRowId of minimum file must be 0", 0L, found.getMaxRowId()); + } + finally + { + if (fileId > 0) + { + metadataService.deleteFiles(Collections.singletonList(fileId)); + } + } + } + + /** + * A deleted REGULAR file is no longer returned by {@code getFiles}. + */ + @Test + public void testGetFiles_deletedRegular_notVisible() throws Exception + { + long regularId = registerTestFile("delete_visibility_" + System.nanoTime() + ".pxl", + File.Type.REGULAR, 1, 0L, 1L); - metadataService.atomicSwapFiles(tempFileId, Collections.singletonList(oldFileId)); + List beforeDelete = metadataService.getRegularFiles(testPathId); + Set beforeIds = new HashSet<>(); + for (File f : beforeDelete) beforeIds.add(f.getId()); + assertTrue("REGULAR file must be visible before delete", + beforeIds.contains(regularId)); - List afterSwap = metadataService.getFiles(testPathId); - Set afterIds = new HashSet<>(); - for (File f : afterSwap) + metadataService.deleteFiles(Collections.singletonList(regularId)); + + List afterDelete = metadataService.getRegularFiles(testPathId); + for (File f : afterDelete) { - afterIds.add(f.getId()); + assertFalse("deleted REGULAR file must no longer be visible", + f.getId() == regularId); + } + } + + /** + * Concurrent readers observe a consistent REGULAR-only result. + */ + @Test + public void testGetFiles_concurrentReaders_consistentRegularOnly() throws Exception + { + long regularId = -1L; + long tempId = -1L; + long nonRegularPositiveId = -1L; + ExecutorService pool = null; + try + { + String suffix = Long.toString(System.nanoTime()); + regularId = registerTestFile("conc_regular_" + suffix + ".pxl", + File.Type.REGULAR, 1, 0L, 1L); + tempId = registerTestFile("conc_temp_" + suffix + ".pxl", + File.Type.TEMPORARY_INGEST, 1, 0L, 1L); + nonRegularPositiveId = insertRawFileWithType("conc_non_regular_" + suffix + ".pxl", + File.Type.TEMPORARY_GC.getNumber(), 1, 0L, 1L); + + final int threads = 8; + final int iterations = 16; + pool = Executors.newFixedThreadPool(threads); + CyclicBarrier startGate = new CyclicBarrier(threads); + AtomicInteger leakedTemporary = new AtomicInteger(); + AtomicInteger leakedNonRegular = new AtomicInteger(); + AtomicInteger missingRegular = new AtomicInteger(); + + List> futures = new ArrayList<>(); + final long pinnedRegular = regularId; + final long pinnedTemp = tempId; + final long pinnedNonRegular = nonRegularPositiveId; + for (int t = 0; t < threads; t++) + { + futures.add(CompletableFuture.runAsync(() -> + { + try + { + startGate.await(); + for (int i = 0; i < iterations; i++) + { + List snapshot = metadataService.getRegularFiles(testPathId); + boolean sawRegular = false; + for (File f : snapshot) + { + if (f.getType() != File.Type.REGULAR) + { + leakedNonRegular.incrementAndGet(); + } + if (f.getId() == pinnedRegular) sawRegular = true; + if (f.getId() == pinnedTemp) leakedTemporary.incrementAndGet(); + if (f.getId() == pinnedNonRegular) leakedNonRegular.incrementAndGet(); + } + if (!sawRegular) missingRegular.incrementAndGet(); + } + } + catch (Exception e) + { + throw new RuntimeException(e); + } + }, pool)); + } + CompletableFuture.allOf(futures.toArray(new CompletableFuture[0])) + .get(30, java.util.concurrent.TimeUnit.SECONDS); + + assertEquals("no concurrent reader may observe a TEMPORARY_INGEST file", + 0, leakedTemporary.get()); + assertEquals("no concurrent reader may observe a non-REGULAR file", + 0, leakedNonRegular.get()); + assertEquals("every concurrent reader must observe the REGULAR file", + 0, missingRegular.get()); + + // A follow-up call should remain REGULAR-only after the concurrent burst. + List followUp = metadataService.getRegularFiles(testPathId); + assertNotNull("follow-up getFiles must not return null", followUp); + for (File f : followUp) + { + assertEquals("follow-up entries must all be REGULAR", + File.Type.REGULAR, f.getType()); + } + } + finally + { + if (pool != null) + { + pool.shutdownNow(); + } + List cleanup = new ArrayList<>(); + if (regularId > 0) cleanup.add(regularId); + if (tempId > 0) cleanup.add(tempId); + if (nonRegularPositiveId > 0) cleanup.add(nonRegularPositiveId); + if (!cleanup.isEmpty()) metadataService.deleteFiles(cleanup); } - assertTrue("Promoted file should be visible via getFiles after swap", - afterIds.contains(tempFileId)); - assertFalse("Old file should NOT be visible via getFiles after swap", - afterIds.contains(oldFileId)); } /** @@ -1910,7 +2203,7 @@ public void testAtomicSwap_temporaryInvisibleViaGetFiles() throws Exception * thread, so {@code atomicSwapFiles} is never called concurrently in production. * This test reflects that design: N independent (newFile, oldFile) pairs are * swapped one after another, and every new file ends up REGULAR while every - * old file is removed. + * old file is marked RETIRED with its cleanup deadline. */ @Test public void testAtomicSwap_multipleSerialSwaps() throws Exception @@ -1922,6 +2215,7 @@ public void testAtomicSwap_multipleSerialSwaps() throws Exception long[] newFileIds = new long[nPairs]; long[] oldFileIds = new long[nPairs]; + long[] cleanupAts = new long[nPairs]; for (int i = 0; i < nPairs; i++) { @@ -1931,30 +2225,32 @@ public void testAtomicSwap_multipleSerialSwaps() throws Exception long[] pair = registerTestFiles( new String[]{oldName, newName}, - new File.Type[]{File.Type.REGULAR, File.Type.TEMPORARY}, + new File.Type[]{File.Type.REGULAR, File.Type.TEMPORARY_GC}, new int[]{1, 1}, new long[]{0, 0}, new long[]{0, 0}); oldFileIds[i] = pair[0]; newFileIds[i] = pair[1]; + cleanupAts[i] = 1_700_000_060_000L + i; } for (int i = 0; i < nPairs; i++) { metadataService.atomicSwapFiles(newFileIds[i], - Collections.singletonList(oldFileIds[i])); + Collections.singletonList(oldFileIds[i]), cleanupAts[i]); } for (int i = 0; i < nPairs; i++) { assertFileRegular(newFileIds[i], "Promoted file " + i + " must be REGULAR"); - assertFileGone(oldFileIds[i], "Old file " + i + " should be gone"); + assertFileRetired(oldFileIds[i], cleanupAts[i], + "Old file " + i + " should be RETIRED"); } } /** * Partial old-files-already-gone: one old file is deleted before the swap, but - * {@code atomicSwapFiles} is called with both IDs. The DELETE-WHERE-IN for an - * already-absent row is a no-op; the transaction must still commit, promoting the - * new file and removing the remaining old file. + * {@code atomicSwapFiles} is called with both IDs. The UPDATE for an already-absent + * row is a no-op; the transaction must still commit, promoting the new file and + * retiring the remaining old file. */ @Test public void testAtomicSwap_partialOldFilesAlreadyGone() throws Exception @@ -1970,16 +2266,17 @@ public void testAtomicSwap_partialOldFilesAlreadyGone() throws Exception metadataService.deleteFiles(Collections.singletonList(oldIds[0])); assertFileGone(oldIds[0], "old1 should be gone before swap"); - long newFileId = registerTestFile("partial_new.pxl", File.Type.TEMPORARY, 1, 0, 1); - metadataService.atomicSwapFiles(newFileId, Arrays.asList(oldIds[0], oldIds[1])); + long newFileId = registerTestFile("partial_new.pxl", File.Type.TEMPORARY_GC, 1, 0, 1); + long cleanupAt = 1_700_000_070_000L; + metadataService.atomicSwapFiles(newFileId, Arrays.asList(oldIds[0], oldIds[1]), cleanupAt); assertFileRegular(newFileId, "New file must be REGULAR"); - assertFileGone(oldIds[1], "Remaining old file should be gone"); + assertFileRetired(oldIds[1], cleanupAt, "Remaining old file should be RETIRED"); } /** * Rollback after rewrite + dual-write: verifies that Visibility entries for the new - * file are removed, dual-write is unregistered, the TEMPORARY catalog entry is deleted, + * file are removed, dual-write is unregistered, the TEMPORARY_GC catalog entry is deleted, * and the physical file is cleaned up. */ @Test @@ -2011,6 +2308,86 @@ public void testAtomicSwap_rollbackCleansUp() throws Exception assertFileGone(result.newFileId, "Catalog entry should be deleted after rollback"); } + // ======================================================================= + // Section: Storage GC WAL ordering invariant (runStorageGC synchrony) + // + // Recovery (StorageGcWal.RecoveryHandler) relies on the invariant that a + // checkpoint baseline can only ever contain a newFile whose task already + // reached SWAPPED_NOT_CHECKPOINTED. That holds because RRM.runGC publishes + // the checkpoint (Step 3) strictly after runStorageGC returns (Step 2), and + // runStorageGC is synchronous: every WAL task opened during a GC round is + // terminalized (SWAPPED on commit, ABORTED on rollback) before the + // synchronous entry point returns — never left lingering in INDEX_SWITCHING. + // These tests lock that load-bearing synchrony property; a regression here + // (e.g. making the GC asynchronous) would surface the recovery fail-fast. + // ======================================================================= + + /** + * Successful GC round: {@code processFileGroup} must leave the WAL task in + * SWAPPED_NOT_CHECKPOINTED, never INDEX_SWITCHING, by the time it returns. + */ + @Test + public void testProcessFileGroup_commitLeavesWalSwapped_neverIndexSwitching() throws Exception + { + long[] ids = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; + long[] ts = new long[10]; + Arrays.fill(ts, 100); + String filePath = writeTestFile("wal_commit_src.pxl", LONG_ID_SCHEMA, ids, true, ts); + long srcFileId = registerTestFile("wal_commit_src.pxl", File.Type.REGULAR, 1, 0, 9); + retinaManager.addVisibility(srcFileId, 0, 10, 50, null, true); + + Map bitmaps = new HashMap<>(); + bitmaps.put(RetinaUtils.buildRgKey(srcFileId, 0), makeBitmap(10, 6)); + + WalTrackingSyncGC walGc = new WalTrackingSyncGC(retinaManager, metadataService, storageGcWal, + 0.5, 134_217_728L, Integer.MAX_VALUE, 10, 1048576, EncodingLevel.EL2, 86_400_000L, false); + StorageGarbageCollector.FileGroup group = makeGroup(srcFileId, filePath, LONG_ID_SCHEMA); + + walGc.processFileGroup(group, 100L, bitmaps); + + List tasks = storageGcWal.listAllTasks(); + assertEquals("exactly one WAL task expected", 1, tasks.size()); + assertEquals("committed GC round must leave the task SWAPPED", + StorageGcWal.State.SWAPPED_NOT_CHECKPOINTED, tasks.get(0).getState()); + assertNoIndexSwitchingTask(tasks); + assertFileRegular(tasks.get(0).getNewFileId(), "new file must be REGULAR after commit"); + } + + /** + * Crash inside the swap→markSwapped window (simulated by throwing right after the WAL + * task is opened): {@code processFileGroup} must catch it and roll back, terminalizing + * the task to ABORTED — never leaving it in INDEX_SWITCHING — and leaving the source + * file REGULAR with the new file cleaned up. + */ + @Test + public void testProcessFileGroup_crashDuringIndexSwitch_walAborted_neverIndexSwitching() throws Exception + { + long[] ids = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; + long[] ts = new long[10]; + Arrays.fill(ts, 100); + String filePath = writeTestFile("wal_abort_src.pxl", LONG_ID_SCHEMA, ids, true, ts); + long srcFileId = registerTestFile("wal_abort_src.pxl", File.Type.REGULAR, 1, 0, 9); + retinaManager.addVisibility(srcFileId, 0, 10, 50, null, true); + + Map bitmaps = new HashMap<>(); + bitmaps.put(RetinaUtils.buildRgKey(srcFileId, 0), makeBitmap(10, 6)); + + WalTrackingSyncGC walGc = new WalTrackingSyncGC(retinaManager, metadataService, storageGcWal, + 0.5, 134_217_728L, Integer.MAX_VALUE, 10, 1048576, EncodingLevel.EL2, 86_400_000L, true); + StorageGarbageCollector.FileGroup group = makeGroup(srcFileId, filePath, LONG_ID_SCHEMA); + + // processFileGroup swallows the injected failure and rolls back internally. + walGc.processFileGroup(group, 100L, bitmaps); + + List tasks = storageGcWal.listAllTasks(); + assertEquals("exactly one WAL task expected", 1, tasks.size()); + assertEquals("crashed GC round must be rolled back to ABORTED", + StorageGcWal.State.ABORTED, tasks.get(0).getState()); + assertNoIndexSwitchingTask(tasks); + assertFileRegular(srcFileId, "source must stay REGULAR after a rolled-back GC round"); + assertFileGone(tasks.get(0).getNewFileId(), "new file catalog must be removed after rollback"); + } + /** Delayed cleanup removes old file Visibility and physical file after wall-clock deadline passes. */ @Test public void testAtomicSwap_delayedCleanup() throws Exception @@ -2063,7 +2440,7 @@ public void testAtomicSwap_delayedCleanup() throws Exception * Phase 3 (ts=200, dual-write active): delete row 3 → propagated to both files * Sync visibility → export + coord-transform + import * Phase 4 (ts=300, post-sync, dual-write still active): delete row 5 - * Commit → atomic swap (TEMPORARY→REGULAR), old file removed from catalog + * Commit -> atomic swap (TEMPORARY_GC -> REGULAR), old file removed from catalog * Verify: multi-snap_ts consistency on new file at ts=100..500 * Verify: old file gone from catalog, new file REGULAR * @@ -2168,7 +2545,8 @@ public void testEndToEnd_fullGcCycle() throws Exception e2eGc.commitFileGroup(result); assertFileRegular(newFileId, "new file should be REGULAR after commit"); - assertFileGone(srcFileId, "old file should be gone from catalog after commit"); + assertFileRetiredWithCleanupAt(srcFileId, + "old file should be RETIRED in catalog after commit"); assertTrue("old physical file should still exist (delayed cleanup, not yet due)", fileStorage.exists(srcPath)); @@ -2453,7 +2831,7 @@ public void testEndToEnd_concurrentCdcAndGc() throws Exception // 3b. Verify catalog state assertFileRegular(newFileId, "new file should be REGULAR"); - assertFileGone(srcFileId, "old file should be gone from catalog"); + assertFileRetiredWithCleanupAt(srcFileId, "old file should be RETIRED in catalog"); // 3c. Forward mapping int[] fwd = result.forwardRgMappings.get(srcFileId).get(0); @@ -2831,10 +3209,10 @@ public void testEndToEnd_multiRoundCdcGcLifecycle() throws Exception assertNotNull("file-B must still exist (not GCed)", metadataService.getFileById(fileIdB)); assertNotNull("file-C must still exist", metadataService.getFileById(fileIdC)); - // Old generations gone from catalog - assertFileGone(fileIdA, "file-A should be gone from catalog"); - assertFileGone(fileIdAprime, "file-A' should be gone from catalog"); - assertFileGone(fileIdAdoubleprime, "file-A'' should be gone from catalog"); + // Old generations are retired in catalog + assertFileRetiredWithCleanupAt(fileIdA, "file-A should be RETIRED in catalog"); + assertFileRetiredWithCleanupAt(fileIdAprime, "file-A' should be RETIRED in catalog"); + assertFileRetiredWithCleanupAt(fileIdAdoubleprime, "file-A'' should be RETIRED in catalog"); // Physical files from generations 1 and 2 cleaned up assertFalse("file-A physical should not exist", fileStorage.exists(pathA)); @@ -2952,6 +3330,27 @@ private long registerTestFile(String name, File.Type type, return id; } + private long insertRawFileWithType(String name, int fileType, + int numRg, long minRow, long maxRow) + throws Exception + { + String sql = "INSERT INTO FILES(FILE_NAME, FILE_TYPE, FILE_NUM_RG, FILE_MIN_ROW_ID, FILE_MAX_ROW_ID, PATHS_PATH_ID) " + + "VALUES (?, ?, ?, ?, ?, ?)"; + try (PreparedStatement pst = MetaDBUtil.Instance().getConnection().prepareStatement(sql)) + { + pst.setString(1, name); + pst.setInt(2, fileType); + pst.setInt(3, numRg); + pst.setLong(4, minRow); + pst.setLong(5, maxRow); + pst.setLong(6, testPathId); + assertEquals("raw test file insert should affect one row", 1, pst.executeUpdate()); + } + long id = metadataService.getFileId(testOrderedPathUri + "/" + name); + assertTrue(name + " must have valid id", id > 0); + return id; + } + private long[] registerTestFiles(String[] names, File.Type[] types, int[] numRgs, long[] minRows, long[] maxRows) throws Exception @@ -3000,6 +3399,31 @@ private void assertFileRegular(long fileId, String msg) throws Exception assertEquals(msg, File.Type.REGULAR, f.getType()); } + private void assertFileRetired(long fileId, long cleanupAt, String msg) throws Exception + { + File f = metadataService.getFileById(fileId); + assertNotNull(msg, f); + assertEquals(msg, File.Type.RETIRED, f.getType()); + assertEquals(msg, Long.valueOf(cleanupAt), f.getCleanupAt()); + } + + private void assertFileRetiredWithCleanupAt(long fileId, String msg) throws Exception + { + File f = metadataService.getFileById(fileId); + assertNotNull(msg, f); + assertEquals(msg, File.Type.RETIRED, f.getType()); + assertNotNull(msg, f.getCleanupAt()); + } + + private static void assertNoIndexSwitchingTask(List tasks) + { + for (StorageGcWal.Task t : tasks) + { + assertFalse("no WAL task may be left in INDEX_SWITCHING after a synchronous GC round", + t.getState() == StorageGcWal.State.INDEX_SWITCHING); + } + } + // ======================================================================= // Helpers: GC factory for grouping tests // ======================================================================= @@ -3008,8 +3432,8 @@ private static StorageGarbageCollector newGcForGrouping( long targetFileSize, int maxFilesPerGroup, int maxGroups) { return new StorageGarbageCollector( - null, null, 0.5, targetFileSize, maxFilesPerGroup, maxGroups, - 1048576, EncodingLevel.EL2, 86_400_000L); + null, null, null, 0.5, targetFileSize, maxFilesPerGroup, maxGroups, + 1048576, EncodingLevel.EL2, 86_400_000L, new StorageGcWal()); } // ======================================================================= @@ -3680,8 +4104,8 @@ static class DirectScanStorageGC extends StorageGarbageCollector DirectScanStorageGC(RetinaResourceManager rm, double threshold, int maxGroups, List fakeEntries) { - super(rm, null, threshold, 134_217_728L, Integer.MAX_VALUE, maxGroups, - 1048576, EncodingLevel.EL2, 86_400_000L); + super(rm, null, null, threshold, 134_217_728L, Integer.MAX_VALUE, maxGroups, + 1048576, EncodingLevel.EL2, 86_400_000L, new StorageGcWal()); this.fakeEntries = fakeEntries; } @@ -3719,6 +4143,53 @@ void processFileGroups(List fileGroups, long safeGcTs, } } + /** + * StorageGarbageCollector subclass that records the boundaries between + * {@code runStorageGC}'s candidate calculation, scan, bitmap trimming, and + * process phases without touching real metadata or Pixels files. + */ + static class TrackingRunStorageGC extends StorageGarbageCollector + { + private final List groupsToReturn; + boolean scanCalled; + boolean processCalled; + RuntimeException processFailure; + Set capturedCandidateFileIds = Collections.emptySet(); + List capturedFileGroups = Collections.emptyList(); + long capturedSafeGcTs = Long.MIN_VALUE; + Set bitmapKeysSeenByProcess = Collections.emptySet(); + + TrackingRunStorageGC(List groupsToReturn) + { + super(null, null, null, 0.5, 0L, Integer.MAX_VALUE, 10, + 1048576, EncodingLevel.EL2, 86_400_000L, new StorageGcWal()); + this.groupsToReturn = groupsToReturn; + } + + @Override + List scanAndGroupFiles(Set candidateFileIds, + Map fileStats) + { + this.scanCalled = true; + this.capturedCandidateFileIds = new HashSet<>(candidateFileIds); + return groupsToReturn; + } + + @Override + void processFileGroups(List fileGroups, long safeGcTs, + Map gcSnapshotBitmaps) + { + this.processCalled = true; + this.capturedFileGroups = new ArrayList<>(fileGroups); + this.capturedSafeGcTs = safeGcTs; + this.bitmapKeysSeenByProcess = new HashSet<>(gcSnapshotBitmaps.keySet()); + if (processFailure != null) + { + throw processFailure; + } + } + } + /** * StorageGarbageCollector subclass where {@code rewriteFileGroup} throws on * the first call and succeeds (cleaning up bitmaps) on subsequent calls. @@ -3732,8 +4203,8 @@ static class FailFirstGroupGC extends StorageGarbageCollector FailFirstGroupGC() { - super(null, null, 0.5, 0L, Integer.MAX_VALUE, 10, - 1048576, EncodingLevel.EL2, 86_400_000L); + super(null, null, null, 0.5, 0L, Integer.MAX_VALUE, 10, + 1048576, EncodingLevel.EL2, 86_400_000L, new StorageGcWal()); } @Override @@ -3770,13 +4241,65 @@ static class NoIndexSyncGC extends StorageGarbageCollector int maxGroups, int rowGroupSize, EncodingLevel encodingLevel, long retireDelayMs) { - super(rm, ms, threshold, targetFileSize, maxFilesPerGroup, maxGroups, - rowGroupSize, encodingLevel, retireDelayMs); + super(rm, ms, null, threshold, targetFileSize, maxFilesPerGroup, maxGroups, + rowGroupSize, encodingLevel, retireDelayMs, new StorageGcWal()); + } + + @Override + void syncIndex(RewriteResult result, long tableId) throws Exception + { + } + } + + /** + * Test double that performs the WAL bookkeeping of the production {@code syncIndex} + * (open a task → INDEX_SWITCHING, flush) against an injected {@link StorageGcWal}, but + * skips the heavy primary-index machinery. Optionally throws right after the task is + * opened to simulate a crash inside the swap→markSwapped window, so tests can assert + * that the synchronous GC entry point ({@code processFileGroup}) never leaves a task in + * INDEX_SWITCHING — committing it to SWAPPED or rolling it back to ABORTED. + */ + static class WalTrackingSyncGC extends StorageGarbageCollector + { + private final StorageGcWal wal; + private final boolean throwAfterOpen; + + WalTrackingSyncGC(RetinaResourceManager rm, MetadataService ms, StorageGcWal wal, + double threshold, long targetFileSize, int maxFilesPerGroup, + int maxGroups, int rowGroupSize, EncodingLevel encodingLevel, + long retireDelayMs, boolean throwAfterOpen) + { + super(rm, ms, LocalIndexService.Instance(), threshold, targetFileSize, + maxFilesPerGroup, maxGroups, rowGroupSize, encodingLevel, retireDelayMs, wal); + this.wal = wal; + this.throwAfterOpen = throwAfterOpen; } @Override void syncIndex(RewriteResult result, long tableId) throws Exception { + int totalRows = result.newFileRgRowStart[result.newFileRgCount]; + if (totalRows == 0) + { + return; + } + result.newRowIdStart = 0L; + String journalTaskId = "storage-gc-test-" + tableId + "-" + result.newFileId; + List oldFileIds = new ArrayList<>(); + for (FileCandidate fc : result.group.files) + { + oldFileIds.add(fc.fileId); + } + result.walWriter = wal.createTask(journalTaskId, tableId, + result.group.virtualNodeId, oldFileIds, result.newFileId, + result.newFilePath, 0L, totalRows); + result.walWriter.flush(); + // Task is now durably INDEX_SWITCHING. A crash here must be terminalized to + // ABORTED by processFileGroup's rollback, never left lingering. + if (throwAfterOpen) + { + throw new RuntimeException("injected crash mid index-switch"); + } } } } diff --git a/pixels-retina/src/test/java/io/pixelsdb/pixels/retina/TestStorageGcWal.java b/pixels-retina/src/test/java/io/pixelsdb/pixels/retina/TestStorageGcWal.java new file mode 100644 index 0000000000..4e2a895a90 --- /dev/null +++ b/pixels-retina/src/test/java/io/pixelsdb/pixels/retina/TestStorageGcWal.java @@ -0,0 +1,624 @@ +/* + * Copyright 2026 PixelsDB. + * + * This file is part of Pixels. + * + * Pixels is free software: you can redistribute it and/or modify + * it under the terms of the Affero GNU General Public License as + * published by the Free Software Foundation, either version 3 of + * the License, or (at your option) any later version. + * + * Pixels is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Affero + * GNU General Public License for more details. + * + * You should have received a copy of the Affero GNU General Public + * License along with Pixels. If not, see + * . + */ +package io.pixelsdb.pixels.retina; + +import com.google.protobuf.ByteString; +import io.pixelsdb.pixels.common.exception.RetinaException; +import io.pixelsdb.pixels.common.index.IndexOption; +import io.pixelsdb.pixels.common.index.RollbackEntry; +import io.pixelsdb.pixels.common.index.service.IndexService; +import io.pixelsdb.pixels.common.metadata.MetadataService; +import io.pixelsdb.pixels.common.metadata.domain.File; +import io.pixelsdb.pixels.common.metadata.domain.SinglePointIndex; +import io.pixelsdb.pixels.common.physical.Storage; +import io.pixelsdb.pixels.common.physical.StorageFactory; +import io.pixelsdb.pixels.common.utils.ConfigFactory; +import io.pixelsdb.pixels.index.IndexProto; +import org.junit.Before; +import org.junit.Test; +import org.mockito.ArgumentCaptor; + +import java.io.DataOutputStream; +import java.io.IOException; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Optional; +import java.util.Set; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.ArgumentMatchers.anyLong; +import static org.mockito.ArgumentMatchers.eq; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.never; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +/** + * Unit tests for the Storage GC append-only WAL: persistence, state machine, + * store queries, and startup recovery. + */ +public class TestStorageGcWal +{ + @Before + public void setUp() throws IOException + { + cleanupWalDir(); + } + + private static void cleanupWalDir() throws IOException + { + String walDir = ConfigFactory.Instance().getProperty("retina.storage.gc.journal.dir"); + Storage storage = StorageFactory.Instance().getStorage(walDir); + if (!storage.exists(walDir)) + { + return; + } + for (String path : storage.listPaths(walDir)) + { + storage.delete(path, false); + } + } + + private static StorageGcWal newWal() + { + return new StorageGcWal(); + } + + // ─── WAL file format and state machine ─────────────────────────────────── + + @Test + public void testWal_createAndReplayTask() throws IOException + { + StorageGcWal wal = newWal(); + try (StorageGcWal.Writer w = wal.createTask( + "task-1", 11L, 2, Arrays.asList(101L, 102L), + 201L, "file:///tmp/pixels/gc-201.pxl", 3000L, 2)) + { + w.appendRollbackEntry(key("pk-a"), 10L, 20L); + w.flush(); + w.markSwapped(); // closes the writer + } + + Optional loaded = wal.getTask("task-1"); + assertTrue(loaded.isPresent()); + StorageGcWal.Task task = loaded.get(); + assertEquals("task-1", task.getTaskId()); + assertEquals(11L, task.getTableId()); + assertEquals(2, task.getVirtualNodeId()); + assertEquals(Arrays.asList(101L, 102L), task.getOldFileIds()); + assertEquals(201L, task.getNewFileId()); + assertEquals(3000L, task.getNewRowIdStart()); + assertEquals(2, task.getNewRowCount()); + assertEquals(StorageGcWal.State.SWAPPED_NOT_CHECKPOINTED, task.getState()); + assertEquals(1, task.getRollbackEntries().size()); + assertEquals(10L, task.getRollbackEntries().get(0).getOldRowId()); + assertEquals(20L, task.getRollbackEntries().get(0).getNewRowId()); + } + + @Test + public void testWal_stateMachine_indexSwitchingToAborted() throws IOException + { + StorageGcWal wal = newWal(); + try (StorageGcWal.Writer w = wal.createTask( + "task-abort", 11L, 2, Collections.singletonList(101L), + 201L, "", 3000L, 2)) + { + w.appendRollbackEntry(key("pk-x"), 5L, 15L); + w.markAborted(); // closes + } + + StorageGcWal.Task task = wal.getTask("task-abort").get(); + assertEquals(StorageGcWal.State.ABORTED, task.getState()); + assertEquals(1, task.getRollbackEntries().size()); + } + + @Test + public void testWal_markCheckpointed_coldPath() throws IOException + { + StorageGcWal wal = newWal(); + try (StorageGcWal.Writer w = wal.createTask( + "task-ck", 11L, 2, Collections.singletonList(101L), + 201L, "", 3000L, 2)) + { + w.markSwapped(); + } + assertEquals(StorageGcWal.State.SWAPPED_NOT_CHECKPOINTED, + wal.getTask("task-ck").get().getState()); + + wal.markCheckpointed("task-ck"); + + assertEquals(StorageGcWal.State.CHECKPOINTED, + wal.getTask("task-ck").get().getState()); + } + + // ─── Store queries ──────────────────────────────────────────────────────── + + @Test + public void testWal_gcWorkflowAndQueries() throws IOException + { + StorageGcWal wal = newWal(); + + // Pending task + try (StorageGcWal.Writer w = wal.createTask( + "task-1", 11L, 2, Arrays.asList(101L, 102L), 201L, "", 3000L, 2)) + { + w.appendRollbackEntry(key("pk-a"), 10L, 20L); + w.flush(); + w.markSwapped(); + } + assertTrue(wal.collectPendingFileIds().containsAll(Arrays.asList(101L, 102L, 201L))); + + // Terminal tasks + try (StorageGcWal.Writer w = wal.createTask( + "task-terminal", 12L, 3, Collections.singletonList(102L), 202L, "", 4000L, 3)) + { + w.markSwapped(); + } + wal.markCheckpointed("task-terminal"); + + try (StorageGcWal.Writer w = wal.createTask( + "task-aborted", 13L, 4, Collections.singletonList(103L), 203L, "", 5000L, 4)) + { + w.markAborted(); + } + + List terminalTasks = wal.listTerminalTasks(); + assertEquals(2, terminalTasks.size()); + + wal.deleteTerminalTasks(Arrays.asList("task-terminal", "task-aborted")); + assertFalse(wal.getTask("task-terminal").isPresent()); + assertFalse(wal.getTask("task-aborted").isPresent()); + + // After marking task-1 checkpointed, pending set should be empty + wal.markCheckpointed("task-1"); + assertTrue(wal.collectPendingFileIds().isEmpty()); + + // Non-terminal task may not be deleted + try (StorageGcWal.Writer w = wal.createTask( + "task-pending", 14L, 5, Collections.singletonList(104L), 204L, "", 6000L, 5)) + { + w.markSwapped(); + } + try + { + wal.deleteTerminalTasks(Collections.singletonList("task-pending")); + fail("deleteTerminalTasks must reject non-terminal tasks"); + } + catch (IllegalArgumentException expected) + { + // expected + } + } + + // ─── Recovery tests ─────────────────────────────────────────────────────── + + @Test + public void testRecovery_acceptSwapAndMultiRoundLifecycle() throws Exception + { + StorageGcWal wal = newWal(); + + // First round: already CHECKPOINTED + try (StorageGcWal.Writer w = wal.createTask( + "first-round", 11L, 2, Collections.singletonList(101L), 201L, "", 3000L, 2)) + { + w.markSwapped(); + } + wal.markCheckpointed("first-round"); + + // Second round: SWAPPED, not yet checkpointed + try (StorageGcWal.Writer w = wal.createTask( + "second-round", 11L, 3, Collections.singletonList(201L), 301L, "", 4000L, 3)) + { + w.markSwapped(); + } + + StorageGcWal.RecoveryHandler handler = new StorageGcWal.RecoveryHandler( + wal, mock(MetadataService.class), mock(IndexService.class)); + // Baseline includes both 201L (first-round output) and 301L (second-round output) + handler.recover(new HashSet<>(Arrays.asList(201L, 301L))); + + assertEquals(StorageGcWal.State.CHECKPOINTED, wal.getTask("second-round").get().getState()); + assertEquals(StorageGcWal.State.CHECKPOINTED, wal.getTask("first-round").get().getState()); + + wal.deleteTerminalTasks(Arrays.asList("first-round", "second-round")); + assertFalse(wal.getTask("first-round").isPresent()); + assertFalse(wal.getTask("second-round").isPresent()); + } + + @Test + @SuppressWarnings({"unchecked", "rawtypes"}) + public void testRecovery_rejectedSwapRollsBackCatalogAndIndex() throws Exception + { + StorageGcWal wal = newWal(); + IndexProto.IndexKey indexKey = key("pk-a"); + try (StorageGcWal.Writer w = wal.createTask( + "task-rejected", 11L, 2, Collections.singletonList(101L), 201L, "", 3000L, 2)) + { + w.appendRollbackEntry(indexKey, 10L, 20L); + w.flush(); + w.markSwapped(); + } + + MetadataService metadataService = mock(MetadataService.class); + IndexService indexService = mock(IndexService.class); + SinglePointIndex primaryIndex = new SinglePointIndex(); + primaryIndex.setId(22L); + File oldFile = catalogFile(101L, File.Type.RETIRED, 123456L); + File newFile = catalogFile(201L, File.Type.REGULAR, null); + + when(metadataService.getPrimaryIndex(11L)).thenReturn(primaryIndex); + when(metadataService.getFileById(101L)).thenReturn(oldFile); + when(metadataService.getFileById(201L)).thenReturn(newFile); + when(metadataService.updateFile(any(File.class))).thenReturn(true); + when(metadataService.deleteFiles(Collections.singletonList(201L))).thenReturn(true); + + StorageGcWal.RecoveryHandler handler = new StorageGcWal.RecoveryHandler( + wal, metadataService, indexService); + handler.recover(Collections.singleton(101L)); + + assertEquals(StorageGcWal.State.ABORTED, wal.getTask("task-rejected").get().getState()); + assertEquals(File.Type.REGULAR, oldFile.getType()); + assertEquals(null, oldFile.getCleanupAt()); + + ArgumentCaptor rollbackCaptor = ArgumentCaptor.forClass(List.class); + verify(indexService).restorePrimaryIndexEntries( + eq(11L), eq(22L), rollbackCaptor.capture(), any(IndexOption.class)); + RollbackEntry rollbackEntry = (RollbackEntry) rollbackCaptor.getValue().get(0); + assertEquals(indexKey, rollbackEntry.getIndexKey()); + assertEquals(10L, rollbackEntry.getOldRowId()); + assertEquals(20L, rollbackEntry.getNewRowId()); + verify(indexService).deleteMainIndexRange(11L, 201L, 3000L, 2); + verify(metadataService).updateFile(oldFile); + verify(metadataService).deleteFiles(Collections.singletonList(201L)); + } + + @Test + public void testRecovery_incompleteAndAbortedTasks() throws Exception + { + StorageGcWal wal = newWal(); + IndexProto.IndexKey indexKey = key("pk-index-switching"); + + // INDEX_SWITCHING: WAL has rollback entry but no state transition (crash mid-sync) + try (StorageGcWal.Writer w = wal.createTask( + "task-index-switching", 11L, 2, Collections.singletonList(101L), 201L, "", 3000L, 2)) + { + w.appendRollbackEntry(indexKey, 10L, 20L); + w.flush(); + // no markSwapped/markAborted — simulates crash; Writer.close() leaves INDEX_SWITCHING + } + + // Already ABORTED + try (StorageGcWal.Writer w = wal.createTask( + "task-aborted", 11L, 2, Collections.singletonList(105L), 205L, "", 5000L, 5)) + { + w.markAborted(); + } + + MetadataService metadataService = mock(MetadataService.class); + IndexService indexService = mock(IndexService.class); + SinglePointIndex primaryIndex = new SinglePointIndex(); + primaryIndex.setId(22L); + File switchingOldFile = catalogFile(101L, File.Type.REGULAR, null); + File switchingNewFile = catalogFile(201L, File.Type.TEMPORARY_GC, null); + File abortedNewFile = catalogFile(205L, File.Type.TEMPORARY_GC, null); + + when(metadataService.getPrimaryIndex(11L)).thenReturn(primaryIndex); + when(metadataService.getFileById(101L)).thenReturn(switchingOldFile); + when(metadataService.getFileById(201L)).thenReturn(switchingNewFile); + when(metadataService.getFileById(205L)).thenReturn(abortedNewFile); + when(metadataService.deleteFiles(any())).thenReturn(true); + + StorageGcWal.RecoveryHandler handler = new StorageGcWal.RecoveryHandler( + wal, metadataService, indexService); + handler.recover(Collections.emptySet()); + + assertEquals(StorageGcWal.State.ABORTED, + wal.getTask("task-index-switching").get().getState()); + assertEquals(StorageGcWal.State.ABORTED, + wal.getTask("task-aborted").get().getState()); + verify(indexService).restorePrimaryIndexEntries( + eq(11L), eq(22L), + org.mockito.ArgumentMatchers.>any(), + any(IndexOption.class)); + verify(indexService).deleteMainIndexRange(11L, 201L, 3000L, 2); + verify(indexService).deleteMainIndexRange(11L, 205L, 5000L, 5); + verify(metadataService).deleteFiles(Collections.singletonList(201L)); + verify(metadataService).deleteFiles(Collections.singletonList(205L)); + verify(metadataService, never()).updateFile(switchingOldFile); + } + + @Test + public void testRecovery_checkpointedMissingFromBaselineFailsClosed() throws Exception + { + StorageGcWal wal = newWal(); + try (StorageGcWal.Writer w = wal.createTask( + "task-final", 11L, 2, Collections.singletonList(101L), 201L, "", 3000L, 2)) + { + w.markSwapped(); + } + wal.markCheckpointed("task-final"); + + StorageGcWal.RecoveryHandler handler = new StorageGcWal.RecoveryHandler( + wal, mock(MetadataService.class), mock(IndexService.class)); + try + { + handler.recover(Collections.singleton(101L)); + fail("CHECKPOINTED WAL task missing from baseline must fail closed"); + } + catch (RetinaException e) + { + assertEquals(StorageGcWal.State.CHECKPOINTED, + wal.getTask("task-final").get().getState()); + } + } + + /** + * Guards the GC ordering invariant from the recovery side: a checkpoint baseline can + * only contain a newFile whose task already reached SWAPPED_NOT_CHECKPOINTED. If an + * INDEX_SWITCHING task's newFile is found in the baseline, the invariant was violated + * and recovery must fail closed rather than commit a half-switched primary index. + */ + @Test + public void testRecovery_indexSwitchingInBaseline_failsClosed() throws Exception + { + StorageGcWal wal = newWal(); + // Crash in the swap→markSwapped window leaves the task in INDEX_SWITCHING. + try (StorageGcWal.Writer w = wal.createTask( + "task-switching-in-baseline", 11L, 2, Collections.singletonList(101L), 201L, "", 3000L, 2)) + { + w.appendRollbackEntry(key("pk-a"), 10L, 20L); + w.flush(); + // no markSwapped/markAborted — leaves INDEX_SWITCHING + } + + StorageGcWal.RecoveryHandler handler = new StorageGcWal.RecoveryHandler( + wal, mock(MetadataService.class), mock(IndexService.class)); + try + { + // newFileId 201 present in the baseline while the task is still INDEX_SWITCHING. + handler.recover(Collections.singleton(201L)); + fail("INDEX_SWITCHING task present in baseline must fail closed"); + } + catch (RetinaException e) + { + assertTrue("error message must flag the invariant violation", + e.getMessage().contains("invariant violation")); + // State must be left untouched: neither committed nor aborted. + assertEquals(StorageGcWal.State.INDEX_SWITCHING, + wal.getTask("task-switching-in-baseline").get().getState()); + } + } + + /** + * INDEX_SWITCHING task whose newFile is NOT in the baseline, but whose catalog shows the + * swap already committed (old RETIRED, new REGULAR). The rollback must restore the old + * file catalog — {@code restoreOldFiles} is derived from {@link + * StorageGcWal.RecoveryHandler} via {@code isSwapCommitted}, not hard-coded by state. This + * complements {@link #testRecovery_incompleteAndAbortedTasks}, which covers the + * swap-not-committed branch where the old catalog is left alone. + */ + @Test + public void testRecovery_indexSwitchingNotInBaseline_swapCommitted_restoresOldFiles() throws Exception + { + StorageGcWal wal = newWal(); + IndexProto.IndexKey indexKey = key("pk-a"); + try (StorageGcWal.Writer w = wal.createTask( + "task-switching-committed", 11L, 2, Collections.singletonList(101L), 201L, "", 3000L, 2)) + { + w.appendRollbackEntry(indexKey, 10L, 20L); + w.flush(); + // no markSwapped — INDEX_SWITCHING, but the catalog below shows the swap committed + } + + MetadataService metadataService = mock(MetadataService.class); + IndexService indexService = mock(IndexService.class); + SinglePointIndex primaryIndex = new SinglePointIndex(); + primaryIndex.setId(22L); + // Swap committed before crash: old RETIRED, new REGULAR → isSwapCommitted() == true + File oldFile = catalogFile(101L, File.Type.RETIRED, 123456L); + File newFile = catalogFile(201L, File.Type.REGULAR, null); + when(metadataService.getPrimaryIndex(11L)).thenReturn(primaryIndex); + when(metadataService.getFileById(101L)).thenReturn(oldFile); + when(metadataService.getFileById(201L)).thenReturn(newFile); + when(metadataService.updateFile(any(File.class))).thenReturn(true); + when(metadataService.deleteFiles(Collections.singletonList(201L))).thenReturn(true); + + StorageGcWal.RecoveryHandler handler = new StorageGcWal.RecoveryHandler( + wal, metadataService, indexService); + handler.recover(Collections.emptySet()); + + assertEquals(StorageGcWal.State.ABORTED, + wal.getTask("task-switching-committed").get().getState()); + // Swap committed → old file catalog restored to REGULAR with cleanupAt cleared. + assertEquals(File.Type.REGULAR, oldFile.getType()); + assertEquals(null, oldFile.getCleanupAt()); + verify(metadataService).updateFile(oldFile); + verify(indexService).deleteMainIndexRange(11L, 201L, 3000L, 2); + } + + @Test + public void testBug1_restoreRegularOldFile_updateFileMustBeCalled() throws Exception + { + StorageGcWal wal = newWal(); + try (StorageGcWal.Writer w = wal.createTask( + "task-regular-old", 11L, 2, Collections.singletonList(101L), 201L, "", 3000L, 2)) + { + w.markSwapped(); + } + + MetadataService metadataService = mock(MetadataService.class); + IndexService indexService = mock(IndexService.class); + File oldFile = catalogFile(101L, File.Type.REGULAR, 123456L); + File newFile = catalogFile(201L, File.Type.REGULAR, null); + + when(metadataService.getFileById(101L)).thenReturn(oldFile); + when(metadataService.getFileById(201L)).thenReturn(newFile); + when(metadataService.updateFile(any(File.class))).thenReturn(true); + when(metadataService.deleteFiles(Collections.singletonList(201L))).thenReturn(true); + + StorageGcWal.RecoveryHandler handler = new StorageGcWal.RecoveryHandler( + wal, metadataService, indexService); + handler.recover(Collections.emptySet()); + + assertEquals(null, oldFile.getCleanupAt()); + verify(metadataService).updateFile(oldFile); + } + + @Test + public void testCorruptedWalFile_failsWithActionableMessage() throws IOException + { + StorageGcWal wal = newWal(); + // Create a valid task so listAllTasks has something to iterate + try (StorageGcWal.Writer w = wal.createTask( + "valid-task", 11L, 2, Collections.singletonList(101L), 201L, "", 3000L, 2)) + { + // leave in INDEX_SWITCHING (no markSwapped/markAborted) + } + + // Inject a corrupted WAL file + String walDir = ConfigFactory.Instance().getProperty("retina.storage.gc.journal.dir"); + Storage storage = StorageFactory.Instance().getStorage(walDir); + String corruptedPath = io.pixelsdb.pixels.common.utils.RetinaUtils + .buildStorageGcJournalPath(walDir, "corrupted-99"); + try (DataOutputStream out = storage.create(corruptedPath, true, 64)) + { + out.write(new byte[]{0x01, 0x02, 0x03}); + } + + try + { + wal.listAllTasks(); + fail("expected IllegalStateException for corrupted WAL file"); + } + catch (IllegalStateException e) + { + assertTrue("error message must contain the corrupted file path", + e.getMessage().contains("corrupted-99")); + assertTrue("error message must guide operators to delete the file", + e.getMessage().contains("Delete this file")); + } + } + + @Test + public void testRecovery_preAbortedTaskOnlyCleanedUp() throws Exception + { + StorageGcWal wal = newWal(); + + // Already ABORTED from a previous run — only its new file is cleaned up again. + try (StorageGcWal.Writer w = wal.createTask( + "pre-aborted", 11L, 2, Collections.singletonList(100L), 200L, "", 2000L, 1)) + { + w.markAborted(); + } + + // Needs rollback this run + try (StorageGcWal.Writer w = wal.createTask( + "needs-rollback", 11L, 2, Collections.singletonList(101L), 201L, "", 3000L, 2)) + { + w.markSwapped(); + } + + MetadataService metadataService = mock(MetadataService.class); + IndexService indexService = mock(IndexService.class); + File oldFile = catalogFile(101L, File.Type.RETIRED, 123456L); + File newFile = catalogFile(201L, File.Type.REGULAR, null); + File preAbortedNew = catalogFile(200L, File.Type.TEMPORARY_GC, null); + when(metadataService.getFileById(101L)).thenReturn(oldFile); + when(metadataService.getFileById(201L)).thenReturn(newFile); + when(metadataService.getFileById(200L)).thenReturn(preAbortedNew); + when(metadataService.updateFile(any(File.class))).thenReturn(true); + when(metadataService.deleteFiles(any())).thenReturn(true); + + StorageGcWal.RecoveryHandler handler = new StorageGcWal.RecoveryHandler( + wal, metadataService, indexService); + handler.recover(Collections.emptySet()); + + // Pre-aborted task: only its new file is cleaned up again (idempotent), no catalog restore. + assertEquals(StorageGcWal.State.ABORTED, wal.getTask("pre-aborted").get().getState()); + verify(metadataService).deleteFiles(Collections.singletonList(200L)); + // Needs-rollback task: full rollback — old file catalog restored and new file cleaned up. + assertEquals(StorageGcWal.State.ABORTED, wal.getTask("needs-rollback").get().getState()); + verify(metadataService).updateFile(oldFile); + verify(metadataService).deleteFiles(Collections.singletonList(201L)); + } + + @Test + @SuppressWarnings({"unchecked", "rawtypes"}) + public void testRollbackEntry_negativeOldRowId_isSkippedInIndexRestore() throws Exception + { + StorageGcWal wal = newWal(); + try (StorageGcWal.Writer w = wal.createTask( + "task-sentinel", 11L, 2, Collections.singletonList(101L), 201L, "", 3000L, 2)) + { + w.appendRollbackEntry(key("pk-sentinel"), -1L, 20L); + w.flush(); + w.markSwapped(); + } + + MetadataService metadataService = mock(MetadataService.class); + IndexService indexService = mock(IndexService.class); + File oldFile = catalogFile(101L, File.Type.RETIRED, 123456L); + File newFile = catalogFile(201L, File.Type.REGULAR, null); + when(metadataService.getFileById(101L)).thenReturn(oldFile); + when(metadataService.getFileById(201L)).thenReturn(newFile); + when(metadataService.updateFile(any(File.class))).thenReturn(true); + when(metadataService.deleteFiles(any())).thenReturn(true); + + StorageGcWal.RecoveryHandler handler = new StorageGcWal.RecoveryHandler( + wal, metadataService, indexService); + handler.recover(Collections.emptySet()); + + // oldRowId=-1 → no prior entry → restorePrimaryIndexEntries must NOT be called + verify(indexService, never()).restorePrimaryIndexEntries( + anyLong(), anyLong(), any(List.class), any(IndexOption.class)); + } + + // ─── Helpers ───────────────────────────────────────────────────────────── + + private static File catalogFile(long id, File.Type type, Long cleanupAt) + { + File file = new File(); + file.setId(id); + file.setName("f-" + id + ".pxl"); + file.setType(type); + file.setNumRowGroup(1); + file.setMinRowId(0L); + file.setMaxRowId(9L); + file.setPathId(1L); + file.setCleanupAt(cleanupAt); + return file; + } + + private static IndexProto.IndexKey key(String value) + { + return IndexProto.IndexKey.newBuilder() + .setTableId(11L) + .setIndexId(22L) + .setKey(ByteString.copyFromUtf8(value)) + .setTimestamp(33L) + .build(); + } +} diff --git a/pixels-storage/pixels-storage-gcs/src/main/java/io/pixelsdb/pixels/storage/gcs/GCS.java b/pixels-storage/pixels-storage-gcs/src/main/java/io/pixelsdb/pixels/storage/gcs/GCS.java index 72a265d261..0fb0c83591 100644 --- a/pixels-storage/pixels-storage-gcs/src/main/java/io/pixelsdb/pixels/storage/gcs/GCS.java +++ b/pixels-storage/pixels-storage-gcs/src/main/java/io/pixelsdb/pixels/storage/gcs/GCS.java @@ -316,6 +316,12 @@ public DataOutputStream create(String path, boolean overwrite, int bufferSize) t Channels.newOutputStream(writeChannel), bufferSize)); } + @Override + public DataOutputStream append(String path, int bufferSize) + { + throw new UnsupportedOperationException("append is not supported by storage backend: " + getScheme()); + } + @Override public boolean delete(String path, boolean recursive) throws IOException { diff --git a/pixels-storage/pixels-storage-hdfs/src/main/java/io/pixelsdb/pixels/storage/hdfs/HDFS.java b/pixels-storage/pixels-storage-hdfs/src/main/java/io/pixelsdb/pixels/storage/hdfs/HDFS.java index 40f0eb9ccd..30159cf955 100644 --- a/pixels-storage/pixels-storage-hdfs/src/main/java/io/pixelsdb/pixels/storage/hdfs/HDFS.java +++ b/pixels-storage/pixels-storage-hdfs/src/main/java/io/pixelsdb/pixels/storage/hdfs/HDFS.java @@ -320,6 +320,12 @@ public DataOutputStream create(String path, boolean overwrite, int bufferSize, return fs.create(new Path(path), overwrite, bufferSize, replication, blockSize); } + @Override + public DataOutputStream append(String path, int bufferSize) throws IOException + { + return fs.append(new Path(path), bufferSize); + } + @Override public boolean delete(String path, boolean recursive) throws IOException { diff --git a/pixels-storage/pixels-storage-http/src/main/java/io/pixelsdb/pixels/storage/http/HttpStream.java b/pixels-storage/pixels-storage-http/src/main/java/io/pixelsdb/pixels/storage/http/HttpStream.java index 9907adc431..169efc4986 100644 --- a/pixels-storage/pixels-storage-http/src/main/java/io/pixelsdb/pixels/storage/http/HttpStream.java +++ b/pixels-storage/pixels-storage-http/src/main/java/io/pixelsdb/pixels/storage/http/HttpStream.java @@ -122,6 +122,12 @@ public DataOutputStream create(String path, boolean overwrite, int bufferSize) t return new DataOutputStream(new HttpOutputStream(httpStreamPath.getHost(), httpStreamPath.getPort(), bufferSize)); } + @Override + public DataOutputStream append(String path, int bufferSize) + { + throw new UnsupportedOperationException("append is not supported by storage backend: " + getScheme()); + } + @Override public boolean delete(String path, boolean recursive) { diff --git a/pixels-storage/pixels-storage-localfs/src/main/java/io/pixelsdb/pixels/storage/localfs/LocalFS.java b/pixels-storage/pixels-storage-localfs/src/main/java/io/pixelsdb/pixels/storage/localfs/LocalFS.java index 0ca8c67407..8b8638109c 100644 --- a/pixels-storage/pixels-storage-localfs/src/main/java/io/pixelsdb/pixels/storage/localfs/LocalFS.java +++ b/pixels-storage/pixels-storage-localfs/src/main/java/io/pixelsdb/pixels/storage/localfs/LocalFS.java @@ -272,6 +272,26 @@ public DataOutputStream create(String path, boolean overwrite, int bufferSize) t return new DataOutputStream(new BufferedOutputStream(new FileOutputStream(file), bufferSize)); } + @Override + public DataOutputStream append(String path, int bufferSize) throws IOException + { + FilePath p = new FilePath(path); + if (!p.valid) + { + throw new IOException("Path '" + path + "' is not a valid local fs path."); + } + File file = new File(p.realPath); + if (!file.exists()) + { + throw new IOException("File '" + p.realPath + "' does not exist, cannot append."); + } + if (file.isDirectory()) + { + throw new IOException("Path '" + p.realPath + "' is a directory, it must be a file."); + } + return new DataOutputStream(new BufferedOutputStream(new FileOutputStream(file, true), bufferSize)); + } + public PixelsRandomAccessFile openRaf(String path) throws IOException { FilePath p = new FilePath(path); diff --git a/pixels-storage/pixels-storage-mock/src/main/java/io/pixelsdb/pixels/storage/mock/Mock.java b/pixels-storage/pixels-storage-mock/src/main/java/io/pixelsdb/pixels/storage/mock/Mock.java index 95fb17a508..606f2ab85f 100644 --- a/pixels-storage/pixels-storage-mock/src/main/java/io/pixelsdb/pixels/storage/mock/Mock.java +++ b/pixels-storage/pixels-storage-mock/src/main/java/io/pixelsdb/pixels/storage/mock/Mock.java @@ -120,6 +120,12 @@ public DataOutputStream create(String path, boolean overwrite, int bufferSize, s return null; } + @Override + public DataOutputStream append(String path, int bufferSize) + { + throw new UnsupportedOperationException("append is not supported by storage backend: " + getScheme()); + } + @Override public boolean delete(String path, boolean recursive) throws IOException { diff --git a/pixels-storage/pixels-storage-redis/src/main/java/io/pixelsdb/pixels/storage/redis/Redis.java b/pixels-storage/pixels-storage-redis/src/main/java/io/pixelsdb/pixels/storage/redis/Redis.java index a5764b3aa9..67051cd251 100644 --- a/pixels-storage/pixels-storage-redis/src/main/java/io/pixelsdb/pixels/storage/redis/Redis.java +++ b/pixels-storage/pixels-storage-redis/src/main/java/io/pixelsdb/pixels/storage/redis/Redis.java @@ -269,6 +269,12 @@ public DataOutputStream create(String path, boolean overwrite, int bufferSize) t return new DataOutputStream(new RedisOutputStream(this.jedis, path, bufferSize)); } + @Override + public DataOutputStream append(String path, int bufferSize) + { + throw new UnsupportedOperationException("append is not supported by storage backend: " + getScheme()); + } + @Override public boolean delete(String path, boolean recursive) throws IOException { diff --git a/pixels-storage/pixels-storage-s3/src/main/java/io/pixelsdb/pixels/storage/s3/AbstractS3.java b/pixels-storage/pixels-storage-s3/src/main/java/io/pixelsdb/pixels/storage/s3/AbstractS3.java index 946d2bdc6a..4b7e37f7f8 100644 --- a/pixels-storage/pixels-storage-s3/src/main/java/io/pixelsdb/pixels/storage/s3/AbstractS3.java +++ b/pixels-storage/pixels-storage-s3/src/main/java/io/pixelsdb/pixels/storage/s3/AbstractS3.java @@ -312,6 +312,12 @@ public DataOutputStream create(String path, boolean overwrite, int bufferSize) t return new DataOutputStream(new S3OutputStream(s3, p.bucket, p.key, bufferSize)); } + @Override + public DataOutputStream append(String path, int bufferSize) + { + throw new UnsupportedOperationException("append is not supported by storage backend: " + getScheme()); + } + @Override public boolean delete(String path, boolean recursive) throws IOException { diff --git a/proto/metadata.proto b/proto/metadata.proto index 575b868918..9cf7269944 100644 --- a/proto/metadata.proto +++ b/proto/metadata.proto @@ -65,7 +65,7 @@ service MetadataService { rpc UpdatePath (UpdatePathRequest) returns (UpdatePathResponse); rpc DeletePaths (DeletePathsRequest) returns (DeletePathsResponse); rpc AddFiles (AddFilesRequest) returns (AddFilesResponse); - rpc GetFiles (GetFilesRequest) returns (GetFilesResponse); + rpc GetFilesByType (GetFilesByTypeRequest) returns (GetFilesByTypeResponse); rpc GetFileId (GetFileIdRequest) returns (GetFileIdResponse); rpc GetFileType (GetFileTypeRequest) returns (GetFileTypeResponse); rpc UpdateFile (UpdateFileRequest) returns (UpdateFileResponse); @@ -174,8 +174,10 @@ message Path { message File { enum Type { - TEMPORARY = 0; + TEMPORARY_INGEST = 0; REGULAR = 1; + TEMPORARY_GC = 2; + RETIRED = 3; } uint64 id = 1; string name = 2; @@ -184,6 +186,7 @@ message File { uint64 minRowId = 5; uint64 maxRowId = 6; uint64 pathId = 7; + optional uint64 cleanupAt = 8; } message SchemaVersion { @@ -679,12 +682,14 @@ message AddFilesResponse { ResponseHeader header = 1; } -message GetFilesRequest { +message GetFilesByTypeRequest { + // If set, restricts the scan to one path; otherwise scans across paths. RequestHeader header = 1; - uint64 pathId = 2; + optional uint64 pathId = 2; + repeated File.Type fileTypes = 3; } -message GetFilesResponse { +message GetFilesByTypeResponse { ResponseHeader header = 1; repeated File files = 2; } @@ -706,7 +711,7 @@ message GetFileTypeRequest { message GetFileTypeResponse { ResponseHeader header = 1; - File.Type fileType = 2; // the type of the file, e.g., REGULAR or EMPTY + File.Type fileType = 2; // the type of the file, e.g., REGULAR or RETIRED } message UpdateFileRequest { @@ -741,6 +746,7 @@ message AtomicSwapFilesRequest { RequestHeader header = 1; uint64 newFileId = 2; repeated uint64 oldFileIds = 3; + optional uint64 cleanupAt = 4; } message AtomicSwapFilesResponse { @@ -844,4 +850,4 @@ message DropViewRequest { message DropViewResponse { ResponseHeader header = 1; } -// end request/response definition for rpc services \ No newline at end of file +// end request/response definition for rpc services diff --git a/proto/retina.proto b/proto/retina.proto index 65d056bdfc..6e1c25fafe 100644 --- a/proto/retina.proto +++ b/proto/retina.proto @@ -35,6 +35,8 @@ service RetinaWorkerService { rpc UpdateRecord (UpdateRecordRequest) returns (UpdateRecordResponse); // Bidirectional streaming method rpc StreamUpdateRecord (stream UpdateRecordRequest) returns (stream UpdateRecordResponse); + rpc GetRetinaStatus (GetRetinaStatusRequest) returns (GetRetinaStatusResponse); + rpc MarkReady (MarkReadyRequest) returns (MarkReadyResponse); rpc AddVisibility (AddVisibilityRequest) returns (AddVisibilityResponse); rpc QueryVisibility (QueryVisibilityRequest) returns (QueryVisibilityResponse); rpc ReclaimVisibility (ReclaimVisibilityRequest) returns (ReclaimVisibilityResponse); @@ -64,6 +66,18 @@ message VisibilityBitmap { repeated uint64 bitmap = 1; } +enum RetinaState { + UNKNOWN = 0; + RECOVERING = 1; + READY = 2; + FAILED = 3; +} + +message VnodeReplayStart { + uint32 virtualNodeId = 1; + uint64 startTs = 2; +} + // DeleteData: describes the deleted rows in a table // - Each DeleteData entry groups index keys that belong to the same row // - `indexKeys[0]` must be the primary index key @@ -113,6 +127,26 @@ message UpdateRecordResponse { ResponseHeader header = 1; } +message GetRetinaStatusRequest { + RequestHeader header = 1; +} + +message GetRetinaStatusResponse { + ResponseHeader header = 1; + RetinaState state = 2; + string recoveryEpoch = 3; + repeated VnodeReplayStart vnodeReplayStarts = 4; +} + +message MarkReadyRequest { + RequestHeader header = 1; + string recoveryEpoch = 2; +} + +message MarkReadyResponse { + ResponseHeader header = 1; +} + // visibility message AddVisibilityRequest { RequestHeader header = 1; diff --git a/proto/transaction.proto b/proto/transaction.proto index 631afedbc8..0489422470 100644 --- a/proto/transaction.proto +++ b/proto/transaction.proto @@ -22,8 +22,6 @@ syntax = "proto3"; -import "google/protobuf/empty.proto"; - option java_multiple_files = false; option java_package = "io.pixelsdb.pixels.daemon"; option java_outer_classname = "TransProto"; @@ -45,7 +43,8 @@ service TransService { rpc GetTransConcurrency (GetTransConcurrencyRequest) returns (GetTransConcurrencyResponse); rpc BindExternalTraceId (BindExternalTraceIdRequest) returns (BindExternalTraceIdResponse); rpc DumpTrans (DumpTransRequest) returns (DumpTransResponse); - rpc GetSafeGcTimestamp(google.protobuf.Empty) returns (GetSafeGcTimestampResponse); + rpc GetSafeVisibilityFoldingTimestamp(GetSafeVisibilityFoldingTimestampRequest) + returns (GetSafeVisibilityFoldingTimestampResponse); rpc MarkTransOffloaded (MarkTransOffloadedRequest) returns (MarkTransOffloadedResponse); } @@ -219,7 +218,12 @@ message DumpTransResponse { int32 errorCode = 1; } -message GetSafeGcTimestampResponse { +message GetSafeVisibilityFoldingTimestampRequest { + // True when the returned timestamp must remain safe for live running queries. + bool includeRunningQueries = 1; +} + +message GetSafeVisibilityFoldingTimestampResponse { int32 errorCode = 1; uint64 timestamp = 2; } diff --git a/scripts/sql/metadata_schema.sql b/scripts/sql/metadata_schema.sql index 3f077e4417..2558d2d1af 100644 --- a/scripts/sql/metadata_schema.sql +++ b/scripts/sql/metadata_schema.sql @@ -318,10 +318,11 @@ CREATE TABLE IF NOT EXISTS `pixels_metadata`.`PEER_PATHS` ( CREATE TABLE IF NOT EXISTS `pixels_metadata`.`FILES` ( `FILE_ID` BIGINT NOT NULL AUTO_INCREMENT, `FILE_NAME` VARCHAR(128) NOT NULL, - `FILE_TYPE` TINYINT NOT NULL COMMENT "Valid value can be 0 (temporary), or 1 (regular).", + `FILE_TYPE` TINYINT NOT NULL COMMENT "Valid value can be 0 (temporary ingest), 1 (regular), 2 (temporary gc), or 3 (retired).", `FILE_NUM_RG` INT NOT NULL, `FILE_MIN_ROW_ID` BIGINT NOT NULL, `FILE_MAX_ROW_ID` BIGINT NOT NULL, + `FILE_CLEANUP_AT` BIGINT NULL COMMENT "Earliest cleanup deadline in epoch milliseconds; meaningful only when FILE_TYPE = 3 (retired).", `PATHS_PATH_ID` BIGINT NOT NULL, PRIMARY KEY (`FILE_ID`), INDEX `fk_FILES_PATHS_idx` (`PATHS_PATH_ID` ASC),