From 126d943f48f4a3d27979e648001930586518d8ec Mon Sep 17 00:00:00 2001
From: Wolf Vollprecht <w.vollprecht@gmail.com>
Date: Fri, 3 Jul 2026 10:14:26 +0200
Subject: [PATCH 1/3] Production hardening and C2SP Phase 0

Security and correctness:
- Fix rate limiter: install ConnectInfo (every request previously failed
  with 500), use SmartIpKeyExtractor behind proxies, spawn the cleanup
  task, keep Retry-After headers, and configure the token replenish
  interval correctly (per_second(n) means one token per n seconds, not
  n/s) with RATE_LIMIT_* env overrides
- Witness: compare-and-swap state updates so concurrent requests can
  never cosign two different roots at the same size (split-view TOCTOU);
  reject sizes above i64::MAX that would defeat rollback protection
- Verify external witness cosignatures against pinned keys before they
  count toward publication (EXTERNAL_WITNESSES=name=url=vkey)
- Vindex WAL: CRC32-checksummed v3 format, torn-tail truncation on
  recovery, single-write entries, idempotent index_entry, and vindex
  failures abort the integration cycle before entries are marked
  integrated

C2SP conformance (Phase 0):
- cosignature/v1 witness signatures (timestamped 76-byte blobs, alg-0x04
  key IDs) per c2sp.org/tlog-cosignature, pinned against the spec's
  example vector; witness-conformance suite passes 28/28
- Quorum-based checkpoint publishing (WITNESS_QUORUM)

Vindex memory:
- Periodic snapshots (CRC'd, atomic rename) truncate the WAL and bound
  startup replay (VINDEX_SNAPSHOT_INTERVAL); missing/corrupt/behind
  state auto-rebuilds from entry bundles in tile storage

Ops:
- SIGTERM-aware graceful shutdown, fail-fast worker supervision,
  atomic filesystem tile writes (temp+rename), request timeouts,
  witness rate limiting, non-root containers, docker-compose fixes
  (vindex WAL path, secrets via environment), strict tile-path digits,
  release overflow-checks

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 Cargo.lock                     |  11 +
 Cargo.toml                     |  10 +-
 README.md                      |  26 +-
 docker/Dockerfile.local        |   7 +-
 docker/Dockerfile.server       |   8 +-
 docker/Dockerfile.witness      |   8 +-
 docker/docker-compose.yml      |  20 +-
 src/api/handlers.rs            |   4 +-
 src/api/paths.rs               |  18 +-
 src/api/rate_limit.rs          |  56 ++++-
 src/bin/witness.rs             |  49 +++-
 src/checkpoint/signer.rs       | 152 +++++++++++-
 src/error.rs                   |   5 +-
 src/lib.rs                     |   1 +
 src/main.rs                    | 215 +++++++++++-----
 src/merkle/proof.rs            |   7 +
 src/monitor/mod.rs             |  46 +++-
 src/shutdown.rs                |  29 +++
 src/storage/database.rs        |   8 +-
 src/storage/opendal.rs         |   9 +-
 src/vindex/mod.rs              | 383 +++++++++++++++++++++++++++--
 src/vindex/snapshot.rs         | 254 +++++++++++++++++++
 src/vindex/wal.rs              | 431 +++++++++++++++++++++++----------
 src/witness/mod.rs             |  52 ++--
 src/witness/state.rs           |  67 ++++-
 src/witness/verifier.rs        |  64 ++++-
 src/worker.rs                  | 281 ++++++++++++++++++---
 tests/witness_security_test.rs | 113 +++++++--
 witness-conformance/.gitignore |   1 +
 29 files changed, 1968 insertions(+), 367 deletions(-)
 create mode 100644 src/shutdown.rs
 create mode 100644 src/vindex/snapshot.rs

diff --git a/Cargo.lock b/Cargo.lock
index 2a7ae28..da74f81 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -639,6 +639,15 @@ dependencies = [
  "rustc_version",
 ]
 
+[[package]]
+name = "crc32fast"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9481c1c90cbf2ac953f07c8d4a58aa3945c425b7185c9154d67a65e4230da511"
+dependencies = [
+ "cfg-if",
+]
+
 [[package]]
 name = "crossbeam-queue"
 version = "0.3.12"
@@ -3256,6 +3265,7 @@ dependencies = [
  "base64",
  "chrono",
  "clap",
+ "crc32fast",
  "ed25519-dalek",
  "futures",
  "hex",
@@ -4043,6 +4053,7 @@ dependencies = [
  "http-body",
  "iri-string",
  "pin-project-lite",
+ "tokio",
  "tower",
  "tower-layer",
  "tower-service",
diff --git a/Cargo.toml b/Cargo.toml
index 3df362e..fac4177 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -30,7 +30,7 @@ path = "src/bin/witness.rs"
 # Web framework
 axum = "0.8"
 tokio = { version = "1", features = ["full"] }
-tower-http = { version = "0.6", features = ["cors", "trace"] }
+tower-http = { version = "0.6", features = ["cors", "trace", "timeout"] }
 tower = "0.5"
 tower_governor = "0.8"
 
@@ -79,6 +79,14 @@ indicatif = "0.18.3"
 # Optimization
 smallvec = "1.13"
 
+# WAL entry checksums
+crc32fast = "1"
+
+[profile.release]
+# Size/index arithmetic guards the Merkle tree and witness rollback
+# protection; wrap-on-overflow must never silently corrupt those checks.
+overflow-checks = true
+
 [dev-dependencies]
 tempfile = "3"
 portpicker = "0.1"
diff --git a/README.md b/README.md
index 32bc8e8..28d092c 100644
--- a/README.md
+++ b/README.md
@@ -84,12 +84,18 @@ cargo build --release
 | `S3_REGION` | S3 region | `auto` |
 | `API_KEY` | Bearer token required for `/add` writes | Required unless `ALLOW_PUBLIC_WRITES=true` |
 | `ALLOW_PUBLIC_WRITES` | Allow unauthenticated `/add` writes for local development | `false` |
+| `EXTERNAL_WITNESSES` | External witnesses to collect cosignatures from, comma-separated. Format: `name=url=vkey` — the note-format verification key is required and cosignatures are verified against it before a checkpoint is published | - |
+| `WITNESS_QUORUM` | Minimum number of external witness cosignatures required to publish a checkpoint | All configured witnesses |
+| `WITNESS_KEYS` | In-process witness private keys for local development (comma-separated) | - |
+| `VINDEX_SNAPSHOT_INTERVAL` | Entries between vindex snapshots. Each snapshot persists the full index and truncates the WAL, bounding WAL growth and startup replay time (0 disables) | `100000` |
+| `RATE_LIMIT_PER_SECOND` | Requests per second allowed per client IP | `100` |
+| `RATE_LIMIT_BURST_SIZE` | Burst capacity per client IP | `200` |
 | `CHECKPOINT_INTERVAL` | Checkpoint frequency (seconds) | `1` |
 | `BATCH_MAX_SIZE` | Max entries per batch | `256` |
 | `BATCH_MAX_AGE_MS` | Max batch age (ms) | `1000` |
 | `VINDEX_ENABLED` | Enable verifiable index | `false` |
 | `VINDEX_KEY_FIELD` | JSON field for key extraction | `name` |
-| `VINDEX_WAL_PATH` | WAL path for persistent vindex recovery | Required when enabling vindex on a non-empty log |
+| `VINDEX_WAL_PATH` | WAL path for persistent vindex state (snapshot is stored alongside as `<path>.snapshot`). If the on-disk state is missing, corrupted, or behind the database, the vindex is automatically rebuilt from the log's entry bundles | Recommended when enabling vindex |
 
 #### Witness Server (`witness`)
 
@@ -206,16 +212,18 @@ A witness independently verifies and co-signs transparency log checkpoints. Runn
 
 #### POST /add-checkpoint
 
-Request body:
-```json
-{
-  "checkpoint": "log.example.com\n123\nROOTHASH...\n\n- log.example.com SIGNATURE...",
-  "proof": ["HASH1...", "HASH2..."],
-  "old_size": 100
-}
+Request body (text/plain, per [c2sp.org/tlog-witness](https://c2sp.org/tlog-witness)):
+```text
+old <size>
+<base64 consistency proof hash>
+...
+
+<checkpoint text with log signature>
 ```
 
-Response (on success): The witness's cosignature line.
+Response (on success): the witness's [cosignature/v1](https://c2sp.org/tlog-cosignature)
+line — a timestamped Ed25519 signature whose key ID is computed with the
+cosignature/v1 algorithm byte (0x04).
 
 ## API Reference
 
diff --git a/docker/Dockerfile.local b/docker/Dockerfile.local
index ddfbe66..3c94364 100644
--- a/docker/Dockerfile.local
+++ b/docker/Dockerfile.local
@@ -32,8 +32,11 @@ COPY --from=builder /app/target/release/siglog /usr/local/bin/siglog
 COPY --from=builder /app/target/release/witness /usr/local/bin/witness
 COPY --from=builder /app/target/release/conda-monitor /usr/local/bin/conda-monitor
 
-# Create data directories
-RUN mkdir -p /data
+# Run as a non-root user with a writable data directory
+RUN useradd --system --uid 10001 --create-home --home-dir /data app \
+    && mkdir -p /data \
+    && chown -R app:app /data
+USER app
 
 WORKDIR /data
 
diff --git a/docker/Dockerfile.server b/docker/Dockerfile.server
index b8b15d9..47b5ef2 100644
--- a/docker/Dockerfile.server
+++ b/docker/Dockerfile.server
@@ -27,8 +27,11 @@ RUN apt-get update && apt-get install -y \
 # Copy binary from builder
 COPY --from=builder /app/target/release/siglog /usr/local/bin/siglog
 
-# Create data directory
-RUN mkdir -p /data
+# Run as a non-root user with a writable data directory
+RUN useradd --system --uid 10001 --create-home --home-dir /data siglog \
+    && mkdir -p /data \
+    && chown -R siglog:siglog /data
+USER siglog
 
 # Default environment variables
 ENV LISTEN_ADDR=0.0.0.0:8080
@@ -36,6 +39,7 @@ ENV DATABASE_URL=sqlite:/data/tessera.db?mode=rwc
 ENV STORAGE_BACKEND=fs
 ENV FS_ROOT=/data/tiles
 
+VOLUME /data
 EXPOSE 8080
 
 ENTRYPOINT ["siglog"]
diff --git a/docker/Dockerfile.witness b/docker/Dockerfile.witness
index ed02a36..f519ffe 100644
--- a/docker/Dockerfile.witness
+++ b/docker/Dockerfile.witness
@@ -27,13 +27,17 @@ RUN apt-get update && apt-get install -y \
 # Copy binary from builder
 COPY --from=builder /app/target/release/witness /usr/local/bin/witness
 
-# Create data directory
-RUN mkdir -p /data
+# Run as a non-root user with a writable data directory
+RUN useradd --system --uid 10001 --create-home --home-dir /data witness \
+    && mkdir -p /data \
+    && chown -R witness:witness /data
+USER witness
 
 # Default environment variables
 ENV LISTEN_ADDR=0.0.0.0:8081
 ENV DATABASE_URL=sqlite:/data/witness.db?mode=rwc
 
+VOLUME /data
 EXPOSE 8081
 
 ENTRYPOINT ["witness"]
diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml
index 3be8898..aec0336 100644
--- a/docker/docker-compose.yml
+++ b/docker/docker-compose.yml
@@ -2,7 +2,14 @@
 #
 # Setup:
 #   Create a .env file with LOG_PRIVATE_KEY, LOG_PUBLIC_KEY,
-#   WITNESS_PRIVATE_KEY, and MONITOR_PRIVATE_KEY.
+#   WITNESS_PRIVATE_KEY, WITNESS_PUBLIC_KEY, MONITOR_PRIVATE_KEY,
+#   and MONITOR_PUBLIC_KEY.
+#
+#   Generate the witness key with name "local.dev/witness" and the monitor
+#   key with name "local.dev/monitor" (see README "Key Format"): the name
+#   embedded in each key must match the witness name configured in
+#   EXTERNAL_WITNESSES below, and the public keys are pinned so the log can
+#   verify cosignatures.
 #
 # Usage:
 #   docker compose -f docker/docker-compose.yml build
@@ -31,7 +38,6 @@ services:
       - --storage-backend=fs
       - --fs-root=/data/tiles
       - --origin=local.dev/log
-      - --private-key=${LOG_PRIVATE_KEY}
       - --listen=0.0.0.0:8080
       - --checkpoint-interval=1
       - --batch-max-size=256
@@ -39,9 +45,12 @@ services:
       - --allow-public-writes
       - --vindex-enabled
       - --vindex-key-field=name
-      - --external-witnesses=witness=http://witness:8080,monitor=http://monitor:8080
+      - --vindex-wal-path=/data/vindex.wal
     environment:
       RUST_LOG: info,siglog=debug
+      # Secrets via environment, not argv (argv is visible in docker inspect / ps)
+      LOG_PRIVATE_KEY: ${LOG_PRIVATE_KEY}
+      EXTERNAL_WITNESSES: "local.dev/witness=http://witness:8080=${WITNESS_PUBLIC_KEY},local.dev/monitor=http://monitor:8080=${MONITOR_PUBLIC_KEY}"
     ports:
       - "8080:8080"
     volumes:
@@ -60,11 +69,11 @@ services:
     command:
       - witness
       - --database-url=sqlite:/data/witness.db?mode=rwc
-      - --private-key=${WITNESS_PRIVATE_KEY}
       - --listen=0.0.0.0:8080
       - --log=local.dev/log=${LOG_PUBLIC_KEY}
     environment:
       RUST_LOG: info,witness=debug,siglog=debug
+      WITNESS_PRIVATE_KEY: ${WITNESS_PRIVATE_KEY}
     ports:
       - "8081:8080"
     volumes:
@@ -112,11 +121,12 @@ services:
     command:
       - conda-monitor
       - --database-url=sqlite:/data/monitor.db?mode=rwc
-      - --private-key=${MONITOR_PRIVATE_KEY}
       - --listen=0.0.0.0:8080
       - --log=local.dev/log=${LOG_PUBLIC_KEY}=http://log:8080
     environment:
       RUST_LOG: info,conda_monitor=debug,siglog=debug
+      # conda-monitor reads its signing key from WITNESS_PRIVATE_KEY
+      WITNESS_PRIVATE_KEY: ${MONITOR_PRIVATE_KEY}
     ports:
       - "8082:8080"
     volumes:
diff --git a/src/api/handlers.rs b/src/api/handlers.rs
index aa8e3e0..9e608a3 100644
--- a/src/api/handlers.rs
+++ b/src/api/handlers.rs
@@ -267,7 +267,7 @@ pub async fn vindex_lookup(
     key.copy_from_slice(&hash_bytes);
 
     let result = vindex.lookup(&key);
-    let root_hash = vindex.root_hash();
+    let root_hash = result.root_hash;
 
     let response = VindexLookupResponse {
         indices: result.indices.iter().map(|i| i.value()).collect(),
@@ -302,7 +302,7 @@ pub async fn vindex_lookup_key(
         .ok_or_else(|| Error::Internal("vindex not enabled".into()))?;
 
     let result = vindex.lookup_string(&key);
-    let root_hash = vindex.root_hash();
+    let root_hash = result.root_hash;
 
     let response = VindexLookupResponse {
         indices: result.indices.iter().map(|i| i.value()).collect(),
diff --git a/src/api/paths.rs b/src/api/paths.rs
index 907addc..2fd5ac9 100644
--- a/src/api/paths.rs
+++ b/src/api/paths.rs
@@ -77,7 +77,13 @@ pub fn entries_path_for_log_index(seq: u64, log_size: u64) -> String {
 /// Calculate the expected number of leaves in a tile at the given level and index
 /// within a tree of the specified size, or 0 if the tile is fully populated.
 pub fn partial_tile_size(level: u64, index: u64, log_size: u64) -> u8 {
-    let size_at_level = log_size >> (level * TILE_HEIGHT);
+    // A shift of >= 64 bits is undefined; levels that high can never have
+    // partial tiles for any representable tree size.
+    let shift = level.saturating_mul(TILE_HEIGHT);
+    if shift >= 64 {
+        return 0;
+    }
+    let size_at_level = log_size >> shift;
     let full_tiles = size_at_level / TILE_WIDTH;
 
     if index < full_tiles {
@@ -91,6 +97,11 @@ pub fn partial_tile_size(level: u64, index: u64, log_size: u64) -> u8 {
 ///
 /// Validates that level is between 0 and 63.
 pub fn parse_tile_level(level: &str) -> Result<u64> {
+    // Strict digits only: u64::parse also accepts a leading '+', which would
+    // create alias URLs for the same immutable tile (CDN cache ambiguity).
+    if level.is_empty() || !level.bytes().all(|b| b.is_ascii_digit()) {
+        return Err(Error::InvalidPath("invalid tile level".into()));
+    }
     let l: u64 = level
         .parse()
         .map_err(|_| Error::InvalidPath("invalid tile level".into()))?;
@@ -113,6 +124,9 @@ pub fn parse_tile_level(level: &str) -> Result<u64> {
 pub fn parse_tile_index(index_str: &str) -> Result<(u64, u8)> {
     let (index_part, partial) = if let Some(pos) = index_str.find(".p/") {
         let partial_str = &index_str[pos + 3..];
+        if partial_str.is_empty() || !partial_str.bytes().all(|b| b.is_ascii_digit()) {
+            return Err(Error::InvalidPath("invalid partial size".into()));
+        }
         let partial: u64 = partial_str
             .parse()
             .map_err(|_| Error::InvalidPath("invalid partial size".into()))?;
@@ -146,7 +160,7 @@ pub fn parse_tile_index(index_str: &str) -> Result<(u64, u8)> {
     for part in parts {
         let digits = part.strip_prefix('x').unwrap_or(part);
 
-        if digits.len() != 3 {
+        if digits.len() != 3 || !digits.bytes().all(|b| b.is_ascii_digit()) {
             return Err(Error::InvalidPath(
                 "each index component must be 3 digits".into(),
             ));
diff --git a/src/api/rate_limit.rs b/src/api/rate_limit.rs
index 24aa8e3..33f6a6c 100644
--- a/src/api/rate_limit.rs
+++ b/src/api/rate_limit.rs
@@ -13,14 +13,50 @@ pub const RATE_LIMIT_PER_SECOND: u64 = 100;
 /// Default burst capacity: maximum requests allowed in a burst.
 pub const RATE_LIMIT_BURST_SIZE: u32 = 200;
 
+/// Requests per second per client IP (`RATE_LIMIT_PER_SECOND` env override).
+pub fn rate_limit_per_second() -> u64 {
+    std::env::var("RATE_LIMIT_PER_SECOND")
+        .ok()
+        .and_then(|s| s.parse().ok())
+        .filter(|&v| v > 0)
+        .unwrap_or(RATE_LIMIT_PER_SECOND)
+}
+
+/// Token replenish interval in nanoseconds for the configured rate.
+///
+/// tower_governor's `per_second(n)` sets the interval to replenish ONE
+/// token to `n` seconds — it does NOT mean "n requests per second". To
+/// allow R requests per second, one token must replenish every 1e9/R
+/// nanoseconds.
+pub fn replenish_interval_ns() -> u64 {
+    (1_000_000_000 / rate_limit_per_second()).max(1)
+}
+
+/// Burst capacity per client IP (`RATE_LIMIT_BURST_SIZE` env override).
+pub fn rate_limit_burst_size() -> u32 {
+    std::env::var("RATE_LIMIT_BURST_SIZE")
+        .ok()
+        .and_then(|s| s.parse().ok())
+        .filter(|&v| v > 0)
+        .unwrap_or(RATE_LIMIT_BURST_SIZE)
+}
+
 /// Convert governor errors to HTTP responses.
 pub fn rate_limit_error_handler(error: GovernorError) -> Response {
     match error {
-        GovernorError::TooManyRequests { .. } => (
-            StatusCode::TOO_MANY_REQUESTS,
-            "Too many requests. Please slow down.",
-        )
-            .into_response(),
+        GovernorError::TooManyRequests { headers, .. } => {
+            // Preserve Retry-After / x-ratelimit-* headers so clients can
+            // back off intelligently.
+            let mut response = (
+                StatusCode::TOO_MANY_REQUESTS,
+                "Too many requests. Please slow down.",
+            )
+                .into_response();
+            if let Some(headers) = headers {
+                response.headers_mut().extend(headers);
+            }
+            response
+        }
         GovernorError::UnableToExtractKey => (
             StatusCode::INTERNAL_SERVER_ERROR,
             "Unable to extract rate limit key",
@@ -42,10 +78,18 @@ mod tests {
     #[test]
     fn test_rate_limit_constants() {
         let config = tower_governor::governor::GovernorConfigBuilder::default()
-            .per_second(RATE_LIMIT_PER_SECOND)
+            .per_nanosecond(replenish_interval_ns())
             .burst_size(RATE_LIMIT_BURST_SIZE)
             .finish();
 
         assert!(config.is_some());
     }
+
+    #[test]
+    fn test_replenish_interval_semantics() {
+        // 100 req/s must replenish one token every 10ms — NOT one token
+        // every 100s, which is what per_second(100) would configure.
+        std::env::remove_var("RATE_LIMIT_PER_SECOND");
+        assert_eq!(replenish_interval_ns(), 10_000_000);
+    }
 }
diff --git a/src/bin/witness.rs b/src/bin/witness.rs
index 353cdd0..2952a08 100644
--- a/src/bin/witness.rs
+++ b/src/bin/witness.rs
@@ -7,9 +7,14 @@ use axum::extract::DefaultBodyLimit;
 use clap::Parser;
 use sea_orm::{ConnectOptions, ConnectionTrait, Database as SeaDatabase, DatabaseConnection};
 use sea_orm_migration::MigratorTrait;
+use siglog::api::rate_limit;
 use siglog::witness::{handlers, LogConfig, Witness};
+use std::net::SocketAddr;
 use std::sync::Arc;
 use std::time::Duration;
+use tower_governor::{
+    governor::GovernorConfigBuilder, key_extractor::SmartIpKeyExtractor, GovernorLayer,
+};
 
 /// Maximum allowed size for witness request bodies (1MB).
 /// This prevents DoS attacks from extremely large checkpoint submissions.
@@ -92,6 +97,28 @@ async fn main() -> anyhow::Result<()> {
     let witness = Arc::new(Witness::new(signer, conn, args.logs));
     tracing::info!("Witness name: {}", witness.name());
 
+    // Rate limiting: /add-checkpoint does signature verification and proof
+    // hashing per request and is unauthenticated.
+    let rate_limit_config = Arc::new(
+        GovernorConfigBuilder::default()
+            // per_second(n) would mean "one request per n seconds"!
+            .per_nanosecond(rate_limit::replenish_interval_ns())
+            .burst_size(rate_limit::rate_limit_burst_size())
+            .key_extractor(SmartIpKeyExtractor)
+            .finish()
+            .expect("failed to create rate limit config"),
+    );
+    let governor_limiter = rate_limit_config.limiter().clone();
+    tokio::spawn(async move {
+        let mut interval = tokio::time::interval(Duration::from_secs(60));
+        loop {
+            interval.tick().await;
+            governor_limiter.retain_recent();
+        }
+    });
+    let governor_layer =
+        GovernorLayer::new(rate_limit_config).error_handler(rate_limit::rate_limit_error_handler);
+
     // Build router with body size limit
     let app = axum::Router::new()
         .route(
@@ -102,6 +129,11 @@ async fn main() -> anyhow::Result<()> {
         .route("/ready", axum::routing::get(handlers::ready))
         .with_state(witness)
         .layer(DefaultBodyLimit::max(MAX_BODY_SIZE))
+        .layer(governor_layer)
+        .layer(tower_http::timeout::TimeoutLayer::with_status_code(
+            axum::http::StatusCode::REQUEST_TIMEOUT,
+            Duration::from_secs(30),
+        ))
         .layer(
             tower_http::trace::TraceLayer::new_for_http()
                 .make_span_with(
@@ -122,17 +154,12 @@ async fn main() -> anyhow::Result<()> {
     let listener = tokio::net::TcpListener::bind(&args.listen).await?;
     tracing::info!("Witness server listening on {}", args.listen);
 
-    // Handle shutdown
-    let shutdown_signal = async {
-        tokio::signal::ctrl_c()
-            .await
-            .expect("failed to install Ctrl+C handler");
-        tracing::info!("Shutdown signal received");
-    };
-
-    axum::serve(listener, app)
-        .with_graceful_shutdown(shutdown_signal)
-        .await?;
+    axum::serve(
+        listener,
+        app.into_make_service_with_connect_info::<SocketAddr>(),
+    )
+    .with_graceful_shutdown(siglog::shutdown::shutdown_signal())
+    .await?;
 
     tracing::info!("Witness server stopped");
     Ok(())
diff --git a/src/checkpoint/signer.rs b/src/checkpoint/signer.rs
index 3b3a81a..9158f09 100644
--- a/src/checkpoint/signer.rs
+++ b/src/checkpoint/signer.rs
@@ -28,6 +28,18 @@ pub struct CheckpointSigner {
 /// Ed25519 algorithm identifier for note format.
 const ALG_ED25519: u8 = 0x01;
 
+/// Ed25519 cosignature/v1 algorithm identifier (c2sp.org/tlog-cosignature).
+pub const ALG_COSIGNATURE_V1: u8 = 0x04;
+
+/// Build the message signed by an Ed25519 cosignature/v1 cosigner.
+///
+/// Per c2sp.org/tlog-cosignature: a `cosignature/v1` header line, a
+/// `time <posix seconds>` line, then the whole note body of the checkpoint
+/// (including its final newline).
+pub fn cosignature_v1_message(timestamp: u64, body: &str) -> String {
+    format!("cosignature/v1\ntime {}\n{}", timestamp, body)
+}
+
 impl CheckpointSigner {
     /// Create a new checkpoint signer from a note-format private key.
     ///
@@ -175,15 +187,50 @@ impl CheckpointSigner {
             signature,
         }
     }
+
+    /// The key ID this signer uses for cosignature/v1 cosignatures.
+    ///
+    /// Computed with the cosignature/v1 algorithm byte (0x04), so it differs
+    /// from the plain note-signature key ID.
+    pub fn cosignature_v1_key_id(&self) -> KeyId {
+        compute_key_id_with_alg(
+            self.name.as_str(),
+            &self.signing_key.verifying_key(),
+            ALG_COSIGNATURE_V1,
+        )
+    }
+
+    /// Produce a C2SP cosignature/v1 cosignature over a checkpoint.
+    ///
+    /// `timestamp` is the POSIX time (seconds) at which the cosignature is
+    /// generated; it is bound into the signed message and encoded in the
+    /// signature blob.
+    pub fn cosign_v1(&self, checkpoint: &Checkpoint, timestamp: u64) -> CheckpointSignature {
+        let message = cosignature_v1_message(timestamp, &checkpoint.to_body());
+        let signature = self.signing_key.sign(message.as_bytes());
+
+        CheckpointSignature {
+            name: self.name.clone(),
+            key_id: self.cosignature_v1_key_id(),
+            signature,
+            timestamp: Some(timestamp),
+        }
+    }
 }
 
 /// Compute the key ID for a verifying key per Go's note format.
 /// Hash = SHA256(name + "\n" + alg_byte + public_key)[:4]
 fn compute_key_id(name: &str, key: &VerifyingKey) -> KeyId {
+    compute_key_id_with_alg(name, key, ALG_ED25519)
+}
+
+/// Compute a key ID with an explicit algorithm byte.
+/// Hash = SHA256(name + "\n" + alg_byte + public_key)[:4]
+pub fn compute_key_id_with_alg(name: &str, key: &VerifyingKey, alg: u8) -> KeyId {
     let mut hasher = Sha256::new();
     hasher.update(name.as_bytes());
     hasher.update(b"\n");
-    hasher.update([ALG_ED25519]); // Ed25519 algorithm identifier
+    hasher.update([alg]);
     hasher.update(key.as_bytes());
     let hash = hasher.finalize();
 
@@ -313,19 +360,35 @@ pub struct CheckpointSignature {
     pub key_id: KeyId,
     /// The signature.
     pub signature: Signature,
+    /// The POSIX timestamp for cosignature/v1 signatures.
+    ///
+    /// `None` for plain note signatures (e.g. the log's own signature);
+    /// `Some` for C2SP cosignature/v1 witness cosignatures, where the
+    /// timestamp is bound into the signed message.
+    pub timestamp: Option<u64>,
 }
 
 impl CheckpointSignature {
     /// Format as a signature line.
+    ///
+    /// Plain signatures encode `key_id(4) || sig(64)`; cosignature/v1
+    /// signatures encode `key_id(4) || timestamp(8, BE) || sig(64)` per
+    /// c2sp.org/tlog-cosignature.
     pub fn to_line(&self) -> String {
-        let mut sig_data = Vec::with_capacity(4 + 64);
+        let mut sig_data = Vec::with_capacity(4 + 8 + 64);
         sig_data.extend_from_slice(self.key_id.as_bytes());
+        if let Some(ts) = self.timestamp {
+            sig_data.extend_from_slice(&ts.to_be_bytes());
+        }
         sig_data.extend_from_slice(&self.signature.to_bytes());
 
         format!("— {} {}", self.name.as_str(), STANDARD.encode(&sig_data))
     }
 
     /// Parse a signature line.
+    ///
+    /// Accepts both plain note signatures (68-byte blob) and cosignature/v1
+    /// signatures (76-byte blob with an embedded big-endian timestamp).
     pub fn from_line(line: &str) -> Result<Self> {
         let line = line.trim();
         if !line.starts_with("— ") {
@@ -343,16 +406,25 @@ impl CheckpointSignature {
             .decode(parts[1])
             .map_err(|e| Error::Config(format!("invalid signature base64: {}", e)))?;
 
-        if sig_data.len() != 68 {
-            return Err(Error::Config(format!(
-                "invalid signature length: expected 68, got {}",
-                sig_data.len()
-            )));
-        }
+        let (timestamp, sig_bytes): (Option<u64>, &[u8]) = match sig_data.len() {
+            68 => (None, &sig_data[4..]),
+            76 => {
+                let ts_bytes: [u8; 8] = sig_data[4..12]
+                    .try_into()
+                    .map_err(|_| Error::Config("invalid timestamp bytes".into()))?;
+                (Some(u64::from_be_bytes(ts_bytes)), &sig_data[12..])
+            }
+            n => {
+                return Err(Error::Config(format!(
+                    "invalid signature length: expected 68 (plain) or 76 (cosignature/v1), got {}",
+                    n
+                )));
+            }
+        };
 
         let key_id = KeyId::new([sig_data[0], sig_data[1], sig_data[2], sig_data[3]]);
         let signature = Signature::from_bytes(
-            sig_data[4..]
+            sig_bytes
                 .try_into()
                 .map_err(|_| Error::Config("invalid signature bytes".into()))?,
         );
@@ -361,6 +433,7 @@ impl CheckpointSignature {
             name,
             key_id,
             signature,
+            timestamp,
         })
     }
 }
@@ -404,6 +477,7 @@ impl SignedCheckpoint {
                 name: self.signer_name,
                 key_id: self.key_id,
                 signature: self.signature,
+                timestamp: None,
             }],
         }
     }
@@ -430,11 +504,12 @@ impl CosignedCheckpoint {
                 name: signer.name.clone(),
                 key_id: signer.key_id.clone(),
                 signature,
+                timestamp: None,
             }],
         }
     }
 
-    /// Add a cosignature from a witness.
+    /// Add a plain note cosignature from a witness (non-spec, legacy).
     pub fn add_signature(&mut self, signer: &CheckpointSigner) {
         let body = self.checkpoint.to_body();
         let signature = signer.signing_key.sign(body.as_bytes());
@@ -443,9 +518,16 @@ impl CosignedCheckpoint {
             name: signer.name.clone(),
             key_id: signer.key_id.clone(),
             signature,
+            timestamp: None,
         });
     }
 
+    /// Add a C2SP cosignature/v1 cosignature from a witness signer.
+    pub fn add_cosignature_v1(&mut self, signer: &CheckpointSigner, timestamp: u64) {
+        let cosig = signer.cosign_v1(&self.checkpoint, timestamp);
+        self.signatures.push(cosig);
+    }
+
     /// Parse a cosigned checkpoint from text.
     pub fn from_text(text: &str) -> Result<Self> {
         let text = text.trim();
@@ -653,6 +735,56 @@ mod tests {
         assert_eq!(cosigned1.signature_count(), 3);
     }
 
+    #[test]
+    fn test_parse_spec_example_cosignature_line() {
+        // Example cosignature line from c2sp.org/tlog-cosignature. The blob
+        // is keyid(4) || timestamp(8, BE) || sig(64); the spec's message
+        // example uses "time 1679315147".
+        let line = "— witness.example.com/w1 jWbPPwAAAABkGFDLEZMHwSRaJNiIDoe9DYn/zXcrtPHeolMI5OWXEhZCB9dlrDJsX3b2oyin1nPZqhf5nNo0xUe+mbIUBkBIfZ+qnA==";
+        let sig = CheckpointSignature::from_line(line).unwrap();
+        assert_eq!(sig.name.as_str(), "witness.example.com/w1");
+        assert_eq!(sig.timestamp, Some(1679315147));
+        assert_eq!(sig.key_id.as_bytes(), &[0x8d, 0x66, 0xcf, 0x3f]);
+        // Round-trip back to the identical line.
+        assert_eq!(sig.to_line(), line);
+    }
+
+    #[test]
+    fn test_cosignature_v1_message_format() {
+        // Exact message layout from the spec.
+        let body = "example.com/behind-the-sofa\n20852163\nCsUYapGGPo4dkMgIAUqom/Xajj7h2fB2MPA3j2jxq2I=\n";
+        let msg = cosignature_v1_message(1679315147, body);
+        assert_eq!(
+            msg,
+            "cosignature/v1\ntime 1679315147\nexample.com/behind-the-sofa\n20852163\nCsUYapGGPo4dkMgIAUqom/Xajj7h2fB2MPA3j2jxq2I=\n"
+        );
+    }
+
+    #[test]
+    fn test_cosign_v1_roundtrip_verifies() {
+        use ed25519_dalek::Verifier;
+
+        let signer = CheckpointSigner::generate("witness.example.com");
+        let checkpoint = Checkpoint::new(
+            Origin::new("example.com/log".to_string()).unwrap(),
+            TreeSize::new(42),
+            Sha256Hash::from_bytes([7u8; 32]),
+        );
+
+        let cosig = signer.cosign_v1(&checkpoint, 1679315147);
+        assert_eq!(cosig.timestamp, Some(1679315147));
+        assert_eq!(cosig.key_id, signer.cosignature_v1_key_id());
+        // The v1 key ID must differ from the plain note key ID.
+        assert_ne!(&cosig.key_id, signer.key_id());
+
+        let parsed = CheckpointSignature::from_line(&cosig.to_line()).unwrap();
+        let msg = cosignature_v1_message(1679315147, &checkpoint.to_body());
+        signer
+            .public_key()
+            .verify(msg.as_bytes(), &parsed.signature)
+            .expect("cosignature/v1 must verify over the timestamped message");
+    }
+
     #[test]
     fn test_origin_validation() {
         // Valid origins
diff --git a/src/error.rs b/src/error.rs
index 83c7d66..7cf3352 100644
--- a/src/error.rs
+++ b/src/error.rs
@@ -73,7 +73,10 @@ impl IntoResponse for Error {
             }
             _ => {
                 tracing::error!("Error: {}", self);
-                (StatusCode::INTERNAL_SERVER_ERROR, self.to_string())
+                (
+                    StatusCode::INTERNAL_SERVER_ERROR,
+                    "Internal server error".to_string(),
+                )
             }
         };
 
diff --git a/src/lib.rs b/src/lib.rs
index 5b6726b..ad99cb8 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -11,6 +11,7 @@
 pub mod api;
 pub mod checkpoint;
 pub mod error;
+pub mod shutdown;
 pub mod merkle;
 pub mod migration;
 pub mod monitor;
diff --git a/src/main.rs b/src/main.rs
index a09872a..67f5432 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -4,15 +4,19 @@ use axum::extract::DefaultBodyLimit;
 use clap::Parser;
 use siglog::api::handlers::{self, AppState};
 use siglog::api::rate_limit;
+use siglog::checkpoint::signer::Origin;
 use siglog::checkpoint::CheckpointSigner;
 use siglog::sequencer::{Sequencer, SequencerConfig};
 use siglog::storage::{Database, TileStorage};
 use siglog::vindex;
 use siglog::worker::{self, WorkerConfig};
+use std::net::SocketAddr;
 use std::sync::Arc;
 use std::time::Duration;
 use tokio::sync::watch;
-use tower_governor::{governor::GovernorConfigBuilder, GovernorLayer};
+use tower_governor::{
+    governor::GovernorConfigBuilder, key_extractor::SmartIpKeyExtractor, GovernorLayer,
+};
 
 /// Siglog - A minimal Tessera-compatible transparency log server.
 #[derive(Parser, Debug)]
@@ -85,12 +89,19 @@ struct Args {
     #[arg(long, env = "WITNESS_KEYS")]
     witness_keys: Option<String>,
 
-    /// External witness URLs (comma-separated).
-    /// Format: name=url,name2=url2
-    /// Example: --external-witnesses "witness1=http://localhost:8081,monitor=http://localhost:8082"
+    /// External witnesses (comma-separated).
+    /// Format: name=url=vkey where vkey is the witness's note-format
+    /// verification key (name+hash+base64). Cosignatures are verified
+    /// against this pinned key before counting toward the quorum.
+    /// Example: --external-witnesses "w1=http://localhost:8081=w1+deadbeef+AQ..."
     #[arg(long, env = "EXTERNAL_WITNESSES")]
     external_witnesses: Option<String>,
 
+    /// Minimum number of external witness cosignatures required to publish
+    /// a checkpoint. Defaults to all configured external witnesses.
+    #[arg(long, env = "WITNESS_QUORUM")]
+    witness_quorum: Option<usize>,
+
     /// API key for authenticating write requests (optional).
     /// When set, the /add endpoint requires an Authorization: Bearer <key> header.
     #[arg(long, env = "API_KEY")]
@@ -139,6 +150,12 @@ async fn main() -> anyhow::Result<()> {
         );
     }
 
+    // Validate the origin up front so a bad value fails startup instead of
+    // killing the checkpoint worker after the server is already accepting
+    // writes.
+    Origin::new(args.origin.clone())
+        .map_err(|e| anyhow::anyhow!("invalid LOG_ORIGIN '{}': {}", args.origin, e))?;
+
     // Initialize database
     tracing::info!("Connecting to database...");
     let db = Database::connect(&args.database_url).await?;
@@ -186,50 +203,56 @@ async fn main() -> anyhow::Result<()> {
     tracing::info!("Checkpoint signer initialized: {}", signer.name());
 
     // Initialize in-process witnesses (for testing/development)
-    let witnesses: Vec<Arc<CheckpointSigner>> = if let Some(witness_keys) = &args.witness_keys {
-        witness_keys
-            .split(',')
-            .filter(|k| !k.trim().is_empty())
-            .map(|key| {
-                let signer =
-                    CheckpointSigner::from_note_key(key.trim()).expect("invalid witness key");
-                tracing::info!("In-process witness initialized: {}", signer.name());
-                Arc::new(signer)
-            })
-            .collect()
-    } else {
-        Vec::new()
-    };
+    let mut witnesses: Vec<Arc<CheckpointSigner>> = Vec::new();
+    if let Some(witness_keys) = &args.witness_keys {
+        for key in witness_keys.split(',').filter(|k| !k.trim().is_empty()) {
+            let signer = CheckpointSigner::from_note_key(key.trim())
+                .map_err(|e| anyhow::anyhow!("invalid in-process witness key: {}", e))?;
+            tracing::info!("In-process witness initialized: {}", signer.name());
+            witnesses.push(Arc::new(signer));
+        }
+    }
     tracing::info!("{} in-process witnesses configured", witnesses.len());
 
-    // Parse external witness URLs
-    let external_witnesses: Vec<worker::ExternalWitness> =
-        if let Some(ext_witnesses) = &args.external_witnesses {
-            ext_witnesses
-                .split(',')
-                .filter(|s| !s.trim().is_empty())
-                .map(|s| {
-                    let parts: Vec<&str> = s.trim().splitn(2, '=').collect();
-                    if parts.len() != 2 {
-                        panic!(
-                            "invalid external witness format: expected 'name=url', got '{}'",
-                            s
-                        );
-                    }
-                    let witness = worker::ExternalWitness::new(parts[0], parts[1]);
-                    tracing::info!(
-                        "External witness configured: {} -> {}",
-                        witness.name,
-                        witness.url
-                    );
-                    witness
-                })
-                .collect()
-        } else {
-            Vec::new()
-        };
+    // Parse external witnesses (name=url=vkey)
+    let mut external_witnesses: Vec<worker::ExternalWitness> = Vec::new();
+    if let Some(ext_witnesses) = &args.external_witnesses {
+        for s in ext_witnesses.split(',').filter(|s| !s.trim().is_empty()) {
+            let parts: Vec<&str> = s.trim().splitn(3, '=').collect();
+            if parts.len() != 3 {
+                anyhow::bail!(
+                    "invalid external witness format: expected 'name=url=vkey', got '{}'. \
+                     The verification key is required so cosignatures can be verified.",
+                    s
+                );
+            }
+            let witness = worker::ExternalWitness::new(parts[0], parts[1], parts[2])
+                .map_err(|e| anyhow::anyhow!("invalid external witness '{}': {}", parts[0], e))?;
+            tracing::info!(
+                "External witness configured: {} -> {}",
+                witness.name,
+                witness.url
+            );
+            external_witnesses.push(witness);
+        }
+    }
     tracing::info!("{} external witnesses configured", external_witnesses.len());
 
+    if let Some(q) = args.witness_quorum {
+        if q > external_witnesses.len() {
+            anyhow::bail!(
+                "WITNESS_QUORUM ({}) exceeds the number of configured external witnesses ({})",
+                q,
+                external_witnesses.len()
+            );
+        }
+        tracing::info!(
+            "Checkpoint publication quorum: {}/{} external witnesses",
+            q,
+            external_witnesses.len()
+        );
+    }
+
     // Create shutdown channel
     let (shutdown_tx, shutdown_rx) = watch::channel(false);
 
@@ -241,8 +264,10 @@ async fn main() -> anyhow::Result<()> {
     };
     let (sequencer, sequencer_task) = Sequencer::new(db.clone(), sequencer_config);
 
-    // Spawn sequencer
-    tokio::spawn(sequencer_task);
+    // Spawn sequencer (supervised below: if it dies, the process exits so
+    // the orchestrator can restart it, instead of silently acking writes
+    // that never get sequenced).
+    let sequencer_handle = tokio::spawn(sequencer_task);
 
     // Configure workers
     let worker_config = WorkerConfig {
@@ -250,6 +275,7 @@ async fn main() -> anyhow::Result<()> {
         integration_batch_size: 1024,
         checkpoint_interval: Duration::from_secs(args.checkpoint_interval),
         origin: args.origin.clone(),
+        witness_quorum: args.witness_quorum,
     };
 
     // Initialize vindex if enabled (before spawning workers)
@@ -264,13 +290,28 @@ async fn main() -> anyhow::Result<()> {
         let expected_tree_size = log_state.integrated_size.value();
 
         let vi = if let Some(wal_path) = &args.vindex_wal_path {
-            // Validate the WAL against the database state after a crash.
+            // Validate the snapshot + WAL against the database state after a
+            // crash. If the on-disk state is unusable (missing, corrupted, or
+            // behind the database), rebuild it from the log's entry bundles —
+            // the log itself is the source of truth for the index.
             tracing::info!(
                 "Vindex WAL path: {}, expected tree size from DB: {}",
                 wal_path,
                 expected_tree_size
             );
-            vindex::VerifiableIndex::with_wal(map_fn, wal_path, expected_tree_size)?
+            match vindex::VerifiableIndex::with_wal(map_fn.clone(), wal_path, expected_tree_size) {
+                Ok(vi) => vi,
+                Err(e) => {
+                    tracing::warn!("Vindex state unusable ({}); rebuilding from log storage", e);
+                    vindex::VerifiableIndex::rebuild_from_storage(
+                        map_fn,
+                        wal_path,
+                        expected_tree_size,
+                        &storage,
+                    )
+                    .await?
+                }
+            }
         } else {
             if expected_tree_size > 0 {
                 anyhow::bail!(
@@ -292,7 +333,7 @@ async fn main() -> anyhow::Result<()> {
     };
 
     // Spawn integration worker (with optional vindex)
-    tokio::spawn(worker::run_integration_worker(
+    let integration_handle = tokio::spawn(worker::run_integration_worker(
         db.clone(),
         storage.clone(),
         worker_config.clone(),
@@ -301,7 +342,7 @@ async fn main() -> anyhow::Result<()> {
     ));
 
     // Spawn checkpoint worker
-    tokio::spawn(worker::run_checkpoint_worker(
+    let checkpoint_handle = tokio::spawn(worker::run_checkpoint_worker(
         db.clone(),
         storage.clone(),
         signer.clone(),
@@ -324,20 +365,38 @@ async fn main() -> anyhow::Result<()> {
     }
     let state = Arc::new(state);
 
-    // Configure rate limiting
+    // Configure rate limiting. SmartIpKeyExtractor prefers standard proxy
+    // headers (x-forwarded-for, x-real-ip, forwarded) and falls back to the
+    // peer address, so per-client limits survive a reverse proxy.
+    let rate_limit_rps = rate_limit::rate_limit_per_second();
+    let rate_limit_burst = rate_limit::rate_limit_burst_size();
     let rate_limit_config = Arc::new(
         GovernorConfigBuilder::default()
-            .per_second(rate_limit::RATE_LIMIT_PER_SECOND)
-            .burst_size(rate_limit::RATE_LIMIT_BURST_SIZE)
+            // per_second(n) would mean "one request per n seconds"!
+            .per_nanosecond(rate_limit::replenish_interval_ns())
+            .burst_size(rate_limit_burst)
+            .key_extractor(SmartIpKeyExtractor)
             .finish()
             .expect("failed to create rate limit config"),
     );
+
+    // tower_governor keeps one bucket per client key; without periodic
+    // cleanup that map grows forever.
+    let governor_limiter = rate_limit_config.limiter().clone();
+    tokio::spawn(async move {
+        let mut interval = tokio::time::interval(Duration::from_secs(60));
+        loop {
+            interval.tick().await;
+            governor_limiter.retain_recent();
+        }
+    });
+
     let governor_layer =
         GovernorLayer::new(rate_limit_config).error_handler(rate_limit::rate_limit_error_handler);
     tracing::info!(
         "Rate limiting enabled: {} req/s per IP, burst {}",
-        rate_limit::RATE_LIMIT_PER_SECOND,
-        rate_limit::RATE_LIMIT_BURST_SIZE
+        rate_limit_rps,
+        rate_limit_burst
     );
 
     // Build router
@@ -374,6 +433,10 @@ async fn main() -> anyhow::Result<()> {
         .with_state(state)
         .layer(DefaultBodyLimit::max(handlers::MAX_ENTRY_SIZE))
         .layer(governor_layer)
+        .layer(tower_http::timeout::TimeoutLayer::with_status_code(
+            axum::http::StatusCode::REQUEST_TIMEOUT,
+            Duration::from_secs(30),
+        ))
         .layer(
             tower_http::trace::TraceLayer::new_for_http()
                 .make_span_with(
@@ -395,18 +458,44 @@ async fn main() -> anyhow::Result<()> {
         args.listen
     );
 
-    // Handle shutdown
+    // Handle shutdown (SIGINT and SIGTERM)
+    let shutting_down = Arc::new(std::sync::atomic::AtomicBool::new(false));
+    let shutdown_flag = shutting_down.clone();
     let shutdown_signal = async move {
-        tokio::signal::ctrl_c()
-            .await
-            .expect("failed to install Ctrl+C handler");
-        tracing::info!("Shutdown signal received");
+        siglog::shutdown::shutdown_signal().await;
+        shutdown_flag.store(true, std::sync::atomic::Ordering::SeqCst);
         let _ = shutdown_tx.send(true);
     };
 
-    axum::serve(listener, app)
-        .with_graceful_shutdown(shutdown_signal)
-        .await?;
+    // Supervise the background pipeline: if any worker dies (panic or
+    // unexpected return) outside of shutdown, exit so the orchestrator
+    // restarts the process, instead of accepting writes that are never
+    // integrated or published.
+    let supervisor_flag = shutting_down.clone();
+    tokio::spawn(async move {
+        let reason = tokio::select! {
+            r = sequencer_handle => format!("sequencer task exited: {:?}", r),
+            r = integration_handle => format!("integration worker exited: {:?}", r),
+            r = checkpoint_handle => format!("checkpoint worker exited: {:?}", r),
+        };
+        if !supervisor_flag.load(std::sync::atomic::Ordering::SeqCst) {
+            tracing::error!(
+                "{}; exiting so the orchestrator can restart the process",
+                reason
+            );
+            std::process::exit(1);
+        }
+    });
+
+    // ConnectInfo is required by the rate limiter's key extractor as the
+    // fallback when no proxy headers are present; without it every request
+    // fails with "unable to extract rate limit key".
+    axum::serve(
+        listener,
+        app.into_make_service_with_connect_info::<SocketAddr>(),
+    )
+    .with_graceful_shutdown(shutdown_signal)
+    .await?;
 
     tracing::info!("Server stopped");
     Ok(())
diff --git a/src/merkle/proof.rs b/src/merkle/proof.rs
index 830a1c1..fb998bc 100644
--- a/src/merkle/proof.rs
+++ b/src/merkle/proof.rs
@@ -175,6 +175,13 @@ pub async fn compute_subtree_hash(
 
 /// Read a leaf hash from level 0 tiles.
 async fn read_leaf_hash(storage: &TileStorage, index: u64, tree_size: u64) -> Result<Sha256Hash> {
+    if index >= tree_size {
+        return Err(Error::NotFound(format!(
+            "leaf {} beyond tree size {}",
+            index, tree_size
+        )));
+    }
+
     let tile_index = index / TILE_WIDTH;
     let offset = (index % TILE_WIDTH) as usize;
 
diff --git a/src/monitor/mod.rs b/src/monitor/mod.rs
index 6d35a1e..70dfbf9 100644
--- a/src/monitor/mod.rs
+++ b/src/monitor/mod.rs
@@ -22,7 +22,6 @@ use crate::witness::{
     WitnessStateStore, WitnessedState,
 };
 use async_trait::async_trait;
-use ed25519_dalek::Signer;
 use sea_orm::DatabaseConnection;
 use std::sync::Arc;
 
@@ -143,7 +142,13 @@ impl<M: Monitor> MonitoringWitness<M> {
             conn: conn.clone(),
             state_store: WitnessStateStore::new(conn),
             logs,
-            http_client: reqwest::Client::new(),
+            // Without a timeout, a hung upstream log stalls add-checkpoint
+            // handlers indefinitely.
+            http_client: reqwest::Client::builder()
+                .timeout(std::time::Duration::from_secs(30))
+                .connect_timeout(std::time::Duration::from_secs(10))
+                .build()
+                .expect("failed to build HTTP client"),
         }
     }
 
@@ -278,14 +283,17 @@ impl<M: Monitor> MonitoringWitness<M> {
             }
         }
 
-        // 9. Create cosignature
-        let body = checkpoint.checkpoint.to_body();
-        let signature = self.signer.signing_key_ref().sign(body.as_bytes());
-        let cosig = CheckpointSignature {
-            name: self.signer.name().clone(),
-            key_id: self.signer.key_id().clone(),
-            signature,
-        };
+        // 9. Create cosignature/v1 (c2sp.org/tlog-cosignature)
+        let timestamp = std::time::SystemTime::now()
+            .duration_since(std::time::UNIX_EPOCH)
+            .map_err(|e| {
+                MonitorError::Witness(WitnessError::Internal(format!(
+                    "system clock error: {}",
+                    e
+                )))
+            })?
+            .as_secs();
+        let cosig = self.signer.cosign_v1(&checkpoint.checkpoint, timestamp);
 
         // 10. Commit the validated entries to the monitor's index and database
         if new_size > state.size {
@@ -300,9 +308,17 @@ impl<M: Monitor> MonitoringWitness<M> {
                 })?;
         }
 
-        // 11. Update witness state
-        self.state_store
-            .update(origin, new_size, new_root, &request.checkpoint)
+        // 11. Update witness state (CAS against the state we verified).
+        let outcome = self
+            .state_store
+            .update(
+                origin,
+                state.size,
+                &state.root_hash,
+                new_size,
+                new_root,
+                &request.checkpoint,
+            )
             .await
             .map_err(|e| {
                 MonitorError::Witness(WitnessError::Internal(format!(
@@ -311,6 +327,10 @@ impl<M: Monitor> MonitoringWitness<M> {
                 )))
             })?;
 
+        if let crate::witness::UpdateOutcome::Conflict { current_size } = outcome {
+            return Err(MonitorError::Witness(WitnessError::Conflict(current_size)));
+        }
+
         Ok(cosig)
     }
 
diff --git a/src/shutdown.rs b/src/shutdown.rs
new file mode 100644
index 0000000..6bfe912
--- /dev/null
+++ b/src/shutdown.rs
@@ -0,0 +1,29 @@
+//! Shared shutdown signal handling for the server binaries.
+
+/// Wait for a shutdown signal: SIGINT (Ctrl+C) or, on Unix, SIGTERM.
+///
+/// Container orchestrators (Docker, Kubernetes, Fly.io) stop services with
+/// SIGTERM; handling only Ctrl+C would turn every deploy into a hard kill.
+pub async fn shutdown_signal() {
+    let ctrl_c = async {
+        tokio::signal::ctrl_c()
+            .await
+            .expect("failed to install Ctrl+C handler");
+    };
+
+    #[cfg(unix)]
+    let terminate = async {
+        tokio::signal::unix::signal(tokio::signal::unix::SignalKind::terminate())
+            .expect("failed to install SIGTERM handler")
+            .recv()
+            .await;
+    };
+
+    #[cfg(not(unix))]
+    let terminate = std::future::pending::<()>();
+
+    tokio::select! {
+        _ = ctrl_c => tracing::info!("SIGINT received, shutting down"),
+        _ = terminate => tracing::info!("SIGTERM received, shutting down"),
+    }
+}
diff --git a/src/storage/database.rs b/src/storage/database.rs
index cfe68da..87b54d4 100644
--- a/src/storage/database.rs
+++ b/src/storage/database.rs
@@ -76,9 +76,15 @@ impl Database {
             .await?
             .ok_or_else(|| Error::Internal("log state not found".into()))?;
 
+        // A corrupted root hash must be a loud error: silently mapping it to
+        // None makes the checkpoint worker stop publishing with no logs.
         let root_hash = row
             .root_hash
-            .and_then(|bytes| Sha256Hash::try_from_slice(&bytes).ok());
+            .map(|bytes| {
+                Sha256Hash::try_from_slice(&bytes)
+                    .map_err(|e| Error::Internal(format!("corrupted root hash in log state: {}", e)))
+            })
+            .transpose()?;
 
         Ok(LogState {
             next_index: LogIndex::new(row.next_index as u64),
diff --git a/src/storage/opendal.rs b/src/storage/opendal.rs
index 57dd004..5f83a49 100644
--- a/src/storage/opendal.rs
+++ b/src/storage/opendal.rs
@@ -43,8 +43,15 @@ impl TileStorage {
     }
 
     /// Create a new tile storage with filesystem backend.
+    ///
+    /// Writes go through a temp directory and are renamed into place, so a
+    /// crash mid-write can never leave a torn tile or checkpoint, and a
+    /// concurrent `GET /checkpoint` never observes a partially-written file.
     pub fn new_fs(root: &str) -> Result<Self> {
-        let builder = Fs::default().root(root);
+        let atomic_dir = std::path::Path::new(root).join(".tmp");
+        let builder = Fs::default()
+            .root(root)
+            .atomic_write_dir(&atomic_dir.to_string_lossy());
 
         let op = Operator::new(builder)?.finish();
 
diff --git a/src/vindex/mod.rs b/src/vindex/mod.rs
index 748b4b0..e223891 100644
--- a/src/vindex/mod.rs
+++ b/src/vindex/mod.rs
@@ -15,15 +15,18 @@
 //! - PrefixTree: Merkle tree for verifiable proofs
 
 mod prefix_tree;
+mod snapshot;
 mod wal;
 
 use crate::error::{Error, Result};
 use crate::types::LogIndex;
 pub use prefix_tree::{LookupProof, PrefixTree, ProofNode};
 use sha2::{Digest, Sha256};
+pub use snapshot::snapshot_path;
 use std::collections::{HashMap, HashSet};
 use std::io::Write as _;
-use std::path::Path;
+use std::path::{Path, PathBuf};
+use std::sync::atomic::{AtomicU64, Ordering};
 use std::sync::{Arc, RwLock};
 pub use wal::{
     validate_and_truncate_wal, BatchedBinaryWalWriter, BatchedWalWriter, BinaryWalWriter,
@@ -118,6 +121,9 @@ pub struct LookupResult {
     pub found: bool,
     /// The inclusion/exclusion proof from the prefix tree.
     pub proof: Vec<ProofNode>,
+    /// The prefix tree root hash the proof verifies against, captured under
+    /// the same lock as the proof so the pair is always consistent.
+    pub root_hash: IndexKey,
 }
 
 /// WAL writer variant (batched or unbatched).
@@ -140,6 +146,13 @@ impl WalWriterVariant {
             WalWriterVariant::Batched(w) => w.flush(),
         }
     }
+
+    fn truncate(&mut self) -> Result<()> {
+        match self {
+            WalWriterVariant::Unbatched(w) => w.truncate(),
+            WalWriterVariant::Batched(w) => w.truncate(),
+        }
+    }
 }
 
 /// The verifiable index maintains a mapping from keys to log indices.
@@ -148,6 +161,12 @@ pub struct VerifiableIndex {
     index: RwLock<HashMap<IndexKey, Vec<LogIndex>>>,
     /// WAL writer for persistence.
     wal_writer: Option<RwLock<WalWriterVariant>>,
+    /// Snapshot file path (when WAL persistence is enabled).
+    snapshot_file: Option<PathBuf>,
+    /// Entries indexed since the last snapshot (drives compaction).
+    entries_since_snapshot: AtomicU64,
+    /// Snapshot after this many new entries (0 = disabled).
+    snapshot_interval: u64,
     /// The map function for extracting keys.
     map_fn: Arc<dyn MapFn>,
     /// Current tree size (number of entries indexed).
@@ -170,6 +189,12 @@ impl VerifiableIndex {
     /// Maximum number of keys that can be represented for one entry in the WAL.
     const MAX_KEYS_PER_ENTRY: usize = u8::MAX as usize;
 
+    /// Default number of new entries between snapshots.
+    ///
+    /// Each snapshot durably captures the full index and truncates the WAL,
+    /// bounding WAL size and startup replay time.
+    const DEFAULT_SNAPSHOT_INTERVAL: u64 = 100_000;
+
     /// Read max_keys from environment or use default.
     fn get_max_keys() -> usize {
         std::env::var("VINDEX_MAX_KEYS")
@@ -178,6 +203,14 @@ impl VerifiableIndex {
             .unwrap_or(Self::DEFAULT_MAX_KEYS)
     }
 
+    /// Read the snapshot interval from the environment or use the default.
+    fn get_snapshot_interval() -> u64 {
+        std::env::var("VINDEX_SNAPSHOT_INTERVAL")
+            .ok()
+            .and_then(|s| s.parse().ok())
+            .unwrap_or(Self::DEFAULT_SNAPSHOT_INTERVAL)
+    }
+
     /// Read max_indices_per_key from environment or use default.
     fn get_max_indices_per_key() -> usize {
         std::env::var("VINDEX_MAX_INDICES_PER_KEY")
@@ -209,6 +242,9 @@ impl VerifiableIndex {
         Self {
             index: RwLock::new(HashMap::new()),
             wal_writer: None,
+            snapshot_file: None,
+            entries_since_snapshot: AtomicU64::new(0),
+            snapshot_interval: 0,
             map_fn,
             tree_size: RwLock::new(0),
             prefix_tree: RwLock::new(PrefixTree::new()),
@@ -253,37 +289,69 @@ impl VerifiableIndex {
         batch_size: usize,
     ) -> Result<Self> {
         let wal_path = wal_path.as_ref();
+        let snapshot_file = snapshot::snapshot_path(wal_path);
+
+        // Load the last snapshot if present. An invalid snapshot is treated
+        // as absent (read_snapshot logs a warning); the coverage check below
+        // then fails and the caller can rebuild from log storage.
+        let (mut index, base_size) = match snapshot::read_snapshot(&snapshot_file) {
+            Some((size, idx)) => {
+                tracing::info!(
+                    "Vindex snapshot loaded: tree_size={}, {} keys",
+                    size,
+                    idx.len()
+                );
+                (idx, size)
+            }
+            None => (HashMap::new(), 0),
+        };
+
+        if base_size > expected_tree_size {
+            return Err(Error::Internal(format!(
+                "vindex snapshot is ahead of database state: snapshot tree_size={}, database \
+                 integrated_size={}. Rebuild the vindex before enabling it.",
+                base_size, expected_tree_size
+            )));
+        }
 
         // Validate and truncate WAL to match expected tree size
         // This is critical for crash recovery: if the WAL was flushed but the database
         // wasn't updated before a crash, we truncate the WAL to avoid duplicates
         let actual_wal_size = validate_and_truncate_wal(wal_path, expected_tree_size)?;
         tracing::info!(
-            "WAL validated: expected_tree_size={}, actual_wal_size={}",
+            "WAL validated: expected_tree_size={}, snapshot_size={}, actual_wal_size={}",
             expected_tree_size,
+            base_size,
             actual_wal_size
         );
 
-        if actual_wal_size < expected_tree_size {
+        let covered = base_size.max(actual_wal_size);
+        if covered < expected_tree_size {
             return Err(Error::Internal(format!(
-                "vindex WAL is behind database state: WAL tree_size={}, database integrated_size={}. \
-                 Rebuild the vindex before enabling it.",
-                actual_wal_size, expected_tree_size
+                "vindex state is behind the database: snapshot+WAL cover tree_size={}, database \
+                 integrated_size={}. Rebuild the vindex before enabling it.",
+                covered, expected_tree_size
             )));
         }
 
-        // Create or open WAL
-        let mut tree_size = 0u64;
-        let mut index: HashMap<IndexKey, Vec<LogIndex>> = HashMap::new();
+        let mut tree_size = base_size;
         let mut prefix_tree = PrefixTree::new();
 
-        // Track seen (idx, key) pairs to prevent duplicates (defense in depth)
+        // Track seen (idx, key) pairs to prevent duplicates (defense in depth).
+        // Only WAL entries after the snapshot are replayed, so this set is
+        // bounded by the snapshot interval, not the full history.
         let mut seen: HashSet<(u64, IndexKey)> = HashSet::new();
 
-        // Replay existing WAL if it exists
+        // Replay WAL entries on top of the snapshot
         if wal_path.exists() {
             let mut reader = WalReader::open(wal_path)?;
             while let Some((idx, keys)) = reader.next_entry()? {
+                // Entries at or below the snapshot's tree size are already in
+                // the snapshot (e.g. after a crash between snapshot write and
+                // WAL truncation).
+                if idx.value() < base_size {
+                    continue;
+                }
                 for key in keys {
                     // Deduplicate: only add if we haven't seen this (idx, key) pair
                     if seen.insert((idx.value(), key)) {
@@ -292,20 +360,20 @@ impl VerifiableIndex {
                 }
                 tree_size = tree_size.max(idx.value() + 1);
             }
+        }
 
-            // Rebuild prefix tree from index
-            for (key, indices) in &index {
-                let value_hash = compute_value_hash(indices);
-                prefix_tree.insert(key, value_hash);
-            }
-
-            tracing::info!(
-                "WAL replayed: {} unique keys, tree_size={}",
-                index.len(),
-                tree_size
-            );
+        // Rebuild prefix tree from the combined index
+        for (key, indices) in &index {
+            let value_hash = compute_value_hash(indices);
+            prefix_tree.insert(key, value_hash);
         }
 
+        tracing::info!(
+            "Vindex restored: {} unique keys, tree_size={}",
+            index.len(),
+            tree_size
+        );
+
         // Create writer (batched or unbatched based on batch_size)
         let wal_writer = if batch_size > 1 {
             tracing::info!("Using batched WAL writer with batch_size={}", batch_size);
@@ -316,16 +384,23 @@ impl VerifiableIndex {
 
         let max_keys = Self::get_max_keys();
         let max_indices_per_key = Self::get_max_indices_per_key();
+        let snapshot_interval = Self::get_snapshot_interval();
 
         tracing::info!(
-            "VerifiableIndex limits: max_keys={}, max_indices_per_key={}",
+            "VerifiableIndex limits: max_keys={}, max_indices_per_key={}, snapshot_interval={}",
             max_keys,
-            max_indices_per_key
+            max_indices_per_key,
+            snapshot_interval
         );
 
         Ok(Self {
             index: RwLock::new(index),
             wal_writer: Some(RwLock::new(wal_writer)),
+            snapshot_file: Some(snapshot_file),
+            // WAL bytes not yet covered by a snapshot count toward the next
+            // snapshot trigger.
+            entries_since_snapshot: AtomicU64::new(tree_size - base_size),
+            snapshot_interval,
             map_fn,
             tree_size: RwLock::new(tree_size),
             prefix_tree: RwLock::new(prefix_tree),
@@ -337,7 +412,15 @@ impl VerifiableIndex {
     /// Index a new entry at the given log index.
     ///
     /// Extracts keys from the entry data and adds them to the index.
+    ///
+    /// Idempotent: entries below the current tree size were already indexed
+    /// and are skipped, so a retried integration cycle (e.g. after a
+    /// transient database error) cannot duplicate indices.
     pub fn index_entry(&self, idx: LogIndex, data: &[u8]) -> Result<()> {
+        if idx.value() < *self.tree_size.read().unwrap() {
+            return Ok(());
+        }
+
         let keys = self.map_fn.map(data);
 
         if keys.len() > Self::MAX_KEYS_PER_ENTRY {
@@ -356,8 +439,11 @@ impl VerifiableIndex {
             }
 
             // Still need to update tree size even if no keys
-            let mut tree_size = self.tree_size.write().unwrap();
-            *tree_size = (*tree_size).max(idx.value() + 1);
+            {
+                let mut tree_size = self.tree_size.write().unwrap();
+                *tree_size = (*tree_size).max(idx.value() + 1);
+            }
+            self.entries_since_snapshot.fetch_add(1, Ordering::Relaxed);
             return Ok(());
         }
 
@@ -419,6 +505,11 @@ impl VerifiableIndex {
 
             for key in &keys {
                 let indices = index.entry(*key).or_default();
+                // Defense in depth: never record the same index twice for a
+                // key (mirrors the dedup applied during WAL replay).
+                if indices.contains(&idx) {
+                    continue;
+                }
                 indices.push(idx);
 
                 // Update the prefix tree with the new value hash
@@ -432,6 +523,7 @@ impl VerifiableIndex {
             let mut tree_size = self.tree_size.write().unwrap();
             *tree_size = (*tree_size).max(idx.value() + 1);
         }
+        self.entries_since_snapshot.fetch_add(1, Ordering::Relaxed);
 
         Ok(())
     }
@@ -444,12 +536,14 @@ impl VerifiableIndex {
 
         let indices = index.get(key).cloned().unwrap_or_default();
         let lookup_proof = prefix_tree.lookup(key);
+        let root_hash = prefix_tree.root_hash();
 
         LookupResult {
             indices,
             tree_size,
             found: lookup_proof.found,
             proof: lookup_proof.proof,
+            root_hash,
         }
     }
 
@@ -479,6 +573,115 @@ impl VerifiableIndex {
         Ok(())
     }
 
+    /// Write a snapshot of the full index and truncate the WAL.
+    ///
+    /// This bounds WAL growth and startup replay time. The WAL writer lock
+    /// is held for the whole operation so no entry can be appended between
+    /// the snapshot capture and the WAL truncation (such an entry would be
+    /// lost by the truncate).
+    pub fn snapshot(&self) -> Result<()> {
+        let (Some(wal), Some(snapshot_file)) = (&self.wal_writer, &self.snapshot_file) else {
+            return Ok(());
+        };
+
+        let mut wal = wal.write().unwrap();
+        wal.flush()?;
+
+        {
+            let index = self.index.read().unwrap();
+            let tree_size = *self.tree_size.read().unwrap();
+            snapshot::write_snapshot(snapshot_file, tree_size, &index)?;
+            tracing::info!(
+                "Vindex snapshot written: tree_size={}, {} keys",
+                tree_size,
+                index.len()
+            );
+        }
+
+        wal.truncate()?;
+        self.entries_since_snapshot.store(0, Ordering::Relaxed);
+        Ok(())
+    }
+
+    /// Snapshot and compact the WAL if enough entries accumulated since the
+    /// last snapshot. Returns whether a snapshot was written.
+    pub fn maybe_snapshot(&self) -> Result<bool> {
+        if self.snapshot_interval == 0 || self.wal_writer.is_none() {
+            return Ok(false);
+        }
+        if self.entries_since_snapshot.load(Ordering::Relaxed) < self.snapshot_interval {
+            return Ok(false);
+        }
+        self.snapshot()?;
+        Ok(true)
+    }
+
+    /// Rebuild the index from the log's entry bundles in tile storage.
+    ///
+    /// Used when the on-disk vindex state (snapshot + WAL) is missing,
+    /// corrupted, or behind the database. The log itself is the source of
+    /// truth: every integrated entry lives in an entry bundle, so the index
+    /// can always be reconstructed. Writes a fresh WAL and snapshot.
+    pub async fn rebuild_from_storage(
+        map_fn: Arc<dyn MapFn>,
+        wal_path: impl AsRef<Path>,
+        expected_tree_size: u64,
+        storage: &crate::storage::TileStorage,
+    ) -> Result<Self> {
+        use crate::api::paths::{partial_tile_size, ENTRY_BUNDLE_WIDTH};
+        use crate::types::{PartialSize, TileIndex};
+
+        let wal_path = wal_path.as_ref();
+
+        // Start from a clean slate.
+        let snapshot_file = snapshot::snapshot_path(wal_path);
+        let _ = std::fs::remove_file(&snapshot_file);
+        let _ = std::fs::remove_file(wal_path);
+
+        let vi = Self::with_wal(map_fn, wal_path, 0)?;
+        if expected_tree_size == 0 {
+            return Ok(vi);
+        }
+
+        tracing::info!(
+            "Rebuilding vindex from log storage ({} entries)...",
+            expected_tree_size
+        );
+
+        let last_bundle = (expected_tree_size - 1) / ENTRY_BUNDLE_WIDTH;
+        for bundle_idx in 0..=last_bundle {
+            let partial = partial_tile_size(0, bundle_idx, expected_tree_size);
+            let bundle = storage
+                .read_entry_bundle(TileIndex::new(bundle_idx), PartialSize::new(partial))
+                .await?
+                .ok_or_else(|| {
+                    Error::Internal(format!(
+                        "missing entry bundle {} during vindex rebuild",
+                        bundle_idx
+                    ))
+                })?;
+
+            for (offset, data) in bundle.entries.iter().enumerate() {
+                let idx = bundle_idx * ENTRY_BUNDLE_WIDTH + offset as u64;
+                if idx >= expected_tree_size {
+                    break;
+                }
+                vi.index_entry(LogIndex::new(idx), data.as_bytes())?;
+            }
+        }
+
+        vi.flush()?;
+        vi.snapshot()?;
+
+        tracing::info!(
+            "Vindex rebuilt: {} keys from {} entries",
+            vi.key_count(),
+            vi.tree_size()
+        );
+
+        Ok(vi)
+    }
+
     /// Get the root hash of the prefix tree.
     ///
     /// This hash commits to the entire index state and can be used
@@ -704,6 +907,134 @@ mod tests {
         assert!(result.is_err());
     }
 
+    #[test]
+    fn test_snapshot_compacts_wal_and_restores() {
+        let temp_dir = tempfile::tempdir().unwrap();
+        let path = temp_dir.path().join("vindex.wal");
+
+        let root_before;
+        {
+            let map_fn = Arc::new(JsonKeysMapFn::new("name"));
+            let index = VerifiableIndex::with_wal(map_fn, &path, 0).unwrap();
+            index
+                .index_entry(LogIndex::new(0), br#"{"name": "foo"}"#)
+                .unwrap();
+            index
+                .index_entry(LogIndex::new(1), br#"{"name": "bar"}"#)
+                .unwrap();
+            index.flush().unwrap();
+
+            // Snapshot and compact.
+            index.snapshot().unwrap();
+            assert_eq!(std::fs::metadata(&path).unwrap().len(), 0, "WAL truncated");
+
+            // More entries after the snapshot land in the WAL only.
+            index
+                .index_entry(LogIndex::new(2), br#"{"name": "foo"}"#)
+                .unwrap();
+            index.flush().unwrap();
+            assert!(std::fs::metadata(&path).unwrap().len() > 0);
+
+            root_before = index.root_hash();
+        }
+
+        // Restart: state must be identical (snapshot + WAL replay).
+        let map_fn = Arc::new(JsonKeysMapFn::new("name"));
+        let index = VerifiableIndex::with_wal(map_fn, &path, 3).unwrap();
+        assert_eq!(index.tree_size(), 3);
+        assert_eq!(index.key_count(), 2);
+        assert_eq!(index.root_hash(), root_before);
+
+        let result = index.lookup_string("foo");
+        assert_eq!(
+            result.indices.iter().map(|i| i.value()).collect::<Vec<_>>(),
+            vec![0, 2]
+        );
+    }
+
+    #[test]
+    fn test_crash_between_snapshot_and_truncate() {
+        let temp_dir = tempfile::tempdir().unwrap();
+        let path = temp_dir.path().join("vindex.wal");
+
+        let root_before;
+        {
+            let map_fn = Arc::new(JsonKeysMapFn::new("name"));
+            let index = VerifiableIndex::with_wal(map_fn, &path, 0).unwrap();
+            index
+                .index_entry(LogIndex::new(0), br#"{"name": "foo"}"#)
+                .unwrap();
+            index
+                .index_entry(LogIndex::new(1), br#"{"name": "bar"}"#)
+                .unwrap();
+            index.flush().unwrap();
+            root_before = index.root_hash();
+
+            // Simulate a crash between snapshot write and WAL truncation:
+            // write the snapshot directly, leaving the full WAL in place.
+            let idx_map = index.index.read().unwrap().clone();
+            snapshot::write_snapshot(&snapshot::snapshot_path(&path), 2, &idx_map).unwrap();
+        }
+        assert!(std::fs::metadata(&path).unwrap().len() > 0, "WAL not truncated");
+
+        // Restart: pre-snapshot WAL entries must not be double-applied.
+        let map_fn = Arc::new(JsonKeysMapFn::new("name"));
+        let index = VerifiableIndex::with_wal(map_fn, &path, 2).unwrap();
+        assert_eq!(index.tree_size(), 2);
+        assert_eq!(index.key_count(), 2);
+        assert_eq!(index.root_hash(), root_before);
+        assert_eq!(index.lookup_string("foo").indices.len(), 1);
+    }
+
+    #[tokio::test]
+    async fn test_rebuild_from_storage() {
+        use crate::merkle::EntryBundle;
+        use crate::storage::TileStorage;
+        use crate::types::{EntryData, PartialSize, TileIndex};
+
+        let temp_dir = tempfile::tempdir().unwrap();
+        let path = temp_dir.path().join("vindex.wal");
+
+        // Log storage with one partial entry bundle of 3 entries.
+        let storage = TileStorage::new(
+            opendal::Operator::new(opendal::services::Memory::default())
+                .unwrap()
+                .finish(),
+        );
+        let entries = vec![
+            EntryData::from(r#"{"name": "foo"}"#),
+            EntryData::from(r#"{"name": "bar"}"#),
+            EntryData::from(r#"{"name": "foo"}"#),
+        ];
+        storage
+            .write_entry_bundle(
+                TileIndex::new(0),
+                PartialSize::new(3),
+                &EntryBundle::with_entries(entries),
+            )
+            .await
+            .unwrap();
+
+        // No WAL/snapshot exists: rebuild from storage.
+        let map_fn = Arc::new(JsonKeysMapFn::new("name"));
+        let index = VerifiableIndex::rebuild_from_storage(map_fn, &path, 3, &storage)
+            .await
+            .unwrap();
+
+        assert_eq!(index.tree_size(), 3);
+        assert_eq!(index.key_count(), 2);
+        let result = index.lookup_string("foo");
+        assert_eq!(
+            result.indices.iter().map(|i| i.value()).collect::<Vec<_>>(),
+            vec![0, 2]
+        );
+
+        // The rebuild persisted a snapshot: a plain restart must now work.
+        let map_fn = Arc::new(JsonKeysMapFn::new("name"));
+        let restored = VerifiableIndex::with_wal(map_fn, &path, 3).unwrap();
+        assert_eq!(restored.root_hash(), index.root_hash());
+    }
+
     #[test]
     fn test_hash_key() {
         let key1 = hash_key("foo");
diff --git a/src/vindex/snapshot.rs b/src/vindex/snapshot.rs
new file mode 100644
index 0000000..7edbadb
--- /dev/null
+++ b/src/vindex/snapshot.rs
@@ -0,0 +1,254 @@
+//! Snapshot persistence for the verifiable index.
+//!
+//! A snapshot is a point-in-time serialization of the full key → indices
+//! map at a given tree size. Together with the WAL it bounds both WAL growth
+//! and startup replay time: after a snapshot at tree size `T` is durably
+//! written, the WAL is truncated and only needs to cover entries `>= T`.
+//!
+//! ## Format
+//!
+//! ```text
+//! magic    "VSNP"                (4 bytes)
+//! version  u8 = 1
+//! tree_size u64 BE
+//! key_count u64 BE
+//! per key:
+//!   key       32 bytes
+//!   idx_count u32 BE
+//!   indices   idx_count * u64 BE
+//! crc32     u32 LE over all preceding bytes
+//! ```
+//!
+//! Snapshots are written to a temp file, fsynced, and renamed into place, so
+//! a crash mid-write can never leave a torn snapshot at the final path. A
+//! snapshot that fails its CRC check is treated as absent (the caller falls
+//! back to rebuilding from log storage).
+
+use crate::error::{Error, Result};
+use crate::types::LogIndex;
+use std::collections::HashMap;
+use std::fs::{File, OpenOptions};
+use std::io::{Read, Write};
+use std::path::{Path, PathBuf};
+
+use super::IndexKey;
+
+const SNAPSHOT_MAGIC: &[u8; 4] = b"VSNP";
+const SNAPSHOT_VERSION: u8 = 1;
+
+/// The snapshot path for a given WAL path (`<wal>.snapshot`).
+pub fn snapshot_path(wal_path: &Path) -> PathBuf {
+    let mut os = wal_path.as_os_str().to_os_string();
+    os.push(".snapshot");
+    PathBuf::from(os)
+}
+
+/// Serialize and durably write a snapshot (temp file + fsync + rename).
+pub fn write_snapshot(
+    path: &Path,
+    tree_size: u64,
+    index: &HashMap<IndexKey, Vec<LogIndex>>,
+) -> Result<()> {
+    let mut buf = Vec::with_capacity(17 + index.len() * 48);
+    buf.extend_from_slice(SNAPSHOT_MAGIC);
+    buf.push(SNAPSHOT_VERSION);
+    buf.extend_from_slice(&tree_size.to_be_bytes());
+    buf.extend_from_slice(&(index.len() as u64).to_be_bytes());
+
+    for (key, indices) in index {
+        buf.extend_from_slice(key);
+        buf.extend_from_slice(&(indices.len() as u32).to_be_bytes());
+        for idx in indices {
+            buf.extend_from_slice(&idx.value().to_be_bytes());
+        }
+    }
+
+    let crc = crc32fast::hash(&buf);
+    buf.extend_from_slice(&crc.to_le_bytes());
+
+    let tmp_path = {
+        let mut os = path.as_os_str().to_os_string();
+        os.push(".tmp");
+        PathBuf::from(os)
+    };
+
+    {
+        let mut tmp = OpenOptions::new()
+            .create(true)
+            .write(true)
+            .truncate(true)
+            .open(&tmp_path)
+            .map_err(|e| Error::Internal(format!("failed to create snapshot temp file: {}", e)))?;
+        tmp.write_all(&buf)
+            .map_err(|e| Error::Internal(format!("failed to write snapshot: {}", e)))?;
+        tmp.sync_all()
+            .map_err(|e| Error::Internal(format!("failed to sync snapshot: {}", e)))?;
+    }
+
+    std::fs::rename(&tmp_path, path)
+        .map_err(|e| Error::Internal(format!("failed to rename snapshot into place: {}", e)))?;
+
+    // Fsync the parent directory so the rename itself is durable.
+    if let Some(parent) = path.parent() {
+        let dir = if parent.as_os_str().is_empty() {
+            Path::new(".")
+        } else {
+            parent
+        };
+        if let Ok(dir_file) = File::open(dir) {
+            let _ = dir_file.sync_all();
+        }
+    }
+
+    Ok(())
+}
+
+/// Read a snapshot. Returns `Ok(None)` if the file does not exist or fails
+/// validation (magic, version, structure, CRC) — a bad snapshot is treated
+/// as absent so the caller can fall back to rebuilding.
+pub fn read_snapshot(path: &Path) -> Option<(u64, HashMap<IndexKey, Vec<LogIndex>>)> {
+    if !path.exists() {
+        return None;
+    }
+
+    let mut data = Vec::new();
+    match File::open(path).and_then(|mut f| f.read_to_end(&mut data)) {
+        Ok(_) => {}
+        Err(e) => {
+            tracing::warn!("Failed to read vindex snapshot {}: {}", path.display(), e);
+            return None;
+        }
+    }
+
+    parse_snapshot(&data).map_err(|e| {
+        tracing::warn!(
+            "Ignoring invalid vindex snapshot {}: {}",
+            path.display(),
+            e
+        );
+    })
+    .ok()
+}
+
+fn parse_snapshot(data: &[u8]) -> Result<(u64, HashMap<IndexKey, Vec<LogIndex>>)> {
+    // magic(4) + version(1) + tree_size(8) + key_count(8) + crc(4)
+    if data.len() < 25 {
+        return Err(Error::Internal("snapshot too short".into()));
+    }
+
+    let (body, crc_bytes) = data.split_at(data.len() - 4);
+    let stored_crc = u32::from_le_bytes(crc_bytes.try_into().unwrap());
+    if crc32fast::hash(body) != stored_crc {
+        return Err(Error::Internal("snapshot checksum mismatch".into()));
+    }
+
+    if &body[0..4] != SNAPSHOT_MAGIC {
+        return Err(Error::Internal("bad snapshot magic".into()));
+    }
+    if body[4] != SNAPSHOT_VERSION {
+        return Err(Error::Internal(format!(
+            "unsupported snapshot version {}",
+            body[4]
+        )));
+    }
+
+    let tree_size = u64::from_be_bytes(body[5..13].try_into().unwrap());
+    let key_count = u64::from_be_bytes(body[13..21].try_into().unwrap()) as usize;
+
+    let mut pos = 21usize;
+    let mut index = HashMap::with_capacity(key_count);
+    for _ in 0..key_count {
+        if body.len() < pos + 36 {
+            return Err(Error::Internal("snapshot truncated in key record".into()));
+        }
+        let key: IndexKey = body[pos..pos + 32].try_into().unwrap();
+        let idx_count = u32::from_be_bytes(body[pos + 32..pos + 36].try_into().unwrap()) as usize;
+        pos += 36;
+
+        if body.len() < pos + idx_count * 8 {
+            return Err(Error::Internal("snapshot truncated in index list".into()));
+        }
+        let mut indices = Vec::with_capacity(idx_count);
+        for i in 0..idx_count {
+            let v = u64::from_be_bytes(body[pos + i * 8..pos + i * 8 + 8].try_into().unwrap());
+            indices.push(LogIndex::new(v));
+        }
+        pos += idx_count * 8;
+        index.insert(key, indices);
+    }
+
+    if pos != body.len() {
+        return Err(Error::Internal("snapshot has trailing bytes".into()));
+    }
+
+    Ok((tree_size, index))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn sample_index() -> HashMap<IndexKey, Vec<LogIndex>> {
+        let mut index = HashMap::new();
+        index.insert([1u8; 32], vec![LogIndex::new(0), LogIndex::new(5)]);
+        index.insert([2u8; 32], vec![LogIndex::new(3)]);
+        index.insert([3u8; 32], vec![]);
+        index
+    }
+
+    #[test]
+    fn test_snapshot_roundtrip() {
+        let dir = tempfile::tempdir().unwrap();
+        let path = dir.path().join("test.wal.snapshot");
+
+        let index = sample_index();
+        write_snapshot(&path, 6, &index).unwrap();
+
+        let (tree_size, loaded) = read_snapshot(&path).unwrap();
+        assert_eq!(tree_size, 6);
+        assert_eq!(loaded, index);
+    }
+
+    #[test]
+    fn test_missing_snapshot_is_none() {
+        let dir = tempfile::tempdir().unwrap();
+        assert!(read_snapshot(&dir.path().join("nope.snapshot")).is_none());
+    }
+
+    #[test]
+    fn test_corrupt_snapshot_is_ignored() {
+        let dir = tempfile::tempdir().unwrap();
+        let path = dir.path().join("test.wal.snapshot");
+
+        write_snapshot(&path, 6, &sample_index()).unwrap();
+
+        // Flip a byte in the middle.
+        let mut data = std::fs::read(&path).unwrap();
+        let mid = data.len() / 2;
+        data[mid] ^= 0xFF;
+        std::fs::write(&path, &data).unwrap();
+
+        assert!(read_snapshot(&path).is_none());
+    }
+
+    #[test]
+    fn test_truncated_snapshot_is_ignored() {
+        let dir = tempfile::tempdir().unwrap();
+        let path = dir.path().join("test.wal.snapshot");
+
+        write_snapshot(&path, 6, &sample_index()).unwrap();
+
+        let data = std::fs::read(&path).unwrap();
+        std::fs::write(&path, &data[..data.len() - 10]).unwrap();
+
+        assert!(read_snapshot(&path).is_none());
+    }
+
+    #[test]
+    fn test_snapshot_path_suffix() {
+        assert_eq!(
+            snapshot_path(Path::new("/data/vindex.wal")),
+            PathBuf::from("/data/vindex.wal.snapshot")
+        );
+    }
+}
diff --git a/src/vindex/wal.rs b/src/vindex/wal.rs
index e189315..4e3267a 100644
--- a/src/vindex/wal.rs
+++ b/src/vindex/wal.rs
@@ -1,37 +1,51 @@
 //! Write Ahead Log (WAL) for verifiable index persistence.
 //!
-//! The WAL supports two formats:
-//!
-//! ## Text format (legacy):
+//! ## Binary format v3 (current, checksummed):
 //! ```text
-//! <index> <hex_key1> <hex_key2> ...
+//! [u8 version=3][u64 index][u8 key_count][32*key_count bytes of keys][u32 crc32 LE]
 //! ```
-//! Cost: ~336 bytes for entry with 5 keys
+//! The CRC32 covers everything from the version byte through the last key
+//! byte, so bit rot and torn writes are detected instead of being replayed
+//! as wrong keys.
 //!
-//! ## Binary format (v2):
+//! ## Binary format v2 (legacy, read-only):
 //! ```text
 //! [u8 version=2][u64 index][u8 key_count][32*key_count bytes of keys]
 //! ```
-//! Cost: ~169 bytes for entry with 5 keys (50% savings)
-//!
-//! The reader auto-detects format by checking the first byte:
-//! - ASCII digit (0-9) → text format
-//! - 0x02 → binary format v2
 //!
-//! On startup, the WAL is validated and truncated to match the expected
-//! tree size from the database. This prevents duplicate entries after a crash.
+//! On startup, the WAL is validated and truncated:
+//! - Entries with `index >= expected_tree_size` (from the database) are
+//!   truncated — the WAL ran ahead of the database before a crash.
+//! - A torn or corrupted tail (crash mid-write, bit rot) is truncated at the
+//!   last fully-valid entry instead of failing startup.
 
 use crate::error::{Error, Result};
 use crate::types::LogIndex;
 use std::fs::{File, OpenOptions};
-use std::io::{BufReader, BufWriter, Read, Write};
+use std::io::{BufReader, Read, Write};
 use std::path::Path;
 
 use super::IndexKey;
 
-/// WAL format version (binary only).
+/// Legacy binary WAL format version (no checksum, read-only support).
 const WAL_VERSION_BINARY: u8 = 2;
 
+/// Checksummed binary WAL format version (current write format).
+const WAL_VERSION_CRC: u8 = 3;
+
+/// Serialize a single WAL entry in v3 (checksummed) format.
+fn encode_entry(buf: &mut Vec<u8>, idx: LogIndex, keys: &[IndexKey]) {
+    let start = buf.len();
+    buf.push(WAL_VERSION_CRC);
+    buf.extend_from_slice(&idx.value().to_be_bytes());
+    buf.push(keys.len() as u8);
+    for key in keys {
+        buf.extend_from_slice(key);
+    }
+    let crc = crc32fast::hash(&buf[start..]);
+    buf.extend_from_slice(&crc.to_le_bytes());
+}
+
 /// Binary WAL writer (now the default and only format).
 ///
 /// Format per entry: [version=0x02][u64 index][u8 key_count][32*key_count bytes]
@@ -51,19 +65,13 @@ pub type WalWriter = BinaryWalWriter;
 /// Expected improvement: 5-10x throughput over unbatched text format
 pub type BatchedWalWriter = BatchedBinaryWalWriter;
 
-/// Binary WAL writer for efficient storage.
+/// Binary WAL writer for efficient storage (v3 checksummed format).
 ///
-/// Format per entry:
-/// ```text
-/// [u8 version=2][u64 index][u8 key_count][32*key_count bytes]
-/// ```
-///
-/// Benefits over text format:
-/// - 50% space savings (169 bytes vs 336 bytes for 5 keys)
-/// - 20-30% faster I/O (no hex encoding/decoding)
-/// - Simpler parsing (no string allocation)
+/// Each entry is serialized to a buffer and written with a single
+/// `write_all` call, so a failed in-process write cannot leave a partial
+/// entry interleaved with later entries.
 pub struct BinaryWalWriter {
-    writer: BufWriter<File>,
+    file: File,
 }
 
 impl BinaryWalWriter {
@@ -75,14 +83,10 @@ impl BinaryWalWriter {
             .open(path.as_ref())
             .map_err(|e| Error::Internal(format!("failed to open WAL: {}", e)))?;
 
-        Ok(Self {
-            writer: BufWriter::new(file),
-        })
+        Ok(Self { file })
     }
 
     /// Append an entry to the WAL in binary format.
-    ///
-    /// Format: [u8 version][u64 index][u8 key_count][keys...]
     pub fn append(&mut self, idx: LogIndex, keys: &[IndexKey]) -> Result<()> {
         if keys.len() > u8::MAX as usize {
             return Err(Error::InvalidEntry(format!(
@@ -93,45 +97,34 @@ impl BinaryWalWriter {
             )));
         }
 
-        // Version marker
-        self.writer
-            .write_all(&[WAL_VERSION_BINARY])
-            .map_err(|e| Error::Internal(format!("failed to write to WAL: {}", e)))?;
-
-        // Index (8 bytes, big-endian for readability in hex dumps)
-        self.writer
-            .write_all(&idx.value().to_be_bytes())
+        let mut buf = Vec::with_capacity(14 + keys.len() * 32);
+        encode_entry(&mut buf, idx, keys);
+        self.file
+            .write_all(&buf)
             .map_err(|e| Error::Internal(format!("failed to write to WAL: {}", e)))?;
 
-        // Key count (1 byte, limiting to 255 keys per entry)
-        let key_count = keys.len() as u8;
-        self.writer
-            .write_all(&[key_count])
-            .map_err(|e| Error::Internal(format!("failed to write to WAL: {}", e)))?;
-
-        // Keys (32 bytes each)
-        for key in keys {
-            self.writer
-                .write_all(key)
-                .map_err(|e| Error::Internal(format!("failed to write to WAL: {}", e)))?;
-        }
-
         Ok(())
     }
 
     /// Flush the WAL to disk with fsync for durability.
     pub fn flush(&mut self) -> Result<()> {
-        self.writer
-            .flush()
-            .map_err(|e| Error::Internal(format!("failed to flush WAL: {}", e)))?;
-
-        self.writer
-            .get_ref()
+        self.file
             .sync_data()
             .map_err(|e| Error::Internal(format!("failed to sync WAL to disk: {}", e)))?;
 
         Ok(())
     }
+
+    /// Truncate the WAL to zero length (after a snapshot has been written).
+    pub fn truncate(&mut self) -> Result<()> {
+        self.file
+            .set_len(0)
+            .map_err(|e| Error::Internal(format!("failed to truncate WAL: {}", e)))?;
+        self.file
+            .sync_all()
+            .map_err(|e| Error::Internal(format!("failed to sync truncated WAL: {}", e)))?;
+        Ok(())
+    }
 }
 
 /// Batched binary WAL writer combining batching with binary format.
@@ -144,7 +137,7 @@ impl BinaryWalWriter {
 pub struct BatchedBinaryWalWriter {
     buffer: Vec<(LogIndex, Vec<IndexKey>)>,
     batch_size: usize,
-    writer: BufWriter<File>,
+    file: File,
 }
 
 impl BatchedBinaryWalWriter {
@@ -159,7 +152,7 @@ impl BatchedBinaryWalWriter {
         Ok(Self {
             buffer: Vec::with_capacity(batch_size),
             batch_size,
-            writer: BufWriter::new(file),
+            file,
         })
     }
 
@@ -192,6 +185,12 @@ impl BatchedBinaryWalWriter {
     }
 
     /// Internal method to flush the current batch in binary format.
+    ///
+    /// The whole batch is serialized and written with a single `write_all`
+    /// and a single fsync. On error the buffer is retained, so a retry
+    /// rewrites the full batch; duplicated entries are harmless because
+    /// replay deduplicates (idx, key) pairs, and torn fragments are removed
+    /// by CRC-validated truncation on startup.
     fn flush_batch(&mut self) -> Result<()> {
         if self.buffer.is_empty() {
             return Ok(());
@@ -199,40 +198,17 @@ impl BatchedBinaryWalWriter {
 
         tracing::debug!("Flushing binary WAL batch of {} entries", self.buffer.len());
 
-        // Write all buffered entries in binary format
+        let mut buf = Vec::with_capacity(self.buffer.iter().map(|(_, k)| 14 + k.len() * 32).sum());
         for (idx, keys) in &self.buffer {
-            // Version marker
-            self.writer
-                .write_all(&[WAL_VERSION_BINARY])
-                .map_err(|e| Error::Internal(format!("failed to write to WAL: {}", e)))?;
-
-            // Index
-            self.writer
-                .write_all(&idx.value().to_be_bytes())
-                .map_err(|e| Error::Internal(format!("failed to write to WAL: {}", e)))?;
-
-            // Key count
-            let key_count = keys.len() as u8;
-            self.writer
-                .write_all(&[key_count])
-                .map_err(|e| Error::Internal(format!("failed to write to WAL: {}", e)))?;
-
-            // Keys
-            for key in keys {
-                self.writer
-                    .write_all(key)
-                    .map_err(|e| Error::Internal(format!("failed to write to WAL: {}", e)))?;
-            }
+            encode_entry(&mut buf, *idx, keys);
         }
 
-        // Flush buffer to OS
-        self.writer
-            .flush()
-            .map_err(|e| Error::Internal(format!("failed to flush WAL: {}", e)))?;
+        self.file
+            .write_all(&buf)
+            .map_err(|e| Error::Internal(format!("failed to write to WAL: {}", e)))?;
 
         // Single fsync for entire batch
-        self.writer
-            .get_ref()
+        self.file
             .sync_data()
             .map_err(|e| Error::Internal(format!("failed to sync WAL to disk: {}", e)))?;
 
@@ -246,13 +222,32 @@ impl BatchedBinaryWalWriter {
     pub fn buffered_count(&self) -> usize {
         self.buffer.len()
     }
+
+    /// Truncate the WAL to zero length (after a snapshot has been written).
+    ///
+    /// The in-memory buffer must be empty (call [`flush`](Self::flush) first).
+    pub fn truncate(&mut self) -> Result<()> {
+        if !self.buffer.is_empty() {
+            return Err(Error::Internal(
+                "cannot truncate WAL with buffered entries; flush first".into(),
+            ));
+        }
+        self.file
+            .set_len(0)
+            .map_err(|e| Error::Internal(format!("failed to truncate WAL: {}", e)))?;
+        self.file
+            .sync_all()
+            .map_err(|e| Error::Internal(format!("failed to sync truncated WAL: {}", e)))?;
+        Ok(())
+    }
 }
 
-/// WAL reader for replaying entries in binary format.
-///
-/// Format per entry: [version=0x02][u64 index][u8 key_count][32*key_count bytes]
+/// WAL reader for replaying entries in binary format (v2 legacy and v3
+/// checksummed).
 pub struct WalReader {
     reader: BufReader<File>,
+    /// Byte offset just past the last successfully-parsed entry.
+    valid_pos: u64,
 }
 
 impl WalReader {
@@ -263,12 +258,21 @@ impl WalReader {
 
         Ok(Self {
             reader: BufReader::new(file),
+            valid_pos: 0,
         })
     }
 
-    /// Read the next entry from the WAL in binary format.
+    /// Byte offset just past the last entry successfully returned by
+    /// [`next_entry`](Self::next_entry). Used for truncating a corrupted tail.
+    pub fn valid_pos(&self) -> u64 {
+        self.valid_pos
+    }
+
+    /// Read the next entry from the WAL.
     ///
-    /// Returns `Ok(None)` when EOF is reached.
+    /// Returns `Ok(None)` on clean EOF. A torn tail or corrupted entry
+    /// (bad version byte, short read, checksum mismatch) returns an error;
+    /// callers recovering from a crash should truncate at [`valid_pos`](Self::valid_pos).
     pub fn next_entry(&mut self) -> Result<Option<(LogIndex, Vec<IndexKey>)>> {
         // Read version byte
         let mut version = [0u8; 1];
@@ -278,10 +282,10 @@ impl WalReader {
             Err(e) => return Err(Error::Internal(format!("failed to read from WAL: {}", e))),
         }
 
-        if version[0] != WAL_VERSION_BINARY {
+        if version[0] != WAL_VERSION_BINARY && version[0] != WAL_VERSION_CRC {
             return Err(Error::Internal(format!(
-                "invalid WAL version: expected 0x{:02x}, got 0x{:02x}",
-                WAL_VERSION_BINARY, version[0]
+                "invalid WAL version: expected 0x{:02x} or 0x{:02x}, got 0x{:02x}",
+                WAL_VERSION_BINARY, WAL_VERSION_CRC, version[0]
             )));
         }
 
@@ -309,18 +313,49 @@ impl WalReader {
             keys.push(key);
         }
 
+        let mut entry_size = 1 + 8 + 1 + key_count as u64 * 32;
+
+        // v3: verify the trailing CRC32 over version..keys.
+        if version[0] == WAL_VERSION_CRC {
+            let mut crc_bytes = [0u8; 4];
+            self.reader
+                .read_exact(&mut crc_bytes)
+                .map_err(|e| Error::Internal(format!("failed to read checksum from WAL: {}", e)))?;
+            let stored_crc = u32::from_le_bytes(crc_bytes);
+
+            let mut hasher = crc32fast::Hasher::new();
+            hasher.update(&version);
+            hasher.update(&idx_bytes);
+            hasher.update(&count_byte);
+            for key in &keys {
+                hasher.update(key);
+            }
+            if hasher.finalize() != stored_crc {
+                return Err(Error::Internal(format!(
+                    "WAL checksum mismatch for entry {}",
+                    idx
+                )));
+            }
+            entry_size += 4;
+        }
+
+        self.valid_pos += entry_size;
         Ok(Some((LogIndex::new(idx), keys)))
     }
 }
 
 /// Validate and truncate the binary WAL file to match the expected tree size.
 ///
-/// This function reads the WAL to find all entries and truncates any entries
-/// with index >= expected_tree_size. This is critical for crash recovery: if
-/// the WAL was flushed but the database wasn't updated before a crash, we need
-/// to truncate the WAL to match the database state to avoid duplicate entries.
+/// Two kinds of tail are removed:
+/// - Entries with `index >= expected_tree_size`: the WAL was flushed but the
+///   database wasn't updated before a crash, so the WAL ran ahead. (The
+///   worker always writes the WAL before marking entries integrated, so the
+///   WAL can only ever be ahead of — never behind — the database.)
+/// - A torn or corrupted tail (crash mid-write, checksum mismatch): the scan
+///   stops at the last fully-valid entry and everything after is truncated.
 ///
-/// Returns the actual tree size found in the WAL (may be less than expected if WAL is behind).
+/// Returns the actual tree size found in the WAL (may be less than expected
+/// if the WAL is behind; callers treat that as fatal).
 pub fn validate_and_truncate_wal(path: impl AsRef<Path>, expected_tree_size: u64) -> Result<u64> {
     let path = path.as_ref();
 
@@ -333,27 +368,42 @@ pub fn validate_and_truncate_wal(path: impl AsRef<Path>, expected_tree_size: u64
     let mut last_valid_pos: u64 = 0;
     let mut max_valid_idx: Option<u64> = None;
 
-    // Calculate entry sizes as we read to find truncation point
-    while let Some((idx, keys)) = reader.next_entry()? {
-        let idx_val = idx.value();
-
-        if expected_tree_size == 0 || idx_val < expected_tree_size {
-            // This entry is within bounds
-            // Binary format: 1 byte version + 8 bytes index + 1 byte count + 32*count bytes keys
-            let entry_size = 1 + 8 + 1 + (keys.len() as u64 * 32);
-            last_valid_pos += entry_size;
-            max_valid_idx = Some(match max_valid_idx {
-                Some(prev) => prev.max(idx_val),
-                None => idx_val,
-            });
-        } else {
-            // Entry is beyond expected tree size - stop here
-            tracing::warn!(
-                "WAL entry {} >= expected tree size {}, truncating",
-                idx_val,
-                expected_tree_size
-            );
-            break;
+    loop {
+        match reader.next_entry() {
+            Ok(Some((idx, _keys))) => {
+                let idx_val = idx.value();
+
+                if idx_val < expected_tree_size {
+                    // This entry is within bounds
+                    last_valid_pos = reader.valid_pos();
+                    max_valid_idx = Some(match max_valid_idx {
+                        Some(prev) => prev.max(idx_val),
+                        None => idx_val,
+                    });
+                } else {
+                    // Entry is beyond expected tree size - stop here
+                    tracing::warn!(
+                        "WAL entry {} >= expected tree size {}, truncating",
+                        idx_val,
+                        expected_tree_size
+                    );
+                    break;
+                }
+            }
+            Ok(None) => break,
+            Err(e) => {
+                // Torn write or bit rot in the tail. Truncate at the last
+                // valid entry instead of refusing to start; this is exactly
+                // the crash the WAL exists to survive. Anything the WAL
+                // loses here was, by write ordering, never marked integrated
+                // in the database (or the caller fails the behind-check).
+                tracing::warn!(
+                    "WAL corrupted at byte {}: {}. Truncating corrupted tail.",
+                    last_valid_pos,
+                    e
+                );
+                break;
+            }
         }
     }
 
@@ -450,12 +500,9 @@ mod tests {
         // Check file size
         let file_size = std::fs::metadata(path).unwrap().len();
 
-        // Binary format: (1 version + 8 index + 1 count + 5*32 keys) * 100 entries
-        // = (1 + 8 + 1 + 160) * 100 = 170 * 100 = 17,000 bytes
-        let expected_size = 170 * 100;
-
-        // Text format would be: ~(5 + 5*65 + 1) * 100 = ~33,100 bytes
-        // Binary saves: ~48% space
+        // Binary v3 format: (1 version + 8 index + 1 count + 5*32 keys + 4 crc) * 100 entries
+        // = (1 + 8 + 1 + 160 + 4) * 100 = 174 * 100 = 17,400 bytes
+        let expected_size = 174 * 100;
 
         println!("Binary format file size: {} bytes", file_size);
         println!("Expected size: {} bytes", expected_size);
@@ -559,6 +606,132 @@ mod tests {
         assert_eq!(actual_size, 3);
     }
 
+    #[test]
+    fn test_torn_tail_is_truncated() {
+        let temp_file = NamedTempFile::new().unwrap();
+        let path = temp_file.path();
+
+        // Write 3 complete entries
+        {
+            let mut writer = WalWriter::open(path).unwrap();
+            let key = [1u8; 32];
+            for i in 0..3 {
+                writer.append(LogIndex::new(i), &[key]).unwrap();
+            }
+            writer.flush().unwrap();
+        }
+
+        // Simulate a crash mid-write: append a partial entry (version +
+        // index but missing keys and checksum).
+        {
+            use std::io::Write;
+            let mut file = OpenOptions::new().append(true).open(path).unwrap();
+            file.write_all(&[3u8]).unwrap();
+            file.write_all(&3u64.to_be_bytes()).unwrap();
+            file.write_all(&[5u8]).unwrap(); // claims 5 keys, none follow
+            file.sync_all().unwrap();
+        }
+
+        // Recovery must truncate the torn tail and keep the 3 good entries.
+        let actual_size = validate_and_truncate_wal(path, 3).unwrap();
+        assert_eq!(actual_size, 3);
+
+        let mut reader = WalReader::open(path).unwrap();
+        for i in 0..3 {
+            let (idx, _) = reader.next_entry().unwrap().unwrap();
+            assert_eq!(idx.value(), i);
+        }
+        assert!(reader.next_entry().unwrap().is_none());
+    }
+
+    #[test]
+    fn test_corrupted_entry_is_truncated() {
+        let temp_file = NamedTempFile::new().unwrap();
+        let path = temp_file.path();
+
+        {
+            let mut writer = WalWriter::open(path).unwrap();
+            let key = [1u8; 32];
+            for i in 0..3 {
+                writer.append(LogIndex::new(i), &[key]).unwrap();
+            }
+            writer.flush().unwrap();
+        }
+
+        // Flip a bit in a key byte of the last entry (offset from end: 4 crc
+        // + 1 key byte). CRC validation must catch this.
+        {
+            use std::io::{Seek, SeekFrom, Write};
+            let mut file = OpenOptions::new().read(true).write(true).open(path).unwrap();
+            file.seek(SeekFrom::End(-5)).unwrap();
+            file.write_all(&[0xFF]).unwrap();
+            file.sync_all().unwrap();
+        }
+
+        // The corrupted third entry must be truncated; first two survive.
+        let actual_size = validate_and_truncate_wal(path, 3).unwrap();
+        assert_eq!(actual_size, 2);
+
+        let mut reader = WalReader::open(path).unwrap();
+        for i in 0..2 {
+            let (idx, _) = reader.next_entry().unwrap().unwrap();
+            assert_eq!(idx.value(), i);
+        }
+        assert!(reader.next_entry().unwrap().is_none());
+    }
+
+    #[test]
+    fn test_stale_wal_with_zero_expected_size_is_truncated() {
+        let temp_file = NamedTempFile::new().unwrap();
+        let path = temp_file.path();
+
+        // A WAL left over from a previous deployment...
+        {
+            let mut writer = WalWriter::open(path).unwrap();
+            let key = [1u8; 32];
+            for i in 0..5 {
+                writer.append(LogIndex::new(i), &[key]).unwrap();
+            }
+            writer.flush().unwrap();
+        }
+
+        // ...must be fully truncated when the database says the log is empty,
+        // instead of replaying entries the log doesn't contain.
+        let actual_size = validate_and_truncate_wal(path, 0).unwrap();
+        assert_eq!(actual_size, 0);
+        assert_eq!(std::fs::metadata(path).unwrap().len(), 0);
+    }
+
+    #[test]
+    fn test_legacy_v2_entries_are_readable() {
+        let temp_file = NamedTempFile::new().unwrap();
+        let path = temp_file.path();
+
+        // Hand-write v2 (no checksum) entries as an old binary would have.
+        {
+            use std::io::Write;
+            let mut file = OpenOptions::new().append(true).open(path).unwrap();
+            for i in 0..3u64 {
+                file.write_all(&[2u8]).unwrap();
+                file.write_all(&i.to_be_bytes()).unwrap();
+                file.write_all(&[1u8]).unwrap();
+                file.write_all(&[7u8; 32]).unwrap();
+            }
+            file.sync_all().unwrap();
+        }
+
+        let actual_size = validate_and_truncate_wal(path, 3).unwrap();
+        assert_eq!(actual_size, 3);
+
+        let mut reader = WalReader::open(path).unwrap();
+        for i in 0..3 {
+            let (idx, keys) = reader.next_entry().unwrap().unwrap();
+            assert_eq!(idx.value(), i);
+            assert_eq!(keys, vec![[7u8; 32]]);
+        }
+        assert!(reader.next_entry().unwrap().is_none());
+    }
+
     #[test]
     fn test_batched_wal_writer_basic() {
         let temp_file = NamedTempFile::new().unwrap();
diff --git a/src/witness/mod.rs b/src/witness/mod.rs
index c7ee4e5..5197693 100644
--- a/src/witness/mod.rs
+++ b/src/witness/mod.rs
@@ -15,12 +15,11 @@ mod verifier;
 mod litewitness_test;
 
 pub use proof::{verify_consistency, ConsistencyProof};
-pub use state::WitnessStateStore;
-pub use verifier::{CheckpointVerifier, LogConfig};
+pub use state::{UpdateOutcome, WitnessStateStore};
+pub use verifier::{parse_vkey, CheckpointVerifier, LogConfig};
 
 use crate::checkpoint::{CheckpointSignature, CheckpointSigner, CosignedCheckpoint};
 use crate::error::{Error, Result};
-use ed25519_dalek::Signer;
 use sea_orm::DatabaseConnection;
 use sigstore_types::Sha256Hash;
 use std::sync::Arc;
@@ -89,6 +88,15 @@ impl Witness {
         let new_size = checkpoint.checkpoint.size.value();
         let new_root = checkpoint.checkpoint.root_hash;
 
+        // Sizes are persisted as i64; reject values that would wrap negative
+        // and defeat the rollback protection.
+        if new_size > i64::MAX as u64 {
+            return Err(WitnessError::BadRequest(format!(
+                "checkpoint size {} exceeds supported maximum",
+                new_size
+            )));
+        }
+
         // 4. Validate old_size constraints
         if request.old_size > new_size {
             return Err(WitnessError::BadRequest(format!(
@@ -143,21 +151,35 @@ impl Witness {
             ));
         }
 
-        // 8. Create cosignature
-        let body = checkpoint.checkpoint.to_body();
-        let signature = self.signer.signing_key_ref().sign(body.as_bytes());
-        let cosig = CheckpointSignature {
-            name: self.signer.name().clone(),
-            key_id: self.signer.key_id().clone(),
-            signature,
-        };
-
-        // 9. Update state
-        self.state_store
-            .update(origin, new_size, new_root, &request.checkpoint)
+        // 8. Persist state with a compare-and-swap against the state we
+        // verified the proof for. If a concurrent request advanced the state
+        // in the meantime, we must NOT cosign: the proof we verified may
+        // extend a different view of the tree than the one now persisted.
+        let outcome = self
+            .state_store
+            .update(
+                origin,
+                state.size,
+                &state.root_hash,
+                new_size,
+                new_root,
+                &request.checkpoint,
+            )
             .await
             .map_err(|e| WitnessError::Internal(format!("failed to update state: {}", e)))?;
 
+        if let UpdateOutcome::Conflict { current_size } = outcome {
+            return Err(WitnessError::Conflict(current_size));
+        }
+
+        // 9. Create the cosignature/v1 only after the state is durably
+        // updated (c2sp.org/tlog-cosignature: timestamped, alg-0x04 key ID).
+        let timestamp = std::time::SystemTime::now()
+            .duration_since(std::time::UNIX_EPOCH)
+            .map_err(|e| WitnessError::Internal(format!("system clock error: {}", e)))?
+            .as_secs();
+        let cosig = self.signer.cosign_v1(&checkpoint.checkpoint, timestamp);
+
         Ok(cosig)
     }
 
diff --git a/src/witness/state.rs b/src/witness/state.rs
index f86cab6..03b6016 100644
--- a/src/witness/state.rs
+++ b/src/witness/state.rs
@@ -87,16 +87,42 @@ impl WitnessStateStore {
         })
     }
 
-    /// Update the witnessed state for a log.
+    /// Update the witnessed state for a log, compare-and-swap style.
     ///
-    /// This is called after successfully verifying a consistency proof.
+    /// This is called after successfully verifying a consistency proof. The
+    /// verification happened against a state read earlier (`expected_size`,
+    /// `expected_root`); this method re-checks under a row lock that the
+    /// persisted state still matches. Without this check, two concurrent
+    /// requests could each verify a proof against the same old state and the
+    /// witness would end up cosigning two conflicting roots at the same size
+    /// (a split view).
+    ///
+    /// Returns `UpdateOutcome::Conflict` if the persisted state no longer
+    /// matches the expected state; the caller must re-read and re-verify.
     pub async fn update(
         &self,
         origin: &str,
+        expected_size: u64,
+        expected_root: &Sha256Hash,
         size: u64,
         root_hash: Sha256Hash,
         checkpoint: &str,
-    ) -> Result<()> {
+    ) -> Result<UpdateOutcome> {
+        // Sizes are stored as i64; reject values that would wrap negative and
+        // corrupt the monotonicity comparison.
+        if size > i64::MAX as u64 || expected_size > i64::MAX as u64 {
+            return Err(Error::InvalidEntry(format!(
+                "tree size {} exceeds supported maximum",
+                size
+            )));
+        }
+        if size < expected_size {
+            return Err(Error::InvalidEntry(format!(
+                "size rollback not allowed: current size {} > new size {}",
+                expected_size, size
+            )));
+        }
+
         let txn = self.conn.begin().await?;
 
         // Lock and get current state
@@ -107,12 +133,15 @@ impl WitnessStateStore {
 
         match current {
             Some(model) => {
-                // Prevent size rollback: new size must be >= current size
-                if (size as i64) < model.size {
-                    return Err(Error::InvalidEntry(format!(
-                        "size rollback not allowed: current size {} > new size {}",
-                        model.size, size
-                    )));
+                // CAS check: the state must not have moved since the caller
+                // verified the consistency proof.
+                if model.size as u64 != expected_size
+                    || model.root_hash != expected_root.as_bytes().to_vec()
+                {
+                    txn.rollback().await?;
+                    return Ok(UpdateOutcome::Conflict {
+                        current_size: model.size as u64,
+                    });
                 }
                 // Update existing
                 witness_state::Entity::update(witness_state::ActiveModel {
@@ -126,6 +155,12 @@ impl WitnessStateStore {
                 .await?;
             }
             None => {
+                // Callers always go through get_or_init first, so an absent
+                // row means the expected state is the initial empty state.
+                if expected_size != 0 {
+                    txn.rollback().await?;
+                    return Ok(UpdateOutcome::Conflict { current_size: 0 });
+                }
                 // Insert new
                 witness_state::Entity::insert(witness_state::ActiveModel {
                     origin: ActiveValue::Set(origin.to_string()),
@@ -140,7 +175,7 @@ impl WitnessStateStore {
         }
 
         txn.commit().await?;
-        Ok(())
+        Ok(UpdateOutcome::Updated)
     }
 
     /// List all witnessed logs.
@@ -161,6 +196,18 @@ impl WitnessStateStore {
     }
 }
 
+/// Outcome of a compare-and-swap state update.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum UpdateOutcome {
+    /// The state was updated.
+    Updated,
+    /// The persisted state changed since the caller read it.
+    Conflict {
+        /// The size currently persisted for this log.
+        current_size: u64,
+    },
+}
+
 /// RFC 6962 empty tree root hash.
 fn empty_root_hash() -> Sha256Hash {
     Sha256Hash::from_bytes([
diff --git a/src/witness/verifier.rs b/src/witness/verifier.rs
index d7aa6da..e2799a9 100644
--- a/src/witness/verifier.rs
+++ b/src/witness/verifier.rs
@@ -9,6 +9,9 @@ use sha2::{Digest, Sha256};
 /// Ed25519 algorithm identifier for note format.
 const ALG_ED25519: u8 = 0x01;
 
+/// Ed25519 cosignature/v1 algorithm identifier (c2sp.org/tlog-cosignature).
+const ALG_COSIGNATURE_V1: u8 = 0x04;
+
 /// Configuration for a known log.
 #[derive(Debug, Clone)]
 pub struct LogConfig {
@@ -32,7 +35,13 @@ impl LogConfig {
     /// Format: `name+hash_hex+base64(alg + pubkey)`
     /// Example: `example.com/log+deadbeef+AQIDBAUGBwg...`
     pub fn new(origin: String, vkey: &str) -> Result<Self> {
-        let (key_name, key_id, verifying_key) = parse_vkey(vkey)?;
+        let (key_name, alg, key_id, verifying_key) = parse_vkey(vkey)?;
+        if alg != ALG_ED25519 {
+            return Err(Error::Config(format!(
+                "log verification keys must be plain Ed25519 note keys (alg 0x01), got alg 0x{:02x}",
+                alg
+            )));
+        }
 
         Ok(Self {
             origin,
@@ -116,7 +125,12 @@ impl CheckpointVerifier {
 /// Parse a verification key string.
 ///
 /// Format: `name+hash_hex+base64(alg + pubkey)`
-fn parse_vkey(vkey: &str) -> Result<(String, KeyId, VerifyingKey)> {
+///
+/// Accepts plain Ed25519 note keys (alg 0x01) and Ed25519 cosignature/v1
+/// keys (alg 0x04, used by C2SP witnesses). Returns the key name, the
+/// algorithm byte, the key ID (computed with that algorithm byte), and the
+/// public key.
+pub fn parse_vkey(vkey: &str) -> Result<(String, u8, KeyId, VerifyingKey)> {
     let parts: Vec<&str> = vkey.trim().splitn(3, '+').collect();
     if parts.len() != 3 {
         return Err(Error::Config(format!(
@@ -153,10 +167,11 @@ fn parse_vkey(vkey: &str) -> Result<(String, KeyId, VerifyingKey)> {
     }
 
     // Check algorithm byte
-    if key_data[0] != ALG_ED25519 {
+    let alg = key_data[0];
+    if alg != ALG_ED25519 && alg != ALG_COSIGNATURE_V1 {
         return Err(Error::Config(format!(
-            "unsupported algorithm: expected {}, got {}",
-            ALG_ED25519, key_data[0]
+            "unsupported algorithm: expected 0x{:02x} or 0x{:02x}, got 0x{:02x}",
+            ALG_ED25519, ALG_COSIGNATURE_V1, alg
         )));
     }
 
@@ -168,8 +183,8 @@ fn parse_vkey(vkey: &str) -> Result<(String, KeyId, VerifyingKey)> {
     let verifying_key = VerifyingKey::from_bytes(&pubkey_bytes)
         .map_err(|e| Error::Config(format!("invalid public key: {}", e)))?;
 
-    // Compute and verify key ID
-    let key_id = compute_key_id(&name, &verifying_key);
+    // Compute and verify key ID (with the algorithm byte from the key data)
+    let key_id = compute_key_id(&name, &verifying_key, alg);
     if key_id.as_u32() != expected_hash {
         return Err(Error::Config(format!(
             "key hash mismatch: expected {:08x}, computed {:08x}",
@@ -178,15 +193,15 @@ fn parse_vkey(vkey: &str) -> Result<(String, KeyId, VerifyingKey)> {
         )));
     }
 
-    Ok((name, key_id, verifying_key))
+    Ok((name, alg, key_id, verifying_key))
 }
 
 /// Compute the key ID for a verifying key per Go's note format.
-fn compute_key_id(name: &str, key: &VerifyingKey) -> KeyId {
+fn compute_key_id(name: &str, key: &VerifyingKey, alg: u8) -> KeyId {
     let mut hasher = Sha256::new();
     hasher.update(name.as_bytes());
     hasher.update(b"\n");
-    hasher.update([ALG_ED25519]);
+    hasher.update([alg]);
     hasher.update(key.as_bytes());
     let hash = hasher.finalize();
 
@@ -225,9 +240,36 @@ mod tests {
         );
 
         // Parse and verify
-        let (parsed_name, parsed_id, parsed_key) = parse_vkey(&vkey).unwrap();
+        let (parsed_name, alg, parsed_id, parsed_key) = parse_vkey(&vkey).unwrap();
         assert_eq!(parsed_name, name);
+        assert_eq!(alg, ALG_ED25519);
         assert_eq!(parsed_id.as_u32(), signer.key_id().as_u32());
         assert_eq!(parsed_key.as_bytes(), pubkey.as_bytes());
     }
+
+    #[test]
+    fn test_parse_cosignature_v1_vkey() {
+        use crate::checkpoint::signer::compute_key_id_with_alg;
+
+        let signer = CheckpointSigner::generate("witness.example.com");
+        let pubkey = signer.public_key();
+
+        // Build a cosignature/v1 vkey (alg 0x04) as a C2SP witness would
+        // distribute it.
+        let key_id = compute_key_id_with_alg("witness.example.com", &pubkey, ALG_COSIGNATURE_V1);
+        let mut key_data = Vec::with_capacity(33);
+        key_data.push(ALG_COSIGNATURE_V1);
+        key_data.extend_from_slice(pubkey.as_bytes());
+        let vkey = format!(
+            "witness.example.com+{:08x}+{}",
+            key_id.as_u32(),
+            base64::engine::general_purpose::STANDARD.encode(&key_data)
+        );
+
+        let (parsed_name, alg, parsed_id, parsed_key) = parse_vkey(&vkey).unwrap();
+        assert_eq!(parsed_name, "witness.example.com");
+        assert_eq!(alg, ALG_COSIGNATURE_V1);
+        assert_eq!(parsed_id.as_u32(), key_id.as_u32());
+        assert_eq!(parsed_key.as_bytes(), pubkey.as_bytes());
+    }
 }
diff --git a/src/worker.rs b/src/worker.rs
index 218208e..783b20c 100644
--- a/src/worker.rs
+++ b/src/worker.rs
@@ -23,14 +23,51 @@ pub struct ExternalWitness {
     pub name: String,
     /// URL of the witness service (e.g., "http://localhost:8081").
     pub url: String,
+    /// The witness's pinned verification key. Cosignatures returned by the
+    /// witness are verified against this key before they count toward the
+    /// publication quorum.
+    pub verifying_key: ed25519_dalek::VerifyingKey,
+    /// The expected key ID for plain note signatures (alg 0x01, legacy).
+    pub key_id: crate::checkpoint::signer::KeyId,
+    /// The expected key ID for cosignature/v1 signatures (alg 0x04, C2SP).
+    pub key_id_v1: crate::checkpoint::signer::KeyId,
 }
 
 impl ExternalWitness {
-    /// Create a new external witness configuration.
-    pub fn new(name: impl Into<String>, url: impl Into<String>) -> Self {
+    /// Create a new external witness configuration from a note-format
+    /// verification key (`name+hash+base64(alg+pubkey)`).
+    ///
+    /// Both plain Ed25519 vkeys (alg 0x01) and cosignature/v1 vkeys
+    /// (alg 0x04) are accepted — the public key material is the same; the
+    /// expected key IDs for both signature formats are derived from it.
+    pub fn new(name: impl Into<String>, url: impl Into<String>, vkey: &str) -> Result<Self> {
+        use crate::checkpoint::signer::{compute_key_id_with_alg, ALG_COSIGNATURE_V1};
+
+        let name = name.into();
+        let (key_name, _alg, _key_id, verifying_key) = crate::witness::parse_vkey(vkey)?;
+        if key_name != name {
+            return Err(Error::Config(format!(
+                "witness key name '{}' does not match witness name '{}'",
+                key_name, name
+            )));
+        }
+        Ok(Self {
+            key_id: compute_key_id_with_alg(&name, &verifying_key, 0x01),
+            key_id_v1: compute_key_id_with_alg(&name, &verifying_key, ALG_COSIGNATURE_V1),
+            name,
+            url: url.into(),
+            verifying_key,
+        })
+    }
+
+    /// Create a witness config directly from a signer's public key (tests).
+    pub fn from_signer(signer: &CheckpointSigner, url: impl Into<String>) -> Self {
         Self {
-            name: name.into(),
+            name: signer.name().as_str().to_string(),
             url: url.into(),
+            verifying_key: signer.public_key(),
+            key_id: signer.key_id().clone(),
+            key_id_v1: signer.cosignature_v1_key_id(),
         }
     }
 }
@@ -70,6 +107,9 @@ pub struct WorkerConfig {
     pub checkpoint_interval: Duration,
     /// Log origin string.
     pub origin: String,
+    /// Minimum number of external witness cosignatures required to publish
+    /// a checkpoint. `None` requires all configured external witnesses.
+    pub witness_quorum: Option<usize>,
 }
 
 impl Default for WorkerConfig {
@@ -79,6 +119,7 @@ impl Default for WorkerConfig {
             integration_batch_size: 1024,
             checkpoint_interval: Duration::from_secs(1),
             origin: "example.com/log".to_string(),
+            witness_quorum: None,
         }
     }
 }
@@ -189,17 +230,25 @@ async fn run_integration_cycle(
     // Write entry bundles
     write_entry_bundles(storage, &pending, state.integrated_size, result.new_size).await?;
 
-    // Index entries in vindex if enabled
+    // Index entries in vindex if enabled. This MUST succeed (including the
+    // WAL fsync) before entries are marked integrated: marking first would
+    // let the vindex silently diverge from the log, and a WAL that ends up
+    // behind the database is a fatal startup error. Failing here aborts the
+    // cycle; the retry re-fetches the same pending entries and index_entry
+    // skips anything already indexed.
     if let Some(vi) = vindex {
         for entry in &pending {
-            if let Err(e) = vi.index_entry(entry.index, entry.data.as_bytes()) {
-                tracing::warn!("Failed to index entry {}: {}", entry.index.value(), e);
-            }
-        }
-        // Flush vindex WAL periodically (if using WAL)
-        if let Err(e) = vi.flush() {
-            tracing::warn!("Failed to flush vindex WAL: {}", e);
+            vi.index_entry(entry.index, entry.data.as_bytes())
+                .map_err(|e| {
+                    Error::Internal(format!(
+                        "failed to index entry {} in vindex: {}",
+                        entry.index.value(),
+                        e
+                    ))
+                })?;
         }
+        vi.flush()
+            .map_err(|e| Error::Internal(format!("failed to flush vindex WAL: {}", e)))?;
         tracing::debug!(
             "Indexed {} entries in vindex, total keys: {}",
             pending.len(),
@@ -229,6 +278,18 @@ async fn run_integration_cycle(
         result.root_hash.to_hex()
     );
 
+    // Compact the vindex WAL once enough entries have accumulated. Runs on a
+    // blocking thread: it serializes the whole index to disk.
+    if let Some(vi) = vindex {
+        let vi = Arc::clone(vi);
+        let compacted = tokio::task::spawn_blocking(move || vi.maybe_snapshot())
+            .await
+            .map_err(|e| Error::Internal(format!("vindex snapshot task panicked: {}", e)))??;
+        if compacted {
+            tracing::info!("Vindex snapshot written; WAL compacted");
+        }
+    }
+
     Ok(())
 }
 
@@ -312,7 +373,15 @@ pub async fn run_checkpoint_worker(
         external_witnesses.len()
     );
 
-    let origin = Origin::new(config.origin.clone()).expect("invalid log origin");
+    // main() validates the origin before spawning; this is a defensive check
+    // so a bad origin can never panic inside the spawned task.
+    let origin = match Origin::new(config.origin.clone()) {
+        Ok(o) => o,
+        Err(e) => {
+            tracing::error!("Checkpoint worker cannot start: invalid log origin: {}", e);
+            return;
+        }
+    };
     let client = reqwest::Client::new();
     let mut witness_state = ExternalWitnessState::default();
     let mut last_published = LastPublished::default();
@@ -326,7 +395,7 @@ pub async fn run_checkpoint_worker(
                 }
             }
             _ = tokio::time::sleep(config.checkpoint_interval) => {
-                if let Err(e) = publish_checkpoint(&db, &storage, &signer, &witnesses, &external_witnesses, &client, &origin, &mut witness_state, &mut last_published).await {
+                if let Err(e) = publish_checkpoint(&db, &storage, &signer, &witnesses, &external_witnesses, config.witness_quorum, &client, &origin, &mut witness_state, &mut last_published).await {
                     tracing::error!("Checkpoint publish error: {}", e);
                 }
             }
@@ -341,6 +410,7 @@ async fn publish_checkpoint(
     signer: &CheckpointSigner,
     witnesses: &[Arc<CheckpointSigner>],
     external_witnesses: &[ExternalWitness],
+    witness_quorum: Option<usize>,
     client: &reqwest::Client,
     origin: &Origin,
     witness_state: &mut ExternalWitnessState,
@@ -382,9 +452,13 @@ async fn publish_checkpoint(
     // Create cosigned checkpoint with the log's signature
     let mut cosigned = CosignedCheckpoint::new(checkpoint, signer);
 
-    // Add in-process witness signatures
+    // Add in-process witness cosignatures (cosignature/v1, like real witnesses)
+    let now = std::time::SystemTime::now()
+        .duration_since(std::time::UNIX_EPOCH)
+        .map_err(|e| Error::Internal(format!("system clock error: {}", e)))?
+        .as_secs();
     for witness in witnesses {
-        cosigned.add_signature(witness);
+        cosigned.add_cosignature_v1(witness, now);
     }
 
     let mut external_signature_count = 0usize;
@@ -431,10 +505,10 @@ async fn publish_checkpoint(
         {
             Ok(signature_line) => {
                 if let Err(e) =
-                    add_external_signature_line(&mut cosigned, &ext_witness.name, &signature_line)
+                    add_external_signature_line(&mut cosigned, ext_witness, &signature_line)
                 {
                     tracing::warn!(
-                        "Failed to parse signature from external witness {}: {}",
+                        "Rejected signature from external witness {}: {}",
                         ext_witness.name,
                         e
                     );
@@ -454,12 +528,18 @@ async fn publish_checkpoint(
         }
     }
 
-    if external_signature_count < external_witnesses.len() {
+    // Publish once a quorum of external witnesses has cosigned. Requiring
+    // every witness would let a single unavailable witness halt the log.
+    let required = witness_quorum
+        .unwrap_or(external_witnesses.len())
+        .min(external_witnesses.len());
+    if external_signature_count < required {
         tracing::warn!(
-            "Not publishing checkpoint size {}: got {}/{} external witness signatures",
+            "Not publishing checkpoint size {}: got {}/{} external witness signatures (quorum {})",
             new_size,
             external_signature_count,
-            external_witnesses.len()
+            external_witnesses.len(),
+            required
         );
         return Ok(());
     }
@@ -486,17 +566,68 @@ async fn publish_checkpoint(
 
 fn add_external_signature_line(
     cosigned: &mut CosignedCheckpoint,
-    expected_name: &str,
+    witness: &ExternalWitness,
     line: &str,
 ) -> Result<()> {
+    use crate::checkpoint::signer::cosignature_v1_message;
+    use ed25519_dalek::Verifier;
+
     let sig = CheckpointSignature::from_line(line)?;
-    if sig.name.as_str() != expected_name {
+    if sig.name.as_str() != witness.name {
         return Err(Error::Config(format!(
             "witness name mismatch: expected '{}', got '{}'",
-            expected_name, sig.name
+            witness.name, sig.name
         )));
     }
 
+    // Verify the cosignature against the pinned key. Without this, a
+    // compromised witness could return garbage that still counts toward the
+    // publication quorum. C2SP cosignature/v1 signatures (with timestamp)
+    // sign the timestamped message and use the alg-0x04 key ID; legacy plain
+    // signatures sign the bare body with the alg-0x01 key ID.
+    let body = cosigned.checkpoint.to_body();
+    match sig.timestamp {
+        Some(ts) => {
+            if sig.key_id != witness.key_id_v1 {
+                return Err(Error::Config(format!(
+                    "witness cosignature/v1 key ID mismatch for '{}': expected {:08x}, got {:08x}",
+                    witness.name,
+                    witness.key_id_v1.as_u32(),
+                    sig.key_id.as_u32()
+                )));
+            }
+            let message = cosignature_v1_message(ts, &body);
+            witness
+                .verifying_key
+                .verify(message.as_bytes(), &sig.signature)
+                .map_err(|e| {
+                    Error::Signing(format!(
+                        "cosignature/v1 from witness '{}' failed verification: {}",
+                        witness.name, e
+                    ))
+                })?;
+        }
+        None => {
+            if sig.key_id != witness.key_id {
+                return Err(Error::Config(format!(
+                    "witness key ID mismatch for '{}': expected {:08x}, got {:08x}",
+                    witness.name,
+                    witness.key_id.as_u32(),
+                    sig.key_id.as_u32()
+                )));
+            }
+            witness
+                .verifying_key
+                .verify(body.as_bytes(), &sig.signature)
+                .map_err(|e| {
+                    Error::Signing(format!(
+                        "cosignature from witness '{}' failed verification: {}",
+                        witness.name, e
+                    ))
+                })?;
+        }
+    }
+
     if !cosigned.has_signature_from(&sig.name) {
         cosigned.signatures.push(sig);
     }
@@ -601,13 +732,14 @@ mod tests {
         ])
     }
 
-    /// Create a signature line in the note format for testing.
+    /// Create a plain (legacy) signature line in the note format for testing.
     fn make_signature_line(signer: &CheckpointSigner, body: &str) -> String {
         let signature = signer.signing_key_ref().sign(body.as_bytes());
         let sig = CheckpointSignature {
             name: signer.name().clone(),
             key_id: signer.key_id().clone(),
             signature,
+            timestamp: None,
         };
         sig.to_line()
     }
@@ -642,7 +774,7 @@ mod tests {
 
         // Call the external witness
         let client = reqwest::Client::new();
-        let ext_witness = ExternalWitness::new("test-witness", mock_server.uri());
+        let ext_witness = ExternalWitness::from_signer(&witness_signer, mock_server.uri());
         let mut witness_state = ExternalWitnessState::default();
 
         let result =
@@ -650,7 +782,92 @@ mod tests {
                 .await;
 
         assert!(result.is_ok(), "Expected success, got: {:?}", result);
-        assert_eq!(result.unwrap(), sig_line);
+        let sig_line_returned = result.unwrap();
+        assert_eq!(sig_line_returned, sig_line);
+
+        // The signature must verify against the pinned key.
+        let mut cosigned = cosigned;
+        add_external_signature_line(&mut cosigned, &ext_witness, &sig_line_returned)
+            .expect("valid cosignature must be accepted");
+        assert_eq!(cosigned.signature_count(), 2);
+    }
+
+    #[test]
+    fn test_cosignature_v1_line_accepted() {
+        let witness_signer = test_signer("test-witness");
+        let log_signer = test_signer("test.log");
+
+        let checkpoint = Checkpoint::new(
+            Origin::new("test.log".to_string()).unwrap(),
+            TreeSize::new(10),
+            empty_root_hash(),
+        );
+        let mut cosigned = CosignedCheckpoint::new(checkpoint, &log_signer);
+
+        // A spec-conformant witness returns a timestamped cosignature/v1 line
+        // (76-byte blob, alg-0x04 key ID).
+        let cosig = witness_signer.cosign_v1(&cosigned.checkpoint, 1679315147);
+        let line = cosig.to_line();
+
+        let ext_witness = ExternalWitness::from_signer(&witness_signer, "http://unused");
+        add_external_signature_line(&mut cosigned, &ext_witness, &line)
+            .expect("valid cosignature/v1 must be accepted");
+        assert_eq!(cosigned.signature_count(), 2);
+
+        // The line must round-trip through the checkpoint text.
+        let text = cosigned.to_text();
+        let reparsed = CosignedCheckpoint::from_text(&text).unwrap();
+        assert_eq!(reparsed.signature_count(), 2);
+        let ws = reparsed
+            .signatures
+            .iter()
+            .find(|s| s.name.as_str() == "test-witness")
+            .unwrap();
+        assert_eq!(ws.timestamp, Some(1679315147));
+
+        // Tampering with the timestamp must break verification.
+        let mut tampered = cosig.clone();
+        tampered.timestamp = Some(1679315148);
+        let mut cosigned2 = CosignedCheckpoint::new(
+            Checkpoint::new(
+                Origin::new("test.log".to_string()).unwrap(),
+                TreeSize::new(10),
+                empty_root_hash(),
+            ),
+            &log_signer,
+        );
+        let result = add_external_signature_line(&mut cosigned2, &ext_witness, &tampered.to_line());
+        assert!(result.is_err(), "Altered timestamp must fail verification");
+    }
+
+    #[tokio::test]
+    async fn test_garbage_witness_signature_rejected() {
+        let witness_signer = test_signer("test-witness");
+        let other_signer = test_signer("test-witness"); // same name, different key
+        let log_signer = test_signer("test.log");
+
+        let checkpoint = Checkpoint::new(
+            Origin::new("test.log".to_string()).unwrap(),
+            TreeSize::new(10),
+            empty_root_hash(),
+        );
+        let mut cosigned = CosignedCheckpoint::new(checkpoint, &log_signer);
+
+        // A signature over the right body but from the WRONG key (e.g. a
+        // compromised witness) must be rejected by pinned-key verification.
+        let body = cosigned.checkpoint.to_body();
+        let forged_line = make_signature_line(&other_signer, &body);
+
+        let ext_witness = ExternalWitness::from_signer(&witness_signer, "http://unused");
+
+        let result = add_external_signature_line(&mut cosigned, &ext_witness, &forged_line);
+        assert!(result.is_err(), "Forged cosignature must be rejected");
+        assert_eq!(cosigned.signature_count(), 1, "Only the log signature remains");
+
+        // A signature from the right key over the WRONG body must also fail.
+        let wrong_body_line = make_signature_line(&witness_signer, "some other body\n");
+        let result = add_external_signature_line(&mut cosigned, &ext_witness, &wrong_body_line);
+        assert!(result.is_err(), "Signature over wrong body must be rejected");
     }
 
     #[tokio::test]
@@ -681,7 +898,8 @@ mod tests {
 
         // Call the external witness
         let client = reqwest::Client::new();
-        let ext_witness = ExternalWitness::new("test-witness", mock_server.uri());
+        let witness_signer = test_signer("test-witness");
+        let ext_witness = ExternalWitness::from_signer(&witness_signer, mock_server.uri());
         let mut witness_state = ExternalWitnessState::default();
 
         let result =
@@ -735,8 +953,8 @@ mod tests {
         let mut witness_state = ExternalWitnessState::default();
 
         let ext_witnesses = vec![
-            ExternalWitness::new("witness1", mock_witness1.uri()),
-            ExternalWitness::new("witness2", mock_witness2.uri()),
+            ExternalWitness::from_signer(&witness1_signer, mock_witness1.uri()),
+            ExternalWitness::from_signer(&witness2_signer, mock_witness2.uri()),
         ];
 
         // Simulate what publish_checkpoint does for external witnesses
@@ -816,9 +1034,10 @@ mod tests {
         let client = reqwest::Client::new();
         let mut witness_state = ExternalWitnessState::default();
 
+        let witness2_signer = test_signer("witness2");
         let ext_witnesses = vec![
-            ExternalWitness::new("witness1", mock_witness1.uri()),
-            ExternalWitness::new("witness2", mock_witness2.uri()),
+            ExternalWitness::from_signer(&witness1_signer, mock_witness1.uri()),
+            ExternalWitness::from_signer(&witness2_signer, mock_witness2.uri()),
         ];
 
         let mut success_count = 0;
diff --git a/tests/witness_security_test.rs b/tests/witness_security_test.rs
index 1c42923..145979f 100644
--- a/tests/witness_security_test.rs
+++ b/tests/witness_security_test.rs
@@ -76,7 +76,7 @@ fn test_proof_hash_count_below_limit() {
 #[cfg(test)]
 mod state_tests {
     use sea_orm::{Database, DatabaseConnection};
-    use siglog::witness::WitnessStateStore;
+    use siglog::witness::{UpdateOutcome, WitnessStateStore};
     use sigstore_types::Sha256Hash;
     use std::sync::Arc;
 
@@ -90,6 +90,14 @@ mod state_tests {
         conn
     }
 
+    fn empty_root() -> Sha256Hash {
+        Sha256Hash::from_bytes([
+            0xe3, 0xb0, 0xc4, 0x42, 0x98, 0xfc, 0x1c, 0x14, 0x9a, 0xfb, 0xf4, 0xc8, 0x99, 0x6f,
+            0xb9, 0x24, 0x27, 0xae, 0x41, 0xe4, 0x64, 0x9b, 0x93, 0x4c, 0xa4, 0x95, 0x99, 0x1b,
+            0x78, 0x52, 0xb8, 0x55,
+        ])
+    }
+
     #[tokio::test]
     async fn test_size_rollback_prevention() {
         let conn = setup_test_db().await;
@@ -100,14 +108,17 @@ mod state_tests {
         let hash2 = Sha256Hash::from_bytes([2u8; 32]);
 
         // Initialize with size 100
-        let _ = store.get_or_init(origin).await.unwrap();
-        store
-            .update(origin, 100, hash1, "checkpoint1")
+        let init = store.get_or_init(origin).await.unwrap();
+        let outcome = store
+            .update(origin, init.size, &init.root_hash, 100, hash1, "checkpoint1")
             .await
             .unwrap();
+        assert_eq!(outcome, UpdateOutcome::Updated);
 
         // Try to rollback to size 50 (should fail)
-        let result = store.update(origin, 50, hash2, "checkpoint2").await;
+        let result = store
+            .update(origin, 100, &hash1, 50, hash2, "checkpoint2")
+            .await;
         assert!(result.is_err(), "Should prevent size rollback");
 
         let err_msg = result.unwrap_err().to_string();
@@ -133,15 +144,18 @@ mod state_tests {
         let hash2 = Sha256Hash::from_bytes([2u8; 32]);
 
         // Initialize with size 100
-        let _ = store.get_or_init(origin).await.unwrap();
+        let init = store.get_or_init(origin).await.unwrap();
         store
-            .update(origin, 100, hash1, "checkpoint1")
+            .update(origin, init.size, &init.root_hash, 100, hash1, "checkpoint1")
             .await
             .unwrap();
 
         // Increase to size 200 (should succeed)
-        let result = store.update(origin, 200, hash2, "checkpoint2").await;
-        assert!(result.is_ok(), "Should allow size increase");
+        let outcome = store
+            .update(origin, 100, &hash1, 200, hash2, "checkpoint2")
+            .await
+            .unwrap();
+        assert_eq!(outcome, UpdateOutcome::Updated, "Should allow size increase");
 
         // Verify the state has changed
         let state = store.get(origin).await.unwrap().unwrap();
@@ -158,14 +172,85 @@ mod state_tests {
         let hash1 = Sha256Hash::from_bytes([1u8; 32]);
 
         // Initialize with size 100
-        let _ = store.get_or_init(origin).await.unwrap();
+        let init = store.get_or_init(origin).await.unwrap();
         store
-            .update(origin, 100, hash1, "checkpoint1")
+            .update(origin, init.size, &init.root_hash, 100, hash1, "checkpoint1")
+            .await
+            .unwrap();
+
+        // Update with same size and same root (idempotent republish)
+        let outcome = store
+            .update(origin, 100, &hash1, 100, hash1, "checkpoint1")
+            .await
+            .unwrap();
+        assert_eq!(outcome, UpdateOutcome::Updated, "Should allow same size update");
+    }
+
+    #[tokio::test]
+    async fn test_cas_conflict_on_stale_expected_state() {
+        let conn = setup_test_db().await;
+        let store = WitnessStateStore::new(Arc::new(conn));
+
+        let origin = "test-log";
+        let hash1 = Sha256Hash::from_bytes([1u8; 32]);
+        let hash2 = Sha256Hash::from_bytes([2u8; 32]);
+        let hash3 = Sha256Hash::from_bytes([3u8; 32]);
+
+        // Two "concurrent" requests both read the initial state.
+        let init = store.get_or_init(origin).await.unwrap();
+
+        // Request A wins the race.
+        let outcome_a = store
+            .update(origin, init.size, &init.root_hash, 10, hash1, "cp-a")
+            .await
+            .unwrap();
+        assert_eq!(outcome_a, UpdateOutcome::Updated);
+
+        // Request B tries to persist a *different* root at the same size,
+        // using the stale expected state. Must be rejected, otherwise the
+        // witness cosigns two conflicting roots (split view).
+        let outcome_b = store
+            .update(origin, init.size, &init.root_hash, 10, hash2, "cp-b")
+            .await
+            .unwrap();
+        assert_eq!(
+            outcome_b,
+            UpdateOutcome::Conflict { current_size: 10 },
+            "Stale CAS must conflict, not overwrite"
+        );
+
+        // Same-size different-root with a *matching* expected size but stale
+        // root must also conflict.
+        let outcome_c = store
+            .update(origin, 10, &hash2, 10, hash3, "cp-c")
             .await
             .unwrap();
+        assert_eq!(outcome_c, UpdateOutcome::Conflict { current_size: 10 });
 
-        // Update with same size (should succeed - allows idempotent updates)
-        let result = store.update(origin, 100, hash1, "checkpoint1").await;
-        assert!(result.is_ok(), "Should allow same size update");
+        let state = store.get(origin).await.unwrap().unwrap();
+        assert_eq!(state.root_hash, hash1, "Winner's root must be preserved");
+    }
+
+    #[tokio::test]
+    async fn test_oversized_tree_size_rejected() {
+        let conn = setup_test_db().await;
+        let store = WitnessStateStore::new(Arc::new(conn));
+
+        let origin = "test-log";
+        let init = store.get_or_init(origin).await.unwrap();
+
+        // Sizes above i64::MAX would wrap negative in the database column and
+        // defeat rollback protection.
+        let result = store
+            .update(
+                origin,
+                init.size,
+                &init.root_hash,
+                u64::MAX,
+                empty_root(),
+                "cp",
+            )
+            .await;
+        assert!(result.is_err(), "Sizes above i64::MAX must be rejected");
     }
 }
diff --git a/witness-conformance/.gitignore b/witness-conformance/.gitignore
index 1d6f88f..1611a6c 100644
--- a/witness-conformance/.gitignore
+++ b/witness-conformance/.gitignore
@@ -45,3 +45,4 @@ conformance-report.json
 *.swo
 *~
 .DS_Store
+.test_log_info

From 738db2a29e6eeca15f143b5fed581397e65ab005 Mon Sep 17 00:00:00 2001
From: Wolf Vollprecht <w.vollprecht@gmail.com>
Date: Fri, 3 Jul 2026 10:14:39 +0200
Subject: [PATCH 2/3] Add fly.io deployment, benchmark harness, and
 remaining-work doc

- fly.toml: single machine + volume (SQLite state, vindex WAL/snapshot)
  with tiles on S3, vindex enabled, quorum-ready config
- scripts/bench.py: end-to-end soak test measuring write throughput and
  latency, integration and checkpoint lag, read-path latencies, and
  per-entry vindex correctness; caught the rate-limiter replenish bug
  on its first run
- docs/REMAINING_WORK.md: vindex root anchoring design, ingest
  validation/dedup plan, CEP text fixes, scale limits, benchmark
  results from the fly.io staging deployment

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 docs/REMAINING_WORK.md | 184 +++++++++++++++++++++++
 fly.toml               |  62 ++++++++
 scripts/bench.py       | 334 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 580 insertions(+)
 create mode 100644 docs/REMAINING_WORK.md
 create mode 100644 fly.toml
 create mode 100644 scripts/bench.py

diff --git a/docs/REMAINING_WORK.md b/docs/REMAINING_WORK.md
new file mode 100644
index 0000000..c9515bd
--- /dev/null
+++ b/docs/REMAINING_WORK.md
@@ -0,0 +1,184 @@
+# Remaining work for a production conda-forge deployment
+
+Status as of July 2026. The items below are what stands between the current
+codebase and a production transparency log for conda-forge. They are ordered
+by how much they block the CEP, not by implementation effort.
+
+Everything here assumes the completed groundwork: C2SP cosignature/v1
+witness signatures (verified against pinned keys), quorum-based checkpoint
+publishing (`WITNESS_QUORUM`), witness split-view CAS protection, vindex
+snapshots + WAL compaction with auto-rebuild from log storage, and the
+production hardening pass (graceful shutdown, worker supervision, atomic
+filesystem writes, rate limiting).
+
+---
+
+## 1. Anchor the vindex root in the log (protocol decision needed)
+
+**Problem.** The verifiable index (key → log indices, with prefix-tree
+proofs) is served by the log operator, but nothing commits its root hash to
+the witnessed checkpoint. A malicious operator can serve a correct Merkle
+tree while lying in the vindex — e.g. omitting indices for a filename so a
+client never sees that a patched entry exists. Proofs from the vindex
+currently verify against a root hash that the *same server* provides in the
+same response: circular trust.
+
+**Proposed design.** Periodically append a special log entry committing to
+the vindex state:
+
+```json
+{"type":"vindex-root","tree_size":123456,"root":"<hex prefix-tree root>"}
+```
+
+- `tree_size` is the log size the vindex covered when the root was computed;
+  the anchor entry itself lands at some later index, which is fine — it
+  describes the index state *at* `tree_size`.
+- Publish one anchor per checkpoint interval **iff** the vindex root changed.
+- Clients verify: (1) inclusion proof of the anchor entry against the
+  witnessed checkpoint, (2) the vindex lookup proof against the anchored
+  root. Both proofs together make lookups operator-independent.
+- Monitors additionally recompute the vindex from the entries themselves and
+  alert if an anchored root diverges — this is what makes a *wrong* (not
+  just stale) anchor detectable.
+
+**Also required: verifiable exclusion proofs.** `prefix_tree.rs::lookup_rec`
+currently returns `found=false` without including the conflicting leaf or
+sibling subtree, so a client cannot recompute the root from a negative
+answer. Non-membership must include the mismatching leaf (or the divergence
+node) so the proof reconstructs the anchored root. Until then, "key not in
+index" is an unverifiable assertion.
+
+**Open question for the CEP:** anchor as a log entry (above) vs. a checkpoint
+extension line. Extension lines are lighter but per the cosignature spec,
+witnesses make **no semantic statement** about extension lines — and they
+bloat every checkpoint. The log-entry approach gets inclusion proofs for
+free and keeps checkpoints minimal. Recommendation: log entry.
+
+## 2. Ingest validation and dedup (`POST /add`)
+
+**Problem.** The log currently accepts any bytes with a valid API key. For
+conda-forge this means: no schema enforcement, no canonicalization check
+(clients could log non-normalized JSON that then never matches a verifier's
+recomputed hash), and no dedup — a bulk repodata-patch run that re-submits
+100k unchanged entries would append 100k duplicate leaves.
+
+**Plan.**
+- Validate at ingest when `ENTRY_SCHEMA=conda-v1` is configured:
+  - parse as JSON, check required fields
+    (`subdir`, `filename`, `sha256`, `size`, `build`, `build_number`,
+    `version`, `name`, `depends`),
+  - re-serialize canonically and require byte-equality with the submission
+    (reject non-canonical bodies with a 422 and the canonical form in the
+    error, so publishers can fix their pipeline),
+  - allow a `type: "index"` variant for freshness entries.
+- Dedup by leaf hash: keep a `leaf_hash → index` table (or reuse the
+  monitor's content-index machinery) and return the **existing** index with
+  `200` instead of appending. This makes `POST /add` idempotent, which also
+  resolves the ambiguous-commit/retry duplication noted in the review.
+  Cost: one indexed lookup per add; the table grows with the log (32 bytes
+  + index per entry — ~80 MB per 2M entries in SQLite/Postgres, acceptable).
+- Size the dedup decision into the CEP: "a channel MUST NOT re-log an entry
+  whose normalized bytes are unchanged; logs SHOULD enforce this."
+
+## 3. CEP text fixes (spec gaps found during review)
+
+1. **Vindex key must include the subdir.** The CEP says
+   `index_key = SHA256(filename)`, but monitors enforce uniqueness per
+   `(subdir, filename)` and identical filenames legitimately exist across
+   subdirs. Use `SHA256(subdir + "/" + filename)`.
+2. **Pin normalization to RFC 8785 (JCS)** instead of ad-hoc rules; state
+   explicitly that floats are forbidden and whether `depends` arrays are
+   sorted or preserved (recommendation: sorted — otherwise a patch that only
+   reorders dependencies produces a spurious "new" entry).
+3. **Inclusion proofs embedded in `repodata_shard_index.json` don't verify
+   against a *newer* checkpoint.** A proof computed at `tree_size = T` does
+   not verify against `root(T')` for `T' > T`, and tlog-tiles only serves
+   the latest checkpoint. Fix: embed the size-`T` checkpoint alongside the
+   proof, and have clients verify consistency `T → T'` from tiles. Keeps
+   offline verification intact.
+4. **Add a `channel` field** to the entry schema (or state that one log
+   origin serves exactly one channel). Without it, entries are ambiguous if
+   the log ever serves more than conda-forge.
+5. **Witness freshness wording.** Checkpoints don't contain timestamps;
+   freshness comes from cosignature/v1 timestamps. Define client freshness
+   as "max cosignature timestamp within quorum ≥ now − max_skew".
+6. **State the freshness window trade-off**: a mirror can serve
+   up-to-`max_age`-old data undetected. Recommend `max_age` of 24h rather
+   than 7 days — re-logging one small index entry per subdir per day is
+   nearly free.
+7. **Operator requirements section**: the log MUST never sign two different
+   trees at the same size ("never fork"). Restoring from a backup that lost
+   acknowledged entries forks the tree and permanently kills the log
+   (witnesses refuse forever). Mandate: single writer, synchronous
+   DB+object-store durability before signing, tested restore procedure, key
+   ceremony / KMS for the signing key.
+
+## 4. Scale limits to address before conda-forge full history
+
+| Component | Current limit | Wall |
+|---|---|---|
+| Vindex key map | in-RAM `HashMap`, `VINDEX_MAX_KEYS` (10M default) | ~2M conda-forge artifacts fit (~several hundred MB); beyond that needs a disk-backed index (sled/rocksdb) or shard-by-prefix |
+| Vindex startup | snapshot load + WAL tail replay | fine now (snapshots bound it); snapshot write is O(index) — at 2M keys expect ~1–2 s pauses per 100k entries, tune `VINDEX_SNAPSHOT_INTERVAL` |
+| Monitor content index | in-RAM, unbounded, O(pending) scan per entry → O(n²) per batch | needs the DB-backed lookup path to be the primary one, plus batch-size caps |
+| Monitor `validate_new_entries` | fetches all new entries inline in one HTTP request | cap per-request validation window; validate asynchronously and cosign on the next request |
+| SQLite | single-writer; `lock_exclusive` is a no-op, deferred transactions can fail with `SQLITE_BUSY_SNAPSHOT` under concurrency | use Postgres in production; if SQLite must stay, issue `BEGIN IMMEDIATE` for writer transactions |
+
+## 5. Tooling debt
+
+- **`conda-monitor verify` does not verify.** It prints "VERIFICATION
+  PASSED" without checking the vindex proof, the checkpoint signature, or an
+  inclusion proof. This is the tool the CEP points users at — it must do the
+  full client verification workflow (normalize → leaf hash → vindex proof
+  against anchored root → inclusion proof → checkpoint signature + witness
+  quorum) before anything ships.
+- **Client library:** the verification workflow belongs in a Rust crate
+  consumable by rattler/pixi, verified at package-download time (not per
+  solve). Roll out `on_failure: warn` first.
+- **litewitness interop:** the conformance suite passes against siglog's own
+  witness; an end-to-end run against a real litewitness instance (Go) is
+  still outstanding and is the definitive C2SP interop check.
+
+## 6. Benchmark results (fly.io staging, July 2026)
+
+`scripts/bench.py` against `conda-transparency-log.fly.dev` (shared-cpu-2x,
+1 GB, ams; SQLite on volume; tiles on Tigris S3; vindex enabled;
+`BATCH_MAX_AGE_MS=500`, `CHECKPOINT_INTERVAL=2`):
+
+- **Writes**: 2,000 entries at concurrency 48 → 78 req/s, p50 595 ms /
+  p99 686 ms, zero errors. `/add` latency ≈ `BATCH_MAX_AGE_MS` + RTT, since
+  the ack waits for the durable batch commit — tune the batch age to trade
+  latency for batch size.
+- **Integration**: 77 entries/s in a short burst (2k entries), degrading to
+  ~36 entries/s under sustained load (10k entries at concurrency 96 → 279 s
+  drain; checkpoint follows ~1 s later). The integration loop writes tiles
+  **sequentially**; parallelizing the S3 PUTs per cycle is the obvious lever
+  (a 2M-entry bootstrap at 36/s is ~15 h — fine one-time, slow for bulk
+  re-patching). Sustained write pressure also pushes `/add` p50 to ~1.5 s at
+  concurrency 96 as requests queue behind batch commits.
+- **Sustained-load verdict**: a 10k-entry run at concurrency 96 completed
+  with zero errors and 500/500 sampled lookups verified — correctness holds;
+  the limits are throughput, not integrity.
+- **Reads**: checkpoint p50 236 ms, vindex lookup p50 215 ms, entry-bundle
+  tiles p50 142 ms (client in EU → ams, no CDN).
+- **Correctness under load**: every sampled entry (300/300) was findable
+  through the vindex at the exact index assigned at write time, and the
+  final checkpoint covered all writes.
+
+The benchmark also caught a real bug on its first run: `tower_governor`'s
+`per_second(n)` configures "one token per *n seconds*", not "n per second" —
+the limiter was effectively 1 req/1000 s with a burst. Fixed by configuring
+the replenish interval (`per_nanosecond(1e9 / rps)`); regression-tested.
+
+## 7. Deployment/ops (tracked, mostly mechanical)
+
+- Postgres for the production log (SQLite + volume is fine for staging).
+- CDN in front of `/tile/*` and `/checkpoint` (immutable tiles cache
+  forever; checkpoint no-cache). The origin then only serves `/add` and
+  `/vindex/*`.
+- Witness recruitment: 3–5 independent orgs (prefix.dev, Anaconda,
+  Quansight, QuantStack + existing C2SP witnesses once interop is proven),
+  quorum 3.
+- Alerting: monitor violations → webhook/status page; log health: pending
+  count growth, checkpoint age, witness cosign failure rate.
+- Bootstrap plan: mass-load existing repodata as epoch T₀; document that
+  tamper-evidence starts at T₀.
diff --git a/fly.toml b/fly.toml
new file mode 100644
index 0000000..c21a474
--- /dev/null
+++ b/fly.toml
@@ -0,0 +1,62 @@
+# Fly.io deployment for the siglog transparency log server.
+#
+# Storage layout:
+#   - SQLite sequencer state + vindex WAL/snapshot on a Fly volume (/data)
+#   - Tiles + checkpoint on S3-compatible object storage (Tigris), via the
+#     S3_* secrets
+#
+# Required secrets (fly secrets set ...):
+#   LOG_PRIVATE_KEY   Ed25519 note-format signing key
+#   API_KEY           Bearer token for POST /add
+#   S3_BUCKET, S3_ENDPOINT, S3_ACCESS_KEY, S3_SECRET_KEY
+
+app = "conda-transparency-log"
+primary_region = "ams"
+
+[build]
+dockerfile = "docker/Dockerfile.server"
+
+[env]
+LISTEN_ADDR = "0.0.0.0:8080"
+LOG_ORIGIN = "conda.prefix.dev"
+DATABASE_URL = "sqlite:/data/siglog.db?mode=rwc"
+STORAGE_BACKEND = "s3"
+S3_REGION = "auto"
+CHECKPOINT_INTERVAL = "2"
+BATCH_MAX_SIZE = "256"
+BATCH_MAX_AGE_MS = "500"
+VINDEX_ENABLED = "true"
+VINDEX_KEY_FIELD = "name"
+VINDEX_WAL_PATH = "/data/vindex.wal"
+VINDEX_SNAPSHOT_INTERVAL = "100000"
+# Benchmark-friendly limits; drop for public exposure.
+RATE_LIMIT_PER_SECOND = "1000"
+RATE_LIMIT_BURST_SIZE = "2000"
+RUST_LOG = "info,siglog=info"
+
+[mounts]
+source = "siglog_data"
+destination = "/data"
+
+[http_service]
+internal_port = 8080
+force_https = true
+auto_stop_machines = true
+auto_start_machines = true
+min_machines_running = 1
+
+[http_service.concurrency]
+type = "connections"
+hard_limit = 500
+soft_limit = 400
+
+[[http_service.checks]]
+interval = "30s"
+timeout = "5s"
+grace_period = "15s"
+method = "GET"
+path = "/health"
+
+[[vm]]
+size = "shared-cpu-2x"
+memory = "1gb"
diff --git a/scripts/bench.py b/scripts/bench.py
new file mode 100644
index 0000000..0245bbf
--- /dev/null
+++ b/scripts/bench.py
@@ -0,0 +1,334 @@
+# /// script
+# requires-python = ">=3.11"
+# dependencies = ["httpx[http2]"]
+# ///
+"""Benchmark and soak-test a siglog transparency log deployment.
+
+Measures whether the log "holds up" end to end:
+
+  1. Write path   — concurrent POST /add: throughput, latency percentiles,
+                    error/rate-limit counts.
+  2. Integration  — time until all written entries are integrated into the
+                    Merkle tree (pending_count back to 0) and the checkpoint
+                    advances to cover them.
+  3. Read path    — GET /checkpoint, tile fetches, and vindex lookups:
+                    latency percentiles.
+  4. Correctness  — every sampled written entry must be findable through the
+                    vindex at the index the server assigned at write time,
+                    and the final checkpoint must cover all writes.
+
+Usage:
+    uv run scripts/bench.py --url https://conda-transparency-log.fly.dev \
+        --api-key-file /path/to/key --entries 2000 --concurrency 32
+"""
+
+import argparse
+import asyncio
+import json
+import random
+import statistics
+import string
+import sys
+import time
+
+import httpx
+
+
+def pct(values: list[float], p: float) -> float:
+    if not values:
+        return float("nan")
+    values = sorted(values)
+    k = min(len(values) - 1, max(0, round(p / 100 * (len(values) - 1))))
+    return values[k]
+
+
+def fmt_ms(seconds: float) -> str:
+    return f"{seconds * 1000:.1f}ms"
+
+
+class Stats:
+    def __init__(self) -> None:
+        self.latencies: list[float] = []
+        self.ok = 0
+        self.rate_limited = 0
+        self.errors: dict[str, int] = {}
+
+    def error(self, kind: str) -> None:
+        self.errors[kind] = self.errors.get(kind, 0) + 1
+
+    def summary(self, name: str, duration: float | None = None) -> str:
+        lines = [f"  requests ok:      {self.ok}"]
+        if duration and self.ok:
+            lines.append(f"  throughput:       {self.ok / duration:.1f} req/s")
+        if self.latencies:
+            lines.append(
+                "  latency p50/p95/p99/max: "
+                f"{fmt_ms(pct(self.latencies, 50))} / {fmt_ms(pct(self.latencies, 95))} / "
+                f"{fmt_ms(pct(self.latencies, 99))} / {fmt_ms(max(self.latencies))}"
+            )
+        if self.rate_limited:
+            lines.append(f"  rate-limited (429): {self.rate_limited}")
+        for kind, count in sorted(self.errors.items()):
+            lines.append(f"  ERROR {kind}: {count}")
+        return f"{name}\n" + "\n".join(lines)
+
+
+async def write_phase(
+    client: httpx.AsyncClient,
+    url: str,
+    api_key: str,
+    n_entries: int,
+    concurrency: int,
+    run_id: str,
+) -> tuple[Stats, dict[str, int], float]:
+    """POST /add for n_entries; returns stats and name → assigned index."""
+    stats = Stats()
+    assigned: dict[str, int] = {}
+    sem = asyncio.Semaphore(concurrency)
+    headers = {"Authorization": f"Bearer {api_key}"}
+
+    async def submit(i: int) -> None:
+        name = f"bench-{run_id}-pkg-{i:06d}"
+        body = json.dumps(
+            {
+                "name": name,
+                "version": "1.0.0",
+                "build": "py311_0",
+                "build_number": 0,
+                "subdir": "linux-64",
+                "filename": f"{name}-1.0.0-py311_0.conda",
+                "sha256": "".join(random.choices("0123456789abcdef", k=64)),
+                "size": random.randint(10_000, 90_000_000),
+                "depends": ["python >=3.11,<3.12.0a0"],
+            },
+            separators=(",", ":"),
+            sort_keys=True,
+        )
+        async with sem:
+            # Retry on 429 with backoff so rate limiting degrades throughput
+            # instead of failing the run.
+            for attempt in range(6):
+                start = time.monotonic()
+                try:
+                    resp = await client.post(f"{url}/add", content=body, headers=headers)
+                except httpx.HTTPError as e:
+                    stats.error(type(e).__name__)
+                    return
+                elapsed = time.monotonic() - start
+                if resp.status_code == 200:
+                    stats.ok += 1
+                    stats.latencies.append(elapsed)
+                    assigned[name] = int(resp.text.strip())
+                    return
+                if resp.status_code == 429:
+                    stats.rate_limited += 1
+                    # Honor Retry-After but cap it: a misconfigured limiter
+                    # can advertise huge values and stall the whole run.
+                    retry_after = min(float(resp.headers.get("retry-after", 0) or 0), 10.0)
+                    await asyncio.sleep(max(retry_after, 0.2 * (attempt + 1)))
+                    continue
+                stats.error(f"HTTP {resp.status_code}")
+                return
+            stats.error("gave up after 429 retries")
+
+    start = time.monotonic()
+    await asyncio.gather(*(submit(i) for i in range(n_entries)))
+    duration = time.monotonic() - start
+    return stats, assigned, duration
+
+
+def parse_checkpoint(text: str) -> tuple[str, int, int]:
+    """Return (origin, tree_size, signature_count)."""
+    body, _, sigs = text.partition("\n\n")
+    lines = body.splitlines()
+    n_sigs = sum(1 for l in sigs.splitlines() if l.startswith("— "))
+    return lines[0], int(lines[1]), n_sigs
+
+
+async def wait_for_integration(
+    client: httpx.AsyncClient, url: str, target_size: int, timeout: float
+) -> tuple[float | None, float | None]:
+    """Wait until /ready reports integrated_size >= target and /checkpoint
+    covers it. Returns (integration_lag, checkpoint_lag) in seconds."""
+    start = time.monotonic()
+    integrated_at = None
+    while time.monotonic() - start < timeout:
+        resp = await client.get(f"{url}/ready")
+        if resp.status_code == 200:
+            data = resp.json()
+            if data["integrated_size"] >= target_size and data["pending_count"] == 0:
+                integrated_at = time.monotonic() - start
+                break
+        elif resp.status_code == 429:
+            # Back off so polling doesn't keep the bucket empty forever.
+            await asyncio.sleep(2.0)
+            continue
+        await asyncio.sleep(0.25)
+    if integrated_at is None:
+        return None, None
+
+    while time.monotonic() - start < timeout:
+        resp = await client.get(f"{url}/checkpoint")
+        if resp.status_code == 200:
+            _, size, _ = parse_checkpoint(resp.text)
+            if size >= target_size:
+                return integrated_at, time.monotonic() - start
+        elif resp.status_code == 429:
+            await asyncio.sleep(2.0)
+            continue
+        await asyncio.sleep(0.25)
+    return integrated_at, None
+
+
+async def read_phase(
+    client: httpx.AsyncClient,
+    url: str,
+    assigned: dict[str, int],
+    n_lookups: int,
+    concurrency: int,
+) -> tuple[Stats, Stats, Stats, int]:
+    """Checkpoint fetches, vindex lookups (with correctness check), tile reads."""
+    ckpt_stats, vindex_stats, tile_stats = Stats(), Stats(), Stats()
+    mismatches = 0
+    sem = asyncio.Semaphore(concurrency)
+
+    async def timed_get(path: str, stats: Stats) -> httpx.Response | None:
+        async with sem:
+            start = time.monotonic()
+            try:
+                resp = await client.get(f"{url}{path}")
+            except httpx.HTTPError as e:
+                stats.error(type(e).__name__)
+                return None
+            if resp.status_code == 200:
+                stats.ok += 1
+                stats.latencies.append(time.monotonic() - start)
+                return resp
+            if resp.status_code == 429:
+                stats.rate_limited += 1
+            else:
+                stats.error(f"HTTP {resp.status_code} {path}")
+            return None
+
+    async def check_lookup(name: str, expected_idx: int) -> None:
+        nonlocal mismatches
+        resp = await timed_get(f"/vindex/lookup/key/{name}", vindex_stats)
+        if resp is None:
+            return
+        data = resp.json()
+        if not data["found"] or expected_idx not in data["indices"]:
+            mismatches += 1
+
+    sample = random.sample(sorted(assigned.items()), min(n_lookups, len(assigned)))
+
+    tasks = [check_lookup(name, idx) for name, idx in sample]
+    tasks += [timed_get("/checkpoint", ckpt_stats) for _ in range(50)]
+    # Tile reads across the tree (entry bundles for sampled indices).
+    max_idx = max(assigned.values()) if assigned else 0
+    tree_size = max_idx + 1
+    bundles = sorted({idx // 256 for _, idx in sample})
+    for b in bundles[:50]:
+        partial = tree_size % 256 if b == tree_size // 256 else 0
+        path = f"/tile/entries/{b:03d}" + (f".p/{partial}" if partial else "")
+        tasks.append(timed_get(path, tile_stats))
+
+    await asyncio.gather(*tasks)
+    return ckpt_stats, vindex_stats, tile_stats, mismatches
+
+
+async def main() -> int:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--url", required=True, help="Log base URL")
+    parser.add_argument("--api-key", help="API key for POST /add")
+    parser.add_argument("--api-key-file", help="File containing the API key")
+    parser.add_argument("--entries", type=int, default=1000)
+    parser.add_argument("--concurrency", type=int, default=32)
+    parser.add_argument("--lookups", type=int, default=200)
+    parser.add_argument("--timeout", type=float, default=120.0,
+                        help="Max seconds to wait for integration/checkpoint")
+    args = parser.parse_args()
+
+    api_key = args.api_key
+    if not api_key and args.api_key_file:
+        api_key = open(args.api_key_file).read().strip()
+    if not api_key:
+        parser.error("--api-key or --api-key-file is required")
+
+    url = args.url.rstrip("/")
+    run_id = "".join(random.choices(string.ascii_lowercase + string.digits, k=6))
+
+    # HTTP/1.1 with a connection pool sized to the concurrency: multiplexing
+    # all requests onto one HTTP/2 connection collapses to ~1 in-flight
+    # request behind some proxies (observed on Fly's edge).
+    limits = httpx.Limits(
+        max_connections=args.concurrency + 8,
+        max_keepalive_connections=args.concurrency + 8,
+    )
+    async with httpx.AsyncClient(timeout=30.0, limits=limits) as client:
+        # Baseline state
+        resp = await client.get(f"{url}/ready")
+        resp.raise_for_status()
+        baseline = resp.json()
+        print(f"target: {url}  (run id {run_id})")
+        print(f"baseline: integrated_size={baseline['integrated_size']} "
+              f"pending={baseline['pending_count']}\n")
+
+        # Phase 1: writes
+        print(f"phase 1: writing {args.entries} entries, concurrency {args.concurrency} ...")
+        write_stats, assigned, write_duration = await write_phase(
+            client, url, api_key, args.entries, args.concurrency, run_id
+        )
+        print(write_stats.summary("write /add", write_duration))
+        if not assigned:
+            print("no successful writes; aborting")
+            return 1
+
+        # Phase 2: integration + checkpoint lag
+        target = max(assigned.values()) + 1
+        print(f"\nphase 2: waiting for integration to size {target} ...")
+        integ_lag, ckpt_lag = await wait_for_integration(client, url, target, args.timeout)
+        if integ_lag is None:
+            print(f"  FAIL: not integrated within {args.timeout}s")
+            return 1
+        print(f"  integration lag after last ack: {integ_lag:.2f}s")
+        if ckpt_lag is None:
+            print(f"  FAIL: checkpoint did not cover size {target} within {args.timeout}s")
+            return 1
+        print(f"  checkpoint covering all writes:  {ckpt_lag:.2f}s")
+
+        resp = await client.get(f"{url}/checkpoint")
+        origin, size, n_sigs = parse_checkpoint(resp.text)
+        print(f"  checkpoint: origin={origin} size={size} signatures={n_sigs}")
+
+        # Phase 3: reads + correctness
+        print(f"\nphase 3: {min(args.lookups, len(assigned))} vindex lookups, "
+              f"50 checkpoint fetches, tile reads ...")
+        ckpt_stats, vindex_stats, tile_stats, mismatches = await read_phase(
+            client, url, assigned, args.lookups, args.concurrency
+        )
+        print(ckpt_stats.summary("read /checkpoint"))
+        print(vindex_stats.summary("read /vindex/lookup/key"))
+        print(tile_stats.summary("read /tile/entries"))
+
+        # Verdict
+        print("\n=== verdict ===")
+        failures = []
+        if write_stats.errors:
+            failures.append(f"write errors: {write_stats.errors}")
+        if mismatches:
+            failures.append(f"{mismatches} vindex lookups missing the assigned index")
+        if any(s.errors for s in (ckpt_stats, vindex_stats, tile_stats)):
+            failures.append("read errors (see above)")
+        if size < target:
+            failures.append(f"checkpoint size {size} < target {target}")
+        if failures:
+            for f in failures:
+                print(f"  FAIL: {f}")
+            return 1
+        print(f"  PASS: {len(assigned)} entries written, integrated, checkpointed, "
+              f"and all {min(args.lookups, len(assigned))} sampled lookups verified")
+        return 0
+
+
+if __name__ == "__main__":
+    sys.exit(asyncio.run(main()))

From 5cb5669f81111ef2760269931519940f53a8d49e Mon Sep 17 00:00:00 2001
From: Wolf Vollprecht <w.vollprecht@gmail.com>
Date: Fri, 3 Jul 2026 11:22:51 +0200
Subject: [PATCH 3/3] Add siglog-import bulk importer for log bootstrapping
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Building a log from existing data through POST /add pays incremental
integration costs (batch-by-batch acks, each partial tile rewritten up
to 256 times) — measured at ~36 entries/s against S3. The importer
builds the tree in one pass instead:

- Streams pre-normalized JSONL entries in tile-aligned chunks through
  the same integrate() tree builder as the live path, so the resulting
  tree is byte-identical to incremental integration (tested)
- Uploads tiles and entry bundles concurrently (200k entries -> 1,571
  objects at ~5,500 entries/s locally)
- Builds the vindex in the same pass, finishing with a snapshot
- Commits the database state only after all objects are durable, and
  refuses non-empty logs (never fork); --resume skips existing objects
- Signs the initial checkpoint; the live server continues incrementally
  from the imported state (verified end-to-end with real conda repodata)
- Optional --epoch-note marker entry records what the bootstrap covers

conda-log-ingest gains --jsonl-out to convert conda repodata into the
importer's input format, and --api-key for authenticated submission.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 Cargo.toml                             |   4 +
 README.md                              |  40 ++
 crates/conda-monitor/src/bin/ingest.rs |  45 ++-
 docker/Dockerfile.server               |   7 +-
 docs/REMAINING_WORK.md                 |  11 +-
 src/bin/import.rs                      | 243 ++++++++++++
 src/import.rs                          | 502 +++++++++++++++++++++++++
 src/lib.rs                             |   1 +
 src/storage/database.rs                |  47 +++
 src/storage/opendal.rs                 |   5 +
 10 files changed, 899 insertions(+), 6 deletions(-)
 create mode 100644 src/bin/import.rs
 create mode 100644 src/import.rs

diff --git a/Cargo.toml b/Cargo.toml
index fac4177..2f10208 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -26,6 +26,10 @@ path = "src/main.rs"
 name = "witness"
 path = "src/bin/witness.rs"
 
+[[bin]]
+name = "siglog-import"
+path = "src/bin/import.rs"
+
 [dependencies]
 # Web framework
 axum = "0.8"
diff --git a/README.md b/README.md
index 28d092c..cf1ee4c 100644
--- a/README.md
+++ b/README.md
@@ -276,6 +276,46 @@ curl http://localhost:8080/tile/0/000
 curl http://localhost:8080/tile/entries/000
 ```
 
+## Bulk import (backfill)
+
+Bootstrapping a log with existing data should not go through `POST /add` —
+the incremental path acknowledges entries batch-by-batch and rewrites each
+partial tile up to 256 times. `siglog-import` builds the tree in one pass
+with concurrent uploads and produces a byte-identical tree to what
+incremental integration would create (~5,000 entries/s locally vs ~36/s
+over HTTP).
+
+```bash
+# 1. Convert conda repodata to normalized JSONL (one file per subdir)
+conda-log-ingest --file linux-64/repodata.json --subdir linux-64 \
+    --jsonl-out linux-64.jsonl
+
+# 2. Import into an EMPTY log (server must not be running)
+siglog-import \
+    --origin conda.prefix.dev \
+    --database-url sqlite:/data/siglog.db?mode=rwc \
+    --storage-backend s3 \
+    --jsonl noarch.jsonl --jsonl linux-64.jsonl \
+    --epoch-note "conda-forge bootstrap $(date -u +%F), repodata sha256 ..." \
+    --vindex-wal-path /data/vindex.wal
+
+# 3. Start the server; it continues incrementally from the imported state.
+```
+
+The import writes the database state only after every tile and bundle is
+durably uploaded, so an interrupted run can be retried; `--resume` skips
+objects that already exist (use the same `--chunk-size` and input). On
+Fly.io, run it as a one-off machine holding the data volume:
+
+```bash
+fly machine destroy <server-machine> --force     # volume must be free
+fly machine run <image> --volume siglog_data:/data --entrypoint sleep -- infinity
+fly ssh sftp shell   # upload the .jsonl files to /data/
+fly ssh console -C "siglog-import --origin ... --jsonl /data/noarch.jsonl ..."
+fly machine destroy <import-machine> --force
+fly deploy           # recreate the server on the imported state
+```
+
 ## Deployment
 
 ### Fly.io
diff --git a/crates/conda-monitor/src/bin/ingest.rs b/crates/conda-monitor/src/bin/ingest.rs
index 4ca4fbd..9b787a2 100644
--- a/crates/conda-monitor/src/bin/ingest.rs
+++ b/crates/conda-monitor/src/bin/ingest.rs
@@ -41,6 +41,16 @@ struct Args {
     #[arg(long)]
     dry_run: bool,
 
+    /// Write normalized entries as JSONL to this file instead of submitting
+    /// them over HTTP. Feed the output to `siglog-import` for bulk
+    /// bootstrapping.
+    #[arg(long)]
+    jsonl_out: Option<String>,
+
+    /// API key for authenticating write requests (Bearer token)
+    #[arg(long, env = "API_KEY")]
+    api_key: Option<String>,
+
     /// Number of entries to process (for testing)
     #[arg(long)]
     limit: Option<usize>,
@@ -115,6 +125,12 @@ fn main() -> anyhow::Result<()> {
     let mut error_count = 0;
     let mut indices: HashMap<String, u64> = HashMap::new();
 
+    let mut jsonl_writer: Option<std::io::BufWriter<std::fs::File>> = args
+        .jsonl_out
+        .as_ref()
+        .map(|path| std::fs::File::create(path).map(std::io::BufWriter::new))
+        .transpose()?;
+
     for (filename, entry) in all_packages.into_iter().take(total) {
         pb.set_message(filename.to_string());
 
@@ -128,6 +144,15 @@ fn main() -> anyhow::Result<()> {
 
         let json_bytes = normalized.to_normalized_json();
 
+        if let Some(writer) = &mut jsonl_writer {
+            use std::io::Write;
+            writer.write_all(&json_bytes)?;
+            writer.write_all(b"\n")?;
+            success_count += 1;
+            pb.inc(1);
+            continue;
+        }
+
         if args.dry_run {
             // Just print first few for verification
             if success_count < 3 {
@@ -143,7 +168,11 @@ fn main() -> anyhow::Result<()> {
 
         // Submit to log
         let add_url = format!("{}/add", args.log_url.trim_end_matches('/'));
-        match client.post(&add_url).body(json_bytes.clone()).send() {
+        let mut request = client.post(&add_url).body(json_bytes.clone());
+        if let Some(key) = &args.api_key {
+            request = request.header("Authorization", format!("Bearer {}", key));
+        }
+        match request.send() {
             Ok(resp) => {
                 if resp.status().is_success() {
                     if let Ok(text) = resp.text() {
@@ -172,6 +201,20 @@ fn main() -> anyhow::Result<()> {
 
     pb.finish_with_message("Done!");
 
+    if let Some(mut writer) = jsonl_writer {
+        use std::io::Write;
+        writer.flush()?;
+        println!("\n=== JSONL Export ===");
+        println!("Subdir: {}", args.subdir);
+        println!("Entries written: {}", success_count);
+        println!("Skipped: {}", skip_count);
+        println!(
+            "Output: {} (feed to siglog-import for bulk bootstrap)",
+            args.jsonl_out.as_deref().unwrap_or_default()
+        );
+        return Ok(());
+    }
+
     println!("\n=== Ingestion Summary ===");
     println!("Subdir: {}", args.subdir);
     println!("Submitted: {}", success_count);
diff --git a/docker/Dockerfile.server b/docker/Dockerfile.server
index 47b5ef2..ea7333d 100644
--- a/docker/Dockerfile.server
+++ b/docker/Dockerfile.server
@@ -14,8 +14,8 @@ COPY Cargo.toml Cargo.lock ./
 COPY src ./src
 COPY crates ./crates
 
-# Build release binary
-RUN cargo build --release --bin siglog
+# Build release binaries (server + bulk importer)
+RUN cargo build --release --bin siglog --bin siglog-import
 
 # Runtime stage
 FROM debian:bookworm-slim
@@ -24,8 +24,9 @@ RUN apt-get update && apt-get install -y \
     ca-certificates \
     && rm -rf /var/lib/apt/lists/*
 
-# Copy binary from builder
+# Copy binaries from builder
 COPY --from=builder /app/target/release/siglog /usr/local/bin/siglog
+COPY --from=builder /app/target/release/siglog-import /usr/local/bin/siglog-import
 
 # Run as a non-root user with a writable data directory
 RUN useradd --system --uid 10001 --create-home --home-dir /data siglog \
diff --git a/docs/REMAINING_WORK.md b/docs/REMAINING_WORK.md
index c9515bd..c1c2f24 100644
--- a/docs/REMAINING_WORK.md
+++ b/docs/REMAINING_WORK.md
@@ -180,5 +180,12 @@ the replenish interval (`per_nanosecond(1e9 / rps)`); regression-tested.
   quorum 3.
 - Alerting: monitor violations → webhook/status page; log health: pending
   count growth, checkpoint age, witness cosign failure rate.
-- Bootstrap plan: mass-load existing repodata as epoch T₀; document that
-  tamper-evidence starts at T₀.
+- Bootstrap plan: **done** — `siglog-import` bulk-builds tree + tiles +
+  bundles + vindex + checkpoint in one pass (byte-identical to incremental
+  integration; measured 5,497 entries/s locally, 200k entries → 1,571
+  objects). `conda-log-ingest --jsonl-out` converts repodata to its input.
+  A `--epoch-note` marker entry records what the bootstrap represents.
+  Run as a one-off Fly machine holding the volume (runbook in README).
+  Ongoing sync after bootstrap: scheduled job (GitHub Actions cron is fine)
+  diffing repodata against the log and submitting deltas via `POST /add`;
+  the publish-time hook in channel infrastructure is the end state.
diff --git a/src/bin/import.rs b/src/bin/import.rs
new file mode 100644
index 0000000..6f34dd2
--- /dev/null
+++ b/src/bin/import.rs
@@ -0,0 +1,243 @@
+//! Bulk importer for bootstrapping a siglog transparency log.
+//!
+//! Reads pre-normalized entries as JSONL (one JSON object per line) and
+//! builds the full tree — tiles, entry bundles, vindex, database state, and
+//! a signed checkpoint — in one pass with concurrent uploads. Orders of
+//! magnitude faster than submitting entries through `POST /add`.
+//!
+//! Producing the JSONL for a conda channel:
+//!   conda-log-ingest --file linux-64/repodata.json --subdir linux-64 \
+//!       --jsonl-out linux-64.jsonl
+//!
+//! Running the import (against the same DATABASE_URL / storage the server
+//! will use — the server must NOT be running):
+//!   siglog-import --origin conda.prefix.dev \
+//!       --jsonl noarch.jsonl --jsonl linux-64.jsonl \
+//!       --epoch-note "conda-forge bootstrap 2026-07-03" \
+//!       --vindex-wal-path /data/vindex.wal
+
+use clap::Parser;
+use siglog::checkpoint::CheckpointSigner;
+use siglog::import::{bulk_import, ImportConfig};
+use siglog::storage::{Database, TileStorage};
+use siglog::vindex;
+use std::io::BufRead;
+use std::sync::Arc;
+
+/// Bulk importer for bootstrapping a siglog transparency log.
+#[derive(Parser, Debug)]
+#[command(name = "siglog-import")]
+#[command(about = "Bulk-import pre-normalized entries into an empty transparency log")]
+struct Args {
+    /// Database URL (PostgreSQL: postgres://... or SQLite: sqlite:./path.db)
+    #[arg(long, env = "DATABASE_URL")]
+    database_url: String,
+
+    /// Storage backend: "s3" or "fs"
+    #[arg(long, env = "STORAGE_BACKEND", default_value = "fs")]
+    storage_backend: String,
+
+    /// Filesystem storage root directory (when storage_backend=fs)
+    #[arg(long, env = "FS_ROOT")]
+    fs_root: Option<String>,
+
+    /// S3 endpoint URL (when storage_backend=s3)
+    #[arg(long, env = "S3_ENDPOINT")]
+    s3_endpoint: Option<String>,
+
+    /// S3 bucket name (when storage_backend=s3)
+    #[arg(long, env = "S3_BUCKET")]
+    s3_bucket: Option<String>,
+
+    /// S3 access key (when storage_backend=s3)
+    #[arg(long, env = "S3_ACCESS_KEY")]
+    s3_access_key: Option<String>,
+
+    /// S3 secret key (when storage_backend=s3)
+    #[arg(long, env = "S3_SECRET_KEY")]
+    s3_secret_key: Option<String>,
+
+    /// S3 region (when storage_backend=s3)
+    #[arg(long, env = "S3_REGION", default_value = "auto")]
+    s3_region: String,
+
+    /// Log origin string (e.g., "conda.prefix.dev")
+    #[arg(long, env = "LOG_ORIGIN")]
+    origin: String,
+
+    /// Ed25519 private key in note format
+    #[arg(long, env = "LOG_PRIVATE_KEY")]
+    private_key: String,
+
+    /// JSONL input file(s), one pre-normalized entry per line. Repeatable;
+    /// files are imported in the order given. Use "-" for stdin.
+    #[arg(long = "jsonl", required = true)]
+    jsonl: Vec<String>,
+
+    /// Optional epoch marker logged as entry 0, recording what this
+    /// bootstrap represents (e.g. the repodata snapshot date/hashes).
+    #[arg(long)]
+    epoch_note: Option<String>,
+
+    /// Entries per integration chunk (must be a multiple of 256).
+    #[arg(long, default_value = "65536")]
+    chunk_size: usize,
+
+    /// Maximum concurrent object uploads.
+    #[arg(long, default_value = "32")]
+    upload_concurrency: usize,
+
+    /// Resume an interrupted import: skip objects that already exist.
+    /// Must use the same chunk size and input as the interrupted run.
+    #[arg(long)]
+    resume: bool,
+
+    /// Build the vindex during import and write its WAL + snapshot here.
+    #[arg(long, env = "VINDEX_WAL_PATH")]
+    vindex_wal_path: Option<String>,
+
+    /// JSON field name to extract vindex keys from.
+    #[arg(long, env = "VINDEX_KEY_FIELD", default_value = "name")]
+    vindex_key_field: String,
+}
+
+/// Iterator over entries from the input files (epoch marker first).
+fn entry_iter(
+    args: &Args,
+) -> anyhow::Result<impl Iterator<Item = siglog::error::Result<Vec<u8>>>> {
+    let epoch: Vec<siglog::error::Result<Vec<u8>>> = match &args.epoch_note {
+        Some(note) => {
+            let marker = serde_json::json!({
+                "type": "epoch",
+                "note": note,
+                "timestamp": chrono::Utc::now().timestamp(),
+            });
+            vec![Ok(serde_json::to_vec(&marker).expect("epoch marker serializes"))]
+        }
+        None => Vec::new(),
+    };
+
+    let mut readers: Vec<Box<dyn BufRead>> = Vec::new();
+    for path in &args.jsonl {
+        if path == "-" {
+            readers.push(Box::new(std::io::BufReader::new(std::io::stdin())));
+        } else {
+            let file = std::fs::File::open(path)
+                .map_err(|e| anyhow::anyhow!("cannot open {}: {}", path, e))?;
+            readers.push(Box::new(std::io::BufReader::new(file)));
+        }
+    }
+
+    Ok(epoch.into_iter().chain(
+        readers
+            .into_iter()
+            .flat_map(|r| r.lines())
+            .filter(|line| !matches!(line, Ok(l) if l.trim().is_empty()))
+            .map(|line| {
+                line.map(|l| l.into_bytes())
+                    .map_err(|e| siglog::error::Error::Io(e))
+            }),
+    ))
+}
+
+#[tokio::main]
+async fn main() -> anyhow::Result<()> {
+    tracing_subscriber::fmt()
+        .with_env_filter(
+            tracing_subscriber::EnvFilter::from_default_env()
+                .add_directive("siglog=info".parse()?),
+        )
+        .init();
+
+    let args = Args::parse();
+
+    tracing::info!("Bulk import starting for origin '{}'", args.origin);
+
+    // Database + storage (same configuration the server will run with)
+    let db = Database::connect(&args.database_url).await?;
+    db.run_migrations().await?;
+
+    let storage = match args.storage_backend.as_str() {
+        "fs" => {
+            let root = args
+                .fs_root
+                .clone()
+                .ok_or_else(|| anyhow::anyhow!("--fs-root is required for fs storage"))?;
+            TileStorage::new_fs(&root)?
+        }
+        "s3" => {
+            let get = |v: &Option<String>, name: &str| {
+                v.clone()
+                    .ok_or_else(|| anyhow::anyhow!("--{} is required for s3 storage", name))
+            };
+            TileStorage::new_s3(
+                &get(&args.s3_endpoint, "s3-endpoint")?,
+                &get(&args.s3_bucket, "s3-bucket")?,
+                &get(&args.s3_access_key, "s3-access-key")?,
+                &get(&args.s3_secret_key, "s3-secret-key")?,
+                &args.s3_region,
+            )?
+        }
+        other => anyhow::bail!("Unknown storage backend: {}. Use 'fs' or 's3'.", other),
+    };
+
+    let signer = CheckpointSigner::from_note_key(&args.private_key)?;
+
+    // Vindex: the import always builds it from scratch, so clear any
+    // leftover state first (a resumed import re-indexes from the input).
+    let vindex = match &args.vindex_wal_path {
+        Some(wal_path) => {
+            let snapshot = vindex::snapshot_path(std::path::Path::new(wal_path));
+            let _ = std::fs::remove_file(&snapshot);
+            let _ = std::fs::remove_file(wal_path);
+            let map_fn = Arc::new(vindex::JsonKeysMapFn::new(&args.vindex_key_field));
+            Some(Arc::new(vindex::VerifiableIndex::with_wal_and_batch_size(
+                map_fn, wal_path, 0, 1024,
+            )?))
+        }
+        None => None,
+    };
+
+    let config = ImportConfig {
+        origin: args.origin.clone(),
+        chunk_size: args.chunk_size,
+        upload_concurrency: args.upload_concurrency,
+        resume: args.resume,
+    };
+
+    let start = std::time::Instant::now();
+    let summary = bulk_import(
+        &db,
+        &storage,
+        &signer,
+        vindex.as_ref(),
+        &config,
+        entry_iter(&args)?,
+    )
+    .await?;
+    let elapsed = start.elapsed();
+
+    tracing::info!(
+        "Import complete: {} entries in {:.1}s ({:.0} entries/s)",
+        summary.entries,
+        elapsed.as_secs_f64(),
+        summary.entries as f64 / elapsed.as_secs_f64().max(0.001),
+    );
+    tracing::info!(
+        "  tree_size={} root={} objects_written={} objects_skipped={}",
+        summary.tree_size,
+        summary.root_hash.to_hex(),
+        summary.objects_written,
+        summary.objects_skipped,
+    );
+    if let Some(vi) = &vindex {
+        tracing::info!(
+            "  vindex: {} keys, root={}",
+            vi.key_count(),
+            hex::encode(vi.root_hash())
+        );
+    }
+    tracing::info!("The log server can now be started against this state.");
+
+    Ok(())
+}
diff --git a/src/import.rs b/src/import.rs
new file mode 100644
index 0000000..8394a19
--- /dev/null
+++ b/src/import.rs
@@ -0,0 +1,502 @@
+//! Bulk import: build the Merkle tree, tiles, entry bundles, vindex, and
+//! checkpoint for a large batch of entries in one pass.
+//!
+//! The incremental integration path rewrites each partial tile up to 256
+//! times as the tree grows and acknowledges entries batch-by-batch — the
+//! right behavior for a live log, but needlessly slow for a one-time
+//! bootstrap. This module instead:
+//!
+//! 1. Streams entries in chunks aligned to the tile width, reusing the same
+//!    [`integrate`] tree builder as the live path (so the resulting tree is
+//!    byte-identical to what incremental integration would produce),
+//! 2. Uploads the resulting tiles and entry bundles **concurrently**,
+//! 3. Builds the vindex in the same pass and finishes with a snapshot,
+//! 4. Sets the database log state once, at the very end, and signs a single
+//!    checkpoint.
+//!
+//! Crash safety: object writes are idempotent, and the database state is
+//! only written after every object upload succeeded — so an interrupted
+//! import can simply be re-run. With [`ImportConfig::resume`] set, objects
+//! that already exist are skipped (the re-run must use the same chunk size
+//! so partial-tile paths line up).
+
+use crate::checkpoint::signer::{Checkpoint, CheckpointSigner, CosignedCheckpoint, Origin};
+use crate::error::{Error, Result};
+use crate::merkle::integrate::integrate;
+use crate::merkle::EntryBundle;
+use crate::storage::opendal::CheckpointData;
+use crate::storage::{Database, TileStorage};
+use crate::types::{Entry, LogIndex, PartialSize, TileIndex, TreeSize};
+use crate::vindex::VerifiableIndex;
+use futures::stream::StreamExt;
+use sigstore_types::Sha256Hash;
+use std::sync::Arc;
+
+/// Entries per bundle / hashes per tile.
+const TILE_WIDTH: usize = 256;
+
+/// Configuration for a bulk import.
+#[derive(Debug, Clone)]
+pub struct ImportConfig {
+    /// Log origin for the final checkpoint.
+    pub origin: String,
+    /// Entries per integration chunk. Must be a multiple of 256. Larger
+    /// chunks mean fewer partial-tile rewrites at upper levels but more
+    /// memory per chunk.
+    pub chunk_size: usize,
+    /// Maximum concurrent object uploads.
+    pub upload_concurrency: usize,
+    /// Skip uploading objects that already exist (resuming an interrupted
+    /// import). The resumed run must use the same chunk size.
+    pub resume: bool,
+}
+
+impl Default for ImportConfig {
+    fn default() -> Self {
+        Self {
+            origin: "example.com/log".to_string(),
+            chunk_size: 65536,
+            upload_concurrency: 32,
+            resume: false,
+        }
+    }
+}
+
+/// Summary of a completed bulk import.
+#[derive(Debug)]
+pub struct ImportSummary {
+    /// Total entries imported.
+    pub entries: u64,
+    /// Final tree size (== entries).
+    pub tree_size: u64,
+    /// Final root hash.
+    pub root_hash: Sha256Hash,
+    /// Objects uploaded (tiles + bundles).
+    pub objects_written: u64,
+    /// Objects skipped because they already existed (resume).
+    pub objects_skipped: u64,
+}
+
+/// Run a bulk import from an entry iterator into an **empty** log.
+///
+/// `entries` yields the raw bytes of each log entry, already in their final
+/// (normalized) form. The database log state must be empty; it is written
+/// once, after all objects are durably uploaded.
+pub async fn bulk_import<I>(
+    db: &Database,
+    storage: &TileStorage,
+    signer: &CheckpointSigner,
+    vindex: Option<&Arc<VerifiableIndex>>,
+    config: &ImportConfig,
+    entries: I,
+) -> Result<ImportSummary>
+where
+    I: IntoIterator<Item = Result<Vec<u8>>>,
+{
+    if config.chunk_size == 0 || config.chunk_size % TILE_WIDTH != 0 {
+        return Err(Error::Config(format!(
+            "chunk_size must be a positive multiple of {}, got {}",
+            TILE_WIDTH, config.chunk_size
+        )));
+    }
+    let origin = Origin::new(config.origin.clone())?;
+
+    // The import target must be an empty log: importing on top of existing
+    // entries would require the incremental path, and overwriting an
+    // existing tree would fork it.
+    let state = db.get_log_state().await?;
+    if state.next_index.value() != 0 || state.integrated_size.value() != 0 {
+        return Err(Error::Config(format!(
+            "bulk import requires an empty log; found next_index={}, integrated_size={}",
+            state.next_index.value(),
+            state.integrated_size.value()
+        )));
+    }
+
+    let mut entries = entries.into_iter();
+    let mut tree_size: u64 = 0;
+    let mut root_hash = None;
+    let mut written: u64 = 0;
+    let mut skipped: u64 = 0;
+
+    loop {
+        // Collect the next chunk.
+        let mut chunk_data: Vec<Vec<u8>> = Vec::with_capacity(config.chunk_size);
+        for entry in entries.by_ref() {
+            let data = entry?;
+            if data.is_empty() {
+                return Err(Error::InvalidEntry(format!(
+                    "entry {} is empty",
+                    tree_size + chunk_data.len() as u64
+                )));
+            }
+            if data.len() > crate::api::handlers::MAX_ENTRY_SIZE {
+                return Err(Error::InvalidEntry(format!(
+                    "entry {} is {} bytes (max {})",
+                    tree_size + chunk_data.len() as u64,
+                    data.len(),
+                    crate::api::handlers::MAX_ENTRY_SIZE
+                )));
+            }
+            chunk_data.push(data);
+            if chunk_data.len() == config.chunk_size {
+                break;
+            }
+        }
+        if chunk_data.is_empty() {
+            break;
+        }
+
+        // Hash and index the chunk.
+        let mut leaf_hashes = Vec::with_capacity(chunk_data.len());
+        for (offset, data) in chunk_data.iter().enumerate() {
+            let idx = tree_size + offset as u64;
+            leaf_hashes.push(*Entry::new(data.clone()).leaf_hash());
+            if let Some(vi) = vindex {
+                vi.index_entry(LogIndex::new(idx), data)?;
+            }
+        }
+
+        // Build this chunk of the tree with the same code path as live
+        // integration (loads the compact range from already-written tiles).
+        let result = integrate(storage, TreeSize::new(tree_size), &leaf_hashes).await?;
+        let new_size = result.new_size.value();
+
+        // Upload tiles concurrently (bounded by upload_concurrency).
+        let tile_jobs: Vec<_> = result
+            .tiles
+            .iter()
+            .map(|(tile_id, tile)| {
+                let partial = crate::api::paths::partial_tile_size(
+                    tile_id.level.value(),
+                    tile_id.index.value(),
+                    new_size,
+                );
+                let path = crate::api::paths::tile_path(
+                    tile_id.level.value(),
+                    tile_id.index.value(),
+                    partial,
+                );
+                async move {
+                    if config.resume && storage.exists(&path).await? {
+                        return Ok::<bool, Error>(false);
+                    }
+                    storage
+                        .write_tile(
+                            tile_id.level,
+                            tile_id.index,
+                            PartialSize::new(partial),
+                            tile,
+                        )
+                        .await?;
+                    Ok(true)
+                }
+            })
+            .collect();
+        let mut stream = futures::stream::iter(tile_jobs)
+            .buffer_unordered(config.upload_concurrency.max(1));
+        while let Some(wrote) = stream.next().await {
+            if wrote? {
+                written += 1;
+            } else {
+                skipped += 1;
+            }
+        }
+        drop(stream);
+
+        // Upload entry bundles concurrently. Chunks are 256-aligned, so
+        // every bundle here is full except possibly the final one.
+        let first_bundle = tree_size / TILE_WIDTH as u64;
+        let bundle_jobs: Vec<_> = chunk_data
+            .chunks(TILE_WIDTH)
+            .enumerate()
+            .map(|(i, bundle_entries)| {
+                let bundle_idx = first_bundle + i as u64;
+                let partial = crate::api::paths::partial_tile_size(0, bundle_idx, new_size);
+                let path = crate::api::paths::entries_path(bundle_idx, partial);
+                let bundle = EntryBundle::with_entries(
+                    bundle_entries
+                        .iter()
+                        .map(|d| crate::types::EntryData::new(d.clone()))
+                        .collect(),
+                );
+                async move {
+                    if config.resume && storage.exists(&path).await? {
+                        return Ok::<bool, Error>(false);
+                    }
+                    storage
+                        .write_entry_bundle(
+                            TileIndex::new(bundle_idx),
+                            PartialSize::new(partial),
+                            &bundle,
+                        )
+                        .await?;
+                    Ok(true)
+                }
+            })
+            .collect();
+        let mut stream = futures::stream::iter(bundle_jobs)
+            .buffer_unordered(config.upload_concurrency.max(1));
+        while let Some(wrote) = stream.next().await {
+            if wrote? {
+                written += 1;
+            } else {
+                skipped += 1;
+            }
+        }
+        drop(stream);
+
+        if let Some(vi) = vindex {
+            vi.flush()?;
+        }
+
+        tree_size = new_size;
+        root_hash = Some(result.root_hash);
+        tracing::info!(
+            "Imported {} entries (root {})",
+            tree_size,
+            result.root_hash.to_hex()
+        );
+    }
+
+    let root_hash = root_hash.ok_or_else(|| Error::InvalidEntry("no entries to import".into()))?;
+
+    // Persist the vindex snapshot before the DB state: the vindex must
+    // never be behind the database.
+    if let Some(vi) = vindex {
+        vi.flush()?;
+        vi.snapshot()?;
+    }
+
+    // Only now, with every object durably uploaded, commit the log state.
+    db.initialize_imported_state(TreeSize::new(tree_size), root_hash)
+        .await?;
+
+    // Sign and publish the checkpoint. Witness cosignatures are collected
+    // by the live server's checkpoint worker once it starts.
+    let checkpoint = Checkpoint::new(origin, TreeSize::new(tree_size), root_hash);
+    let cosigned = CosignedCheckpoint::new(checkpoint, signer);
+    storage
+        .write_checkpoint(&CheckpointData::from(cosigned.to_text()))
+        .await?;
+
+    Ok(ImportSummary {
+        entries: tree_size,
+        tree_size,
+        root_hash,
+        objects_written: written,
+        objects_skipped: skipped,
+    })
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::vindex::JsonKeysMapFn;
+    use opendal::{services::Memory, Operator};
+
+    fn mem_storage() -> TileStorage {
+        TileStorage::new(Operator::new(Memory::default()).unwrap().finish())
+    }
+
+    async fn mem_db() -> Database {
+        let db = Database::connect("sqlite::memory:").await.unwrap();
+        db.run_migrations().await.unwrap();
+        db
+    }
+
+    fn test_entries(n: usize) -> Vec<Result<Vec<u8>>> {
+        (0..n)
+            .map(|i| Ok(format!(r#"{{"name":"pkg-{:05}","version":"1.0"}}"#, i).into_bytes()))
+            .collect()
+    }
+
+    fn config(chunk_size: usize) -> ImportConfig {
+        ImportConfig {
+            origin: "import.test/log".to_string(),
+            chunk_size,
+            upload_concurrency: 8,
+            resume: false,
+        }
+    }
+
+    /// The bulk import must produce the exact same tree as one-shot
+    /// integration of the same leaves — chunking must be invisible.
+    #[tokio::test]
+    async fn test_import_root_matches_single_integration() {
+        let n = 700; // spans multiple bundles with a partial tail
+
+        // Reference root: single integrate() call over all leaves.
+        let reference = mem_storage();
+        let leaves: Vec<_> = test_entries(n)
+            .into_iter()
+            .map(|e| *Entry::new(e.unwrap()).leaf_hash())
+            .collect();
+        let expected = integrate(&reference, TreeSize::new(0), &leaves)
+            .await
+            .unwrap();
+
+        // Bulk import with small chunks.
+        let storage = mem_storage();
+        let db = mem_db().await;
+        let signer = CheckpointSigner::generate("import.test/log");
+        let summary = bulk_import(
+            &db,
+            &storage,
+            &signer,
+            None,
+            &config(256),
+            test_entries(n),
+        )
+        .await
+        .unwrap();
+
+        assert_eq!(summary.tree_size, n as u64);
+        assert_eq!(summary.root_hash, expected.root_hash);
+
+        // DB state committed.
+        let state = db.get_log_state().await.unwrap();
+        assert_eq!(state.integrated_size.value(), n as u64);
+        assert_eq!(state.root_hash, Some(expected.root_hash));
+
+        // Checkpoint written and parseable.
+        let ckpt = storage.read_checkpoint().await.unwrap().unwrap();
+        let cosigned = CosignedCheckpoint::from_text(ckpt.as_str().unwrap()).unwrap();
+        assert_eq!(cosigned.checkpoint.size.value(), n as u64);
+        assert_eq!(cosigned.checkpoint.root_hash, expected.root_hash);
+
+        // All entry bundles present and correctly sized.
+        for bundle_idx in 0..=(n as u64 - 1) / 256 {
+            let partial = crate::api::paths::partial_tile_size(0, bundle_idx, n as u64);
+            let bundle = storage
+                .read_entry_bundle(TileIndex::new(bundle_idx), PartialSize::new(partial))
+                .await
+                .unwrap()
+                .unwrap();
+            let expected_len = if partial == 0 { 256 } else { partial as usize };
+            assert_eq!(bundle.entries.len(), expected_len);
+        }
+    }
+
+    /// Root must not depend on the chunk size.
+    #[tokio::test]
+    async fn test_import_chunk_size_invariance() {
+        let n = 600;
+        let mut roots = Vec::new();
+        for chunk in [256usize, 512, 65536] {
+            let storage = mem_storage();
+            let db = mem_db().await;
+            let signer = CheckpointSigner::generate("import.test/log");
+            let summary = bulk_import(
+                &db,
+                &storage,
+                &signer,
+                None,
+                &config(chunk),
+                test_entries(n),
+            )
+            .await
+            .unwrap();
+            roots.push(summary.root_hash);
+        }
+        assert!(roots.windows(2).all(|w| w[0] == w[1]));
+    }
+
+    /// Importing into a non-empty log must be refused.
+    #[tokio::test]
+    async fn test_import_requires_empty_log() {
+        let storage = mem_storage();
+        let db = mem_db().await;
+        let signer = CheckpointSigner::generate("import.test/log");
+
+        bulk_import(&db, &storage, &signer, None, &config(256), test_entries(10))
+            .await
+            .unwrap();
+
+        // Second import must fail: the log is no longer empty.
+        let result = bulk_import(
+            &db,
+            &storage,
+            &signer,
+            None,
+            &config(256),
+            test_entries(10),
+        )
+        .await;
+        assert!(result.is_err());
+        assert!(result.unwrap_err().to_string().contains("empty log"));
+    }
+
+    /// Resume: a re-run over already-written objects skips them and
+    /// produces the same root.
+    #[tokio::test]
+    async fn test_import_resume_skips_existing() {
+        let n = 300;
+        let storage = mem_storage();
+        let signer = CheckpointSigner::generate("import.test/log");
+
+        // First (simulated interrupted) run: objects written, but pretend
+        // the process died before the DB commit by using a throwaway DB.
+        let db1 = mem_db().await;
+        let first = bulk_import(&db1, &storage, &signer, None, &config(256), test_entries(n))
+            .await
+            .unwrap();
+        assert!(first.objects_written > 0);
+        assert_eq!(first.objects_skipped, 0);
+
+        // Resume against the same storage with a fresh (still-empty) DB.
+        let db2 = mem_db().await;
+        let mut cfg = config(256);
+        cfg.resume = true;
+        let resumed = bulk_import(&db2, &storage, &signer, None, &cfg, test_entries(n))
+            .await
+            .unwrap();
+
+        assert_eq!(resumed.root_hash, first.root_hash);
+        assert_eq!(resumed.objects_written, 0, "everything already uploaded");
+        assert_eq!(resumed.objects_skipped, first.objects_written);
+        let state = db2.get_log_state().await.unwrap();
+        assert_eq!(state.integrated_size.value(), n as u64);
+    }
+
+    /// Vindex built during import must serve correct lookups and survive a
+    /// restart via its snapshot.
+    #[tokio::test]
+    async fn test_import_builds_vindex() {
+        let n = 300;
+        let temp_dir = tempfile::tempdir().unwrap();
+        let wal_path = temp_dir.path().join("vindex.wal");
+
+        let storage = mem_storage();
+        let db = mem_db().await;
+        let signer = CheckpointSigner::generate("import.test/log");
+        let vi = Arc::new(
+            VerifiableIndex::with_wal(Arc::new(JsonKeysMapFn::new("name")), &wal_path, 0).unwrap(),
+        );
+
+        bulk_import(
+            &db,
+            &storage,
+            &signer,
+            Some(&vi),
+            &config(256),
+            test_entries(n),
+        )
+        .await
+        .unwrap();
+
+        assert_eq!(vi.tree_size(), n as u64);
+        let result = vi.lookup_string("pkg-00042");
+        assert!(result.found);
+        assert_eq!(result.indices[0].value(), 42);
+
+        // Snapshot was written; a restart at the imported size must load it.
+        let restored = VerifiableIndex::with_wal(
+            Arc::new(JsonKeysMapFn::new("name")),
+            &wal_path,
+            n as u64,
+        )
+        .unwrap();
+        assert_eq!(restored.root_hash(), vi.root_hash());
+    }
+}
diff --git a/src/lib.rs b/src/lib.rs
index ad99cb8..15ec5a6 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -11,6 +11,7 @@
 pub mod api;
 pub mod checkpoint;
 pub mod error;
+pub mod import;
 pub mod shutdown;
 pub mod merkle;
 pub mod migration;
diff --git a/src/storage/database.rs b/src/storage/database.rs
index 87b54d4..e54e0b9 100644
--- a/src/storage/database.rs
+++ b/src/storage/database.rs
@@ -174,6 +174,53 @@ impl Database {
             .collect()
     }
 
+    /// Initialize the log state after a bulk import into an empty log.
+    ///
+    /// Sets `next_index` and `integrated_size` to the imported size in one
+    /// step. Fails if the log is not empty — bulk import must never fork or
+    /// overwrite an existing tree.
+    pub async fn initialize_imported_state(
+        &self,
+        size: TreeSize,
+        root_hash: Sha256Hash,
+    ) -> Result<()> {
+        if size.value() > i64::MAX as u64 {
+            return Err(Error::Internal(format!(
+                "tree size {} exceeds supported maximum",
+                size.value()
+            )));
+        }
+
+        let txn = self.conn.begin().await?;
+
+        let state = log_state::Entity::find_by_id(1)
+            .lock_exclusive()
+            .one(&txn)
+            .await?
+            .ok_or_else(|| Error::Internal("log state not found".into()))?;
+
+        if state.next_index != 0 || state.integrated_size != 0 {
+            txn.rollback().await?;
+            return Err(Error::Internal(format!(
+                "cannot initialize imported state: log is not empty \
+                 (next_index={}, integrated_size={})",
+                state.next_index, state.integrated_size
+            )));
+        }
+
+        log_state::Entity::update(log_state::ActiveModel {
+            id: ActiveValue::Unchanged(1),
+            next_index: ActiveValue::Set(size.value() as i64),
+            integrated_size: ActiveValue::Set(size.value() as i64),
+            root_hash: ActiveValue::Set(Some(root_hash.as_bytes().to_vec())),
+        })
+        .exec(&txn)
+        .await?;
+
+        txn.commit().await?;
+        Ok(())
+    }
+
     /// Mark entries as integrated up to the given size if the state has not
     /// changed since the caller read it.
     ///
diff --git a/src/storage/opendal.rs b/src/storage/opendal.rs
index 5f83a49..8a98988 100644
--- a/src/storage/opendal.rs
+++ b/src/storage/opendal.rs
@@ -170,6 +170,11 @@ impl TileStorage {
             Err(e) => Err(e.into()),
         }
     }
+
+    /// Check whether an object exists at a path.
+    pub async fn exists(&self, path: &str) -> Result<bool> {
+        self.op.exists(path).await.map_err(Into::into)
+    }
 }
 
 /// Wrapper type for checkpoint data.