From 126d943f48f4a3d27979e648001930586518d8ec Mon Sep 17 00:00:00 2001 From: Wolf Vollprecht Date: Fri, 3 Jul 2026 10:14:26 +0200 Subject: [PATCH 1/3] Production hardening and C2SP Phase 0 Security and correctness: - Fix rate limiter: install ConnectInfo (every request previously failed with 500), use SmartIpKeyExtractor behind proxies, spawn the cleanup task, keep Retry-After headers, and configure the token replenish interval correctly (per_second(n) means one token per n seconds, not n/s) with RATE_LIMIT_* env overrides - Witness: compare-and-swap state updates so concurrent requests can never cosign two different roots at the same size (split-view TOCTOU); reject sizes above i64::MAX that would defeat rollback protection - Verify external witness cosignatures against pinned keys before they count toward publication (EXTERNAL_WITNESSES=name=url=vkey) - Vindex WAL: CRC32-checksummed v3 format, torn-tail truncation on recovery, single-write entries, idempotent index_entry, and vindex failures abort the integration cycle before entries are marked integrated C2SP conformance (Phase 0): - cosignature/v1 witness signatures (timestamped 76-byte blobs, alg-0x04 key IDs) per c2sp.org/tlog-cosignature, pinned against the spec's example vector; witness-conformance suite passes 28/28 - Quorum-based checkpoint publishing (WITNESS_QUORUM) Vindex memory: - Periodic snapshots (CRC'd, atomic rename) truncate the WAL and bound startup replay (VINDEX_SNAPSHOT_INTERVAL); missing/corrupt/behind state auto-rebuilds from entry bundles in tile storage Ops: - SIGTERM-aware graceful shutdown, fail-fast worker supervision, atomic filesystem tile writes (temp+rename), request timeouts, witness rate limiting, non-root containers, docker-compose fixes (vindex WAL path, secrets via environment), strict tile-path digits, release overflow-checks Co-Authored-By: Claude Fable 5 --- Cargo.lock | 11 + Cargo.toml | 10 +- README.md | 26 +- docker/Dockerfile.local | 7 +- docker/Dockerfile.server | 8 +- docker/Dockerfile.witness | 8 +- docker/docker-compose.yml | 20 +- src/api/handlers.rs | 4 +- src/api/paths.rs | 18 +- src/api/rate_limit.rs | 56 ++++- src/bin/witness.rs | 49 +++- src/checkpoint/signer.rs | 152 +++++++++++- src/error.rs | 5 +- src/lib.rs | 1 + src/main.rs | 215 +++++++++++----- src/merkle/proof.rs | 7 + src/monitor/mod.rs | 46 +++- src/shutdown.rs | 29 +++ src/storage/database.rs | 8 +- src/storage/opendal.rs | 9 +- src/vindex/mod.rs | 383 +++++++++++++++++++++++++++-- src/vindex/snapshot.rs | 254 +++++++++++++++++++ src/vindex/wal.rs | 431 +++++++++++++++++++++++---------- src/witness/mod.rs | 52 ++-- src/witness/state.rs | 67 ++++- src/witness/verifier.rs | 64 ++++- src/worker.rs | 281 ++++++++++++++++++--- tests/witness_security_test.rs | 113 +++++++-- witness-conformance/.gitignore | 1 + 29 files changed, 1968 insertions(+), 367 deletions(-) create mode 100644 src/shutdown.rs create mode 100644 src/vindex/snapshot.rs diff --git a/Cargo.lock b/Cargo.lock index 2a7ae28..da74f81 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -639,6 +639,15 @@ dependencies = [ "rustc_version", ] +[[package]] +name = "crc32fast" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9481c1c90cbf2ac953f07c8d4a58aa3945c425b7185c9154d67a65e4230da511" +dependencies = [ + "cfg-if", +] + [[package]] name = "crossbeam-queue" version = "0.3.12" @@ -3256,6 +3265,7 @@ dependencies = [ "base64", "chrono", "clap", + "crc32fast", "ed25519-dalek", "futures", "hex", @@ -4043,6 +4053,7 @@ dependencies = [ "http-body", "iri-string", "pin-project-lite", + "tokio", "tower", "tower-layer", "tower-service", diff --git a/Cargo.toml b/Cargo.toml index 3df362e..fac4177 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -30,7 +30,7 @@ path = "src/bin/witness.rs" # Web framework axum = "0.8" tokio = { version = "1", features = ["full"] } -tower-http = { version = "0.6", features = ["cors", "trace"] } +tower-http = { version = "0.6", features = ["cors", "trace", "timeout"] } tower = "0.5" tower_governor = "0.8" @@ -79,6 +79,14 @@ indicatif = "0.18.3" # Optimization smallvec = "1.13" +# WAL entry checksums +crc32fast = "1" + +[profile.release] +# Size/index arithmetic guards the Merkle tree and witness rollback +# protection; wrap-on-overflow must never silently corrupt those checks. +overflow-checks = true + [dev-dependencies] tempfile = "3" portpicker = "0.1" diff --git a/README.md b/README.md index 32bc8e8..28d092c 100644 --- a/README.md +++ b/README.md @@ -84,12 +84,18 @@ cargo build --release | `S3_REGION` | S3 region | `auto` | | `API_KEY` | Bearer token required for `/add` writes | Required unless `ALLOW_PUBLIC_WRITES=true` | | `ALLOW_PUBLIC_WRITES` | Allow unauthenticated `/add` writes for local development | `false` | +| `EXTERNAL_WITNESSES` | External witnesses to collect cosignatures from, comma-separated. Format: `name=url=vkey` — the note-format verification key is required and cosignatures are verified against it before a checkpoint is published | - | +| `WITNESS_QUORUM` | Minimum number of external witness cosignatures required to publish a checkpoint | All configured witnesses | +| `WITNESS_KEYS` | In-process witness private keys for local development (comma-separated) | - | +| `VINDEX_SNAPSHOT_INTERVAL` | Entries between vindex snapshots. Each snapshot persists the full index and truncates the WAL, bounding WAL growth and startup replay time (0 disables) | `100000` | +| `RATE_LIMIT_PER_SECOND` | Requests per second allowed per client IP | `100` | +| `RATE_LIMIT_BURST_SIZE` | Burst capacity per client IP | `200` | | `CHECKPOINT_INTERVAL` | Checkpoint frequency (seconds) | `1` | | `BATCH_MAX_SIZE` | Max entries per batch | `256` | | `BATCH_MAX_AGE_MS` | Max batch age (ms) | `1000` | | `VINDEX_ENABLED` | Enable verifiable index | `false` | | `VINDEX_KEY_FIELD` | JSON field for key extraction | `name` | -| `VINDEX_WAL_PATH` | WAL path for persistent vindex recovery | Required when enabling vindex on a non-empty log | +| `VINDEX_WAL_PATH` | WAL path for persistent vindex state (snapshot is stored alongside as `.snapshot`). If the on-disk state is missing, corrupted, or behind the database, the vindex is automatically rebuilt from the log's entry bundles | Recommended when enabling vindex | #### Witness Server (`witness`) @@ -206,16 +212,18 @@ A witness independently verifies and co-signs transparency log checkpoints. Runn #### POST /add-checkpoint -Request body: -```json -{ - "checkpoint": "log.example.com\n123\nROOTHASH...\n\n- log.example.com SIGNATURE...", - "proof": ["HASH1...", "HASH2..."], - "old_size": 100 -} +Request body (text/plain, per [c2sp.org/tlog-witness](https://c2sp.org/tlog-witness)): +```text +old + +... + + ``` -Response (on success): The witness's cosignature line. +Response (on success): the witness's [cosignature/v1](https://c2sp.org/tlog-cosignature) +line — a timestamped Ed25519 signature whose key ID is computed with the +cosignature/v1 algorithm byte (0x04). ## API Reference diff --git a/docker/Dockerfile.local b/docker/Dockerfile.local index ddfbe66..3c94364 100644 --- a/docker/Dockerfile.local +++ b/docker/Dockerfile.local @@ -32,8 +32,11 @@ COPY --from=builder /app/target/release/siglog /usr/local/bin/siglog COPY --from=builder /app/target/release/witness /usr/local/bin/witness COPY --from=builder /app/target/release/conda-monitor /usr/local/bin/conda-monitor -# Create data directories -RUN mkdir -p /data +# Run as a non-root user with a writable data directory +RUN useradd --system --uid 10001 --create-home --home-dir /data app \ + && mkdir -p /data \ + && chown -R app:app /data +USER app WORKDIR /data diff --git a/docker/Dockerfile.server b/docker/Dockerfile.server index b8b15d9..47b5ef2 100644 --- a/docker/Dockerfile.server +++ b/docker/Dockerfile.server @@ -27,8 +27,11 @@ RUN apt-get update && apt-get install -y \ # Copy binary from builder COPY --from=builder /app/target/release/siglog /usr/local/bin/siglog -# Create data directory -RUN mkdir -p /data +# Run as a non-root user with a writable data directory +RUN useradd --system --uid 10001 --create-home --home-dir /data siglog \ + && mkdir -p /data \ + && chown -R siglog:siglog /data +USER siglog # Default environment variables ENV LISTEN_ADDR=0.0.0.0:8080 @@ -36,6 +39,7 @@ ENV DATABASE_URL=sqlite:/data/tessera.db?mode=rwc ENV STORAGE_BACKEND=fs ENV FS_ROOT=/data/tiles +VOLUME /data EXPOSE 8080 ENTRYPOINT ["siglog"] diff --git a/docker/Dockerfile.witness b/docker/Dockerfile.witness index ed02a36..f519ffe 100644 --- a/docker/Dockerfile.witness +++ b/docker/Dockerfile.witness @@ -27,13 +27,17 @@ RUN apt-get update && apt-get install -y \ # Copy binary from builder COPY --from=builder /app/target/release/witness /usr/local/bin/witness -# Create data directory -RUN mkdir -p /data +# Run as a non-root user with a writable data directory +RUN useradd --system --uid 10001 --create-home --home-dir /data witness \ + && mkdir -p /data \ + && chown -R witness:witness /data +USER witness # Default environment variables ENV LISTEN_ADDR=0.0.0.0:8081 ENV DATABASE_URL=sqlite:/data/witness.db?mode=rwc +VOLUME /data EXPOSE 8081 ENTRYPOINT ["witness"] diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 3be8898..aec0336 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -2,7 +2,14 @@ # # Setup: # Create a .env file with LOG_PRIVATE_KEY, LOG_PUBLIC_KEY, -# WITNESS_PRIVATE_KEY, and MONITOR_PRIVATE_KEY. +# WITNESS_PRIVATE_KEY, WITNESS_PUBLIC_KEY, MONITOR_PRIVATE_KEY, +# and MONITOR_PUBLIC_KEY. +# +# Generate the witness key with name "local.dev/witness" and the monitor +# key with name "local.dev/monitor" (see README "Key Format"): the name +# embedded in each key must match the witness name configured in +# EXTERNAL_WITNESSES below, and the public keys are pinned so the log can +# verify cosignatures. # # Usage: # docker compose -f docker/docker-compose.yml build @@ -31,7 +38,6 @@ services: - --storage-backend=fs - --fs-root=/data/tiles - --origin=local.dev/log - - --private-key=${LOG_PRIVATE_KEY} - --listen=0.0.0.0:8080 - --checkpoint-interval=1 - --batch-max-size=256 @@ -39,9 +45,12 @@ services: - --allow-public-writes - --vindex-enabled - --vindex-key-field=name - - --external-witnesses=witness=http://witness:8080,monitor=http://monitor:8080 + - --vindex-wal-path=/data/vindex.wal environment: RUST_LOG: info,siglog=debug + # Secrets via environment, not argv (argv is visible in docker inspect / ps) + LOG_PRIVATE_KEY: ${LOG_PRIVATE_KEY} + EXTERNAL_WITNESSES: "local.dev/witness=http://witness:8080=${WITNESS_PUBLIC_KEY},local.dev/monitor=http://monitor:8080=${MONITOR_PUBLIC_KEY}" ports: - "8080:8080" volumes: @@ -60,11 +69,11 @@ services: command: - witness - --database-url=sqlite:/data/witness.db?mode=rwc - - --private-key=${WITNESS_PRIVATE_KEY} - --listen=0.0.0.0:8080 - --log=local.dev/log=${LOG_PUBLIC_KEY} environment: RUST_LOG: info,witness=debug,siglog=debug + WITNESS_PRIVATE_KEY: ${WITNESS_PRIVATE_KEY} ports: - "8081:8080" volumes: @@ -112,11 +121,12 @@ services: command: - conda-monitor - --database-url=sqlite:/data/monitor.db?mode=rwc - - --private-key=${MONITOR_PRIVATE_KEY} - --listen=0.0.0.0:8080 - --log=local.dev/log=${LOG_PUBLIC_KEY}=http://log:8080 environment: RUST_LOG: info,conda_monitor=debug,siglog=debug + # conda-monitor reads its signing key from WITNESS_PRIVATE_KEY + WITNESS_PRIVATE_KEY: ${MONITOR_PRIVATE_KEY} ports: - "8082:8080" volumes: diff --git a/src/api/handlers.rs b/src/api/handlers.rs index aa8e3e0..9e608a3 100644 --- a/src/api/handlers.rs +++ b/src/api/handlers.rs @@ -267,7 +267,7 @@ pub async fn vindex_lookup( key.copy_from_slice(&hash_bytes); let result = vindex.lookup(&key); - let root_hash = vindex.root_hash(); + let root_hash = result.root_hash; let response = VindexLookupResponse { indices: result.indices.iter().map(|i| i.value()).collect(), @@ -302,7 +302,7 @@ pub async fn vindex_lookup_key( .ok_or_else(|| Error::Internal("vindex not enabled".into()))?; let result = vindex.lookup_string(&key); - let root_hash = vindex.root_hash(); + let root_hash = result.root_hash; let response = VindexLookupResponse { indices: result.indices.iter().map(|i| i.value()).collect(), diff --git a/src/api/paths.rs b/src/api/paths.rs index 907addc..2fd5ac9 100644 --- a/src/api/paths.rs +++ b/src/api/paths.rs @@ -77,7 +77,13 @@ pub fn entries_path_for_log_index(seq: u64, log_size: u64) -> String { /// Calculate the expected number of leaves in a tile at the given level and index /// within a tree of the specified size, or 0 if the tile is fully populated. pub fn partial_tile_size(level: u64, index: u64, log_size: u64) -> u8 { - let size_at_level = log_size >> (level * TILE_HEIGHT); + // A shift of >= 64 bits is undefined; levels that high can never have + // partial tiles for any representable tree size. + let shift = level.saturating_mul(TILE_HEIGHT); + if shift >= 64 { + return 0; + } + let size_at_level = log_size >> shift; let full_tiles = size_at_level / TILE_WIDTH; if index < full_tiles { @@ -91,6 +97,11 @@ pub fn partial_tile_size(level: u64, index: u64, log_size: u64) -> u8 { /// /// Validates that level is between 0 and 63. pub fn parse_tile_level(level: &str) -> Result { + // Strict digits only: u64::parse also accepts a leading '+', which would + // create alias URLs for the same immutable tile (CDN cache ambiguity). + if level.is_empty() || !level.bytes().all(|b| b.is_ascii_digit()) { + return Err(Error::InvalidPath("invalid tile level".into())); + } let l: u64 = level .parse() .map_err(|_| Error::InvalidPath("invalid tile level".into()))?; @@ -113,6 +124,9 @@ pub fn parse_tile_level(level: &str) -> Result { pub fn parse_tile_index(index_str: &str) -> Result<(u64, u8)> { let (index_part, partial) = if let Some(pos) = index_str.find(".p/") { let partial_str = &index_str[pos + 3..]; + if partial_str.is_empty() || !partial_str.bytes().all(|b| b.is_ascii_digit()) { + return Err(Error::InvalidPath("invalid partial size".into())); + } let partial: u64 = partial_str .parse() .map_err(|_| Error::InvalidPath("invalid partial size".into()))?; @@ -146,7 +160,7 @@ pub fn parse_tile_index(index_str: &str) -> Result<(u64, u8)> { for part in parts { let digits = part.strip_prefix('x').unwrap_or(part); - if digits.len() != 3 { + if digits.len() != 3 || !digits.bytes().all(|b| b.is_ascii_digit()) { return Err(Error::InvalidPath( "each index component must be 3 digits".into(), )); diff --git a/src/api/rate_limit.rs b/src/api/rate_limit.rs index 24aa8e3..33f6a6c 100644 --- a/src/api/rate_limit.rs +++ b/src/api/rate_limit.rs @@ -13,14 +13,50 @@ pub const RATE_LIMIT_PER_SECOND: u64 = 100; /// Default burst capacity: maximum requests allowed in a burst. pub const RATE_LIMIT_BURST_SIZE: u32 = 200; +/// Requests per second per client IP (`RATE_LIMIT_PER_SECOND` env override). +pub fn rate_limit_per_second() -> u64 { + std::env::var("RATE_LIMIT_PER_SECOND") + .ok() + .and_then(|s| s.parse().ok()) + .filter(|&v| v > 0) + .unwrap_or(RATE_LIMIT_PER_SECOND) +} + +/// Token replenish interval in nanoseconds for the configured rate. +/// +/// tower_governor's `per_second(n)` sets the interval to replenish ONE +/// token to `n` seconds — it does NOT mean "n requests per second". To +/// allow R requests per second, one token must replenish every 1e9/R +/// nanoseconds. +pub fn replenish_interval_ns() -> u64 { + (1_000_000_000 / rate_limit_per_second()).max(1) +} + +/// Burst capacity per client IP (`RATE_LIMIT_BURST_SIZE` env override). +pub fn rate_limit_burst_size() -> u32 { + std::env::var("RATE_LIMIT_BURST_SIZE") + .ok() + .and_then(|s| s.parse().ok()) + .filter(|&v| v > 0) + .unwrap_or(RATE_LIMIT_BURST_SIZE) +} + /// Convert governor errors to HTTP responses. pub fn rate_limit_error_handler(error: GovernorError) -> Response { match error { - GovernorError::TooManyRequests { .. } => ( - StatusCode::TOO_MANY_REQUESTS, - "Too many requests. Please slow down.", - ) - .into_response(), + GovernorError::TooManyRequests { headers, .. } => { + // Preserve Retry-After / x-ratelimit-* headers so clients can + // back off intelligently. + let mut response = ( + StatusCode::TOO_MANY_REQUESTS, + "Too many requests. Please slow down.", + ) + .into_response(); + if let Some(headers) = headers { + response.headers_mut().extend(headers); + } + response + } GovernorError::UnableToExtractKey => ( StatusCode::INTERNAL_SERVER_ERROR, "Unable to extract rate limit key", @@ -42,10 +78,18 @@ mod tests { #[test] fn test_rate_limit_constants() { let config = tower_governor::governor::GovernorConfigBuilder::default() - .per_second(RATE_LIMIT_PER_SECOND) + .per_nanosecond(replenish_interval_ns()) .burst_size(RATE_LIMIT_BURST_SIZE) .finish(); assert!(config.is_some()); } + + #[test] + fn test_replenish_interval_semantics() { + // 100 req/s must replenish one token every 10ms — NOT one token + // every 100s, which is what per_second(100) would configure. + std::env::remove_var("RATE_LIMIT_PER_SECOND"); + assert_eq!(replenish_interval_ns(), 10_000_000); + } } diff --git a/src/bin/witness.rs b/src/bin/witness.rs index 353cdd0..2952a08 100644 --- a/src/bin/witness.rs +++ b/src/bin/witness.rs @@ -7,9 +7,14 @@ use axum::extract::DefaultBodyLimit; use clap::Parser; use sea_orm::{ConnectOptions, ConnectionTrait, Database as SeaDatabase, DatabaseConnection}; use sea_orm_migration::MigratorTrait; +use siglog::api::rate_limit; use siglog::witness::{handlers, LogConfig, Witness}; +use std::net::SocketAddr; use std::sync::Arc; use std::time::Duration; +use tower_governor::{ + governor::GovernorConfigBuilder, key_extractor::SmartIpKeyExtractor, GovernorLayer, +}; /// Maximum allowed size for witness request bodies (1MB). /// This prevents DoS attacks from extremely large checkpoint submissions. @@ -92,6 +97,28 @@ async fn main() -> anyhow::Result<()> { let witness = Arc::new(Witness::new(signer, conn, args.logs)); tracing::info!("Witness name: {}", witness.name()); + // Rate limiting: /add-checkpoint does signature verification and proof + // hashing per request and is unauthenticated. + let rate_limit_config = Arc::new( + GovernorConfigBuilder::default() + // per_second(n) would mean "one request per n seconds"! + .per_nanosecond(rate_limit::replenish_interval_ns()) + .burst_size(rate_limit::rate_limit_burst_size()) + .key_extractor(SmartIpKeyExtractor) + .finish() + .expect("failed to create rate limit config"), + ); + let governor_limiter = rate_limit_config.limiter().clone(); + tokio::spawn(async move { + let mut interval = tokio::time::interval(Duration::from_secs(60)); + loop { + interval.tick().await; + governor_limiter.retain_recent(); + } + }); + let governor_layer = + GovernorLayer::new(rate_limit_config).error_handler(rate_limit::rate_limit_error_handler); + // Build router with body size limit let app = axum::Router::new() .route( @@ -102,6 +129,11 @@ async fn main() -> anyhow::Result<()> { .route("/ready", axum::routing::get(handlers::ready)) .with_state(witness) .layer(DefaultBodyLimit::max(MAX_BODY_SIZE)) + .layer(governor_layer) + .layer(tower_http::timeout::TimeoutLayer::with_status_code( + axum::http::StatusCode::REQUEST_TIMEOUT, + Duration::from_secs(30), + )) .layer( tower_http::trace::TraceLayer::new_for_http() .make_span_with( @@ -122,17 +154,12 @@ async fn main() -> anyhow::Result<()> { let listener = tokio::net::TcpListener::bind(&args.listen).await?; tracing::info!("Witness server listening on {}", args.listen); - // Handle shutdown - let shutdown_signal = async { - tokio::signal::ctrl_c() - .await - .expect("failed to install Ctrl+C handler"); - tracing::info!("Shutdown signal received"); - }; - - axum::serve(listener, app) - .with_graceful_shutdown(shutdown_signal) - .await?; + axum::serve( + listener, + app.into_make_service_with_connect_info::(), + ) + .with_graceful_shutdown(siglog::shutdown::shutdown_signal()) + .await?; tracing::info!("Witness server stopped"); Ok(()) diff --git a/src/checkpoint/signer.rs b/src/checkpoint/signer.rs index 3b3a81a..9158f09 100644 --- a/src/checkpoint/signer.rs +++ b/src/checkpoint/signer.rs @@ -28,6 +28,18 @@ pub struct CheckpointSigner { /// Ed25519 algorithm identifier for note format. const ALG_ED25519: u8 = 0x01; +/// Ed25519 cosignature/v1 algorithm identifier (c2sp.org/tlog-cosignature). +pub const ALG_COSIGNATURE_V1: u8 = 0x04; + +/// Build the message signed by an Ed25519 cosignature/v1 cosigner. +/// +/// Per c2sp.org/tlog-cosignature: a `cosignature/v1` header line, a +/// `time ` line, then the whole note body of the checkpoint +/// (including its final newline). +pub fn cosignature_v1_message(timestamp: u64, body: &str) -> String { + format!("cosignature/v1\ntime {}\n{}", timestamp, body) +} + impl CheckpointSigner { /// Create a new checkpoint signer from a note-format private key. /// @@ -175,15 +187,50 @@ impl CheckpointSigner { signature, } } + + /// The key ID this signer uses for cosignature/v1 cosignatures. + /// + /// Computed with the cosignature/v1 algorithm byte (0x04), so it differs + /// from the plain note-signature key ID. + pub fn cosignature_v1_key_id(&self) -> KeyId { + compute_key_id_with_alg( + self.name.as_str(), + &self.signing_key.verifying_key(), + ALG_COSIGNATURE_V1, + ) + } + + /// Produce a C2SP cosignature/v1 cosignature over a checkpoint. + /// + /// `timestamp` is the POSIX time (seconds) at which the cosignature is + /// generated; it is bound into the signed message and encoded in the + /// signature blob. + pub fn cosign_v1(&self, checkpoint: &Checkpoint, timestamp: u64) -> CheckpointSignature { + let message = cosignature_v1_message(timestamp, &checkpoint.to_body()); + let signature = self.signing_key.sign(message.as_bytes()); + + CheckpointSignature { + name: self.name.clone(), + key_id: self.cosignature_v1_key_id(), + signature, + timestamp: Some(timestamp), + } + } } /// Compute the key ID for a verifying key per Go's note format. /// Hash = SHA256(name + "\n" + alg_byte + public_key)[:4] fn compute_key_id(name: &str, key: &VerifyingKey) -> KeyId { + compute_key_id_with_alg(name, key, ALG_ED25519) +} + +/// Compute a key ID with an explicit algorithm byte. +/// Hash = SHA256(name + "\n" + alg_byte + public_key)[:4] +pub fn compute_key_id_with_alg(name: &str, key: &VerifyingKey, alg: u8) -> KeyId { let mut hasher = Sha256::new(); hasher.update(name.as_bytes()); hasher.update(b"\n"); - hasher.update([ALG_ED25519]); // Ed25519 algorithm identifier + hasher.update([alg]); hasher.update(key.as_bytes()); let hash = hasher.finalize(); @@ -313,19 +360,35 @@ pub struct CheckpointSignature { pub key_id: KeyId, /// The signature. pub signature: Signature, + /// The POSIX timestamp for cosignature/v1 signatures. + /// + /// `None` for plain note signatures (e.g. the log's own signature); + /// `Some` for C2SP cosignature/v1 witness cosignatures, where the + /// timestamp is bound into the signed message. + pub timestamp: Option, } impl CheckpointSignature { /// Format as a signature line. + /// + /// Plain signatures encode `key_id(4) || sig(64)`; cosignature/v1 + /// signatures encode `key_id(4) || timestamp(8, BE) || sig(64)` per + /// c2sp.org/tlog-cosignature. pub fn to_line(&self) -> String { - let mut sig_data = Vec::with_capacity(4 + 64); + let mut sig_data = Vec::with_capacity(4 + 8 + 64); sig_data.extend_from_slice(self.key_id.as_bytes()); + if let Some(ts) = self.timestamp { + sig_data.extend_from_slice(&ts.to_be_bytes()); + } sig_data.extend_from_slice(&self.signature.to_bytes()); format!("— {} {}", self.name.as_str(), STANDARD.encode(&sig_data)) } /// Parse a signature line. + /// + /// Accepts both plain note signatures (68-byte blob) and cosignature/v1 + /// signatures (76-byte blob with an embedded big-endian timestamp). pub fn from_line(line: &str) -> Result { let line = line.trim(); if !line.starts_with("— ") { @@ -343,16 +406,25 @@ impl CheckpointSignature { .decode(parts[1]) .map_err(|e| Error::Config(format!("invalid signature base64: {}", e)))?; - if sig_data.len() != 68 { - return Err(Error::Config(format!( - "invalid signature length: expected 68, got {}", - sig_data.len() - ))); - } + let (timestamp, sig_bytes): (Option, &[u8]) = match sig_data.len() { + 68 => (None, &sig_data[4..]), + 76 => { + let ts_bytes: [u8; 8] = sig_data[4..12] + .try_into() + .map_err(|_| Error::Config("invalid timestamp bytes".into()))?; + (Some(u64::from_be_bytes(ts_bytes)), &sig_data[12..]) + } + n => { + return Err(Error::Config(format!( + "invalid signature length: expected 68 (plain) or 76 (cosignature/v1), got {}", + n + ))); + } + }; let key_id = KeyId::new([sig_data[0], sig_data[1], sig_data[2], sig_data[3]]); let signature = Signature::from_bytes( - sig_data[4..] + sig_bytes .try_into() .map_err(|_| Error::Config("invalid signature bytes".into()))?, ); @@ -361,6 +433,7 @@ impl CheckpointSignature { name, key_id, signature, + timestamp, }) } } @@ -404,6 +477,7 @@ impl SignedCheckpoint { name: self.signer_name, key_id: self.key_id, signature: self.signature, + timestamp: None, }], } } @@ -430,11 +504,12 @@ impl CosignedCheckpoint { name: signer.name.clone(), key_id: signer.key_id.clone(), signature, + timestamp: None, }], } } - /// Add a cosignature from a witness. + /// Add a plain note cosignature from a witness (non-spec, legacy). pub fn add_signature(&mut self, signer: &CheckpointSigner) { let body = self.checkpoint.to_body(); let signature = signer.signing_key.sign(body.as_bytes()); @@ -443,9 +518,16 @@ impl CosignedCheckpoint { name: signer.name.clone(), key_id: signer.key_id.clone(), signature, + timestamp: None, }); } + /// Add a C2SP cosignature/v1 cosignature from a witness signer. + pub fn add_cosignature_v1(&mut self, signer: &CheckpointSigner, timestamp: u64) { + let cosig = signer.cosign_v1(&self.checkpoint, timestamp); + self.signatures.push(cosig); + } + /// Parse a cosigned checkpoint from text. pub fn from_text(text: &str) -> Result { let text = text.trim(); @@ -653,6 +735,56 @@ mod tests { assert_eq!(cosigned1.signature_count(), 3); } + #[test] + fn test_parse_spec_example_cosignature_line() { + // Example cosignature line from c2sp.org/tlog-cosignature. The blob + // is keyid(4) || timestamp(8, BE) || sig(64); the spec's message + // example uses "time 1679315147". + let line = "— witness.example.com/w1 jWbPPwAAAABkGFDLEZMHwSRaJNiIDoe9DYn/zXcrtPHeolMI5OWXEhZCB9dlrDJsX3b2oyin1nPZqhf5nNo0xUe+mbIUBkBIfZ+qnA=="; + let sig = CheckpointSignature::from_line(line).unwrap(); + assert_eq!(sig.name.as_str(), "witness.example.com/w1"); + assert_eq!(sig.timestamp, Some(1679315147)); + assert_eq!(sig.key_id.as_bytes(), &[0x8d, 0x66, 0xcf, 0x3f]); + // Round-trip back to the identical line. + assert_eq!(sig.to_line(), line); + } + + #[test] + fn test_cosignature_v1_message_format() { + // Exact message layout from the spec. + let body = "example.com/behind-the-sofa\n20852163\nCsUYapGGPo4dkMgIAUqom/Xajj7h2fB2MPA3j2jxq2I=\n"; + let msg = cosignature_v1_message(1679315147, body); + assert_eq!( + msg, + "cosignature/v1\ntime 1679315147\nexample.com/behind-the-sofa\n20852163\nCsUYapGGPo4dkMgIAUqom/Xajj7h2fB2MPA3j2jxq2I=\n" + ); + } + + #[test] + fn test_cosign_v1_roundtrip_verifies() { + use ed25519_dalek::Verifier; + + let signer = CheckpointSigner::generate("witness.example.com"); + let checkpoint = Checkpoint::new( + Origin::new("example.com/log".to_string()).unwrap(), + TreeSize::new(42), + Sha256Hash::from_bytes([7u8; 32]), + ); + + let cosig = signer.cosign_v1(&checkpoint, 1679315147); + assert_eq!(cosig.timestamp, Some(1679315147)); + assert_eq!(cosig.key_id, signer.cosignature_v1_key_id()); + // The v1 key ID must differ from the plain note key ID. + assert_ne!(&cosig.key_id, signer.key_id()); + + let parsed = CheckpointSignature::from_line(&cosig.to_line()).unwrap(); + let msg = cosignature_v1_message(1679315147, &checkpoint.to_body()); + signer + .public_key() + .verify(msg.as_bytes(), &parsed.signature) + .expect("cosignature/v1 must verify over the timestamped message"); + } + #[test] fn test_origin_validation() { // Valid origins diff --git a/src/error.rs b/src/error.rs index 83c7d66..7cf3352 100644 --- a/src/error.rs +++ b/src/error.rs @@ -73,7 +73,10 @@ impl IntoResponse for Error { } _ => { tracing::error!("Error: {}", self); - (StatusCode::INTERNAL_SERVER_ERROR, self.to_string()) + ( + StatusCode::INTERNAL_SERVER_ERROR, + "Internal server error".to_string(), + ) } }; diff --git a/src/lib.rs b/src/lib.rs index 5b6726b..ad99cb8 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -11,6 +11,7 @@ pub mod api; pub mod checkpoint; pub mod error; +pub mod shutdown; pub mod merkle; pub mod migration; pub mod monitor; diff --git a/src/main.rs b/src/main.rs index a09872a..67f5432 100644 --- a/src/main.rs +++ b/src/main.rs @@ -4,15 +4,19 @@ use axum::extract::DefaultBodyLimit; use clap::Parser; use siglog::api::handlers::{self, AppState}; use siglog::api::rate_limit; +use siglog::checkpoint::signer::Origin; use siglog::checkpoint::CheckpointSigner; use siglog::sequencer::{Sequencer, SequencerConfig}; use siglog::storage::{Database, TileStorage}; use siglog::vindex; use siglog::worker::{self, WorkerConfig}; +use std::net::SocketAddr; use std::sync::Arc; use std::time::Duration; use tokio::sync::watch; -use tower_governor::{governor::GovernorConfigBuilder, GovernorLayer}; +use tower_governor::{ + governor::GovernorConfigBuilder, key_extractor::SmartIpKeyExtractor, GovernorLayer, +}; /// Siglog - A minimal Tessera-compatible transparency log server. #[derive(Parser, Debug)] @@ -85,12 +89,19 @@ struct Args { #[arg(long, env = "WITNESS_KEYS")] witness_keys: Option, - /// External witness URLs (comma-separated). - /// Format: name=url,name2=url2 - /// Example: --external-witnesses "witness1=http://localhost:8081,monitor=http://localhost:8082" + /// External witnesses (comma-separated). + /// Format: name=url=vkey where vkey is the witness's note-format + /// verification key (name+hash+base64). Cosignatures are verified + /// against this pinned key before counting toward the quorum. + /// Example: --external-witnesses "w1=http://localhost:8081=w1+deadbeef+AQ..." #[arg(long, env = "EXTERNAL_WITNESSES")] external_witnesses: Option, + /// Minimum number of external witness cosignatures required to publish + /// a checkpoint. Defaults to all configured external witnesses. + #[arg(long, env = "WITNESS_QUORUM")] + witness_quorum: Option, + /// API key for authenticating write requests (optional). /// When set, the /add endpoint requires an Authorization: Bearer header. #[arg(long, env = "API_KEY")] @@ -139,6 +150,12 @@ async fn main() -> anyhow::Result<()> { ); } + // Validate the origin up front so a bad value fails startup instead of + // killing the checkpoint worker after the server is already accepting + // writes. + Origin::new(args.origin.clone()) + .map_err(|e| anyhow::anyhow!("invalid LOG_ORIGIN '{}': {}", args.origin, e))?; + // Initialize database tracing::info!("Connecting to database..."); let db = Database::connect(&args.database_url).await?; @@ -186,50 +203,56 @@ async fn main() -> anyhow::Result<()> { tracing::info!("Checkpoint signer initialized: {}", signer.name()); // Initialize in-process witnesses (for testing/development) - let witnesses: Vec> = if let Some(witness_keys) = &args.witness_keys { - witness_keys - .split(',') - .filter(|k| !k.trim().is_empty()) - .map(|key| { - let signer = - CheckpointSigner::from_note_key(key.trim()).expect("invalid witness key"); - tracing::info!("In-process witness initialized: {}", signer.name()); - Arc::new(signer) - }) - .collect() - } else { - Vec::new() - }; + let mut witnesses: Vec> = Vec::new(); + if let Some(witness_keys) = &args.witness_keys { + for key in witness_keys.split(',').filter(|k| !k.trim().is_empty()) { + let signer = CheckpointSigner::from_note_key(key.trim()) + .map_err(|e| anyhow::anyhow!("invalid in-process witness key: {}", e))?; + tracing::info!("In-process witness initialized: {}", signer.name()); + witnesses.push(Arc::new(signer)); + } + } tracing::info!("{} in-process witnesses configured", witnesses.len()); - // Parse external witness URLs - let external_witnesses: Vec = - if let Some(ext_witnesses) = &args.external_witnesses { - ext_witnesses - .split(',') - .filter(|s| !s.trim().is_empty()) - .map(|s| { - let parts: Vec<&str> = s.trim().splitn(2, '=').collect(); - if parts.len() != 2 { - panic!( - "invalid external witness format: expected 'name=url', got '{}'", - s - ); - } - let witness = worker::ExternalWitness::new(parts[0], parts[1]); - tracing::info!( - "External witness configured: {} -> {}", - witness.name, - witness.url - ); - witness - }) - .collect() - } else { - Vec::new() - }; + // Parse external witnesses (name=url=vkey) + let mut external_witnesses: Vec = Vec::new(); + if let Some(ext_witnesses) = &args.external_witnesses { + for s in ext_witnesses.split(',').filter(|s| !s.trim().is_empty()) { + let parts: Vec<&str> = s.trim().splitn(3, '=').collect(); + if parts.len() != 3 { + anyhow::bail!( + "invalid external witness format: expected 'name=url=vkey', got '{}'. \ + The verification key is required so cosignatures can be verified.", + s + ); + } + let witness = worker::ExternalWitness::new(parts[0], parts[1], parts[2]) + .map_err(|e| anyhow::anyhow!("invalid external witness '{}': {}", parts[0], e))?; + tracing::info!( + "External witness configured: {} -> {}", + witness.name, + witness.url + ); + external_witnesses.push(witness); + } + } tracing::info!("{} external witnesses configured", external_witnesses.len()); + if let Some(q) = args.witness_quorum { + if q > external_witnesses.len() { + anyhow::bail!( + "WITNESS_QUORUM ({}) exceeds the number of configured external witnesses ({})", + q, + external_witnesses.len() + ); + } + tracing::info!( + "Checkpoint publication quorum: {}/{} external witnesses", + q, + external_witnesses.len() + ); + } + // Create shutdown channel let (shutdown_tx, shutdown_rx) = watch::channel(false); @@ -241,8 +264,10 @@ async fn main() -> anyhow::Result<()> { }; let (sequencer, sequencer_task) = Sequencer::new(db.clone(), sequencer_config); - // Spawn sequencer - tokio::spawn(sequencer_task); + // Spawn sequencer (supervised below: if it dies, the process exits so + // the orchestrator can restart it, instead of silently acking writes + // that never get sequenced). + let sequencer_handle = tokio::spawn(sequencer_task); // Configure workers let worker_config = WorkerConfig { @@ -250,6 +275,7 @@ async fn main() -> anyhow::Result<()> { integration_batch_size: 1024, checkpoint_interval: Duration::from_secs(args.checkpoint_interval), origin: args.origin.clone(), + witness_quorum: args.witness_quorum, }; // Initialize vindex if enabled (before spawning workers) @@ -264,13 +290,28 @@ async fn main() -> anyhow::Result<()> { let expected_tree_size = log_state.integrated_size.value(); let vi = if let Some(wal_path) = &args.vindex_wal_path { - // Validate the WAL against the database state after a crash. + // Validate the snapshot + WAL against the database state after a + // crash. If the on-disk state is unusable (missing, corrupted, or + // behind the database), rebuild it from the log's entry bundles — + // the log itself is the source of truth for the index. tracing::info!( "Vindex WAL path: {}, expected tree size from DB: {}", wal_path, expected_tree_size ); - vindex::VerifiableIndex::with_wal(map_fn, wal_path, expected_tree_size)? + match vindex::VerifiableIndex::with_wal(map_fn.clone(), wal_path, expected_tree_size) { + Ok(vi) => vi, + Err(e) => { + tracing::warn!("Vindex state unusable ({}); rebuilding from log storage", e); + vindex::VerifiableIndex::rebuild_from_storage( + map_fn, + wal_path, + expected_tree_size, + &storage, + ) + .await? + } + } } else { if expected_tree_size > 0 { anyhow::bail!( @@ -292,7 +333,7 @@ async fn main() -> anyhow::Result<()> { }; // Spawn integration worker (with optional vindex) - tokio::spawn(worker::run_integration_worker( + let integration_handle = tokio::spawn(worker::run_integration_worker( db.clone(), storage.clone(), worker_config.clone(), @@ -301,7 +342,7 @@ async fn main() -> anyhow::Result<()> { )); // Spawn checkpoint worker - tokio::spawn(worker::run_checkpoint_worker( + let checkpoint_handle = tokio::spawn(worker::run_checkpoint_worker( db.clone(), storage.clone(), signer.clone(), @@ -324,20 +365,38 @@ async fn main() -> anyhow::Result<()> { } let state = Arc::new(state); - // Configure rate limiting + // Configure rate limiting. SmartIpKeyExtractor prefers standard proxy + // headers (x-forwarded-for, x-real-ip, forwarded) and falls back to the + // peer address, so per-client limits survive a reverse proxy. + let rate_limit_rps = rate_limit::rate_limit_per_second(); + let rate_limit_burst = rate_limit::rate_limit_burst_size(); let rate_limit_config = Arc::new( GovernorConfigBuilder::default() - .per_second(rate_limit::RATE_LIMIT_PER_SECOND) - .burst_size(rate_limit::RATE_LIMIT_BURST_SIZE) + // per_second(n) would mean "one request per n seconds"! + .per_nanosecond(rate_limit::replenish_interval_ns()) + .burst_size(rate_limit_burst) + .key_extractor(SmartIpKeyExtractor) .finish() .expect("failed to create rate limit config"), ); + + // tower_governor keeps one bucket per client key; without periodic + // cleanup that map grows forever. + let governor_limiter = rate_limit_config.limiter().clone(); + tokio::spawn(async move { + let mut interval = tokio::time::interval(Duration::from_secs(60)); + loop { + interval.tick().await; + governor_limiter.retain_recent(); + } + }); + let governor_layer = GovernorLayer::new(rate_limit_config).error_handler(rate_limit::rate_limit_error_handler); tracing::info!( "Rate limiting enabled: {} req/s per IP, burst {}", - rate_limit::RATE_LIMIT_PER_SECOND, - rate_limit::RATE_LIMIT_BURST_SIZE + rate_limit_rps, + rate_limit_burst ); // Build router @@ -374,6 +433,10 @@ async fn main() -> anyhow::Result<()> { .with_state(state) .layer(DefaultBodyLimit::max(handlers::MAX_ENTRY_SIZE)) .layer(governor_layer) + .layer(tower_http::timeout::TimeoutLayer::with_status_code( + axum::http::StatusCode::REQUEST_TIMEOUT, + Duration::from_secs(30), + )) .layer( tower_http::trace::TraceLayer::new_for_http() .make_span_with( @@ -395,18 +458,44 @@ async fn main() -> anyhow::Result<()> { args.listen ); - // Handle shutdown + // Handle shutdown (SIGINT and SIGTERM) + let shutting_down = Arc::new(std::sync::atomic::AtomicBool::new(false)); + let shutdown_flag = shutting_down.clone(); let shutdown_signal = async move { - tokio::signal::ctrl_c() - .await - .expect("failed to install Ctrl+C handler"); - tracing::info!("Shutdown signal received"); + siglog::shutdown::shutdown_signal().await; + shutdown_flag.store(true, std::sync::atomic::Ordering::SeqCst); let _ = shutdown_tx.send(true); }; - axum::serve(listener, app) - .with_graceful_shutdown(shutdown_signal) - .await?; + // Supervise the background pipeline: if any worker dies (panic or + // unexpected return) outside of shutdown, exit so the orchestrator + // restarts the process, instead of accepting writes that are never + // integrated or published. + let supervisor_flag = shutting_down.clone(); + tokio::spawn(async move { + let reason = tokio::select! { + r = sequencer_handle => format!("sequencer task exited: {:?}", r), + r = integration_handle => format!("integration worker exited: {:?}", r), + r = checkpoint_handle => format!("checkpoint worker exited: {:?}", r), + }; + if !supervisor_flag.load(std::sync::atomic::Ordering::SeqCst) { + tracing::error!( + "{}; exiting so the orchestrator can restart the process", + reason + ); + std::process::exit(1); + } + }); + + // ConnectInfo is required by the rate limiter's key extractor as the + // fallback when no proxy headers are present; without it every request + // fails with "unable to extract rate limit key". + axum::serve( + listener, + app.into_make_service_with_connect_info::(), + ) + .with_graceful_shutdown(shutdown_signal) + .await?; tracing::info!("Server stopped"); Ok(()) diff --git a/src/merkle/proof.rs b/src/merkle/proof.rs index 830a1c1..fb998bc 100644 --- a/src/merkle/proof.rs +++ b/src/merkle/proof.rs @@ -175,6 +175,13 @@ pub async fn compute_subtree_hash( /// Read a leaf hash from level 0 tiles. async fn read_leaf_hash(storage: &TileStorage, index: u64, tree_size: u64) -> Result { + if index >= tree_size { + return Err(Error::NotFound(format!( + "leaf {} beyond tree size {}", + index, tree_size + ))); + } + let tile_index = index / TILE_WIDTH; let offset = (index % TILE_WIDTH) as usize; diff --git a/src/monitor/mod.rs b/src/monitor/mod.rs index 6d35a1e..70dfbf9 100644 --- a/src/monitor/mod.rs +++ b/src/monitor/mod.rs @@ -22,7 +22,6 @@ use crate::witness::{ WitnessStateStore, WitnessedState, }; use async_trait::async_trait; -use ed25519_dalek::Signer; use sea_orm::DatabaseConnection; use std::sync::Arc; @@ -143,7 +142,13 @@ impl MonitoringWitness { conn: conn.clone(), state_store: WitnessStateStore::new(conn), logs, - http_client: reqwest::Client::new(), + // Without a timeout, a hung upstream log stalls add-checkpoint + // handlers indefinitely. + http_client: reqwest::Client::builder() + .timeout(std::time::Duration::from_secs(30)) + .connect_timeout(std::time::Duration::from_secs(10)) + .build() + .expect("failed to build HTTP client"), } } @@ -278,14 +283,17 @@ impl MonitoringWitness { } } - // 9. Create cosignature - let body = checkpoint.checkpoint.to_body(); - let signature = self.signer.signing_key_ref().sign(body.as_bytes()); - let cosig = CheckpointSignature { - name: self.signer.name().clone(), - key_id: self.signer.key_id().clone(), - signature, - }; + // 9. Create cosignature/v1 (c2sp.org/tlog-cosignature) + let timestamp = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .map_err(|e| { + MonitorError::Witness(WitnessError::Internal(format!( + "system clock error: {}", + e + ))) + })? + .as_secs(); + let cosig = self.signer.cosign_v1(&checkpoint.checkpoint, timestamp); // 10. Commit the validated entries to the monitor's index and database if new_size > state.size { @@ -300,9 +308,17 @@ impl MonitoringWitness { })?; } - // 11. Update witness state - self.state_store - .update(origin, new_size, new_root, &request.checkpoint) + // 11. Update witness state (CAS against the state we verified). + let outcome = self + .state_store + .update( + origin, + state.size, + &state.root_hash, + new_size, + new_root, + &request.checkpoint, + ) .await .map_err(|e| { MonitorError::Witness(WitnessError::Internal(format!( @@ -311,6 +327,10 @@ impl MonitoringWitness { ))) })?; + if let crate::witness::UpdateOutcome::Conflict { current_size } = outcome { + return Err(MonitorError::Witness(WitnessError::Conflict(current_size))); + } + Ok(cosig) } diff --git a/src/shutdown.rs b/src/shutdown.rs new file mode 100644 index 0000000..6bfe912 --- /dev/null +++ b/src/shutdown.rs @@ -0,0 +1,29 @@ +//! Shared shutdown signal handling for the server binaries. + +/// Wait for a shutdown signal: SIGINT (Ctrl+C) or, on Unix, SIGTERM. +/// +/// Container orchestrators (Docker, Kubernetes, Fly.io) stop services with +/// SIGTERM; handling only Ctrl+C would turn every deploy into a hard kill. +pub async fn shutdown_signal() { + let ctrl_c = async { + tokio::signal::ctrl_c() + .await + .expect("failed to install Ctrl+C handler"); + }; + + #[cfg(unix)] + let terminate = async { + tokio::signal::unix::signal(tokio::signal::unix::SignalKind::terminate()) + .expect("failed to install SIGTERM handler") + .recv() + .await; + }; + + #[cfg(not(unix))] + let terminate = std::future::pending::<()>(); + + tokio::select! { + _ = ctrl_c => tracing::info!("SIGINT received, shutting down"), + _ = terminate => tracing::info!("SIGTERM received, shutting down"), + } +} diff --git a/src/storage/database.rs b/src/storage/database.rs index cfe68da..87b54d4 100644 --- a/src/storage/database.rs +++ b/src/storage/database.rs @@ -76,9 +76,15 @@ impl Database { .await? .ok_or_else(|| Error::Internal("log state not found".into()))?; + // A corrupted root hash must be a loud error: silently mapping it to + // None makes the checkpoint worker stop publishing with no logs. let root_hash = row .root_hash - .and_then(|bytes| Sha256Hash::try_from_slice(&bytes).ok()); + .map(|bytes| { + Sha256Hash::try_from_slice(&bytes) + .map_err(|e| Error::Internal(format!("corrupted root hash in log state: {}", e))) + }) + .transpose()?; Ok(LogState { next_index: LogIndex::new(row.next_index as u64), diff --git a/src/storage/opendal.rs b/src/storage/opendal.rs index 57dd004..5f83a49 100644 --- a/src/storage/opendal.rs +++ b/src/storage/opendal.rs @@ -43,8 +43,15 @@ impl TileStorage { } /// Create a new tile storage with filesystem backend. + /// + /// Writes go through a temp directory and are renamed into place, so a + /// crash mid-write can never leave a torn tile or checkpoint, and a + /// concurrent `GET /checkpoint` never observes a partially-written file. pub fn new_fs(root: &str) -> Result { - let builder = Fs::default().root(root); + let atomic_dir = std::path::Path::new(root).join(".tmp"); + let builder = Fs::default() + .root(root) + .atomic_write_dir(&atomic_dir.to_string_lossy()); let op = Operator::new(builder)?.finish(); diff --git a/src/vindex/mod.rs b/src/vindex/mod.rs index 748b4b0..e223891 100644 --- a/src/vindex/mod.rs +++ b/src/vindex/mod.rs @@ -15,15 +15,18 @@ //! - PrefixTree: Merkle tree for verifiable proofs mod prefix_tree; +mod snapshot; mod wal; use crate::error::{Error, Result}; use crate::types::LogIndex; pub use prefix_tree::{LookupProof, PrefixTree, ProofNode}; use sha2::{Digest, Sha256}; +pub use snapshot::snapshot_path; use std::collections::{HashMap, HashSet}; use std::io::Write as _; -use std::path::Path; +use std::path::{Path, PathBuf}; +use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::{Arc, RwLock}; pub use wal::{ validate_and_truncate_wal, BatchedBinaryWalWriter, BatchedWalWriter, BinaryWalWriter, @@ -118,6 +121,9 @@ pub struct LookupResult { pub found: bool, /// The inclusion/exclusion proof from the prefix tree. pub proof: Vec, + /// The prefix tree root hash the proof verifies against, captured under + /// the same lock as the proof so the pair is always consistent. + pub root_hash: IndexKey, } /// WAL writer variant (batched or unbatched). @@ -140,6 +146,13 @@ impl WalWriterVariant { WalWriterVariant::Batched(w) => w.flush(), } } + + fn truncate(&mut self) -> Result<()> { + match self { + WalWriterVariant::Unbatched(w) => w.truncate(), + WalWriterVariant::Batched(w) => w.truncate(), + } + } } /// The verifiable index maintains a mapping from keys to log indices. @@ -148,6 +161,12 @@ pub struct VerifiableIndex { index: RwLock>>, /// WAL writer for persistence. wal_writer: Option>, + /// Snapshot file path (when WAL persistence is enabled). + snapshot_file: Option, + /// Entries indexed since the last snapshot (drives compaction). + entries_since_snapshot: AtomicU64, + /// Snapshot after this many new entries (0 = disabled). + snapshot_interval: u64, /// The map function for extracting keys. map_fn: Arc, /// Current tree size (number of entries indexed). @@ -170,6 +189,12 @@ impl VerifiableIndex { /// Maximum number of keys that can be represented for one entry in the WAL. const MAX_KEYS_PER_ENTRY: usize = u8::MAX as usize; + /// Default number of new entries between snapshots. + /// + /// Each snapshot durably captures the full index and truncates the WAL, + /// bounding WAL size and startup replay time. + const DEFAULT_SNAPSHOT_INTERVAL: u64 = 100_000; + /// Read max_keys from environment or use default. fn get_max_keys() -> usize { std::env::var("VINDEX_MAX_KEYS") @@ -178,6 +203,14 @@ impl VerifiableIndex { .unwrap_or(Self::DEFAULT_MAX_KEYS) } + /// Read the snapshot interval from the environment or use the default. + fn get_snapshot_interval() -> u64 { + std::env::var("VINDEX_SNAPSHOT_INTERVAL") + .ok() + .and_then(|s| s.parse().ok()) + .unwrap_or(Self::DEFAULT_SNAPSHOT_INTERVAL) + } + /// Read max_indices_per_key from environment or use default. fn get_max_indices_per_key() -> usize { std::env::var("VINDEX_MAX_INDICES_PER_KEY") @@ -209,6 +242,9 @@ impl VerifiableIndex { Self { index: RwLock::new(HashMap::new()), wal_writer: None, + snapshot_file: None, + entries_since_snapshot: AtomicU64::new(0), + snapshot_interval: 0, map_fn, tree_size: RwLock::new(0), prefix_tree: RwLock::new(PrefixTree::new()), @@ -253,37 +289,69 @@ impl VerifiableIndex { batch_size: usize, ) -> Result { let wal_path = wal_path.as_ref(); + let snapshot_file = snapshot::snapshot_path(wal_path); + + // Load the last snapshot if present. An invalid snapshot is treated + // as absent (read_snapshot logs a warning); the coverage check below + // then fails and the caller can rebuild from log storage. + let (mut index, base_size) = match snapshot::read_snapshot(&snapshot_file) { + Some((size, idx)) => { + tracing::info!( + "Vindex snapshot loaded: tree_size={}, {} keys", + size, + idx.len() + ); + (idx, size) + } + None => (HashMap::new(), 0), + }; + + if base_size > expected_tree_size { + return Err(Error::Internal(format!( + "vindex snapshot is ahead of database state: snapshot tree_size={}, database \ + integrated_size={}. Rebuild the vindex before enabling it.", + base_size, expected_tree_size + ))); + } // Validate and truncate WAL to match expected tree size // This is critical for crash recovery: if the WAL was flushed but the database // wasn't updated before a crash, we truncate the WAL to avoid duplicates let actual_wal_size = validate_and_truncate_wal(wal_path, expected_tree_size)?; tracing::info!( - "WAL validated: expected_tree_size={}, actual_wal_size={}", + "WAL validated: expected_tree_size={}, snapshot_size={}, actual_wal_size={}", expected_tree_size, + base_size, actual_wal_size ); - if actual_wal_size < expected_tree_size { + let covered = base_size.max(actual_wal_size); + if covered < expected_tree_size { return Err(Error::Internal(format!( - "vindex WAL is behind database state: WAL tree_size={}, database integrated_size={}. \ - Rebuild the vindex before enabling it.", - actual_wal_size, expected_tree_size + "vindex state is behind the database: snapshot+WAL cover tree_size={}, database \ + integrated_size={}. Rebuild the vindex before enabling it.", + covered, expected_tree_size ))); } - // Create or open WAL - let mut tree_size = 0u64; - let mut index: HashMap> = HashMap::new(); + let mut tree_size = base_size; let mut prefix_tree = PrefixTree::new(); - // Track seen (idx, key) pairs to prevent duplicates (defense in depth) + // Track seen (idx, key) pairs to prevent duplicates (defense in depth). + // Only WAL entries after the snapshot are replayed, so this set is + // bounded by the snapshot interval, not the full history. let mut seen: HashSet<(u64, IndexKey)> = HashSet::new(); - // Replay existing WAL if it exists + // Replay WAL entries on top of the snapshot if wal_path.exists() { let mut reader = WalReader::open(wal_path)?; while let Some((idx, keys)) = reader.next_entry()? { + // Entries at or below the snapshot's tree size are already in + // the snapshot (e.g. after a crash between snapshot write and + // WAL truncation). + if idx.value() < base_size { + continue; + } for key in keys { // Deduplicate: only add if we haven't seen this (idx, key) pair if seen.insert((idx.value(), key)) { @@ -292,20 +360,20 @@ impl VerifiableIndex { } tree_size = tree_size.max(idx.value() + 1); } + } - // Rebuild prefix tree from index - for (key, indices) in &index { - let value_hash = compute_value_hash(indices); - prefix_tree.insert(key, value_hash); - } - - tracing::info!( - "WAL replayed: {} unique keys, tree_size={}", - index.len(), - tree_size - ); + // Rebuild prefix tree from the combined index + for (key, indices) in &index { + let value_hash = compute_value_hash(indices); + prefix_tree.insert(key, value_hash); } + tracing::info!( + "Vindex restored: {} unique keys, tree_size={}", + index.len(), + tree_size + ); + // Create writer (batched or unbatched based on batch_size) let wal_writer = if batch_size > 1 { tracing::info!("Using batched WAL writer with batch_size={}", batch_size); @@ -316,16 +384,23 @@ impl VerifiableIndex { let max_keys = Self::get_max_keys(); let max_indices_per_key = Self::get_max_indices_per_key(); + let snapshot_interval = Self::get_snapshot_interval(); tracing::info!( - "VerifiableIndex limits: max_keys={}, max_indices_per_key={}", + "VerifiableIndex limits: max_keys={}, max_indices_per_key={}, snapshot_interval={}", max_keys, - max_indices_per_key + max_indices_per_key, + snapshot_interval ); Ok(Self { index: RwLock::new(index), wal_writer: Some(RwLock::new(wal_writer)), + snapshot_file: Some(snapshot_file), + // WAL bytes not yet covered by a snapshot count toward the next + // snapshot trigger. + entries_since_snapshot: AtomicU64::new(tree_size - base_size), + snapshot_interval, map_fn, tree_size: RwLock::new(tree_size), prefix_tree: RwLock::new(prefix_tree), @@ -337,7 +412,15 @@ impl VerifiableIndex { /// Index a new entry at the given log index. /// /// Extracts keys from the entry data and adds them to the index. + /// + /// Idempotent: entries below the current tree size were already indexed + /// and are skipped, so a retried integration cycle (e.g. after a + /// transient database error) cannot duplicate indices. pub fn index_entry(&self, idx: LogIndex, data: &[u8]) -> Result<()> { + if idx.value() < *self.tree_size.read().unwrap() { + return Ok(()); + } + let keys = self.map_fn.map(data); if keys.len() > Self::MAX_KEYS_PER_ENTRY { @@ -356,8 +439,11 @@ impl VerifiableIndex { } // Still need to update tree size even if no keys - let mut tree_size = self.tree_size.write().unwrap(); - *tree_size = (*tree_size).max(idx.value() + 1); + { + let mut tree_size = self.tree_size.write().unwrap(); + *tree_size = (*tree_size).max(idx.value() + 1); + } + self.entries_since_snapshot.fetch_add(1, Ordering::Relaxed); return Ok(()); } @@ -419,6 +505,11 @@ impl VerifiableIndex { for key in &keys { let indices = index.entry(*key).or_default(); + // Defense in depth: never record the same index twice for a + // key (mirrors the dedup applied during WAL replay). + if indices.contains(&idx) { + continue; + } indices.push(idx); // Update the prefix tree with the new value hash @@ -432,6 +523,7 @@ impl VerifiableIndex { let mut tree_size = self.tree_size.write().unwrap(); *tree_size = (*tree_size).max(idx.value() + 1); } + self.entries_since_snapshot.fetch_add(1, Ordering::Relaxed); Ok(()) } @@ -444,12 +536,14 @@ impl VerifiableIndex { let indices = index.get(key).cloned().unwrap_or_default(); let lookup_proof = prefix_tree.lookup(key); + let root_hash = prefix_tree.root_hash(); LookupResult { indices, tree_size, found: lookup_proof.found, proof: lookup_proof.proof, + root_hash, } } @@ -479,6 +573,115 @@ impl VerifiableIndex { Ok(()) } + /// Write a snapshot of the full index and truncate the WAL. + /// + /// This bounds WAL growth and startup replay time. The WAL writer lock + /// is held for the whole operation so no entry can be appended between + /// the snapshot capture and the WAL truncation (such an entry would be + /// lost by the truncate). + pub fn snapshot(&self) -> Result<()> { + let (Some(wal), Some(snapshot_file)) = (&self.wal_writer, &self.snapshot_file) else { + return Ok(()); + }; + + let mut wal = wal.write().unwrap(); + wal.flush()?; + + { + let index = self.index.read().unwrap(); + let tree_size = *self.tree_size.read().unwrap(); + snapshot::write_snapshot(snapshot_file, tree_size, &index)?; + tracing::info!( + "Vindex snapshot written: tree_size={}, {} keys", + tree_size, + index.len() + ); + } + + wal.truncate()?; + self.entries_since_snapshot.store(0, Ordering::Relaxed); + Ok(()) + } + + /// Snapshot and compact the WAL if enough entries accumulated since the + /// last snapshot. Returns whether a snapshot was written. + pub fn maybe_snapshot(&self) -> Result { + if self.snapshot_interval == 0 || self.wal_writer.is_none() { + return Ok(false); + } + if self.entries_since_snapshot.load(Ordering::Relaxed) < self.snapshot_interval { + return Ok(false); + } + self.snapshot()?; + Ok(true) + } + + /// Rebuild the index from the log's entry bundles in tile storage. + /// + /// Used when the on-disk vindex state (snapshot + WAL) is missing, + /// corrupted, or behind the database. The log itself is the source of + /// truth: every integrated entry lives in an entry bundle, so the index + /// can always be reconstructed. Writes a fresh WAL and snapshot. + pub async fn rebuild_from_storage( + map_fn: Arc, + wal_path: impl AsRef, + expected_tree_size: u64, + storage: &crate::storage::TileStorage, + ) -> Result { + use crate::api::paths::{partial_tile_size, ENTRY_BUNDLE_WIDTH}; + use crate::types::{PartialSize, TileIndex}; + + let wal_path = wal_path.as_ref(); + + // Start from a clean slate. + let snapshot_file = snapshot::snapshot_path(wal_path); + let _ = std::fs::remove_file(&snapshot_file); + let _ = std::fs::remove_file(wal_path); + + let vi = Self::with_wal(map_fn, wal_path, 0)?; + if expected_tree_size == 0 { + return Ok(vi); + } + + tracing::info!( + "Rebuilding vindex from log storage ({} entries)...", + expected_tree_size + ); + + let last_bundle = (expected_tree_size - 1) / ENTRY_BUNDLE_WIDTH; + for bundle_idx in 0..=last_bundle { + let partial = partial_tile_size(0, bundle_idx, expected_tree_size); + let bundle = storage + .read_entry_bundle(TileIndex::new(bundle_idx), PartialSize::new(partial)) + .await? + .ok_or_else(|| { + Error::Internal(format!( + "missing entry bundle {} during vindex rebuild", + bundle_idx + )) + })?; + + for (offset, data) in bundle.entries.iter().enumerate() { + let idx = bundle_idx * ENTRY_BUNDLE_WIDTH + offset as u64; + if idx >= expected_tree_size { + break; + } + vi.index_entry(LogIndex::new(idx), data.as_bytes())?; + } + } + + vi.flush()?; + vi.snapshot()?; + + tracing::info!( + "Vindex rebuilt: {} keys from {} entries", + vi.key_count(), + vi.tree_size() + ); + + Ok(vi) + } + /// Get the root hash of the prefix tree. /// /// This hash commits to the entire index state and can be used @@ -704,6 +907,134 @@ mod tests { assert!(result.is_err()); } + #[test] + fn test_snapshot_compacts_wal_and_restores() { + let temp_dir = tempfile::tempdir().unwrap(); + let path = temp_dir.path().join("vindex.wal"); + + let root_before; + { + let map_fn = Arc::new(JsonKeysMapFn::new("name")); + let index = VerifiableIndex::with_wal(map_fn, &path, 0).unwrap(); + index + .index_entry(LogIndex::new(0), br#"{"name": "foo"}"#) + .unwrap(); + index + .index_entry(LogIndex::new(1), br#"{"name": "bar"}"#) + .unwrap(); + index.flush().unwrap(); + + // Snapshot and compact. + index.snapshot().unwrap(); + assert_eq!(std::fs::metadata(&path).unwrap().len(), 0, "WAL truncated"); + + // More entries after the snapshot land in the WAL only. + index + .index_entry(LogIndex::new(2), br#"{"name": "foo"}"#) + .unwrap(); + index.flush().unwrap(); + assert!(std::fs::metadata(&path).unwrap().len() > 0); + + root_before = index.root_hash(); + } + + // Restart: state must be identical (snapshot + WAL replay). + let map_fn = Arc::new(JsonKeysMapFn::new("name")); + let index = VerifiableIndex::with_wal(map_fn, &path, 3).unwrap(); + assert_eq!(index.tree_size(), 3); + assert_eq!(index.key_count(), 2); + assert_eq!(index.root_hash(), root_before); + + let result = index.lookup_string("foo"); + assert_eq!( + result.indices.iter().map(|i| i.value()).collect::>(), + vec![0, 2] + ); + } + + #[test] + fn test_crash_between_snapshot_and_truncate() { + let temp_dir = tempfile::tempdir().unwrap(); + let path = temp_dir.path().join("vindex.wal"); + + let root_before; + { + let map_fn = Arc::new(JsonKeysMapFn::new("name")); + let index = VerifiableIndex::with_wal(map_fn, &path, 0).unwrap(); + index + .index_entry(LogIndex::new(0), br#"{"name": "foo"}"#) + .unwrap(); + index + .index_entry(LogIndex::new(1), br#"{"name": "bar"}"#) + .unwrap(); + index.flush().unwrap(); + root_before = index.root_hash(); + + // Simulate a crash between snapshot write and WAL truncation: + // write the snapshot directly, leaving the full WAL in place. + let idx_map = index.index.read().unwrap().clone(); + snapshot::write_snapshot(&snapshot::snapshot_path(&path), 2, &idx_map).unwrap(); + } + assert!(std::fs::metadata(&path).unwrap().len() > 0, "WAL not truncated"); + + // Restart: pre-snapshot WAL entries must not be double-applied. + let map_fn = Arc::new(JsonKeysMapFn::new("name")); + let index = VerifiableIndex::with_wal(map_fn, &path, 2).unwrap(); + assert_eq!(index.tree_size(), 2); + assert_eq!(index.key_count(), 2); + assert_eq!(index.root_hash(), root_before); + assert_eq!(index.lookup_string("foo").indices.len(), 1); + } + + #[tokio::test] + async fn test_rebuild_from_storage() { + use crate::merkle::EntryBundle; + use crate::storage::TileStorage; + use crate::types::{EntryData, PartialSize, TileIndex}; + + let temp_dir = tempfile::tempdir().unwrap(); + let path = temp_dir.path().join("vindex.wal"); + + // Log storage with one partial entry bundle of 3 entries. + let storage = TileStorage::new( + opendal::Operator::new(opendal::services::Memory::default()) + .unwrap() + .finish(), + ); + let entries = vec![ + EntryData::from(r#"{"name": "foo"}"#), + EntryData::from(r#"{"name": "bar"}"#), + EntryData::from(r#"{"name": "foo"}"#), + ]; + storage + .write_entry_bundle( + TileIndex::new(0), + PartialSize::new(3), + &EntryBundle::with_entries(entries), + ) + .await + .unwrap(); + + // No WAL/snapshot exists: rebuild from storage. + let map_fn = Arc::new(JsonKeysMapFn::new("name")); + let index = VerifiableIndex::rebuild_from_storage(map_fn, &path, 3, &storage) + .await + .unwrap(); + + assert_eq!(index.tree_size(), 3); + assert_eq!(index.key_count(), 2); + let result = index.lookup_string("foo"); + assert_eq!( + result.indices.iter().map(|i| i.value()).collect::>(), + vec![0, 2] + ); + + // The rebuild persisted a snapshot: a plain restart must now work. + let map_fn = Arc::new(JsonKeysMapFn::new("name")); + let restored = VerifiableIndex::with_wal(map_fn, &path, 3).unwrap(); + assert_eq!(restored.root_hash(), index.root_hash()); + } + #[test] fn test_hash_key() { let key1 = hash_key("foo"); diff --git a/src/vindex/snapshot.rs b/src/vindex/snapshot.rs new file mode 100644 index 0000000..7edbadb --- /dev/null +++ b/src/vindex/snapshot.rs @@ -0,0 +1,254 @@ +//! Snapshot persistence for the verifiable index. +//! +//! A snapshot is a point-in-time serialization of the full key → indices +//! map at a given tree size. Together with the WAL it bounds both WAL growth +//! and startup replay time: after a snapshot at tree size `T` is durably +//! written, the WAL is truncated and only needs to cover entries `>= T`. +//! +//! ## Format +//! +//! ```text +//! magic "VSNP" (4 bytes) +//! version u8 = 1 +//! tree_size u64 BE +//! key_count u64 BE +//! per key: +//! key 32 bytes +//! idx_count u32 BE +//! indices idx_count * u64 BE +//! crc32 u32 LE over all preceding bytes +//! ``` +//! +//! Snapshots are written to a temp file, fsynced, and renamed into place, so +//! a crash mid-write can never leave a torn snapshot at the final path. A +//! snapshot that fails its CRC check is treated as absent (the caller falls +//! back to rebuilding from log storage). + +use crate::error::{Error, Result}; +use crate::types::LogIndex; +use std::collections::HashMap; +use std::fs::{File, OpenOptions}; +use std::io::{Read, Write}; +use std::path::{Path, PathBuf}; + +use super::IndexKey; + +const SNAPSHOT_MAGIC: &[u8; 4] = b"VSNP"; +const SNAPSHOT_VERSION: u8 = 1; + +/// The snapshot path for a given WAL path (`.snapshot`). +pub fn snapshot_path(wal_path: &Path) -> PathBuf { + let mut os = wal_path.as_os_str().to_os_string(); + os.push(".snapshot"); + PathBuf::from(os) +} + +/// Serialize and durably write a snapshot (temp file + fsync + rename). +pub fn write_snapshot( + path: &Path, + tree_size: u64, + index: &HashMap>, +) -> Result<()> { + let mut buf = Vec::with_capacity(17 + index.len() * 48); + buf.extend_from_slice(SNAPSHOT_MAGIC); + buf.push(SNAPSHOT_VERSION); + buf.extend_from_slice(&tree_size.to_be_bytes()); + buf.extend_from_slice(&(index.len() as u64).to_be_bytes()); + + for (key, indices) in index { + buf.extend_from_slice(key); + buf.extend_from_slice(&(indices.len() as u32).to_be_bytes()); + for idx in indices { + buf.extend_from_slice(&idx.value().to_be_bytes()); + } + } + + let crc = crc32fast::hash(&buf); + buf.extend_from_slice(&crc.to_le_bytes()); + + let tmp_path = { + let mut os = path.as_os_str().to_os_string(); + os.push(".tmp"); + PathBuf::from(os) + }; + + { + let mut tmp = OpenOptions::new() + .create(true) + .write(true) + .truncate(true) + .open(&tmp_path) + .map_err(|e| Error::Internal(format!("failed to create snapshot temp file: {}", e)))?; + tmp.write_all(&buf) + .map_err(|e| Error::Internal(format!("failed to write snapshot: {}", e)))?; + tmp.sync_all() + .map_err(|e| Error::Internal(format!("failed to sync snapshot: {}", e)))?; + } + + std::fs::rename(&tmp_path, path) + .map_err(|e| Error::Internal(format!("failed to rename snapshot into place: {}", e)))?; + + // Fsync the parent directory so the rename itself is durable. + if let Some(parent) = path.parent() { + let dir = if parent.as_os_str().is_empty() { + Path::new(".") + } else { + parent + }; + if let Ok(dir_file) = File::open(dir) { + let _ = dir_file.sync_all(); + } + } + + Ok(()) +} + +/// Read a snapshot. Returns `Ok(None)` if the file does not exist or fails +/// validation (magic, version, structure, CRC) — a bad snapshot is treated +/// as absent so the caller can fall back to rebuilding. +pub fn read_snapshot(path: &Path) -> Option<(u64, HashMap>)> { + if !path.exists() { + return None; + } + + let mut data = Vec::new(); + match File::open(path).and_then(|mut f| f.read_to_end(&mut data)) { + Ok(_) => {} + Err(e) => { + tracing::warn!("Failed to read vindex snapshot {}: {}", path.display(), e); + return None; + } + } + + parse_snapshot(&data).map_err(|e| { + tracing::warn!( + "Ignoring invalid vindex snapshot {}: {}", + path.display(), + e + ); + }) + .ok() +} + +fn parse_snapshot(data: &[u8]) -> Result<(u64, HashMap>)> { + // magic(4) + version(1) + tree_size(8) + key_count(8) + crc(4) + if data.len() < 25 { + return Err(Error::Internal("snapshot too short".into())); + } + + let (body, crc_bytes) = data.split_at(data.len() - 4); + let stored_crc = u32::from_le_bytes(crc_bytes.try_into().unwrap()); + if crc32fast::hash(body) != stored_crc { + return Err(Error::Internal("snapshot checksum mismatch".into())); + } + + if &body[0..4] != SNAPSHOT_MAGIC { + return Err(Error::Internal("bad snapshot magic".into())); + } + if body[4] != SNAPSHOT_VERSION { + return Err(Error::Internal(format!( + "unsupported snapshot version {}", + body[4] + ))); + } + + let tree_size = u64::from_be_bytes(body[5..13].try_into().unwrap()); + let key_count = u64::from_be_bytes(body[13..21].try_into().unwrap()) as usize; + + let mut pos = 21usize; + let mut index = HashMap::with_capacity(key_count); + for _ in 0..key_count { + if body.len() < pos + 36 { + return Err(Error::Internal("snapshot truncated in key record".into())); + } + let key: IndexKey = body[pos..pos + 32].try_into().unwrap(); + let idx_count = u32::from_be_bytes(body[pos + 32..pos + 36].try_into().unwrap()) as usize; + pos += 36; + + if body.len() < pos + idx_count * 8 { + return Err(Error::Internal("snapshot truncated in index list".into())); + } + let mut indices = Vec::with_capacity(idx_count); + for i in 0..idx_count { + let v = u64::from_be_bytes(body[pos + i * 8..pos + i * 8 + 8].try_into().unwrap()); + indices.push(LogIndex::new(v)); + } + pos += idx_count * 8; + index.insert(key, indices); + } + + if pos != body.len() { + return Err(Error::Internal("snapshot has trailing bytes".into())); + } + + Ok((tree_size, index)) +} + +#[cfg(test)] +mod tests { + use super::*; + + fn sample_index() -> HashMap> { + let mut index = HashMap::new(); + index.insert([1u8; 32], vec![LogIndex::new(0), LogIndex::new(5)]); + index.insert([2u8; 32], vec![LogIndex::new(3)]); + index.insert([3u8; 32], vec![]); + index + } + + #[test] + fn test_snapshot_roundtrip() { + let dir = tempfile::tempdir().unwrap(); + let path = dir.path().join("test.wal.snapshot"); + + let index = sample_index(); + write_snapshot(&path, 6, &index).unwrap(); + + let (tree_size, loaded) = read_snapshot(&path).unwrap(); + assert_eq!(tree_size, 6); + assert_eq!(loaded, index); + } + + #[test] + fn test_missing_snapshot_is_none() { + let dir = tempfile::tempdir().unwrap(); + assert!(read_snapshot(&dir.path().join("nope.snapshot")).is_none()); + } + + #[test] + fn test_corrupt_snapshot_is_ignored() { + let dir = tempfile::tempdir().unwrap(); + let path = dir.path().join("test.wal.snapshot"); + + write_snapshot(&path, 6, &sample_index()).unwrap(); + + // Flip a byte in the middle. + let mut data = std::fs::read(&path).unwrap(); + let mid = data.len() / 2; + data[mid] ^= 0xFF; + std::fs::write(&path, &data).unwrap(); + + assert!(read_snapshot(&path).is_none()); + } + + #[test] + fn test_truncated_snapshot_is_ignored() { + let dir = tempfile::tempdir().unwrap(); + let path = dir.path().join("test.wal.snapshot"); + + write_snapshot(&path, 6, &sample_index()).unwrap(); + + let data = std::fs::read(&path).unwrap(); + std::fs::write(&path, &data[..data.len() - 10]).unwrap(); + + assert!(read_snapshot(&path).is_none()); + } + + #[test] + fn test_snapshot_path_suffix() { + assert_eq!( + snapshot_path(Path::new("/data/vindex.wal")), + PathBuf::from("/data/vindex.wal.snapshot") + ); + } +} diff --git a/src/vindex/wal.rs b/src/vindex/wal.rs index e189315..4e3267a 100644 --- a/src/vindex/wal.rs +++ b/src/vindex/wal.rs @@ -1,37 +1,51 @@ //! Write Ahead Log (WAL) for verifiable index persistence. //! -//! The WAL supports two formats: -//! -//! ## Text format (legacy): +//! ## Binary format v3 (current, checksummed): //! ```text -//! ... +//! [u8 version=3][u64 index][u8 key_count][32*key_count bytes of keys][u32 crc32 LE] //! ``` -//! Cost: ~336 bytes for entry with 5 keys +//! The CRC32 covers everything from the version byte through the last key +//! byte, so bit rot and torn writes are detected instead of being replayed +//! as wrong keys. //! -//! ## Binary format (v2): +//! ## Binary format v2 (legacy, read-only): //! ```text //! [u8 version=2][u64 index][u8 key_count][32*key_count bytes of keys] //! ``` -//! Cost: ~169 bytes for entry with 5 keys (50% savings) -//! -//! The reader auto-detects format by checking the first byte: -//! - ASCII digit (0-9) → text format -//! - 0x02 → binary format v2 //! -//! On startup, the WAL is validated and truncated to match the expected -//! tree size from the database. This prevents duplicate entries after a crash. +//! On startup, the WAL is validated and truncated: +//! - Entries with `index >= expected_tree_size` (from the database) are +//! truncated — the WAL ran ahead of the database before a crash. +//! - A torn or corrupted tail (crash mid-write, bit rot) is truncated at the +//! last fully-valid entry instead of failing startup. use crate::error::{Error, Result}; use crate::types::LogIndex; use std::fs::{File, OpenOptions}; -use std::io::{BufReader, BufWriter, Read, Write}; +use std::io::{BufReader, Read, Write}; use std::path::Path; use super::IndexKey; -/// WAL format version (binary only). +/// Legacy binary WAL format version (no checksum, read-only support). const WAL_VERSION_BINARY: u8 = 2; +/// Checksummed binary WAL format version (current write format). +const WAL_VERSION_CRC: u8 = 3; + +/// Serialize a single WAL entry in v3 (checksummed) format. +fn encode_entry(buf: &mut Vec, idx: LogIndex, keys: &[IndexKey]) { + let start = buf.len(); + buf.push(WAL_VERSION_CRC); + buf.extend_from_slice(&idx.value().to_be_bytes()); + buf.push(keys.len() as u8); + for key in keys { + buf.extend_from_slice(key); + } + let crc = crc32fast::hash(&buf[start..]); + buf.extend_from_slice(&crc.to_le_bytes()); +} + /// Binary WAL writer (now the default and only format). /// /// Format per entry: [version=0x02][u64 index][u8 key_count][32*key_count bytes] @@ -51,19 +65,13 @@ pub type WalWriter = BinaryWalWriter; /// Expected improvement: 5-10x throughput over unbatched text format pub type BatchedWalWriter = BatchedBinaryWalWriter; -/// Binary WAL writer for efficient storage. +/// Binary WAL writer for efficient storage (v3 checksummed format). /// -/// Format per entry: -/// ```text -/// [u8 version=2][u64 index][u8 key_count][32*key_count bytes] -/// ``` -/// -/// Benefits over text format: -/// - 50% space savings (169 bytes vs 336 bytes for 5 keys) -/// - 20-30% faster I/O (no hex encoding/decoding) -/// - Simpler parsing (no string allocation) +/// Each entry is serialized to a buffer and written with a single +/// `write_all` call, so a failed in-process write cannot leave a partial +/// entry interleaved with later entries. pub struct BinaryWalWriter { - writer: BufWriter, + file: File, } impl BinaryWalWriter { @@ -75,14 +83,10 @@ impl BinaryWalWriter { .open(path.as_ref()) .map_err(|e| Error::Internal(format!("failed to open WAL: {}", e)))?; - Ok(Self { - writer: BufWriter::new(file), - }) + Ok(Self { file }) } /// Append an entry to the WAL in binary format. - /// - /// Format: [u8 version][u64 index][u8 key_count][keys...] pub fn append(&mut self, idx: LogIndex, keys: &[IndexKey]) -> Result<()> { if keys.len() > u8::MAX as usize { return Err(Error::InvalidEntry(format!( @@ -93,45 +97,34 @@ impl BinaryWalWriter { ))); } - // Version marker - self.writer - .write_all(&[WAL_VERSION_BINARY]) - .map_err(|e| Error::Internal(format!("failed to write to WAL: {}", e)))?; - - // Index (8 bytes, big-endian for readability in hex dumps) - self.writer - .write_all(&idx.value().to_be_bytes()) + let mut buf = Vec::with_capacity(14 + keys.len() * 32); + encode_entry(&mut buf, idx, keys); + self.file + .write_all(&buf) .map_err(|e| Error::Internal(format!("failed to write to WAL: {}", e)))?; - // Key count (1 byte, limiting to 255 keys per entry) - let key_count = keys.len() as u8; - self.writer - .write_all(&[key_count]) - .map_err(|e| Error::Internal(format!("failed to write to WAL: {}", e)))?; - - // Keys (32 bytes each) - for key in keys { - self.writer - .write_all(key) - .map_err(|e| Error::Internal(format!("failed to write to WAL: {}", e)))?; - } - Ok(()) } /// Flush the WAL to disk with fsync for durability. pub fn flush(&mut self) -> Result<()> { - self.writer - .flush() - .map_err(|e| Error::Internal(format!("failed to flush WAL: {}", e)))?; - - self.writer - .get_ref() + self.file .sync_data() .map_err(|e| Error::Internal(format!("failed to sync WAL to disk: {}", e)))?; Ok(()) } + + /// Truncate the WAL to zero length (after a snapshot has been written). + pub fn truncate(&mut self) -> Result<()> { + self.file + .set_len(0) + .map_err(|e| Error::Internal(format!("failed to truncate WAL: {}", e)))?; + self.file + .sync_all() + .map_err(|e| Error::Internal(format!("failed to sync truncated WAL: {}", e)))?; + Ok(()) + } } /// Batched binary WAL writer combining batching with binary format. @@ -144,7 +137,7 @@ impl BinaryWalWriter { pub struct BatchedBinaryWalWriter { buffer: Vec<(LogIndex, Vec)>, batch_size: usize, - writer: BufWriter, + file: File, } impl BatchedBinaryWalWriter { @@ -159,7 +152,7 @@ impl BatchedBinaryWalWriter { Ok(Self { buffer: Vec::with_capacity(batch_size), batch_size, - writer: BufWriter::new(file), + file, }) } @@ -192,6 +185,12 @@ impl BatchedBinaryWalWriter { } /// Internal method to flush the current batch in binary format. + /// + /// The whole batch is serialized and written with a single `write_all` + /// and a single fsync. On error the buffer is retained, so a retry + /// rewrites the full batch; duplicated entries are harmless because + /// replay deduplicates (idx, key) pairs, and torn fragments are removed + /// by CRC-validated truncation on startup. fn flush_batch(&mut self) -> Result<()> { if self.buffer.is_empty() { return Ok(()); @@ -199,40 +198,17 @@ impl BatchedBinaryWalWriter { tracing::debug!("Flushing binary WAL batch of {} entries", self.buffer.len()); - // Write all buffered entries in binary format + let mut buf = Vec::with_capacity(self.buffer.iter().map(|(_, k)| 14 + k.len() * 32).sum()); for (idx, keys) in &self.buffer { - // Version marker - self.writer - .write_all(&[WAL_VERSION_BINARY]) - .map_err(|e| Error::Internal(format!("failed to write to WAL: {}", e)))?; - - // Index - self.writer - .write_all(&idx.value().to_be_bytes()) - .map_err(|e| Error::Internal(format!("failed to write to WAL: {}", e)))?; - - // Key count - let key_count = keys.len() as u8; - self.writer - .write_all(&[key_count]) - .map_err(|e| Error::Internal(format!("failed to write to WAL: {}", e)))?; - - // Keys - for key in keys { - self.writer - .write_all(key) - .map_err(|e| Error::Internal(format!("failed to write to WAL: {}", e)))?; - } + encode_entry(&mut buf, *idx, keys); } - // Flush buffer to OS - self.writer - .flush() - .map_err(|e| Error::Internal(format!("failed to flush WAL: {}", e)))?; + self.file + .write_all(&buf) + .map_err(|e| Error::Internal(format!("failed to write to WAL: {}", e)))?; // Single fsync for entire batch - self.writer - .get_ref() + self.file .sync_data() .map_err(|e| Error::Internal(format!("failed to sync WAL to disk: {}", e)))?; @@ -246,13 +222,32 @@ impl BatchedBinaryWalWriter { pub fn buffered_count(&self) -> usize { self.buffer.len() } + + /// Truncate the WAL to zero length (after a snapshot has been written). + /// + /// The in-memory buffer must be empty (call [`flush`](Self::flush) first). + pub fn truncate(&mut self) -> Result<()> { + if !self.buffer.is_empty() { + return Err(Error::Internal( + "cannot truncate WAL with buffered entries; flush first".into(), + )); + } + self.file + .set_len(0) + .map_err(|e| Error::Internal(format!("failed to truncate WAL: {}", e)))?; + self.file + .sync_all() + .map_err(|e| Error::Internal(format!("failed to sync truncated WAL: {}", e)))?; + Ok(()) + } } -/// WAL reader for replaying entries in binary format. -/// -/// Format per entry: [version=0x02][u64 index][u8 key_count][32*key_count bytes] +/// WAL reader for replaying entries in binary format (v2 legacy and v3 +/// checksummed). pub struct WalReader { reader: BufReader, + /// Byte offset just past the last successfully-parsed entry. + valid_pos: u64, } impl WalReader { @@ -263,12 +258,21 @@ impl WalReader { Ok(Self { reader: BufReader::new(file), + valid_pos: 0, }) } - /// Read the next entry from the WAL in binary format. + /// Byte offset just past the last entry successfully returned by + /// [`next_entry`](Self::next_entry). Used for truncating a corrupted tail. + pub fn valid_pos(&self) -> u64 { + self.valid_pos + } + + /// Read the next entry from the WAL. /// - /// Returns `Ok(None)` when EOF is reached. + /// Returns `Ok(None)` on clean EOF. A torn tail or corrupted entry + /// (bad version byte, short read, checksum mismatch) returns an error; + /// callers recovering from a crash should truncate at [`valid_pos`](Self::valid_pos). pub fn next_entry(&mut self) -> Result)>> { // Read version byte let mut version = [0u8; 1]; @@ -278,10 +282,10 @@ impl WalReader { Err(e) => return Err(Error::Internal(format!("failed to read from WAL: {}", e))), } - if version[0] != WAL_VERSION_BINARY { + if version[0] != WAL_VERSION_BINARY && version[0] != WAL_VERSION_CRC { return Err(Error::Internal(format!( - "invalid WAL version: expected 0x{:02x}, got 0x{:02x}", - WAL_VERSION_BINARY, version[0] + "invalid WAL version: expected 0x{:02x} or 0x{:02x}, got 0x{:02x}", + WAL_VERSION_BINARY, WAL_VERSION_CRC, version[0] ))); } @@ -309,18 +313,49 @@ impl WalReader { keys.push(key); } + let mut entry_size = 1 + 8 + 1 + key_count as u64 * 32; + + // v3: verify the trailing CRC32 over version..keys. + if version[0] == WAL_VERSION_CRC { + let mut crc_bytes = [0u8; 4]; + self.reader + .read_exact(&mut crc_bytes) + .map_err(|e| Error::Internal(format!("failed to read checksum from WAL: {}", e)))?; + let stored_crc = u32::from_le_bytes(crc_bytes); + + let mut hasher = crc32fast::Hasher::new(); + hasher.update(&version); + hasher.update(&idx_bytes); + hasher.update(&count_byte); + for key in &keys { + hasher.update(key); + } + if hasher.finalize() != stored_crc { + return Err(Error::Internal(format!( + "WAL checksum mismatch for entry {}", + idx + ))); + } + entry_size += 4; + } + + self.valid_pos += entry_size; Ok(Some((LogIndex::new(idx), keys))) } } /// Validate and truncate the binary WAL file to match the expected tree size. /// -/// This function reads the WAL to find all entries and truncates any entries -/// with index >= expected_tree_size. This is critical for crash recovery: if -/// the WAL was flushed but the database wasn't updated before a crash, we need -/// to truncate the WAL to match the database state to avoid duplicate entries. +/// Two kinds of tail are removed: +/// - Entries with `index >= expected_tree_size`: the WAL was flushed but the +/// database wasn't updated before a crash, so the WAL ran ahead. (The +/// worker always writes the WAL before marking entries integrated, so the +/// WAL can only ever be ahead of — never behind — the database.) +/// - A torn or corrupted tail (crash mid-write, checksum mismatch): the scan +/// stops at the last fully-valid entry and everything after is truncated. /// -/// Returns the actual tree size found in the WAL (may be less than expected if WAL is behind). +/// Returns the actual tree size found in the WAL (may be less than expected +/// if the WAL is behind; callers treat that as fatal). pub fn validate_and_truncate_wal(path: impl AsRef, expected_tree_size: u64) -> Result { let path = path.as_ref(); @@ -333,27 +368,42 @@ pub fn validate_and_truncate_wal(path: impl AsRef, expected_tree_size: u64 let mut last_valid_pos: u64 = 0; let mut max_valid_idx: Option = None; - // Calculate entry sizes as we read to find truncation point - while let Some((idx, keys)) = reader.next_entry()? { - let idx_val = idx.value(); - - if expected_tree_size == 0 || idx_val < expected_tree_size { - // This entry is within bounds - // Binary format: 1 byte version + 8 bytes index + 1 byte count + 32*count bytes keys - let entry_size = 1 + 8 + 1 + (keys.len() as u64 * 32); - last_valid_pos += entry_size; - max_valid_idx = Some(match max_valid_idx { - Some(prev) => prev.max(idx_val), - None => idx_val, - }); - } else { - // Entry is beyond expected tree size - stop here - tracing::warn!( - "WAL entry {} >= expected tree size {}, truncating", - idx_val, - expected_tree_size - ); - break; + loop { + match reader.next_entry() { + Ok(Some((idx, _keys))) => { + let idx_val = idx.value(); + + if idx_val < expected_tree_size { + // This entry is within bounds + last_valid_pos = reader.valid_pos(); + max_valid_idx = Some(match max_valid_idx { + Some(prev) => prev.max(idx_val), + None => idx_val, + }); + } else { + // Entry is beyond expected tree size - stop here + tracing::warn!( + "WAL entry {} >= expected tree size {}, truncating", + idx_val, + expected_tree_size + ); + break; + } + } + Ok(None) => break, + Err(e) => { + // Torn write or bit rot in the tail. Truncate at the last + // valid entry instead of refusing to start; this is exactly + // the crash the WAL exists to survive. Anything the WAL + // loses here was, by write ordering, never marked integrated + // in the database (or the caller fails the behind-check). + tracing::warn!( + "WAL corrupted at byte {}: {}. Truncating corrupted tail.", + last_valid_pos, + e + ); + break; + } } } @@ -450,12 +500,9 @@ mod tests { // Check file size let file_size = std::fs::metadata(path).unwrap().len(); - // Binary format: (1 version + 8 index + 1 count + 5*32 keys) * 100 entries - // = (1 + 8 + 1 + 160) * 100 = 170 * 100 = 17,000 bytes - let expected_size = 170 * 100; - - // Text format would be: ~(5 + 5*65 + 1) * 100 = ~33,100 bytes - // Binary saves: ~48% space + // Binary v3 format: (1 version + 8 index + 1 count + 5*32 keys + 4 crc) * 100 entries + // = (1 + 8 + 1 + 160 + 4) * 100 = 174 * 100 = 17,400 bytes + let expected_size = 174 * 100; println!("Binary format file size: {} bytes", file_size); println!("Expected size: {} bytes", expected_size); @@ -559,6 +606,132 @@ mod tests { assert_eq!(actual_size, 3); } + #[test] + fn test_torn_tail_is_truncated() { + let temp_file = NamedTempFile::new().unwrap(); + let path = temp_file.path(); + + // Write 3 complete entries + { + let mut writer = WalWriter::open(path).unwrap(); + let key = [1u8; 32]; + for i in 0..3 { + writer.append(LogIndex::new(i), &[key]).unwrap(); + } + writer.flush().unwrap(); + } + + // Simulate a crash mid-write: append a partial entry (version + + // index but missing keys and checksum). + { + use std::io::Write; + let mut file = OpenOptions::new().append(true).open(path).unwrap(); + file.write_all(&[3u8]).unwrap(); + file.write_all(&3u64.to_be_bytes()).unwrap(); + file.write_all(&[5u8]).unwrap(); // claims 5 keys, none follow + file.sync_all().unwrap(); + } + + // Recovery must truncate the torn tail and keep the 3 good entries. + let actual_size = validate_and_truncate_wal(path, 3).unwrap(); + assert_eq!(actual_size, 3); + + let mut reader = WalReader::open(path).unwrap(); + for i in 0..3 { + let (idx, _) = reader.next_entry().unwrap().unwrap(); + assert_eq!(idx.value(), i); + } + assert!(reader.next_entry().unwrap().is_none()); + } + + #[test] + fn test_corrupted_entry_is_truncated() { + let temp_file = NamedTempFile::new().unwrap(); + let path = temp_file.path(); + + { + let mut writer = WalWriter::open(path).unwrap(); + let key = [1u8; 32]; + for i in 0..3 { + writer.append(LogIndex::new(i), &[key]).unwrap(); + } + writer.flush().unwrap(); + } + + // Flip a bit in a key byte of the last entry (offset from end: 4 crc + // + 1 key byte). CRC validation must catch this. + { + use std::io::{Seek, SeekFrom, Write}; + let mut file = OpenOptions::new().read(true).write(true).open(path).unwrap(); + file.seek(SeekFrom::End(-5)).unwrap(); + file.write_all(&[0xFF]).unwrap(); + file.sync_all().unwrap(); + } + + // The corrupted third entry must be truncated; first two survive. + let actual_size = validate_and_truncate_wal(path, 3).unwrap(); + assert_eq!(actual_size, 2); + + let mut reader = WalReader::open(path).unwrap(); + for i in 0..2 { + let (idx, _) = reader.next_entry().unwrap().unwrap(); + assert_eq!(idx.value(), i); + } + assert!(reader.next_entry().unwrap().is_none()); + } + + #[test] + fn test_stale_wal_with_zero_expected_size_is_truncated() { + let temp_file = NamedTempFile::new().unwrap(); + let path = temp_file.path(); + + // A WAL left over from a previous deployment... + { + let mut writer = WalWriter::open(path).unwrap(); + let key = [1u8; 32]; + for i in 0..5 { + writer.append(LogIndex::new(i), &[key]).unwrap(); + } + writer.flush().unwrap(); + } + + // ...must be fully truncated when the database says the log is empty, + // instead of replaying entries the log doesn't contain. + let actual_size = validate_and_truncate_wal(path, 0).unwrap(); + assert_eq!(actual_size, 0); + assert_eq!(std::fs::metadata(path).unwrap().len(), 0); + } + + #[test] + fn test_legacy_v2_entries_are_readable() { + let temp_file = NamedTempFile::new().unwrap(); + let path = temp_file.path(); + + // Hand-write v2 (no checksum) entries as an old binary would have. + { + use std::io::Write; + let mut file = OpenOptions::new().append(true).open(path).unwrap(); + for i in 0..3u64 { + file.write_all(&[2u8]).unwrap(); + file.write_all(&i.to_be_bytes()).unwrap(); + file.write_all(&[1u8]).unwrap(); + file.write_all(&[7u8; 32]).unwrap(); + } + file.sync_all().unwrap(); + } + + let actual_size = validate_and_truncate_wal(path, 3).unwrap(); + assert_eq!(actual_size, 3); + + let mut reader = WalReader::open(path).unwrap(); + for i in 0..3 { + let (idx, keys) = reader.next_entry().unwrap().unwrap(); + assert_eq!(idx.value(), i); + assert_eq!(keys, vec![[7u8; 32]]); + } + assert!(reader.next_entry().unwrap().is_none()); + } + #[test] fn test_batched_wal_writer_basic() { let temp_file = NamedTempFile::new().unwrap(); diff --git a/src/witness/mod.rs b/src/witness/mod.rs index c7ee4e5..5197693 100644 --- a/src/witness/mod.rs +++ b/src/witness/mod.rs @@ -15,12 +15,11 @@ mod verifier; mod litewitness_test; pub use proof::{verify_consistency, ConsistencyProof}; -pub use state::WitnessStateStore; -pub use verifier::{CheckpointVerifier, LogConfig}; +pub use state::{UpdateOutcome, WitnessStateStore}; +pub use verifier::{parse_vkey, CheckpointVerifier, LogConfig}; use crate::checkpoint::{CheckpointSignature, CheckpointSigner, CosignedCheckpoint}; use crate::error::{Error, Result}; -use ed25519_dalek::Signer; use sea_orm::DatabaseConnection; use sigstore_types::Sha256Hash; use std::sync::Arc; @@ -89,6 +88,15 @@ impl Witness { let new_size = checkpoint.checkpoint.size.value(); let new_root = checkpoint.checkpoint.root_hash; + // Sizes are persisted as i64; reject values that would wrap negative + // and defeat the rollback protection. + if new_size > i64::MAX as u64 { + return Err(WitnessError::BadRequest(format!( + "checkpoint size {} exceeds supported maximum", + new_size + ))); + } + // 4. Validate old_size constraints if request.old_size > new_size { return Err(WitnessError::BadRequest(format!( @@ -143,21 +151,35 @@ impl Witness { )); } - // 8. Create cosignature - let body = checkpoint.checkpoint.to_body(); - let signature = self.signer.signing_key_ref().sign(body.as_bytes()); - let cosig = CheckpointSignature { - name: self.signer.name().clone(), - key_id: self.signer.key_id().clone(), - signature, - }; - - // 9. Update state - self.state_store - .update(origin, new_size, new_root, &request.checkpoint) + // 8. Persist state with a compare-and-swap against the state we + // verified the proof for. If a concurrent request advanced the state + // in the meantime, we must NOT cosign: the proof we verified may + // extend a different view of the tree than the one now persisted. + let outcome = self + .state_store + .update( + origin, + state.size, + &state.root_hash, + new_size, + new_root, + &request.checkpoint, + ) .await .map_err(|e| WitnessError::Internal(format!("failed to update state: {}", e)))?; + if let UpdateOutcome::Conflict { current_size } = outcome { + return Err(WitnessError::Conflict(current_size)); + } + + // 9. Create the cosignature/v1 only after the state is durably + // updated (c2sp.org/tlog-cosignature: timestamped, alg-0x04 key ID). + let timestamp = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .map_err(|e| WitnessError::Internal(format!("system clock error: {}", e)))? + .as_secs(); + let cosig = self.signer.cosign_v1(&checkpoint.checkpoint, timestamp); + Ok(cosig) } diff --git a/src/witness/state.rs b/src/witness/state.rs index f86cab6..03b6016 100644 --- a/src/witness/state.rs +++ b/src/witness/state.rs @@ -87,16 +87,42 @@ impl WitnessStateStore { }) } - /// Update the witnessed state for a log. + /// Update the witnessed state for a log, compare-and-swap style. /// - /// This is called after successfully verifying a consistency proof. + /// This is called after successfully verifying a consistency proof. The + /// verification happened against a state read earlier (`expected_size`, + /// `expected_root`); this method re-checks under a row lock that the + /// persisted state still matches. Without this check, two concurrent + /// requests could each verify a proof against the same old state and the + /// witness would end up cosigning two conflicting roots at the same size + /// (a split view). + /// + /// Returns `UpdateOutcome::Conflict` if the persisted state no longer + /// matches the expected state; the caller must re-read and re-verify. pub async fn update( &self, origin: &str, + expected_size: u64, + expected_root: &Sha256Hash, size: u64, root_hash: Sha256Hash, checkpoint: &str, - ) -> Result<()> { + ) -> Result { + // Sizes are stored as i64; reject values that would wrap negative and + // corrupt the monotonicity comparison. + if size > i64::MAX as u64 || expected_size > i64::MAX as u64 { + return Err(Error::InvalidEntry(format!( + "tree size {} exceeds supported maximum", + size + ))); + } + if size < expected_size { + return Err(Error::InvalidEntry(format!( + "size rollback not allowed: current size {} > new size {}", + expected_size, size + ))); + } + let txn = self.conn.begin().await?; // Lock and get current state @@ -107,12 +133,15 @@ impl WitnessStateStore { match current { Some(model) => { - // Prevent size rollback: new size must be >= current size - if (size as i64) < model.size { - return Err(Error::InvalidEntry(format!( - "size rollback not allowed: current size {} > new size {}", - model.size, size - ))); + // CAS check: the state must not have moved since the caller + // verified the consistency proof. + if model.size as u64 != expected_size + || model.root_hash != expected_root.as_bytes().to_vec() + { + txn.rollback().await?; + return Ok(UpdateOutcome::Conflict { + current_size: model.size as u64, + }); } // Update existing witness_state::Entity::update(witness_state::ActiveModel { @@ -126,6 +155,12 @@ impl WitnessStateStore { .await?; } None => { + // Callers always go through get_or_init first, so an absent + // row means the expected state is the initial empty state. + if expected_size != 0 { + txn.rollback().await?; + return Ok(UpdateOutcome::Conflict { current_size: 0 }); + } // Insert new witness_state::Entity::insert(witness_state::ActiveModel { origin: ActiveValue::Set(origin.to_string()), @@ -140,7 +175,7 @@ impl WitnessStateStore { } txn.commit().await?; - Ok(()) + Ok(UpdateOutcome::Updated) } /// List all witnessed logs. @@ -161,6 +196,18 @@ impl WitnessStateStore { } } +/// Outcome of a compare-and-swap state update. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum UpdateOutcome { + /// The state was updated. + Updated, + /// The persisted state changed since the caller read it. + Conflict { + /// The size currently persisted for this log. + current_size: u64, + }, +} + /// RFC 6962 empty tree root hash. fn empty_root_hash() -> Sha256Hash { Sha256Hash::from_bytes([ diff --git a/src/witness/verifier.rs b/src/witness/verifier.rs index d7aa6da..e2799a9 100644 --- a/src/witness/verifier.rs +++ b/src/witness/verifier.rs @@ -9,6 +9,9 @@ use sha2::{Digest, Sha256}; /// Ed25519 algorithm identifier for note format. const ALG_ED25519: u8 = 0x01; +/// Ed25519 cosignature/v1 algorithm identifier (c2sp.org/tlog-cosignature). +const ALG_COSIGNATURE_V1: u8 = 0x04; + /// Configuration for a known log. #[derive(Debug, Clone)] pub struct LogConfig { @@ -32,7 +35,13 @@ impl LogConfig { /// Format: `name+hash_hex+base64(alg + pubkey)` /// Example: `example.com/log+deadbeef+AQIDBAUGBwg...` pub fn new(origin: String, vkey: &str) -> Result { - let (key_name, key_id, verifying_key) = parse_vkey(vkey)?; + let (key_name, alg, key_id, verifying_key) = parse_vkey(vkey)?; + if alg != ALG_ED25519 { + return Err(Error::Config(format!( + "log verification keys must be plain Ed25519 note keys (alg 0x01), got alg 0x{:02x}", + alg + ))); + } Ok(Self { origin, @@ -116,7 +125,12 @@ impl CheckpointVerifier { /// Parse a verification key string. /// /// Format: `name+hash_hex+base64(alg + pubkey)` -fn parse_vkey(vkey: &str) -> Result<(String, KeyId, VerifyingKey)> { +/// +/// Accepts plain Ed25519 note keys (alg 0x01) and Ed25519 cosignature/v1 +/// keys (alg 0x04, used by C2SP witnesses). Returns the key name, the +/// algorithm byte, the key ID (computed with that algorithm byte), and the +/// public key. +pub fn parse_vkey(vkey: &str) -> Result<(String, u8, KeyId, VerifyingKey)> { let parts: Vec<&str> = vkey.trim().splitn(3, '+').collect(); if parts.len() != 3 { return Err(Error::Config(format!( @@ -153,10 +167,11 @@ fn parse_vkey(vkey: &str) -> Result<(String, KeyId, VerifyingKey)> { } // Check algorithm byte - if key_data[0] != ALG_ED25519 { + let alg = key_data[0]; + if alg != ALG_ED25519 && alg != ALG_COSIGNATURE_V1 { return Err(Error::Config(format!( - "unsupported algorithm: expected {}, got {}", - ALG_ED25519, key_data[0] + "unsupported algorithm: expected 0x{:02x} or 0x{:02x}, got 0x{:02x}", + ALG_ED25519, ALG_COSIGNATURE_V1, alg ))); } @@ -168,8 +183,8 @@ fn parse_vkey(vkey: &str) -> Result<(String, KeyId, VerifyingKey)> { let verifying_key = VerifyingKey::from_bytes(&pubkey_bytes) .map_err(|e| Error::Config(format!("invalid public key: {}", e)))?; - // Compute and verify key ID - let key_id = compute_key_id(&name, &verifying_key); + // Compute and verify key ID (with the algorithm byte from the key data) + let key_id = compute_key_id(&name, &verifying_key, alg); if key_id.as_u32() != expected_hash { return Err(Error::Config(format!( "key hash mismatch: expected {:08x}, computed {:08x}", @@ -178,15 +193,15 @@ fn parse_vkey(vkey: &str) -> Result<(String, KeyId, VerifyingKey)> { ))); } - Ok((name, key_id, verifying_key)) + Ok((name, alg, key_id, verifying_key)) } /// Compute the key ID for a verifying key per Go's note format. -fn compute_key_id(name: &str, key: &VerifyingKey) -> KeyId { +fn compute_key_id(name: &str, key: &VerifyingKey, alg: u8) -> KeyId { let mut hasher = Sha256::new(); hasher.update(name.as_bytes()); hasher.update(b"\n"); - hasher.update([ALG_ED25519]); + hasher.update([alg]); hasher.update(key.as_bytes()); let hash = hasher.finalize(); @@ -225,9 +240,36 @@ mod tests { ); // Parse and verify - let (parsed_name, parsed_id, parsed_key) = parse_vkey(&vkey).unwrap(); + let (parsed_name, alg, parsed_id, parsed_key) = parse_vkey(&vkey).unwrap(); assert_eq!(parsed_name, name); + assert_eq!(alg, ALG_ED25519); assert_eq!(parsed_id.as_u32(), signer.key_id().as_u32()); assert_eq!(parsed_key.as_bytes(), pubkey.as_bytes()); } + + #[test] + fn test_parse_cosignature_v1_vkey() { + use crate::checkpoint::signer::compute_key_id_with_alg; + + let signer = CheckpointSigner::generate("witness.example.com"); + let pubkey = signer.public_key(); + + // Build a cosignature/v1 vkey (alg 0x04) as a C2SP witness would + // distribute it. + let key_id = compute_key_id_with_alg("witness.example.com", &pubkey, ALG_COSIGNATURE_V1); + let mut key_data = Vec::with_capacity(33); + key_data.push(ALG_COSIGNATURE_V1); + key_data.extend_from_slice(pubkey.as_bytes()); + let vkey = format!( + "witness.example.com+{:08x}+{}", + key_id.as_u32(), + base64::engine::general_purpose::STANDARD.encode(&key_data) + ); + + let (parsed_name, alg, parsed_id, parsed_key) = parse_vkey(&vkey).unwrap(); + assert_eq!(parsed_name, "witness.example.com"); + assert_eq!(alg, ALG_COSIGNATURE_V1); + assert_eq!(parsed_id.as_u32(), key_id.as_u32()); + assert_eq!(parsed_key.as_bytes(), pubkey.as_bytes()); + } } diff --git a/src/worker.rs b/src/worker.rs index 218208e..783b20c 100644 --- a/src/worker.rs +++ b/src/worker.rs @@ -23,14 +23,51 @@ pub struct ExternalWitness { pub name: String, /// URL of the witness service (e.g., "http://localhost:8081"). pub url: String, + /// The witness's pinned verification key. Cosignatures returned by the + /// witness are verified against this key before they count toward the + /// publication quorum. + pub verifying_key: ed25519_dalek::VerifyingKey, + /// The expected key ID for plain note signatures (alg 0x01, legacy). + pub key_id: crate::checkpoint::signer::KeyId, + /// The expected key ID for cosignature/v1 signatures (alg 0x04, C2SP). + pub key_id_v1: crate::checkpoint::signer::KeyId, } impl ExternalWitness { - /// Create a new external witness configuration. - pub fn new(name: impl Into, url: impl Into) -> Self { + /// Create a new external witness configuration from a note-format + /// verification key (`name+hash+base64(alg+pubkey)`). + /// + /// Both plain Ed25519 vkeys (alg 0x01) and cosignature/v1 vkeys + /// (alg 0x04) are accepted — the public key material is the same; the + /// expected key IDs for both signature formats are derived from it. + pub fn new(name: impl Into, url: impl Into, vkey: &str) -> Result { + use crate::checkpoint::signer::{compute_key_id_with_alg, ALG_COSIGNATURE_V1}; + + let name = name.into(); + let (key_name, _alg, _key_id, verifying_key) = crate::witness::parse_vkey(vkey)?; + if key_name != name { + return Err(Error::Config(format!( + "witness key name '{}' does not match witness name '{}'", + key_name, name + ))); + } + Ok(Self { + key_id: compute_key_id_with_alg(&name, &verifying_key, 0x01), + key_id_v1: compute_key_id_with_alg(&name, &verifying_key, ALG_COSIGNATURE_V1), + name, + url: url.into(), + verifying_key, + }) + } + + /// Create a witness config directly from a signer's public key (tests). + pub fn from_signer(signer: &CheckpointSigner, url: impl Into) -> Self { Self { - name: name.into(), + name: signer.name().as_str().to_string(), url: url.into(), + verifying_key: signer.public_key(), + key_id: signer.key_id().clone(), + key_id_v1: signer.cosignature_v1_key_id(), } } } @@ -70,6 +107,9 @@ pub struct WorkerConfig { pub checkpoint_interval: Duration, /// Log origin string. pub origin: String, + /// Minimum number of external witness cosignatures required to publish + /// a checkpoint. `None` requires all configured external witnesses. + pub witness_quorum: Option, } impl Default for WorkerConfig { @@ -79,6 +119,7 @@ impl Default for WorkerConfig { integration_batch_size: 1024, checkpoint_interval: Duration::from_secs(1), origin: "example.com/log".to_string(), + witness_quorum: None, } } } @@ -189,17 +230,25 @@ async fn run_integration_cycle( // Write entry bundles write_entry_bundles(storage, &pending, state.integrated_size, result.new_size).await?; - // Index entries in vindex if enabled + // Index entries in vindex if enabled. This MUST succeed (including the + // WAL fsync) before entries are marked integrated: marking first would + // let the vindex silently diverge from the log, and a WAL that ends up + // behind the database is a fatal startup error. Failing here aborts the + // cycle; the retry re-fetches the same pending entries and index_entry + // skips anything already indexed. if let Some(vi) = vindex { for entry in &pending { - if let Err(e) = vi.index_entry(entry.index, entry.data.as_bytes()) { - tracing::warn!("Failed to index entry {}: {}", entry.index.value(), e); - } - } - // Flush vindex WAL periodically (if using WAL) - if let Err(e) = vi.flush() { - tracing::warn!("Failed to flush vindex WAL: {}", e); + vi.index_entry(entry.index, entry.data.as_bytes()) + .map_err(|e| { + Error::Internal(format!( + "failed to index entry {} in vindex: {}", + entry.index.value(), + e + )) + })?; } + vi.flush() + .map_err(|e| Error::Internal(format!("failed to flush vindex WAL: {}", e)))?; tracing::debug!( "Indexed {} entries in vindex, total keys: {}", pending.len(), @@ -229,6 +278,18 @@ async fn run_integration_cycle( result.root_hash.to_hex() ); + // Compact the vindex WAL once enough entries have accumulated. Runs on a + // blocking thread: it serializes the whole index to disk. + if let Some(vi) = vindex { + let vi = Arc::clone(vi); + let compacted = tokio::task::spawn_blocking(move || vi.maybe_snapshot()) + .await + .map_err(|e| Error::Internal(format!("vindex snapshot task panicked: {}", e)))??; + if compacted { + tracing::info!("Vindex snapshot written; WAL compacted"); + } + } + Ok(()) } @@ -312,7 +373,15 @@ pub async fn run_checkpoint_worker( external_witnesses.len() ); - let origin = Origin::new(config.origin.clone()).expect("invalid log origin"); + // main() validates the origin before spawning; this is a defensive check + // so a bad origin can never panic inside the spawned task. + let origin = match Origin::new(config.origin.clone()) { + Ok(o) => o, + Err(e) => { + tracing::error!("Checkpoint worker cannot start: invalid log origin: {}", e); + return; + } + }; let client = reqwest::Client::new(); let mut witness_state = ExternalWitnessState::default(); let mut last_published = LastPublished::default(); @@ -326,7 +395,7 @@ pub async fn run_checkpoint_worker( } } _ = tokio::time::sleep(config.checkpoint_interval) => { - if let Err(e) = publish_checkpoint(&db, &storage, &signer, &witnesses, &external_witnesses, &client, &origin, &mut witness_state, &mut last_published).await { + if let Err(e) = publish_checkpoint(&db, &storage, &signer, &witnesses, &external_witnesses, config.witness_quorum, &client, &origin, &mut witness_state, &mut last_published).await { tracing::error!("Checkpoint publish error: {}", e); } } @@ -341,6 +410,7 @@ async fn publish_checkpoint( signer: &CheckpointSigner, witnesses: &[Arc], external_witnesses: &[ExternalWitness], + witness_quorum: Option, client: &reqwest::Client, origin: &Origin, witness_state: &mut ExternalWitnessState, @@ -382,9 +452,13 @@ async fn publish_checkpoint( // Create cosigned checkpoint with the log's signature let mut cosigned = CosignedCheckpoint::new(checkpoint, signer); - // Add in-process witness signatures + // Add in-process witness cosignatures (cosignature/v1, like real witnesses) + let now = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .map_err(|e| Error::Internal(format!("system clock error: {}", e)))? + .as_secs(); for witness in witnesses { - cosigned.add_signature(witness); + cosigned.add_cosignature_v1(witness, now); } let mut external_signature_count = 0usize; @@ -431,10 +505,10 @@ async fn publish_checkpoint( { Ok(signature_line) => { if let Err(e) = - add_external_signature_line(&mut cosigned, &ext_witness.name, &signature_line) + add_external_signature_line(&mut cosigned, ext_witness, &signature_line) { tracing::warn!( - "Failed to parse signature from external witness {}: {}", + "Rejected signature from external witness {}: {}", ext_witness.name, e ); @@ -454,12 +528,18 @@ async fn publish_checkpoint( } } - if external_signature_count < external_witnesses.len() { + // Publish once a quorum of external witnesses has cosigned. Requiring + // every witness would let a single unavailable witness halt the log. + let required = witness_quorum + .unwrap_or(external_witnesses.len()) + .min(external_witnesses.len()); + if external_signature_count < required { tracing::warn!( - "Not publishing checkpoint size {}: got {}/{} external witness signatures", + "Not publishing checkpoint size {}: got {}/{} external witness signatures (quorum {})", new_size, external_signature_count, - external_witnesses.len() + external_witnesses.len(), + required ); return Ok(()); } @@ -486,17 +566,68 @@ async fn publish_checkpoint( fn add_external_signature_line( cosigned: &mut CosignedCheckpoint, - expected_name: &str, + witness: &ExternalWitness, line: &str, ) -> Result<()> { + use crate::checkpoint::signer::cosignature_v1_message; + use ed25519_dalek::Verifier; + let sig = CheckpointSignature::from_line(line)?; - if sig.name.as_str() != expected_name { + if sig.name.as_str() != witness.name { return Err(Error::Config(format!( "witness name mismatch: expected '{}', got '{}'", - expected_name, sig.name + witness.name, sig.name ))); } + // Verify the cosignature against the pinned key. Without this, a + // compromised witness could return garbage that still counts toward the + // publication quorum. C2SP cosignature/v1 signatures (with timestamp) + // sign the timestamped message and use the alg-0x04 key ID; legacy plain + // signatures sign the bare body with the alg-0x01 key ID. + let body = cosigned.checkpoint.to_body(); + match sig.timestamp { + Some(ts) => { + if sig.key_id != witness.key_id_v1 { + return Err(Error::Config(format!( + "witness cosignature/v1 key ID mismatch for '{}': expected {:08x}, got {:08x}", + witness.name, + witness.key_id_v1.as_u32(), + sig.key_id.as_u32() + ))); + } + let message = cosignature_v1_message(ts, &body); + witness + .verifying_key + .verify(message.as_bytes(), &sig.signature) + .map_err(|e| { + Error::Signing(format!( + "cosignature/v1 from witness '{}' failed verification: {}", + witness.name, e + )) + })?; + } + None => { + if sig.key_id != witness.key_id { + return Err(Error::Config(format!( + "witness key ID mismatch for '{}': expected {:08x}, got {:08x}", + witness.name, + witness.key_id.as_u32(), + sig.key_id.as_u32() + ))); + } + witness + .verifying_key + .verify(body.as_bytes(), &sig.signature) + .map_err(|e| { + Error::Signing(format!( + "cosignature from witness '{}' failed verification: {}", + witness.name, e + )) + })?; + } + } + if !cosigned.has_signature_from(&sig.name) { cosigned.signatures.push(sig); } @@ -601,13 +732,14 @@ mod tests { ]) } - /// Create a signature line in the note format for testing. + /// Create a plain (legacy) signature line in the note format for testing. fn make_signature_line(signer: &CheckpointSigner, body: &str) -> String { let signature = signer.signing_key_ref().sign(body.as_bytes()); let sig = CheckpointSignature { name: signer.name().clone(), key_id: signer.key_id().clone(), signature, + timestamp: None, }; sig.to_line() } @@ -642,7 +774,7 @@ mod tests { // Call the external witness let client = reqwest::Client::new(); - let ext_witness = ExternalWitness::new("test-witness", mock_server.uri()); + let ext_witness = ExternalWitness::from_signer(&witness_signer, mock_server.uri()); let mut witness_state = ExternalWitnessState::default(); let result = @@ -650,7 +782,92 @@ mod tests { .await; assert!(result.is_ok(), "Expected success, got: {:?}", result); - assert_eq!(result.unwrap(), sig_line); + let sig_line_returned = result.unwrap(); + assert_eq!(sig_line_returned, sig_line); + + // The signature must verify against the pinned key. + let mut cosigned = cosigned; + add_external_signature_line(&mut cosigned, &ext_witness, &sig_line_returned) + .expect("valid cosignature must be accepted"); + assert_eq!(cosigned.signature_count(), 2); + } + + #[test] + fn test_cosignature_v1_line_accepted() { + let witness_signer = test_signer("test-witness"); + let log_signer = test_signer("test.log"); + + let checkpoint = Checkpoint::new( + Origin::new("test.log".to_string()).unwrap(), + TreeSize::new(10), + empty_root_hash(), + ); + let mut cosigned = CosignedCheckpoint::new(checkpoint, &log_signer); + + // A spec-conformant witness returns a timestamped cosignature/v1 line + // (76-byte blob, alg-0x04 key ID). + let cosig = witness_signer.cosign_v1(&cosigned.checkpoint, 1679315147); + let line = cosig.to_line(); + + let ext_witness = ExternalWitness::from_signer(&witness_signer, "http://unused"); + add_external_signature_line(&mut cosigned, &ext_witness, &line) + .expect("valid cosignature/v1 must be accepted"); + assert_eq!(cosigned.signature_count(), 2); + + // The line must round-trip through the checkpoint text. + let text = cosigned.to_text(); + let reparsed = CosignedCheckpoint::from_text(&text).unwrap(); + assert_eq!(reparsed.signature_count(), 2); + let ws = reparsed + .signatures + .iter() + .find(|s| s.name.as_str() == "test-witness") + .unwrap(); + assert_eq!(ws.timestamp, Some(1679315147)); + + // Tampering with the timestamp must break verification. + let mut tampered = cosig.clone(); + tampered.timestamp = Some(1679315148); + let mut cosigned2 = CosignedCheckpoint::new( + Checkpoint::new( + Origin::new("test.log".to_string()).unwrap(), + TreeSize::new(10), + empty_root_hash(), + ), + &log_signer, + ); + let result = add_external_signature_line(&mut cosigned2, &ext_witness, &tampered.to_line()); + assert!(result.is_err(), "Altered timestamp must fail verification"); + } + + #[tokio::test] + async fn test_garbage_witness_signature_rejected() { + let witness_signer = test_signer("test-witness"); + let other_signer = test_signer("test-witness"); // same name, different key + let log_signer = test_signer("test.log"); + + let checkpoint = Checkpoint::new( + Origin::new("test.log".to_string()).unwrap(), + TreeSize::new(10), + empty_root_hash(), + ); + let mut cosigned = CosignedCheckpoint::new(checkpoint, &log_signer); + + // A signature over the right body but from the WRONG key (e.g. a + // compromised witness) must be rejected by pinned-key verification. + let body = cosigned.checkpoint.to_body(); + let forged_line = make_signature_line(&other_signer, &body); + + let ext_witness = ExternalWitness::from_signer(&witness_signer, "http://unused"); + + let result = add_external_signature_line(&mut cosigned, &ext_witness, &forged_line); + assert!(result.is_err(), "Forged cosignature must be rejected"); + assert_eq!(cosigned.signature_count(), 1, "Only the log signature remains"); + + // A signature from the right key over the WRONG body must also fail. + let wrong_body_line = make_signature_line(&witness_signer, "some other body\n"); + let result = add_external_signature_line(&mut cosigned, &ext_witness, &wrong_body_line); + assert!(result.is_err(), "Signature over wrong body must be rejected"); } #[tokio::test] @@ -681,7 +898,8 @@ mod tests { // Call the external witness let client = reqwest::Client::new(); - let ext_witness = ExternalWitness::new("test-witness", mock_server.uri()); + let witness_signer = test_signer("test-witness"); + let ext_witness = ExternalWitness::from_signer(&witness_signer, mock_server.uri()); let mut witness_state = ExternalWitnessState::default(); let result = @@ -735,8 +953,8 @@ mod tests { let mut witness_state = ExternalWitnessState::default(); let ext_witnesses = vec![ - ExternalWitness::new("witness1", mock_witness1.uri()), - ExternalWitness::new("witness2", mock_witness2.uri()), + ExternalWitness::from_signer(&witness1_signer, mock_witness1.uri()), + ExternalWitness::from_signer(&witness2_signer, mock_witness2.uri()), ]; // Simulate what publish_checkpoint does for external witnesses @@ -816,9 +1034,10 @@ mod tests { let client = reqwest::Client::new(); let mut witness_state = ExternalWitnessState::default(); + let witness2_signer = test_signer("witness2"); let ext_witnesses = vec![ - ExternalWitness::new("witness1", mock_witness1.uri()), - ExternalWitness::new("witness2", mock_witness2.uri()), + ExternalWitness::from_signer(&witness1_signer, mock_witness1.uri()), + ExternalWitness::from_signer(&witness2_signer, mock_witness2.uri()), ]; let mut success_count = 0; diff --git a/tests/witness_security_test.rs b/tests/witness_security_test.rs index 1c42923..145979f 100644 --- a/tests/witness_security_test.rs +++ b/tests/witness_security_test.rs @@ -76,7 +76,7 @@ fn test_proof_hash_count_below_limit() { #[cfg(test)] mod state_tests { use sea_orm::{Database, DatabaseConnection}; - use siglog::witness::WitnessStateStore; + use siglog::witness::{UpdateOutcome, WitnessStateStore}; use sigstore_types::Sha256Hash; use std::sync::Arc; @@ -90,6 +90,14 @@ mod state_tests { conn } + fn empty_root() -> Sha256Hash { + Sha256Hash::from_bytes([ + 0xe3, 0xb0, 0xc4, 0x42, 0x98, 0xfc, 0x1c, 0x14, 0x9a, 0xfb, 0xf4, 0xc8, 0x99, 0x6f, + 0xb9, 0x24, 0x27, 0xae, 0x41, 0xe4, 0x64, 0x9b, 0x93, 0x4c, 0xa4, 0x95, 0x99, 0x1b, + 0x78, 0x52, 0xb8, 0x55, + ]) + } + #[tokio::test] async fn test_size_rollback_prevention() { let conn = setup_test_db().await; @@ -100,14 +108,17 @@ mod state_tests { let hash2 = Sha256Hash::from_bytes([2u8; 32]); // Initialize with size 100 - let _ = store.get_or_init(origin).await.unwrap(); - store - .update(origin, 100, hash1, "checkpoint1") + let init = store.get_or_init(origin).await.unwrap(); + let outcome = store + .update(origin, init.size, &init.root_hash, 100, hash1, "checkpoint1") .await .unwrap(); + assert_eq!(outcome, UpdateOutcome::Updated); // Try to rollback to size 50 (should fail) - let result = store.update(origin, 50, hash2, "checkpoint2").await; + let result = store + .update(origin, 100, &hash1, 50, hash2, "checkpoint2") + .await; assert!(result.is_err(), "Should prevent size rollback"); let err_msg = result.unwrap_err().to_string(); @@ -133,15 +144,18 @@ mod state_tests { let hash2 = Sha256Hash::from_bytes([2u8; 32]); // Initialize with size 100 - let _ = store.get_or_init(origin).await.unwrap(); + let init = store.get_or_init(origin).await.unwrap(); store - .update(origin, 100, hash1, "checkpoint1") + .update(origin, init.size, &init.root_hash, 100, hash1, "checkpoint1") .await .unwrap(); // Increase to size 200 (should succeed) - let result = store.update(origin, 200, hash2, "checkpoint2").await; - assert!(result.is_ok(), "Should allow size increase"); + let outcome = store + .update(origin, 100, &hash1, 200, hash2, "checkpoint2") + .await + .unwrap(); + assert_eq!(outcome, UpdateOutcome::Updated, "Should allow size increase"); // Verify the state has changed let state = store.get(origin).await.unwrap().unwrap(); @@ -158,14 +172,85 @@ mod state_tests { let hash1 = Sha256Hash::from_bytes([1u8; 32]); // Initialize with size 100 - let _ = store.get_or_init(origin).await.unwrap(); + let init = store.get_or_init(origin).await.unwrap(); store - .update(origin, 100, hash1, "checkpoint1") + .update(origin, init.size, &init.root_hash, 100, hash1, "checkpoint1") + .await + .unwrap(); + + // Update with same size and same root (idempotent republish) + let outcome = store + .update(origin, 100, &hash1, 100, hash1, "checkpoint1") + .await + .unwrap(); + assert_eq!(outcome, UpdateOutcome::Updated, "Should allow same size update"); + } + + #[tokio::test] + async fn test_cas_conflict_on_stale_expected_state() { + let conn = setup_test_db().await; + let store = WitnessStateStore::new(Arc::new(conn)); + + let origin = "test-log"; + let hash1 = Sha256Hash::from_bytes([1u8; 32]); + let hash2 = Sha256Hash::from_bytes([2u8; 32]); + let hash3 = Sha256Hash::from_bytes([3u8; 32]); + + // Two "concurrent" requests both read the initial state. + let init = store.get_or_init(origin).await.unwrap(); + + // Request A wins the race. + let outcome_a = store + .update(origin, init.size, &init.root_hash, 10, hash1, "cp-a") + .await + .unwrap(); + assert_eq!(outcome_a, UpdateOutcome::Updated); + + // Request B tries to persist a *different* root at the same size, + // using the stale expected state. Must be rejected, otherwise the + // witness cosigns two conflicting roots (split view). + let outcome_b = store + .update(origin, init.size, &init.root_hash, 10, hash2, "cp-b") + .await + .unwrap(); + assert_eq!( + outcome_b, + UpdateOutcome::Conflict { current_size: 10 }, + "Stale CAS must conflict, not overwrite" + ); + + // Same-size different-root with a *matching* expected size but stale + // root must also conflict. + let outcome_c = store + .update(origin, 10, &hash2, 10, hash3, "cp-c") .await .unwrap(); + assert_eq!(outcome_c, UpdateOutcome::Conflict { current_size: 10 }); - // Update with same size (should succeed - allows idempotent updates) - let result = store.update(origin, 100, hash1, "checkpoint1").await; - assert!(result.is_ok(), "Should allow same size update"); + let state = store.get(origin).await.unwrap().unwrap(); + assert_eq!(state.root_hash, hash1, "Winner's root must be preserved"); + } + + #[tokio::test] + async fn test_oversized_tree_size_rejected() { + let conn = setup_test_db().await; + let store = WitnessStateStore::new(Arc::new(conn)); + + let origin = "test-log"; + let init = store.get_or_init(origin).await.unwrap(); + + // Sizes above i64::MAX would wrap negative in the database column and + // defeat rollback protection. + let result = store + .update( + origin, + init.size, + &init.root_hash, + u64::MAX, + empty_root(), + "cp", + ) + .await; + assert!(result.is_err(), "Sizes above i64::MAX must be rejected"); } } diff --git a/witness-conformance/.gitignore b/witness-conformance/.gitignore index 1d6f88f..1611a6c 100644 --- a/witness-conformance/.gitignore +++ b/witness-conformance/.gitignore @@ -45,3 +45,4 @@ conformance-report.json *.swo *~ .DS_Store +.test_log_info From 738db2a29e6eeca15f143b5fed581397e65ab005 Mon Sep 17 00:00:00 2001 From: Wolf Vollprecht Date: Fri, 3 Jul 2026 10:14:39 +0200 Subject: [PATCH 2/3] Add fly.io deployment, benchmark harness, and remaining-work doc - fly.toml: single machine + volume (SQLite state, vindex WAL/snapshot) with tiles on S3, vindex enabled, quorum-ready config - scripts/bench.py: end-to-end soak test measuring write throughput and latency, integration and checkpoint lag, read-path latencies, and per-entry vindex correctness; caught the rate-limiter replenish bug on its first run - docs/REMAINING_WORK.md: vindex root anchoring design, ingest validation/dedup plan, CEP text fixes, scale limits, benchmark results from the fly.io staging deployment Co-Authored-By: Claude Fable 5 --- docs/REMAINING_WORK.md | 184 +++++++++++++++++++++++ fly.toml | 62 ++++++++ scripts/bench.py | 334 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 580 insertions(+) create mode 100644 docs/REMAINING_WORK.md create mode 100644 fly.toml create mode 100644 scripts/bench.py diff --git a/docs/REMAINING_WORK.md b/docs/REMAINING_WORK.md new file mode 100644 index 0000000..c9515bd --- /dev/null +++ b/docs/REMAINING_WORK.md @@ -0,0 +1,184 @@ +# Remaining work for a production conda-forge deployment + +Status as of July 2026. The items below are what stands between the current +codebase and a production transparency log for conda-forge. They are ordered +by how much they block the CEP, not by implementation effort. + +Everything here assumes the completed groundwork: C2SP cosignature/v1 +witness signatures (verified against pinned keys), quorum-based checkpoint +publishing (`WITNESS_QUORUM`), witness split-view CAS protection, vindex +snapshots + WAL compaction with auto-rebuild from log storage, and the +production hardening pass (graceful shutdown, worker supervision, atomic +filesystem writes, rate limiting). + +--- + +## 1. Anchor the vindex root in the log (protocol decision needed) + +**Problem.** The verifiable index (key → log indices, with prefix-tree +proofs) is served by the log operator, but nothing commits its root hash to +the witnessed checkpoint. A malicious operator can serve a correct Merkle +tree while lying in the vindex — e.g. omitting indices for a filename so a +client never sees that a patched entry exists. Proofs from the vindex +currently verify against a root hash that the *same server* provides in the +same response: circular trust. + +**Proposed design.** Periodically append a special log entry committing to +the vindex state: + +```json +{"type":"vindex-root","tree_size":123456,"root":""} +``` + +- `tree_size` is the log size the vindex covered when the root was computed; + the anchor entry itself lands at some later index, which is fine — it + describes the index state *at* `tree_size`. +- Publish one anchor per checkpoint interval **iff** the vindex root changed. +- Clients verify: (1) inclusion proof of the anchor entry against the + witnessed checkpoint, (2) the vindex lookup proof against the anchored + root. Both proofs together make lookups operator-independent. +- Monitors additionally recompute the vindex from the entries themselves and + alert if an anchored root diverges — this is what makes a *wrong* (not + just stale) anchor detectable. + +**Also required: verifiable exclusion proofs.** `prefix_tree.rs::lookup_rec` +currently returns `found=false` without including the conflicting leaf or +sibling subtree, so a client cannot recompute the root from a negative +answer. Non-membership must include the mismatching leaf (or the divergence +node) so the proof reconstructs the anchored root. Until then, "key not in +index" is an unverifiable assertion. + +**Open question for the CEP:** anchor as a log entry (above) vs. a checkpoint +extension line. Extension lines are lighter but per the cosignature spec, +witnesses make **no semantic statement** about extension lines — and they +bloat every checkpoint. The log-entry approach gets inclusion proofs for +free and keeps checkpoints minimal. Recommendation: log entry. + +## 2. Ingest validation and dedup (`POST /add`) + +**Problem.** The log currently accepts any bytes with a valid API key. For +conda-forge this means: no schema enforcement, no canonicalization check +(clients could log non-normalized JSON that then never matches a verifier's +recomputed hash), and no dedup — a bulk repodata-patch run that re-submits +100k unchanged entries would append 100k duplicate leaves. + +**Plan.** +- Validate at ingest when `ENTRY_SCHEMA=conda-v1` is configured: + - parse as JSON, check required fields + (`subdir`, `filename`, `sha256`, `size`, `build`, `build_number`, + `version`, `name`, `depends`), + - re-serialize canonically and require byte-equality with the submission + (reject non-canonical bodies with a 422 and the canonical form in the + error, so publishers can fix their pipeline), + - allow a `type: "index"` variant for freshness entries. +- Dedup by leaf hash: keep a `leaf_hash → index` table (or reuse the + monitor's content-index machinery) and return the **existing** index with + `200` instead of appending. This makes `POST /add` idempotent, which also + resolves the ambiguous-commit/retry duplication noted in the review. + Cost: one indexed lookup per add; the table grows with the log (32 bytes + + index per entry — ~80 MB per 2M entries in SQLite/Postgres, acceptable). +- Size the dedup decision into the CEP: "a channel MUST NOT re-log an entry + whose normalized bytes are unchanged; logs SHOULD enforce this." + +## 3. CEP text fixes (spec gaps found during review) + +1. **Vindex key must include the subdir.** The CEP says + `index_key = SHA256(filename)`, but monitors enforce uniqueness per + `(subdir, filename)` and identical filenames legitimately exist across + subdirs. Use `SHA256(subdir + "/" + filename)`. +2. **Pin normalization to RFC 8785 (JCS)** instead of ad-hoc rules; state + explicitly that floats are forbidden and whether `depends` arrays are + sorted or preserved (recommendation: sorted — otherwise a patch that only + reorders dependencies produces a spurious "new" entry). +3. **Inclusion proofs embedded in `repodata_shard_index.json` don't verify + against a *newer* checkpoint.** A proof computed at `tree_size = T` does + not verify against `root(T')` for `T' > T`, and tlog-tiles only serves + the latest checkpoint. Fix: embed the size-`T` checkpoint alongside the + proof, and have clients verify consistency `T → T'` from tiles. Keeps + offline verification intact. +4. **Add a `channel` field** to the entry schema (or state that one log + origin serves exactly one channel). Without it, entries are ambiguous if + the log ever serves more than conda-forge. +5. **Witness freshness wording.** Checkpoints don't contain timestamps; + freshness comes from cosignature/v1 timestamps. Define client freshness + as "max cosignature timestamp within quorum ≥ now − max_skew". +6. **State the freshness window trade-off**: a mirror can serve + up-to-`max_age`-old data undetected. Recommend `max_age` of 24h rather + than 7 days — re-logging one small index entry per subdir per day is + nearly free. +7. **Operator requirements section**: the log MUST never sign two different + trees at the same size ("never fork"). Restoring from a backup that lost + acknowledged entries forks the tree and permanently kills the log + (witnesses refuse forever). Mandate: single writer, synchronous + DB+object-store durability before signing, tested restore procedure, key + ceremony / KMS for the signing key. + +## 4. Scale limits to address before conda-forge full history + +| Component | Current limit | Wall | +|---|---|---| +| Vindex key map | in-RAM `HashMap`, `VINDEX_MAX_KEYS` (10M default) | ~2M conda-forge artifacts fit (~several hundred MB); beyond that needs a disk-backed index (sled/rocksdb) or shard-by-prefix | +| Vindex startup | snapshot load + WAL tail replay | fine now (snapshots bound it); snapshot write is O(index) — at 2M keys expect ~1–2 s pauses per 100k entries, tune `VINDEX_SNAPSHOT_INTERVAL` | +| Monitor content index | in-RAM, unbounded, O(pending) scan per entry → O(n²) per batch | needs the DB-backed lookup path to be the primary one, plus batch-size caps | +| Monitor `validate_new_entries` | fetches all new entries inline in one HTTP request | cap per-request validation window; validate asynchronously and cosign on the next request | +| SQLite | single-writer; `lock_exclusive` is a no-op, deferred transactions can fail with `SQLITE_BUSY_SNAPSHOT` under concurrency | use Postgres in production; if SQLite must stay, issue `BEGIN IMMEDIATE` for writer transactions | + +## 5. Tooling debt + +- **`conda-monitor verify` does not verify.** It prints "VERIFICATION + PASSED" without checking the vindex proof, the checkpoint signature, or an + inclusion proof. This is the tool the CEP points users at — it must do the + full client verification workflow (normalize → leaf hash → vindex proof + against anchored root → inclusion proof → checkpoint signature + witness + quorum) before anything ships. +- **Client library:** the verification workflow belongs in a Rust crate + consumable by rattler/pixi, verified at package-download time (not per + solve). Roll out `on_failure: warn` first. +- **litewitness interop:** the conformance suite passes against siglog's own + witness; an end-to-end run against a real litewitness instance (Go) is + still outstanding and is the definitive C2SP interop check. + +## 6. Benchmark results (fly.io staging, July 2026) + +`scripts/bench.py` against `conda-transparency-log.fly.dev` (shared-cpu-2x, +1 GB, ams; SQLite on volume; tiles on Tigris S3; vindex enabled; +`BATCH_MAX_AGE_MS=500`, `CHECKPOINT_INTERVAL=2`): + +- **Writes**: 2,000 entries at concurrency 48 → 78 req/s, p50 595 ms / + p99 686 ms, zero errors. `/add` latency ≈ `BATCH_MAX_AGE_MS` + RTT, since + the ack waits for the durable batch commit — tune the batch age to trade + latency for batch size. +- **Integration**: 77 entries/s in a short burst (2k entries), degrading to + ~36 entries/s under sustained load (10k entries at concurrency 96 → 279 s + drain; checkpoint follows ~1 s later). The integration loop writes tiles + **sequentially**; parallelizing the S3 PUTs per cycle is the obvious lever + (a 2M-entry bootstrap at 36/s is ~15 h — fine one-time, slow for bulk + re-patching). Sustained write pressure also pushes `/add` p50 to ~1.5 s at + concurrency 96 as requests queue behind batch commits. +- **Sustained-load verdict**: a 10k-entry run at concurrency 96 completed + with zero errors and 500/500 sampled lookups verified — correctness holds; + the limits are throughput, not integrity. +- **Reads**: checkpoint p50 236 ms, vindex lookup p50 215 ms, entry-bundle + tiles p50 142 ms (client in EU → ams, no CDN). +- **Correctness under load**: every sampled entry (300/300) was findable + through the vindex at the exact index assigned at write time, and the + final checkpoint covered all writes. + +The benchmark also caught a real bug on its first run: `tower_governor`'s +`per_second(n)` configures "one token per *n seconds*", not "n per second" — +the limiter was effectively 1 req/1000 s with a burst. Fixed by configuring +the replenish interval (`per_nanosecond(1e9 / rps)`); regression-tested. + +## 7. Deployment/ops (tracked, mostly mechanical) + +- Postgres for the production log (SQLite + volume is fine for staging). +- CDN in front of `/tile/*` and `/checkpoint` (immutable tiles cache + forever; checkpoint no-cache). The origin then only serves `/add` and + `/vindex/*`. +- Witness recruitment: 3–5 independent orgs (prefix.dev, Anaconda, + Quansight, QuantStack + existing C2SP witnesses once interop is proven), + quorum 3. +- Alerting: monitor violations → webhook/status page; log health: pending + count growth, checkpoint age, witness cosign failure rate. +- Bootstrap plan: mass-load existing repodata as epoch T₀; document that + tamper-evidence starts at T₀. diff --git a/fly.toml b/fly.toml new file mode 100644 index 0000000..c21a474 --- /dev/null +++ b/fly.toml @@ -0,0 +1,62 @@ +# Fly.io deployment for the siglog transparency log server. +# +# Storage layout: +# - SQLite sequencer state + vindex WAL/snapshot on a Fly volume (/data) +# - Tiles + checkpoint on S3-compatible object storage (Tigris), via the +# S3_* secrets +# +# Required secrets (fly secrets set ...): +# LOG_PRIVATE_KEY Ed25519 note-format signing key +# API_KEY Bearer token for POST /add +# S3_BUCKET, S3_ENDPOINT, S3_ACCESS_KEY, S3_SECRET_KEY + +app = "conda-transparency-log" +primary_region = "ams" + +[build] +dockerfile = "docker/Dockerfile.server" + +[env] +LISTEN_ADDR = "0.0.0.0:8080" +LOG_ORIGIN = "conda.prefix.dev" +DATABASE_URL = "sqlite:/data/siglog.db?mode=rwc" +STORAGE_BACKEND = "s3" +S3_REGION = "auto" +CHECKPOINT_INTERVAL = "2" +BATCH_MAX_SIZE = "256" +BATCH_MAX_AGE_MS = "500" +VINDEX_ENABLED = "true" +VINDEX_KEY_FIELD = "name" +VINDEX_WAL_PATH = "/data/vindex.wal" +VINDEX_SNAPSHOT_INTERVAL = "100000" +# Benchmark-friendly limits; drop for public exposure. +RATE_LIMIT_PER_SECOND = "1000" +RATE_LIMIT_BURST_SIZE = "2000" +RUST_LOG = "info,siglog=info" + +[mounts] +source = "siglog_data" +destination = "/data" + +[http_service] +internal_port = 8080 +force_https = true +auto_stop_machines = true +auto_start_machines = true +min_machines_running = 1 + +[http_service.concurrency] +type = "connections" +hard_limit = 500 +soft_limit = 400 + +[[http_service.checks]] +interval = "30s" +timeout = "5s" +grace_period = "15s" +method = "GET" +path = "/health" + +[[vm]] +size = "shared-cpu-2x" +memory = "1gb" diff --git a/scripts/bench.py b/scripts/bench.py new file mode 100644 index 0000000..0245bbf --- /dev/null +++ b/scripts/bench.py @@ -0,0 +1,334 @@ +# /// script +# requires-python = ">=3.11" +# dependencies = ["httpx[http2]"] +# /// +"""Benchmark and soak-test a siglog transparency log deployment. + +Measures whether the log "holds up" end to end: + + 1. Write path — concurrent POST /add: throughput, latency percentiles, + error/rate-limit counts. + 2. Integration — time until all written entries are integrated into the + Merkle tree (pending_count back to 0) and the checkpoint + advances to cover them. + 3. Read path — GET /checkpoint, tile fetches, and vindex lookups: + latency percentiles. + 4. Correctness — every sampled written entry must be findable through the + vindex at the index the server assigned at write time, + and the final checkpoint must cover all writes. + +Usage: + uv run scripts/bench.py --url https://conda-transparency-log.fly.dev \ + --api-key-file /path/to/key --entries 2000 --concurrency 32 +""" + +import argparse +import asyncio +import json +import random +import statistics +import string +import sys +import time + +import httpx + + +def pct(values: list[float], p: float) -> float: + if not values: + return float("nan") + values = sorted(values) + k = min(len(values) - 1, max(0, round(p / 100 * (len(values) - 1)))) + return values[k] + + +def fmt_ms(seconds: float) -> str: + return f"{seconds * 1000:.1f}ms" + + +class Stats: + def __init__(self) -> None: + self.latencies: list[float] = [] + self.ok = 0 + self.rate_limited = 0 + self.errors: dict[str, int] = {} + + def error(self, kind: str) -> None: + self.errors[kind] = self.errors.get(kind, 0) + 1 + + def summary(self, name: str, duration: float | None = None) -> str: + lines = [f" requests ok: {self.ok}"] + if duration and self.ok: + lines.append(f" throughput: {self.ok / duration:.1f} req/s") + if self.latencies: + lines.append( + " latency p50/p95/p99/max: " + f"{fmt_ms(pct(self.latencies, 50))} / {fmt_ms(pct(self.latencies, 95))} / " + f"{fmt_ms(pct(self.latencies, 99))} / {fmt_ms(max(self.latencies))}" + ) + if self.rate_limited: + lines.append(f" rate-limited (429): {self.rate_limited}") + for kind, count in sorted(self.errors.items()): + lines.append(f" ERROR {kind}: {count}") + return f"{name}\n" + "\n".join(lines) + + +async def write_phase( + client: httpx.AsyncClient, + url: str, + api_key: str, + n_entries: int, + concurrency: int, + run_id: str, +) -> tuple[Stats, dict[str, int], float]: + """POST /add for n_entries; returns stats and name → assigned index.""" + stats = Stats() + assigned: dict[str, int] = {} + sem = asyncio.Semaphore(concurrency) + headers = {"Authorization": f"Bearer {api_key}"} + + async def submit(i: int) -> None: + name = f"bench-{run_id}-pkg-{i:06d}" + body = json.dumps( + { + "name": name, + "version": "1.0.0", + "build": "py311_0", + "build_number": 0, + "subdir": "linux-64", + "filename": f"{name}-1.0.0-py311_0.conda", + "sha256": "".join(random.choices("0123456789abcdef", k=64)), + "size": random.randint(10_000, 90_000_000), + "depends": ["python >=3.11,<3.12.0a0"], + }, + separators=(",", ":"), + sort_keys=True, + ) + async with sem: + # Retry on 429 with backoff so rate limiting degrades throughput + # instead of failing the run. + for attempt in range(6): + start = time.monotonic() + try: + resp = await client.post(f"{url}/add", content=body, headers=headers) + except httpx.HTTPError as e: + stats.error(type(e).__name__) + return + elapsed = time.monotonic() - start + if resp.status_code == 200: + stats.ok += 1 + stats.latencies.append(elapsed) + assigned[name] = int(resp.text.strip()) + return + if resp.status_code == 429: + stats.rate_limited += 1 + # Honor Retry-After but cap it: a misconfigured limiter + # can advertise huge values and stall the whole run. + retry_after = min(float(resp.headers.get("retry-after", 0) or 0), 10.0) + await asyncio.sleep(max(retry_after, 0.2 * (attempt + 1))) + continue + stats.error(f"HTTP {resp.status_code}") + return + stats.error("gave up after 429 retries") + + start = time.monotonic() + await asyncio.gather(*(submit(i) for i in range(n_entries))) + duration = time.monotonic() - start + return stats, assigned, duration + + +def parse_checkpoint(text: str) -> tuple[str, int, int]: + """Return (origin, tree_size, signature_count).""" + body, _, sigs = text.partition("\n\n") + lines = body.splitlines() + n_sigs = sum(1 for l in sigs.splitlines() if l.startswith("— ")) + return lines[0], int(lines[1]), n_sigs + + +async def wait_for_integration( + client: httpx.AsyncClient, url: str, target_size: int, timeout: float +) -> tuple[float | None, float | None]: + """Wait until /ready reports integrated_size >= target and /checkpoint + covers it. Returns (integration_lag, checkpoint_lag) in seconds.""" + start = time.monotonic() + integrated_at = None + while time.monotonic() - start < timeout: + resp = await client.get(f"{url}/ready") + if resp.status_code == 200: + data = resp.json() + if data["integrated_size"] >= target_size and data["pending_count"] == 0: + integrated_at = time.monotonic() - start + break + elif resp.status_code == 429: + # Back off so polling doesn't keep the bucket empty forever. + await asyncio.sleep(2.0) + continue + await asyncio.sleep(0.25) + if integrated_at is None: + return None, None + + while time.monotonic() - start < timeout: + resp = await client.get(f"{url}/checkpoint") + if resp.status_code == 200: + _, size, _ = parse_checkpoint(resp.text) + if size >= target_size: + return integrated_at, time.monotonic() - start + elif resp.status_code == 429: + await asyncio.sleep(2.0) + continue + await asyncio.sleep(0.25) + return integrated_at, None + + +async def read_phase( + client: httpx.AsyncClient, + url: str, + assigned: dict[str, int], + n_lookups: int, + concurrency: int, +) -> tuple[Stats, Stats, Stats, int]: + """Checkpoint fetches, vindex lookups (with correctness check), tile reads.""" + ckpt_stats, vindex_stats, tile_stats = Stats(), Stats(), Stats() + mismatches = 0 + sem = asyncio.Semaphore(concurrency) + + async def timed_get(path: str, stats: Stats) -> httpx.Response | None: + async with sem: + start = time.monotonic() + try: + resp = await client.get(f"{url}{path}") + except httpx.HTTPError as e: + stats.error(type(e).__name__) + return None + if resp.status_code == 200: + stats.ok += 1 + stats.latencies.append(time.monotonic() - start) + return resp + if resp.status_code == 429: + stats.rate_limited += 1 + else: + stats.error(f"HTTP {resp.status_code} {path}") + return None + + async def check_lookup(name: str, expected_idx: int) -> None: + nonlocal mismatches + resp = await timed_get(f"/vindex/lookup/key/{name}", vindex_stats) + if resp is None: + return + data = resp.json() + if not data["found"] or expected_idx not in data["indices"]: + mismatches += 1 + + sample = random.sample(sorted(assigned.items()), min(n_lookups, len(assigned))) + + tasks = [check_lookup(name, idx) for name, idx in sample] + tasks += [timed_get("/checkpoint", ckpt_stats) for _ in range(50)] + # Tile reads across the tree (entry bundles for sampled indices). + max_idx = max(assigned.values()) if assigned else 0 + tree_size = max_idx + 1 + bundles = sorted({idx // 256 for _, idx in sample}) + for b in bundles[:50]: + partial = tree_size % 256 if b == tree_size // 256 else 0 + path = f"/tile/entries/{b:03d}" + (f".p/{partial}" if partial else "") + tasks.append(timed_get(path, tile_stats)) + + await asyncio.gather(*tasks) + return ckpt_stats, vindex_stats, tile_stats, mismatches + + +async def main() -> int: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--url", required=True, help="Log base URL") + parser.add_argument("--api-key", help="API key for POST /add") + parser.add_argument("--api-key-file", help="File containing the API key") + parser.add_argument("--entries", type=int, default=1000) + parser.add_argument("--concurrency", type=int, default=32) + parser.add_argument("--lookups", type=int, default=200) + parser.add_argument("--timeout", type=float, default=120.0, + help="Max seconds to wait for integration/checkpoint") + args = parser.parse_args() + + api_key = args.api_key + if not api_key and args.api_key_file: + api_key = open(args.api_key_file).read().strip() + if not api_key: + parser.error("--api-key or --api-key-file is required") + + url = args.url.rstrip("/") + run_id = "".join(random.choices(string.ascii_lowercase + string.digits, k=6)) + + # HTTP/1.1 with a connection pool sized to the concurrency: multiplexing + # all requests onto one HTTP/2 connection collapses to ~1 in-flight + # request behind some proxies (observed on Fly's edge). + limits = httpx.Limits( + max_connections=args.concurrency + 8, + max_keepalive_connections=args.concurrency + 8, + ) + async with httpx.AsyncClient(timeout=30.0, limits=limits) as client: + # Baseline state + resp = await client.get(f"{url}/ready") + resp.raise_for_status() + baseline = resp.json() + print(f"target: {url} (run id {run_id})") + print(f"baseline: integrated_size={baseline['integrated_size']} " + f"pending={baseline['pending_count']}\n") + + # Phase 1: writes + print(f"phase 1: writing {args.entries} entries, concurrency {args.concurrency} ...") + write_stats, assigned, write_duration = await write_phase( + client, url, api_key, args.entries, args.concurrency, run_id + ) + print(write_stats.summary("write /add", write_duration)) + if not assigned: + print("no successful writes; aborting") + return 1 + + # Phase 2: integration + checkpoint lag + target = max(assigned.values()) + 1 + print(f"\nphase 2: waiting for integration to size {target} ...") + integ_lag, ckpt_lag = await wait_for_integration(client, url, target, args.timeout) + if integ_lag is None: + print(f" FAIL: not integrated within {args.timeout}s") + return 1 + print(f" integration lag after last ack: {integ_lag:.2f}s") + if ckpt_lag is None: + print(f" FAIL: checkpoint did not cover size {target} within {args.timeout}s") + return 1 + print(f" checkpoint covering all writes: {ckpt_lag:.2f}s") + + resp = await client.get(f"{url}/checkpoint") + origin, size, n_sigs = parse_checkpoint(resp.text) + print(f" checkpoint: origin={origin} size={size} signatures={n_sigs}") + + # Phase 3: reads + correctness + print(f"\nphase 3: {min(args.lookups, len(assigned))} vindex lookups, " + f"50 checkpoint fetches, tile reads ...") + ckpt_stats, vindex_stats, tile_stats, mismatches = await read_phase( + client, url, assigned, args.lookups, args.concurrency + ) + print(ckpt_stats.summary("read /checkpoint")) + print(vindex_stats.summary("read /vindex/lookup/key")) + print(tile_stats.summary("read /tile/entries")) + + # Verdict + print("\n=== verdict ===") + failures = [] + if write_stats.errors: + failures.append(f"write errors: {write_stats.errors}") + if mismatches: + failures.append(f"{mismatches} vindex lookups missing the assigned index") + if any(s.errors for s in (ckpt_stats, vindex_stats, tile_stats)): + failures.append("read errors (see above)") + if size < target: + failures.append(f"checkpoint size {size} < target {target}") + if failures: + for f in failures: + print(f" FAIL: {f}") + return 1 + print(f" PASS: {len(assigned)} entries written, integrated, checkpointed, " + f"and all {min(args.lookups, len(assigned))} sampled lookups verified") + return 0 + + +if __name__ == "__main__": + sys.exit(asyncio.run(main())) From 5cb5669f81111ef2760269931519940f53a8d49e Mon Sep 17 00:00:00 2001 From: Wolf Vollprecht Date: Fri, 3 Jul 2026 11:22:51 +0200 Subject: [PATCH 3/3] Add siglog-import bulk importer for log bootstrapping MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Building a log from existing data through POST /add pays incremental integration costs (batch-by-batch acks, each partial tile rewritten up to 256 times) — measured at ~36 entries/s against S3. The importer builds the tree in one pass instead: - Streams pre-normalized JSONL entries in tile-aligned chunks through the same integrate() tree builder as the live path, so the resulting tree is byte-identical to incremental integration (tested) - Uploads tiles and entry bundles concurrently (200k entries -> 1,571 objects at ~5,500 entries/s locally) - Builds the vindex in the same pass, finishing with a snapshot - Commits the database state only after all objects are durable, and refuses non-empty logs (never fork); --resume skips existing objects - Signs the initial checkpoint; the live server continues incrementally from the imported state (verified end-to-end with real conda repodata) - Optional --epoch-note marker entry records what the bootstrap covers conda-log-ingest gains --jsonl-out to convert conda repodata into the importer's input format, and --api-key for authenticated submission. Co-Authored-By: Claude Fable 5 --- Cargo.toml | 4 + README.md | 40 ++ crates/conda-monitor/src/bin/ingest.rs | 45 ++- docker/Dockerfile.server | 7 +- docs/REMAINING_WORK.md | 11 +- src/bin/import.rs | 243 ++++++++++++ src/import.rs | 502 +++++++++++++++++++++++++ src/lib.rs | 1 + src/storage/database.rs | 47 +++ src/storage/opendal.rs | 5 + 10 files changed, 899 insertions(+), 6 deletions(-) create mode 100644 src/bin/import.rs create mode 100644 src/import.rs diff --git a/Cargo.toml b/Cargo.toml index fac4177..2f10208 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -26,6 +26,10 @@ path = "src/main.rs" name = "witness" path = "src/bin/witness.rs" +[[bin]] +name = "siglog-import" +path = "src/bin/import.rs" + [dependencies] # Web framework axum = "0.8" diff --git a/README.md b/README.md index 28d092c..cf1ee4c 100644 --- a/README.md +++ b/README.md @@ -276,6 +276,46 @@ curl http://localhost:8080/tile/0/000 curl http://localhost:8080/tile/entries/000 ``` +## Bulk import (backfill) + +Bootstrapping a log with existing data should not go through `POST /add` — +the incremental path acknowledges entries batch-by-batch and rewrites each +partial tile up to 256 times. `siglog-import` builds the tree in one pass +with concurrent uploads and produces a byte-identical tree to what +incremental integration would create (~5,000 entries/s locally vs ~36/s +over HTTP). + +```bash +# 1. Convert conda repodata to normalized JSONL (one file per subdir) +conda-log-ingest --file linux-64/repodata.json --subdir linux-64 \ + --jsonl-out linux-64.jsonl + +# 2. Import into an EMPTY log (server must not be running) +siglog-import \ + --origin conda.prefix.dev \ + --database-url sqlite:/data/siglog.db?mode=rwc \ + --storage-backend s3 \ + --jsonl noarch.jsonl --jsonl linux-64.jsonl \ + --epoch-note "conda-forge bootstrap $(date -u +%F), repodata sha256 ..." \ + --vindex-wal-path /data/vindex.wal + +# 3. Start the server; it continues incrementally from the imported state. +``` + +The import writes the database state only after every tile and bundle is +durably uploaded, so an interrupted run can be retried; `--resume` skips +objects that already exist (use the same `--chunk-size` and input). On +Fly.io, run it as a one-off machine holding the data volume: + +```bash +fly machine destroy --force # volume must be free +fly machine run --volume siglog_data:/data --entrypoint sleep -- infinity +fly ssh sftp shell # upload the .jsonl files to /data/ +fly ssh console -C "siglog-import --origin ... --jsonl /data/noarch.jsonl ..." +fly machine destroy --force +fly deploy # recreate the server on the imported state +``` + ## Deployment ### Fly.io diff --git a/crates/conda-monitor/src/bin/ingest.rs b/crates/conda-monitor/src/bin/ingest.rs index 4ca4fbd..9b787a2 100644 --- a/crates/conda-monitor/src/bin/ingest.rs +++ b/crates/conda-monitor/src/bin/ingest.rs @@ -41,6 +41,16 @@ struct Args { #[arg(long)] dry_run: bool, + /// Write normalized entries as JSONL to this file instead of submitting + /// them over HTTP. Feed the output to `siglog-import` for bulk + /// bootstrapping. + #[arg(long)] + jsonl_out: Option, + + /// API key for authenticating write requests (Bearer token) + #[arg(long, env = "API_KEY")] + api_key: Option, + /// Number of entries to process (for testing) #[arg(long)] limit: Option, @@ -115,6 +125,12 @@ fn main() -> anyhow::Result<()> { let mut error_count = 0; let mut indices: HashMap = HashMap::new(); + let mut jsonl_writer: Option> = args + .jsonl_out + .as_ref() + .map(|path| std::fs::File::create(path).map(std::io::BufWriter::new)) + .transpose()?; + for (filename, entry) in all_packages.into_iter().take(total) { pb.set_message(filename.to_string()); @@ -128,6 +144,15 @@ fn main() -> anyhow::Result<()> { let json_bytes = normalized.to_normalized_json(); + if let Some(writer) = &mut jsonl_writer { + use std::io::Write; + writer.write_all(&json_bytes)?; + writer.write_all(b"\n")?; + success_count += 1; + pb.inc(1); + continue; + } + if args.dry_run { // Just print first few for verification if success_count < 3 { @@ -143,7 +168,11 @@ fn main() -> anyhow::Result<()> { // Submit to log let add_url = format!("{}/add", args.log_url.trim_end_matches('/')); - match client.post(&add_url).body(json_bytes.clone()).send() { + let mut request = client.post(&add_url).body(json_bytes.clone()); + if let Some(key) = &args.api_key { + request = request.header("Authorization", format!("Bearer {}", key)); + } + match request.send() { Ok(resp) => { if resp.status().is_success() { if let Ok(text) = resp.text() { @@ -172,6 +201,20 @@ fn main() -> anyhow::Result<()> { pb.finish_with_message("Done!"); + if let Some(mut writer) = jsonl_writer { + use std::io::Write; + writer.flush()?; + println!("\n=== JSONL Export ==="); + println!("Subdir: {}", args.subdir); + println!("Entries written: {}", success_count); + println!("Skipped: {}", skip_count); + println!( + "Output: {} (feed to siglog-import for bulk bootstrap)", + args.jsonl_out.as_deref().unwrap_or_default() + ); + return Ok(()); + } + println!("\n=== Ingestion Summary ==="); println!("Subdir: {}", args.subdir); println!("Submitted: {}", success_count); diff --git a/docker/Dockerfile.server b/docker/Dockerfile.server index 47b5ef2..ea7333d 100644 --- a/docker/Dockerfile.server +++ b/docker/Dockerfile.server @@ -14,8 +14,8 @@ COPY Cargo.toml Cargo.lock ./ COPY src ./src COPY crates ./crates -# Build release binary -RUN cargo build --release --bin siglog +# Build release binaries (server + bulk importer) +RUN cargo build --release --bin siglog --bin siglog-import # Runtime stage FROM debian:bookworm-slim @@ -24,8 +24,9 @@ RUN apt-get update && apt-get install -y \ ca-certificates \ && rm -rf /var/lib/apt/lists/* -# Copy binary from builder +# Copy binaries from builder COPY --from=builder /app/target/release/siglog /usr/local/bin/siglog +COPY --from=builder /app/target/release/siglog-import /usr/local/bin/siglog-import # Run as a non-root user with a writable data directory RUN useradd --system --uid 10001 --create-home --home-dir /data siglog \ diff --git a/docs/REMAINING_WORK.md b/docs/REMAINING_WORK.md index c9515bd..c1c2f24 100644 --- a/docs/REMAINING_WORK.md +++ b/docs/REMAINING_WORK.md @@ -180,5 +180,12 @@ the replenish interval (`per_nanosecond(1e9 / rps)`); regression-tested. quorum 3. - Alerting: monitor violations → webhook/status page; log health: pending count growth, checkpoint age, witness cosign failure rate. -- Bootstrap plan: mass-load existing repodata as epoch T₀; document that - tamper-evidence starts at T₀. +- Bootstrap plan: **done** — `siglog-import` bulk-builds tree + tiles + + bundles + vindex + checkpoint in one pass (byte-identical to incremental + integration; measured 5,497 entries/s locally, 200k entries → 1,571 + objects). `conda-log-ingest --jsonl-out` converts repodata to its input. + A `--epoch-note` marker entry records what the bootstrap represents. + Run as a one-off Fly machine holding the volume (runbook in README). + Ongoing sync after bootstrap: scheduled job (GitHub Actions cron is fine) + diffing repodata against the log and submitting deltas via `POST /add`; + the publish-time hook in channel infrastructure is the end state. diff --git a/src/bin/import.rs b/src/bin/import.rs new file mode 100644 index 0000000..6f34dd2 --- /dev/null +++ b/src/bin/import.rs @@ -0,0 +1,243 @@ +//! Bulk importer for bootstrapping a siglog transparency log. +//! +//! Reads pre-normalized entries as JSONL (one JSON object per line) and +//! builds the full tree — tiles, entry bundles, vindex, database state, and +//! a signed checkpoint — in one pass with concurrent uploads. Orders of +//! magnitude faster than submitting entries through `POST /add`. +//! +//! Producing the JSONL for a conda channel: +//! conda-log-ingest --file linux-64/repodata.json --subdir linux-64 \ +//! --jsonl-out linux-64.jsonl +//! +//! Running the import (against the same DATABASE_URL / storage the server +//! will use — the server must NOT be running): +//! siglog-import --origin conda.prefix.dev \ +//! --jsonl noarch.jsonl --jsonl linux-64.jsonl \ +//! --epoch-note "conda-forge bootstrap 2026-07-03" \ +//! --vindex-wal-path /data/vindex.wal + +use clap::Parser; +use siglog::checkpoint::CheckpointSigner; +use siglog::import::{bulk_import, ImportConfig}; +use siglog::storage::{Database, TileStorage}; +use siglog::vindex; +use std::io::BufRead; +use std::sync::Arc; + +/// Bulk importer for bootstrapping a siglog transparency log. +#[derive(Parser, Debug)] +#[command(name = "siglog-import")] +#[command(about = "Bulk-import pre-normalized entries into an empty transparency log")] +struct Args { + /// Database URL (PostgreSQL: postgres://... or SQLite: sqlite:./path.db) + #[arg(long, env = "DATABASE_URL")] + database_url: String, + + /// Storage backend: "s3" or "fs" + #[arg(long, env = "STORAGE_BACKEND", default_value = "fs")] + storage_backend: String, + + /// Filesystem storage root directory (when storage_backend=fs) + #[arg(long, env = "FS_ROOT")] + fs_root: Option, + + /// S3 endpoint URL (when storage_backend=s3) + #[arg(long, env = "S3_ENDPOINT")] + s3_endpoint: Option, + + /// S3 bucket name (when storage_backend=s3) + #[arg(long, env = "S3_BUCKET")] + s3_bucket: Option, + + /// S3 access key (when storage_backend=s3) + #[arg(long, env = "S3_ACCESS_KEY")] + s3_access_key: Option, + + /// S3 secret key (when storage_backend=s3) + #[arg(long, env = "S3_SECRET_KEY")] + s3_secret_key: Option, + + /// S3 region (when storage_backend=s3) + #[arg(long, env = "S3_REGION", default_value = "auto")] + s3_region: String, + + /// Log origin string (e.g., "conda.prefix.dev") + #[arg(long, env = "LOG_ORIGIN")] + origin: String, + + /// Ed25519 private key in note format + #[arg(long, env = "LOG_PRIVATE_KEY")] + private_key: String, + + /// JSONL input file(s), one pre-normalized entry per line. Repeatable; + /// files are imported in the order given. Use "-" for stdin. + #[arg(long = "jsonl", required = true)] + jsonl: Vec, + + /// Optional epoch marker logged as entry 0, recording what this + /// bootstrap represents (e.g. the repodata snapshot date/hashes). + #[arg(long)] + epoch_note: Option, + + /// Entries per integration chunk (must be a multiple of 256). + #[arg(long, default_value = "65536")] + chunk_size: usize, + + /// Maximum concurrent object uploads. + #[arg(long, default_value = "32")] + upload_concurrency: usize, + + /// Resume an interrupted import: skip objects that already exist. + /// Must use the same chunk size and input as the interrupted run. + #[arg(long)] + resume: bool, + + /// Build the vindex during import and write its WAL + snapshot here. + #[arg(long, env = "VINDEX_WAL_PATH")] + vindex_wal_path: Option, + + /// JSON field name to extract vindex keys from. + #[arg(long, env = "VINDEX_KEY_FIELD", default_value = "name")] + vindex_key_field: String, +} + +/// Iterator over entries from the input files (epoch marker first). +fn entry_iter( + args: &Args, +) -> anyhow::Result>>> { + let epoch: Vec>> = match &args.epoch_note { + Some(note) => { + let marker = serde_json::json!({ + "type": "epoch", + "note": note, + "timestamp": chrono::Utc::now().timestamp(), + }); + vec![Ok(serde_json::to_vec(&marker).expect("epoch marker serializes"))] + } + None => Vec::new(), + }; + + let mut readers: Vec> = Vec::new(); + for path in &args.jsonl { + if path == "-" { + readers.push(Box::new(std::io::BufReader::new(std::io::stdin()))); + } else { + let file = std::fs::File::open(path) + .map_err(|e| anyhow::anyhow!("cannot open {}: {}", path, e))?; + readers.push(Box::new(std::io::BufReader::new(file))); + } + } + + Ok(epoch.into_iter().chain( + readers + .into_iter() + .flat_map(|r| r.lines()) + .filter(|line| !matches!(line, Ok(l) if l.trim().is_empty())) + .map(|line| { + line.map(|l| l.into_bytes()) + .map_err(|e| siglog::error::Error::Io(e)) + }), + )) +} + +#[tokio::main] +async fn main() -> anyhow::Result<()> { + tracing_subscriber::fmt() + .with_env_filter( + tracing_subscriber::EnvFilter::from_default_env() + .add_directive("siglog=info".parse()?), + ) + .init(); + + let args = Args::parse(); + + tracing::info!("Bulk import starting for origin '{}'", args.origin); + + // Database + storage (same configuration the server will run with) + let db = Database::connect(&args.database_url).await?; + db.run_migrations().await?; + + let storage = match args.storage_backend.as_str() { + "fs" => { + let root = args + .fs_root + .clone() + .ok_or_else(|| anyhow::anyhow!("--fs-root is required for fs storage"))?; + TileStorage::new_fs(&root)? + } + "s3" => { + let get = |v: &Option, name: &str| { + v.clone() + .ok_or_else(|| anyhow::anyhow!("--{} is required for s3 storage", name)) + }; + TileStorage::new_s3( + &get(&args.s3_endpoint, "s3-endpoint")?, + &get(&args.s3_bucket, "s3-bucket")?, + &get(&args.s3_access_key, "s3-access-key")?, + &get(&args.s3_secret_key, "s3-secret-key")?, + &args.s3_region, + )? + } + other => anyhow::bail!("Unknown storage backend: {}. Use 'fs' or 's3'.", other), + }; + + let signer = CheckpointSigner::from_note_key(&args.private_key)?; + + // Vindex: the import always builds it from scratch, so clear any + // leftover state first (a resumed import re-indexes from the input). + let vindex = match &args.vindex_wal_path { + Some(wal_path) => { + let snapshot = vindex::snapshot_path(std::path::Path::new(wal_path)); + let _ = std::fs::remove_file(&snapshot); + let _ = std::fs::remove_file(wal_path); + let map_fn = Arc::new(vindex::JsonKeysMapFn::new(&args.vindex_key_field)); + Some(Arc::new(vindex::VerifiableIndex::with_wal_and_batch_size( + map_fn, wal_path, 0, 1024, + )?)) + } + None => None, + }; + + let config = ImportConfig { + origin: args.origin.clone(), + chunk_size: args.chunk_size, + upload_concurrency: args.upload_concurrency, + resume: args.resume, + }; + + let start = std::time::Instant::now(); + let summary = bulk_import( + &db, + &storage, + &signer, + vindex.as_ref(), + &config, + entry_iter(&args)?, + ) + .await?; + let elapsed = start.elapsed(); + + tracing::info!( + "Import complete: {} entries in {:.1}s ({:.0} entries/s)", + summary.entries, + elapsed.as_secs_f64(), + summary.entries as f64 / elapsed.as_secs_f64().max(0.001), + ); + tracing::info!( + " tree_size={} root={} objects_written={} objects_skipped={}", + summary.tree_size, + summary.root_hash.to_hex(), + summary.objects_written, + summary.objects_skipped, + ); + if let Some(vi) = &vindex { + tracing::info!( + " vindex: {} keys, root={}", + vi.key_count(), + hex::encode(vi.root_hash()) + ); + } + tracing::info!("The log server can now be started against this state."); + + Ok(()) +} diff --git a/src/import.rs b/src/import.rs new file mode 100644 index 0000000..8394a19 --- /dev/null +++ b/src/import.rs @@ -0,0 +1,502 @@ +//! Bulk import: build the Merkle tree, tiles, entry bundles, vindex, and +//! checkpoint for a large batch of entries in one pass. +//! +//! The incremental integration path rewrites each partial tile up to 256 +//! times as the tree grows and acknowledges entries batch-by-batch — the +//! right behavior for a live log, but needlessly slow for a one-time +//! bootstrap. This module instead: +//! +//! 1. Streams entries in chunks aligned to the tile width, reusing the same +//! [`integrate`] tree builder as the live path (so the resulting tree is +//! byte-identical to what incremental integration would produce), +//! 2. Uploads the resulting tiles and entry bundles **concurrently**, +//! 3. Builds the vindex in the same pass and finishes with a snapshot, +//! 4. Sets the database log state once, at the very end, and signs a single +//! checkpoint. +//! +//! Crash safety: object writes are idempotent, and the database state is +//! only written after every object upload succeeded — so an interrupted +//! import can simply be re-run. With [`ImportConfig::resume`] set, objects +//! that already exist are skipped (the re-run must use the same chunk size +//! so partial-tile paths line up). + +use crate::checkpoint::signer::{Checkpoint, CheckpointSigner, CosignedCheckpoint, Origin}; +use crate::error::{Error, Result}; +use crate::merkle::integrate::integrate; +use crate::merkle::EntryBundle; +use crate::storage::opendal::CheckpointData; +use crate::storage::{Database, TileStorage}; +use crate::types::{Entry, LogIndex, PartialSize, TileIndex, TreeSize}; +use crate::vindex::VerifiableIndex; +use futures::stream::StreamExt; +use sigstore_types::Sha256Hash; +use std::sync::Arc; + +/// Entries per bundle / hashes per tile. +const TILE_WIDTH: usize = 256; + +/// Configuration for a bulk import. +#[derive(Debug, Clone)] +pub struct ImportConfig { + /// Log origin for the final checkpoint. + pub origin: String, + /// Entries per integration chunk. Must be a multiple of 256. Larger + /// chunks mean fewer partial-tile rewrites at upper levels but more + /// memory per chunk. + pub chunk_size: usize, + /// Maximum concurrent object uploads. + pub upload_concurrency: usize, + /// Skip uploading objects that already exist (resuming an interrupted + /// import). The resumed run must use the same chunk size. + pub resume: bool, +} + +impl Default for ImportConfig { + fn default() -> Self { + Self { + origin: "example.com/log".to_string(), + chunk_size: 65536, + upload_concurrency: 32, + resume: false, + } + } +} + +/// Summary of a completed bulk import. +#[derive(Debug)] +pub struct ImportSummary { + /// Total entries imported. + pub entries: u64, + /// Final tree size (== entries). + pub tree_size: u64, + /// Final root hash. + pub root_hash: Sha256Hash, + /// Objects uploaded (tiles + bundles). + pub objects_written: u64, + /// Objects skipped because they already existed (resume). + pub objects_skipped: u64, +} + +/// Run a bulk import from an entry iterator into an **empty** log. +/// +/// `entries` yields the raw bytes of each log entry, already in their final +/// (normalized) form. The database log state must be empty; it is written +/// once, after all objects are durably uploaded. +pub async fn bulk_import( + db: &Database, + storage: &TileStorage, + signer: &CheckpointSigner, + vindex: Option<&Arc>, + config: &ImportConfig, + entries: I, +) -> Result +where + I: IntoIterator>>, +{ + if config.chunk_size == 0 || config.chunk_size % TILE_WIDTH != 0 { + return Err(Error::Config(format!( + "chunk_size must be a positive multiple of {}, got {}", + TILE_WIDTH, config.chunk_size + ))); + } + let origin = Origin::new(config.origin.clone())?; + + // The import target must be an empty log: importing on top of existing + // entries would require the incremental path, and overwriting an + // existing tree would fork it. + let state = db.get_log_state().await?; + if state.next_index.value() != 0 || state.integrated_size.value() != 0 { + return Err(Error::Config(format!( + "bulk import requires an empty log; found next_index={}, integrated_size={}", + state.next_index.value(), + state.integrated_size.value() + ))); + } + + let mut entries = entries.into_iter(); + let mut tree_size: u64 = 0; + let mut root_hash = None; + let mut written: u64 = 0; + let mut skipped: u64 = 0; + + loop { + // Collect the next chunk. + let mut chunk_data: Vec> = Vec::with_capacity(config.chunk_size); + for entry in entries.by_ref() { + let data = entry?; + if data.is_empty() { + return Err(Error::InvalidEntry(format!( + "entry {} is empty", + tree_size + chunk_data.len() as u64 + ))); + } + if data.len() > crate::api::handlers::MAX_ENTRY_SIZE { + return Err(Error::InvalidEntry(format!( + "entry {} is {} bytes (max {})", + tree_size + chunk_data.len() as u64, + data.len(), + crate::api::handlers::MAX_ENTRY_SIZE + ))); + } + chunk_data.push(data); + if chunk_data.len() == config.chunk_size { + break; + } + } + if chunk_data.is_empty() { + break; + } + + // Hash and index the chunk. + let mut leaf_hashes = Vec::with_capacity(chunk_data.len()); + for (offset, data) in chunk_data.iter().enumerate() { + let idx = tree_size + offset as u64; + leaf_hashes.push(*Entry::new(data.clone()).leaf_hash()); + if let Some(vi) = vindex { + vi.index_entry(LogIndex::new(idx), data)?; + } + } + + // Build this chunk of the tree with the same code path as live + // integration (loads the compact range from already-written tiles). + let result = integrate(storage, TreeSize::new(tree_size), &leaf_hashes).await?; + let new_size = result.new_size.value(); + + // Upload tiles concurrently (bounded by upload_concurrency). + let tile_jobs: Vec<_> = result + .tiles + .iter() + .map(|(tile_id, tile)| { + let partial = crate::api::paths::partial_tile_size( + tile_id.level.value(), + tile_id.index.value(), + new_size, + ); + let path = crate::api::paths::tile_path( + tile_id.level.value(), + tile_id.index.value(), + partial, + ); + async move { + if config.resume && storage.exists(&path).await? { + return Ok::(false); + } + storage + .write_tile( + tile_id.level, + tile_id.index, + PartialSize::new(partial), + tile, + ) + .await?; + Ok(true) + } + }) + .collect(); + let mut stream = futures::stream::iter(tile_jobs) + .buffer_unordered(config.upload_concurrency.max(1)); + while let Some(wrote) = stream.next().await { + if wrote? { + written += 1; + } else { + skipped += 1; + } + } + drop(stream); + + // Upload entry bundles concurrently. Chunks are 256-aligned, so + // every bundle here is full except possibly the final one. + let first_bundle = tree_size / TILE_WIDTH as u64; + let bundle_jobs: Vec<_> = chunk_data + .chunks(TILE_WIDTH) + .enumerate() + .map(|(i, bundle_entries)| { + let bundle_idx = first_bundle + i as u64; + let partial = crate::api::paths::partial_tile_size(0, bundle_idx, new_size); + let path = crate::api::paths::entries_path(bundle_idx, partial); + let bundle = EntryBundle::with_entries( + bundle_entries + .iter() + .map(|d| crate::types::EntryData::new(d.clone())) + .collect(), + ); + async move { + if config.resume && storage.exists(&path).await? { + return Ok::(false); + } + storage + .write_entry_bundle( + TileIndex::new(bundle_idx), + PartialSize::new(partial), + &bundle, + ) + .await?; + Ok(true) + } + }) + .collect(); + let mut stream = futures::stream::iter(bundle_jobs) + .buffer_unordered(config.upload_concurrency.max(1)); + while let Some(wrote) = stream.next().await { + if wrote? { + written += 1; + } else { + skipped += 1; + } + } + drop(stream); + + if let Some(vi) = vindex { + vi.flush()?; + } + + tree_size = new_size; + root_hash = Some(result.root_hash); + tracing::info!( + "Imported {} entries (root {})", + tree_size, + result.root_hash.to_hex() + ); + } + + let root_hash = root_hash.ok_or_else(|| Error::InvalidEntry("no entries to import".into()))?; + + // Persist the vindex snapshot before the DB state: the vindex must + // never be behind the database. + if let Some(vi) = vindex { + vi.flush()?; + vi.snapshot()?; + } + + // Only now, with every object durably uploaded, commit the log state. + db.initialize_imported_state(TreeSize::new(tree_size), root_hash) + .await?; + + // Sign and publish the checkpoint. Witness cosignatures are collected + // by the live server's checkpoint worker once it starts. + let checkpoint = Checkpoint::new(origin, TreeSize::new(tree_size), root_hash); + let cosigned = CosignedCheckpoint::new(checkpoint, signer); + storage + .write_checkpoint(&CheckpointData::from(cosigned.to_text())) + .await?; + + Ok(ImportSummary { + entries: tree_size, + tree_size, + root_hash, + objects_written: written, + objects_skipped: skipped, + }) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::vindex::JsonKeysMapFn; + use opendal::{services::Memory, Operator}; + + fn mem_storage() -> TileStorage { + TileStorage::new(Operator::new(Memory::default()).unwrap().finish()) + } + + async fn mem_db() -> Database { + let db = Database::connect("sqlite::memory:").await.unwrap(); + db.run_migrations().await.unwrap(); + db + } + + fn test_entries(n: usize) -> Vec>> { + (0..n) + .map(|i| Ok(format!(r#"{{"name":"pkg-{:05}","version":"1.0"}}"#, i).into_bytes())) + .collect() + } + + fn config(chunk_size: usize) -> ImportConfig { + ImportConfig { + origin: "import.test/log".to_string(), + chunk_size, + upload_concurrency: 8, + resume: false, + } + } + + /// The bulk import must produce the exact same tree as one-shot + /// integration of the same leaves — chunking must be invisible. + #[tokio::test] + async fn test_import_root_matches_single_integration() { + let n = 700; // spans multiple bundles with a partial tail + + // Reference root: single integrate() call over all leaves. + let reference = mem_storage(); + let leaves: Vec<_> = test_entries(n) + .into_iter() + .map(|e| *Entry::new(e.unwrap()).leaf_hash()) + .collect(); + let expected = integrate(&reference, TreeSize::new(0), &leaves) + .await + .unwrap(); + + // Bulk import with small chunks. + let storage = mem_storage(); + let db = mem_db().await; + let signer = CheckpointSigner::generate("import.test/log"); + let summary = bulk_import( + &db, + &storage, + &signer, + None, + &config(256), + test_entries(n), + ) + .await + .unwrap(); + + assert_eq!(summary.tree_size, n as u64); + assert_eq!(summary.root_hash, expected.root_hash); + + // DB state committed. + let state = db.get_log_state().await.unwrap(); + assert_eq!(state.integrated_size.value(), n as u64); + assert_eq!(state.root_hash, Some(expected.root_hash)); + + // Checkpoint written and parseable. + let ckpt = storage.read_checkpoint().await.unwrap().unwrap(); + let cosigned = CosignedCheckpoint::from_text(ckpt.as_str().unwrap()).unwrap(); + assert_eq!(cosigned.checkpoint.size.value(), n as u64); + assert_eq!(cosigned.checkpoint.root_hash, expected.root_hash); + + // All entry bundles present and correctly sized. + for bundle_idx in 0..=(n as u64 - 1) / 256 { + let partial = crate::api::paths::partial_tile_size(0, bundle_idx, n as u64); + let bundle = storage + .read_entry_bundle(TileIndex::new(bundle_idx), PartialSize::new(partial)) + .await + .unwrap() + .unwrap(); + let expected_len = if partial == 0 { 256 } else { partial as usize }; + assert_eq!(bundle.entries.len(), expected_len); + } + } + + /// Root must not depend on the chunk size. + #[tokio::test] + async fn test_import_chunk_size_invariance() { + let n = 600; + let mut roots = Vec::new(); + for chunk in [256usize, 512, 65536] { + let storage = mem_storage(); + let db = mem_db().await; + let signer = CheckpointSigner::generate("import.test/log"); + let summary = bulk_import( + &db, + &storage, + &signer, + None, + &config(chunk), + test_entries(n), + ) + .await + .unwrap(); + roots.push(summary.root_hash); + } + assert!(roots.windows(2).all(|w| w[0] == w[1])); + } + + /// Importing into a non-empty log must be refused. + #[tokio::test] + async fn test_import_requires_empty_log() { + let storage = mem_storage(); + let db = mem_db().await; + let signer = CheckpointSigner::generate("import.test/log"); + + bulk_import(&db, &storage, &signer, None, &config(256), test_entries(10)) + .await + .unwrap(); + + // Second import must fail: the log is no longer empty. + let result = bulk_import( + &db, + &storage, + &signer, + None, + &config(256), + test_entries(10), + ) + .await; + assert!(result.is_err()); + assert!(result.unwrap_err().to_string().contains("empty log")); + } + + /// Resume: a re-run over already-written objects skips them and + /// produces the same root. + #[tokio::test] + async fn test_import_resume_skips_existing() { + let n = 300; + let storage = mem_storage(); + let signer = CheckpointSigner::generate("import.test/log"); + + // First (simulated interrupted) run: objects written, but pretend + // the process died before the DB commit by using a throwaway DB. + let db1 = mem_db().await; + let first = bulk_import(&db1, &storage, &signer, None, &config(256), test_entries(n)) + .await + .unwrap(); + assert!(first.objects_written > 0); + assert_eq!(first.objects_skipped, 0); + + // Resume against the same storage with a fresh (still-empty) DB. + let db2 = mem_db().await; + let mut cfg = config(256); + cfg.resume = true; + let resumed = bulk_import(&db2, &storage, &signer, None, &cfg, test_entries(n)) + .await + .unwrap(); + + assert_eq!(resumed.root_hash, first.root_hash); + assert_eq!(resumed.objects_written, 0, "everything already uploaded"); + assert_eq!(resumed.objects_skipped, first.objects_written); + let state = db2.get_log_state().await.unwrap(); + assert_eq!(state.integrated_size.value(), n as u64); + } + + /// Vindex built during import must serve correct lookups and survive a + /// restart via its snapshot. + #[tokio::test] + async fn test_import_builds_vindex() { + let n = 300; + let temp_dir = tempfile::tempdir().unwrap(); + let wal_path = temp_dir.path().join("vindex.wal"); + + let storage = mem_storage(); + let db = mem_db().await; + let signer = CheckpointSigner::generate("import.test/log"); + let vi = Arc::new( + VerifiableIndex::with_wal(Arc::new(JsonKeysMapFn::new("name")), &wal_path, 0).unwrap(), + ); + + bulk_import( + &db, + &storage, + &signer, + Some(&vi), + &config(256), + test_entries(n), + ) + .await + .unwrap(); + + assert_eq!(vi.tree_size(), n as u64); + let result = vi.lookup_string("pkg-00042"); + assert!(result.found); + assert_eq!(result.indices[0].value(), 42); + + // Snapshot was written; a restart at the imported size must load it. + let restored = VerifiableIndex::with_wal( + Arc::new(JsonKeysMapFn::new("name")), + &wal_path, + n as u64, + ) + .unwrap(); + assert_eq!(restored.root_hash(), vi.root_hash()); + } +} diff --git a/src/lib.rs b/src/lib.rs index ad99cb8..15ec5a6 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -11,6 +11,7 @@ pub mod api; pub mod checkpoint; pub mod error; +pub mod import; pub mod shutdown; pub mod merkle; pub mod migration; diff --git a/src/storage/database.rs b/src/storage/database.rs index 87b54d4..e54e0b9 100644 --- a/src/storage/database.rs +++ b/src/storage/database.rs @@ -174,6 +174,53 @@ impl Database { .collect() } + /// Initialize the log state after a bulk import into an empty log. + /// + /// Sets `next_index` and `integrated_size` to the imported size in one + /// step. Fails if the log is not empty — bulk import must never fork or + /// overwrite an existing tree. + pub async fn initialize_imported_state( + &self, + size: TreeSize, + root_hash: Sha256Hash, + ) -> Result<()> { + if size.value() > i64::MAX as u64 { + return Err(Error::Internal(format!( + "tree size {} exceeds supported maximum", + size.value() + ))); + } + + let txn = self.conn.begin().await?; + + let state = log_state::Entity::find_by_id(1) + .lock_exclusive() + .one(&txn) + .await? + .ok_or_else(|| Error::Internal("log state not found".into()))?; + + if state.next_index != 0 || state.integrated_size != 0 { + txn.rollback().await?; + return Err(Error::Internal(format!( + "cannot initialize imported state: log is not empty \ + (next_index={}, integrated_size={})", + state.next_index, state.integrated_size + ))); + } + + log_state::Entity::update(log_state::ActiveModel { + id: ActiveValue::Unchanged(1), + next_index: ActiveValue::Set(size.value() as i64), + integrated_size: ActiveValue::Set(size.value() as i64), + root_hash: ActiveValue::Set(Some(root_hash.as_bytes().to_vec())), + }) + .exec(&txn) + .await?; + + txn.commit().await?; + Ok(()) + } + /// Mark entries as integrated up to the given size if the state has not /// changed since the caller read it. /// diff --git a/src/storage/opendal.rs b/src/storage/opendal.rs index 5f83a49..8a98988 100644 --- a/src/storage/opendal.rs +++ b/src/storage/opendal.rs @@ -170,6 +170,11 @@ impl TileStorage { Err(e) => Err(e.into()), } } + + /// Check whether an object exists at a path. + pub async fn exists(&self, path: &str) -> Result { + self.op.exists(path).await.map_err(Into::into) + } } /// Wrapper type for checkpoint data.