From c838ac2b5bb7687473c895efd4c1030fe3f66c77 Mon Sep 17 00:00:00 2001 From: Santiago Palladino Date: Fri, 3 Jul 2026 08:13:02 -0300 Subject: [PATCH 1/2] test(e2e): extract HA clock-skew suite and split node-killing HA test - Extract the four Clock Skew / Timezone Safety assertions out of the 5-node docker HA suite (composed/ha/e2e_ha_full.parallel.test.ts) into a new in-process multi-node/high-availability/clock_skew.test.ts. They test PostgreSQL DB-clock semantics (timestamp storage + CURRENT_TIMESTAMP-based cleanup, immune to node clock skew), so they run against PGlite (real Postgres in WASM) rather than the mock-gossip stack, whose shared DB is driven by the node clock. Requires adding @electric-sql/pglite to end-to-end. - Split the node-killing "distribute work" HA test into its own e2e_ha_distribute_work.parallel.test.ts, sharing the cluster setup via a new ha_full_setup.ts module. This removes the "must run last" ordering contract: it gets its own cluster, and the remaining e2e_ha_full tests are order-independent. No assertions changed in either file. - Leave the two stranded composed tests in place, still excluded from CI: integration_proof_verification's committed epoch-proof fixture is stale (the proof no longer verifies) and e2e_persistence's beforeAll no longer completes (single-node sequencer stalls in checkpoint proposal). Both root causes are outside this diff; documented in the files and bootstrap.sh. --- .test_patterns.yml | 4 + yarn-project/end-to-end/bootstrap.sh | 4 + yarn-project/end-to-end/package.json | 1 + .../src/composed/e2e_persistence.test.ts | 12 +- .../e2e_ha_distribute_work.parallel.test.ts | 210 +++++ .../composed/ha/e2e_ha_full.parallel.test.ts | 875 +----------------- .../src/composed/ha/ha_full_setup.ts | 463 +++++++++ .../integration_proof_verification.test.ts | 9 +- .../high-availability/clock_skew.test.ts | 285 ++++++ yarn-project/yarn.lock | 1 + 10 files changed, 1022 insertions(+), 842 deletions(-) create mode 100644 yarn-project/end-to-end/src/composed/ha/e2e_ha_distribute_work.parallel.test.ts create mode 100644 yarn-project/end-to-end/src/composed/ha/ha_full_setup.ts create mode 100644 yarn-project/end-to-end/src/multi-node/high-availability/clock_skew.test.ts diff --git a/.test_patterns.yml b/.test_patterns.yml index e4800e12b11c..e1ee02f76c9c 100644 --- a/.test_patterns.yml +++ b/.test_patterns.yml @@ -397,6 +397,10 @@ tests: owners: - *spyros + - regex: "yarn-project/end-to-end/scripts/run_test.sh ha src/composed/ha/e2e_ha_distribute_work.parallel.test.ts" + owners: + - *spyros + # http://ci.aztec-labs.com/98d59d04f85223f8 # Build-cache flake: module not found during Jest startup - regex: "src/single-node/sequencer/gov_proposal.parallel.test.ts" diff --git a/yarn-project/end-to-end/bootstrap.sh b/yarn-project/end-to-end/bootstrap.sh index 428646156342..75d4cffd1d2e 100755 --- a/yarn-project/end-to-end/bootstrap.sh +++ b/yarn-project/end-to-end/bootstrap.sh @@ -106,6 +106,10 @@ function test_cmds { # compose-based tests (use running local network) tests=( + # integration_proof_verification and e2e_persistence are excluded and run nowhere: the former's committed + # epoch-proof fixture is stale (the proof no longer verifies), and the latter's beforeAll no longer + # completes on the current branch (the single-node sequencer stalls in checkpoint proposal). See each + # file's header comment. Both stay excluded until fixed/regenerated. src/composed/!(integration_proof_verification|e2e_persistence).test.ts src/guides/*.test.ts ) diff --git a/yarn-project/end-to-end/package.json b/yarn-project/end-to-end/package.json index 6b21639a228f..cb01ef53320d 100644 --- a/yarn-project/end-to-end/package.json +++ b/yarn-project/end-to-end/package.json @@ -68,6 +68,7 @@ "@aztec/wallet-sdk": "workspace:^", "@aztec/wallets": "workspace:^", "@aztec/world-state": "workspace:^", + "@electric-sql/pglite": "^0.3.14", "@iarna/toml": "^2.2.5", "@jest/globals": "^30.0.0", "@noble/curves": "=1.0.0", diff --git a/yarn-project/end-to-end/src/composed/e2e_persistence.test.ts b/yarn-project/end-to-end/src/composed/e2e_persistence.test.ts index fe48d6a45c65..5a8dd8a9e222 100644 --- a/yarn-project/end-to-end/src/composed/e2e_persistence.test.ts +++ b/yarn-project/end-to-end/src/composed/e2e_persistence.test.ts @@ -22,9 +22,15 @@ import type { TestWallet } from '../test-wallet/test_wallet.js'; jest.setTimeout(15 * 60 * 1000); -// Node and PXE persistence tests. Uses setup() directly with PIPELINING_SETUP_OPTS; excluded from the -// compose glob for unknown reasons (migrate-later candidate). Spawns and tears down node/PXE with -// varying combinations of persisted vs empty data directories to cover five restart scenarios. +// Node and PXE persistence tests: an in-process single-node test (uses setup() directly with +// PIPELINING_SETUP_OPTS) that spawns and tears down node/PXE across five persisted-vs-empty data-directory +// restart scenarios. +// +// EXCLUDED from every CI test list (see bootstrap.sh) and does NOT run anywhere. It is a candidate to +// refile under single-node/, but on the current branch its beforeAll no longer completes: the single-node +// sequencer stalls in checkpoint proposal (waitForAttestationsAndEnqueueSubmissionAsync) and the 600s hook +// times out before setup finishes. Re-enabling it needs that setup stall fixed (the root cause is in the +// shared setup/sequencer, not this file); until then it stays excluded. describe('Aztec persistence', () => { /** * These tests check that the Aztec Node and PXE can be shutdown and restarted without losing data. diff --git a/yarn-project/end-to-end/src/composed/ha/e2e_ha_distribute_work.parallel.test.ts b/yarn-project/end-to-end/src/composed/ha/e2e_ha_distribute_work.parallel.test.ts new file mode 100644 index 000000000000..b61d207470d0 --- /dev/null +++ b/yarn-project/end-to-end/src/composed/ha/e2e_ha_distribute_work.parallel.test.ts @@ -0,0 +1,210 @@ +/** + * HA work-distribution resilience test. + * + * Extracted from `e2e_ha_full.parallel.test.ts`: this test kills each block's producer node in turn, so it + * leaves the cluster unusable and previously carried a "must run last" ordering contract. Giving it its + * own file (and therefore its own cluster via the shared `HaFullTestContext`) removes that contract while + * preserving every assertion. Requires the docker-compose HA suite (run_test.sh ha). + */ +import { type AttestationInfo, getAttestationInfoFromPublishedCheckpoint } from '@aztec/stdlib/block'; +import { Checkpoint } from '@aztec/stdlib/checkpoint'; +import { OffenseType } from '@aztec/stdlib/slashing'; + +import { jest } from '@jest/globals'; + +import { getValidatorDuties, verifyNoDuplicateAttestations } from '../../fixtures/ha_setup.js'; +import { COMMITTEE_SIZE, HaFullTestContext, NODE_COUNT } from './ha_full_setup.js'; + +describe('HA Distribute Work', () => { + jest.setTimeout(20 * 60 * 1000); // 20 minutes + + const t = new HaFullTestContext(); + + beforeAll(async () => { + await t.setup(); + }); + + afterAll(async () => { + await t.teardown(); + }); + + it('should distribute work across multiple HA nodes', async () => { + const { logger, haNodeServices, sendTriggerTx, aztecNode, mainPool, getSignatureContext, stopHANode } = t; + + logger.info('Testing HA resilience by killing nodes after they produce blocks'); + + // We'll produce NODE_COUNT blocks (5 total with NODE_COUNT=5) + // Each node produces exactly 1 block, and we kill it after it produces + // The last remaining node will produce the final block + const blockCount = NODE_COUNT; + const receipts = []; + const killedNodes: number[] = []; // Track indices of killed nodes + const blockProducers = new Map(); // Map block index to node ID + let previousBlockNumber: number | undefined; + + const nodeIds: string[] = []; + for (const service of haNodeServices) { + nodeIds.push((await service.getConfig()).nodeId); + } + + for (let i = 0; i < blockCount; i++) { + logger.info(`\n=== Producing block ${i + 1}/${blockCount} ===`); + logger.info(`Active nodes: ${haNodeServices.length - killedNodes.length}/${NODE_COUNT}`); + + const receipt = await sendTriggerTx(); + + expect(receipt.blockNumber).toBeDefined(); + + // Verify this transaction is in a different block than the previous one + if (previousBlockNumber !== undefined) { + expect(receipt.blockNumber).toBeGreaterThan(previousBlockNumber); + } + + previousBlockNumber = receipt.blockNumber; + receipts.push(receipt); + + // Find which node produced this block + const [block] = await aztecNode.getBlocks(receipt.blockNumber!, 1, { + includeL1PublishInfo: true, + includeAttestations: true, + includeTransactions: true, + onlyCheckpointed: true, + }); + if (!block) { + throw new Error(`Block ${receipt.blockNumber} not found`); + } + const slotNumber = BigInt(block.header.globalVariables.slotNumber); + const duties = await getValidatorDuties(mainPool, slotNumber); + const blockProposalDuty = duties.find(d => d.dutyType === 'BLOCK_PROPOSAL'); + + if (!blockProposalDuty) { + throw new Error(`No block proposal duty found for slot ${slotNumber}`); + } + + blockProducers.set(i, blockProposalDuty.nodeId); + logger.info(`Block ${receipt.blockNumber} produced by node ${blockProposalDuty.nodeId}`); + + const producerNodeId = blockProposalDuty.nodeId; + const producerNodeIndex = nodeIds.findIndex(nodeId => nodeId === producerNodeId); + + if (producerNodeIndex === -1) { + throw new Error(`Could not find active node with ID ${producerNodeId}`); + } + + // Kill the node that produced this block, unless it's the last block + if (i < blockCount - 1) { + logger.info(`Killing node ${producerNodeId} that produced this block`); + await stopHANode(producerNodeIndex); + killedNodes.push(producerNodeIndex); + } else { + // The final survivor is kept online for the slash-offense assertion below, but its sequencer + // is no longer needed. Stop it before running the remaining assertions so it cannot start a + // new empty checkpoint and then block service shutdown while awaiting a delayed L1 publish. + logger.info(`Last block produced; stopping sequencer for survivor ${producerNodeId}`); + await haNodeServices[producerNodeIndex].getSequencer()?.stop(); + } + + logger.info(`Block ${i + 1}/${blockCount} completed. Killed nodes: ${killedNodes.length}/${NODE_COUNT}`); + } + + // Verify we got the expected number of distinct blocks + const blockNumbers = receipts.map(r => r.blockNumber!).sort((a, b) => a - b); + const uniqueBlockNumbers = new Set(blockNumbers); + expect(uniqueBlockNumbers.size).toBe(blockCount); + logger.info(`Created ${uniqueBlockNumbers.size} distinct blocks: ${Array.from(uniqueBlockNumbers).join(', ')}`); + + // Verify each node produced at least 1 block + const nodeBlockCounts = new Map(); + for (const nodeId of blockProducers.values()) { + const count = nodeBlockCounts.get(nodeId) || 0; + nodeBlockCounts.set(nodeId, count + 1); + } + + logger.info(`Block production by node: ${JSON.stringify(Array.from(nodeBlockCounts.entries()))}`); + + // Verify: each node should have produced at least 1 block + // (there may be empty blocks produced during node transitions) + for (const [nodeId, count] of nodeBlockCounts.entries()) { + expect(count).toBeGreaterThanOrEqual(1); + logger.info(`Node ${nodeId} produced ${count} block(s) as expected`); + } + + // Verify all nodes participated (NODE_COUNT nodes total) + expect(nodeBlockCounts.size).toBe(NODE_COUNT); + logger.info(`All ${NODE_COUNT} nodes participated in block production`); + + // Verify no double-signing occurred across all blocks + const quorum = Math.floor((COMMITTEE_SIZE * 2) / 3) + 1; + for (const receipt of receipts) { + const [block] = await aztecNode.getBlocks(receipt.blockNumber!, 1, { + includeL1PublishInfo: true, + includeAttestations: true, + includeTransactions: true, + onlyCheckpointed: true, + }); + if (!block) { + throw new Error(`Block ${receipt.blockNumber} not found`); + } + const slotNumber = BigInt(block.header.globalVariables.slotNumber); + + // PRIMARY CHECK: Database records show all attestation duties attempted/completed + const duties = await getValidatorDuties(mainPool, slotNumber); + const attestationDuties = duties.filter(d => d.dutyType === 'ATTESTATION'); + + // Verify no duplicate attestation duties per validator (HA protection ensures 1 per validator) + const dutiesByValidator = verifyNoDuplicateAttestations(attestationDuties, logger); + expect(dutiesByValidator.size).toBeGreaterThanOrEqual(quorum); + logger.info( + `Block ${receipt.blockNumber}: Database shows ${dutiesByValidator.size} unique validators attested (quorum: ${quorum}), no double-signing detected in DB`, + ); + + // SECONDARY CHECK: Verify checkpoint attestations match database records + const [publishedCheckpoint] = await aztecNode.getCheckpoints(block.checkpointNumber, 1, { + includeAttestations: true, + }); + const attestationInfos = getAttestationInfoFromPublishedCheckpoint( + { + attestations: publishedCheckpoint.attestations ?? [], + checkpoint: new Checkpoint( + publishedCheckpoint.archive, + publishedCheckpoint.header, + [], + publishedCheckpoint.number, + publishedCheckpoint.feeAssetPriceModifier, + ), + }, + getSignatureContext(), + ); + + // Filter to only valid attestations with recovered addresses + const validAttestations = attestationInfos.filter( + (info: AttestationInfo) => info.status === 'recovered-from-signature' && info.address !== undefined, + ); + + // Verify checkpoint has exactly quorum attestations (trimmed to minimum required) + const checkpointValidatorAddresses = new Set(validAttestations.map(info => info.address!.toString())); + expect(checkpointValidatorAddresses.size).toBe(quorum); + + // Verify every validator in the checkpoint has a corresponding DB duty record + // (checkpoint is trimmed to quorum, so it's a subset of DB records) + for (const validatorAddress of checkpointValidatorAddresses) { + expect(dutiesByValidator.has(validatorAddress)).toBe(true); + } + } + + // GOSSIP-LAYER CHECK: each HA node's libp2p service detects when a signer attests to two + // distinct payloads at the same slot and fires `duplicateAttestationCallback` -> validator + // client emits WANT_TO_SLASH_EVENT -> SlashOffensesCollector persists a DUPLICATE_ATTESTATION + // offense. We assert no such offense (or DUPLICATE_PROPOSAL) was collected on any surviving + // HA node. Killed nodes are unreachable, but the surviving node — which has been alive the + // whole test — has observed all gossiped attestations and proposals across every slot. + const aliveNodes = haNodeServices.filter((_, idx) => !killedNodes.includes(idx)); + const allOffenses = (await Promise.all(aliveNodes.map(n => n.getSlashOffenses('all')))).flat(); + const equivocationOffenses = allOffenses.filter( + o => o.offenseType === OffenseType.DUPLICATE_ATTESTATION || o.offenseType === OffenseType.DUPLICATE_PROPOSAL, + ); + expect(equivocationOffenses).toEqual([]); + + await Promise.all(haNodeServices.map((_, nodeIndex) => stopHANode(nodeIndex))); + }); +}); diff --git a/yarn-project/end-to-end/src/composed/ha/e2e_ha_full.parallel.test.ts b/yarn-project/end-to-end/src/composed/ha/e2e_ha_full.parallel.test.ts index 28d3d75ebc20..52df5c030842 100644 --- a/yarn-project/end-to-end/src/composed/ha/e2e_ha_full.parallel.test.ts +++ b/yarn-project/end-to-end/src/composed/ha/e2e_ha_full.parallel.test.ts @@ -4,438 +4,44 @@ * Tests a complete HA setup with multiple nodes coordinating via PostgreSQL * and Web3Signer for remote signing. Verifies that blocks are produced, * attestations are signed, and no double-signing occurs. + * + * The cluster setup lives in `ha_full_setup.ts` and is shared with + * `e2e_ha_distribute_work.parallel.test.ts`. The node-killing "distribute work" resilience test lives in + * that separate file so it gets its own cluster instead of relying on running last, and the clock-skew / + * timezone DB assertions moved to the cheap in-process `multi-node/high-availability/clock_skew.test.ts`. */ -import { type AztecNodeConfig, AztecNodeService, createAztecNodeService } from '@aztec/aztec-node'; import { AztecAddress, EthAddress } from '@aztec/aztec.js/addresses'; -import { NO_WAIT, getContractInstanceFromInstantiationParams } from '@aztec/aztec.js/contracts'; -import { Fr } from '@aztec/aztec.js/fields'; -import type { Logger } from '@aztec/aztec.js/log'; -import { type AztecNode, waitForTx } from '@aztec/aztec.js/node'; -import { GovernanceProposerContract } from '@aztec/ethereum/contracts'; -import type { DeployAztecL1ContractsReturnType } from '@aztec/ethereum/deploy-aztec-l1-contracts'; -import { BlockNumber, CheckpointNumber, SlotNumber } from '@aztec/foundation/branded-types'; -import { Buffer32 } from '@aztec/foundation/buffer'; -import { SecretValue } from '@aztec/foundation/config'; -import { withLoggerBindings } from '@aztec/foundation/log/server'; +import { SlotNumber } from '@aztec/foundation/branded-types'; import { retryUntil } from '@aztec/foundation/retry'; -import { sleep } from '@aztec/foundation/sleep'; -import type { TestDateProvider } from '@aztec/foundation/timer'; import { GovernanceProposerAbi } from '@aztec/l1-artifacts/GovernanceProposerAbi'; -import { TestContract } from '@aztec/noir-test-contracts.js/Test'; -import { type AttestationInfo, getAttestationInfoFromPublishedCheckpoint } from '@aztec/stdlib/block'; -import { Checkpoint } from '@aztec/stdlib/checkpoint'; -import { TopicType } from '@aztec/stdlib/p2p'; -import { OffenseType } from '@aztec/stdlib/slashing'; -import { TxHash, type TxReceipt, TxStatus } from '@aztec/stdlib/tx'; -import type { GenesisData } from '@aztec/stdlib/world-state'; import type { ValidatorClient } from '@aztec/validator-client'; -import { PostgresSlashingProtectionDatabase } from '@aztec/validator-ha-signer/db'; -import { type DutyRow, DutyStatus, DutyType } from '@aztec/validator-ha-signer/types'; +import { type DutyRow, DutyStatus } from '@aztec/validator-ha-signer/types'; import { jest } from '@jest/globals'; -import getPort, { portNumbers } from 'get-port'; -import { mkdtemp, rm, writeFile } from 'node:fs/promises'; -import { tmpdir } from 'node:os'; +import { writeFile } from 'node:fs/promises'; import { join } from 'node:path'; -import { Pool } from 'pg'; -import { PIPELINING_SETUP_OPTS } from '../../fixtures/fixtures.js'; -import { - type HADatabaseConfig, - cleanupHADatabase, - createHADatabaseConfig, - createInitialValidatorsFromPrivateKeys, - getAddressesFromPrivateKeys, - getValidatorDuties, - setupHADatabase, - verifyNoDuplicateAttestations, -} from '../../fixtures/ha_setup.js'; -import { getPrivateKeyFromIndex, setup } from '../../fixtures/utils.js'; +import { getValidatorDuties, verifyNoDuplicateAttestations } from '../../fixtures/ha_setup.js'; import { - createWeb3SignerKeystore, - getWeb3SignerTestKeystoreDir, - getWeb3SignerUrl, - refreshWeb3Signer, -} from '../../fixtures/web3signer.js'; -import type { TestWallet } from '../../test-wallet/test_wallet.js'; -import { proveInteraction } from '../../test-wallet/utils.js'; - -const NODE_COUNT = 5; -const VALIDATOR_COUNT = 4; -const COMMITTEE_SIZE = 4; - -// Allocate p2p listen ports from above the OS ephemeral range (Linux default tops out at 60999) so they -// never collide with an ephemeral socket the OS may already have handed out -- e.g. the in-process prover -// node (which listens on p2pPort 0) or any outbound connection. The previous fixed 4040x ports sat inside -// the ephemeral range, so an ephemeral socket occasionally held a node's port at bind time, surfacing as -// libp2p ERR_NO_VALID_ADDRESSES and aborting beforeAll. get-port also locks each returned port briefly, so -// concurrent calls within this process never hand back the same one. -const getFreeP2PPort = () => getPort({ port: portNumbers(61000, 65535) }); - -async function registerTestContract(wallet: TestWallet): Promise { - const instance = await getContractInstanceFromInstantiationParams(TestContract.artifact, { - constructorArgs: [], - constructorArtifact: undefined, - salt: Fr.ZERO, - publicKeys: undefined, - deployer: undefined, - }); - await wallet.registerContract(instance, TestContract.artifact); - return TestContract.at(instance.address, wallet); -} - -async function submitTriggerTx(wallet: TestWallet, testContract: TestContract, from: AztecAddress): Promise { - const tx = await proveInteraction(wallet, testContract.methods.emit_nullifier(Fr.random()), { from }); - return await tx.send({ wait: NO_WAIT }); -} + COMMITTEE_SIZE, + HaFullTestContext, + NODE_COUNT, + VALIDATOR_COUNT, + submitTriggerTx, + waitForTriggerTx, +} from './ha_full_setup.js'; -async function waitForTriggerTx(node: AztecNode, txHash: TxHash): Promise { - const receipt = await waitForTx(node, txHash, { waitForStatus: TxStatus.CHECKPOINTED }); - if (!receipt.blockNumber) { - throw new Error('Trigger tx was checkpointed without a block number'); - } - return receipt; -} - -// Requires the docker-compose HA suite (run_test.sh ha): live Postgres (DATABASE_URL) and Web3Signer -// sidecar. Uses setup() with PIPELINING_SETUP_OPTS; multiple in-proc AztecNodeService instances share the -// Postgres slashing-protection DB and Web3Signer keystore. describe('HA Full Setup', () => { jest.setTimeout(20 * 60 * 1000); // 20 minutes - let logger: Logger; - let wallet: TestWallet; - let ownerAddress: AztecAddress; - let testContract: TestContract; - let aztecNode: AztecNode; - let config: AztecNodeConfig; - let teardown: () => Promise = async () => {}; - let accounts: AztecAddress[]; - let dateProvider: TestDateProvider; - let genesis: GenesisData | undefined; - - // HA specific resources - let haNodePools: Pool[]; // Database pools for HA nodes (for cleanup) - let haNodeServices: AztecNodeService[]; // All N HA peer nodes - let haSequencersStarted = false; - const stoppedHANodeIndexes = new Set(); - let haKeystoreDirs: string[]; - let mainPool: Pool; - let databaseConfig: HADatabaseConfig; - let attesterPrivateKeys: `0x${string}`[]; - let attesterAddresses: string[]; - let publisherPrivateKeys: `0x${string}`[]; - let publisherAddresses: string[]; - let web3SignerUrl: string; - let deployL1ContractsValues: DeployAztecL1ContractsReturnType; - let governanceProposer: GovernanceProposerContract; - /** Per-node initial keystore JSON (all 4 attesters, node's own publisher) for restore after reload test */ - let initialKeystoreJsons: string[]; - const getSignatureContext = () => ({ - chainId: config.l1ChainId, - rollupAddress: deployL1ContractsValues.l1ContractAddresses.rollupAddress, - }); - - const startHASequencers = async () => { - if (haSequencersStarted) { - return; - } - - await Promise.all( - haNodeServices.map(async (service, i) => { - logger.info(`Starting HA peer node ${i} sequencer`); - await service.getSequencer()?.start(); - }), - ); - haSequencersStarted = true; - logger.info('All HA peer sequencers started'); - }; - - const sendTriggerTx = async (): Promise => { - await startHASequencers(); - const txHash = await submitTriggerTx(wallet, testContract, ownerAddress); - return await waitForTriggerTx(aztecNode, txHash); - }; - - const stopHANode = async (nodeIndex: number) => { - if (stoppedHANodeIndexes.has(nodeIndex)) { - return; - } - - logger.info(`Stopping HA peer node ${nodeIndex}`); - await haNodeServices[nodeIndex].stop(); - stoppedHANodeIndexes.add(nodeIndex); - }; + const t = new HaFullTestContext(); beforeAll(async () => { - // Check required environment variables - if (!process.env.DATABASE_URL) { - throw new Error('DATABASE_URL environment variable must be set for HA tests'); - } - - web3SignerUrl = getWeb3SignerUrl(); - if (!web3SignerUrl) { - throw new Error('WEB3_SIGNER_URL environment variable must be set for HA tests'); - } - - // Setup database configuration - databaseConfig = createHADatabaseConfig('ha-full-test'); - - // Connect to database (migrations already run by docker-compose entrypoint) - mainPool = setupHADatabase(databaseConfig.databaseUrl.getValue()!); - - attesterPrivateKeys = Array.from( - { length: VALIDATOR_COUNT }, - (_, i) => `0x${getPrivateKeyFromIndex(i)!.toString('hex')}` as `0x${string}`, - ); - - publisherPrivateKeys = Array.from( - { length: NODE_COUNT }, - (_, i) => `0x${getPrivateKeyFromIndex(i + VALIDATOR_COUNT)!.toString('hex')}` as `0x${string}`, - ); - - const web3SignerDir = getWeb3SignerTestKeystoreDir(); - const allKeys = [...attesterPrivateKeys, ...publisherPrivateKeys]; - for (const key of allKeys) { - await createWeb3SignerKeystore(web3SignerDir, key); - } - - attesterAddresses = getAddressesFromPrivateKeys(attesterPrivateKeys); - - publisherAddresses = getAddressesFromPrivateKeys(publisherPrivateKeys); - - // Refresh Web3Signer to load all the keys (attesters + publishers) - await refreshWeb3Signer(web3SignerUrl, ...attesterAddresses, ...publisherAddresses); - - // Create database pools for HA nodes - haNodePools = Array.from( - { length: NODE_COUNT }, - () => new Pool({ connectionString: databaseConfig.databaseUrl.getValue()! }), - ); - - const initialValidators = createInitialValidatorsFromPrivateKeys(attesterPrivateKeys); - - const bootstrapP2PPort = await getFreeP2PPort(); - - ({ teardown, logger, wallet, aztecNode, config, accounts, dateProvider, deployL1ContractsValues, genesis } = - await setup( - // A single default initializerless account, created/funded/registered by setup with no on-chain - // deploy tx -- the bootstrap node can't build blocks (disableValidator), so the owner must be usable - // without one. - 1, - { - ...PIPELINING_SETUP_OPTS, - automineL1Setup: true, - initialValidators, - sequencerPublisherPrivateKeys: [new SecretValue(publisherPrivateKeys[0])], - aztecTargetCommitteeSize: COMMITTEE_SIZE, - // The full HA docker/Web3Signer stack can still be joining and syncing after the shared - // 12s pipelining preset's 2.5s start window has closed. Keep real sequencing, but give - // HA validators enough time to pass the enforced build-start gate in CI. - aztecSlotDuration: 16, - // This suite validates HA coordination on tx-bearing checkpoints. Requiring one tx avoids a startup empty - // checkpoint from occupying the shared HA publisher while the trigger tx is still being prepared. - minTxsPerBlock: 1, - archiverPollingIntervalMS: 200, - sequencerPollingIntervalMS: 200, - worldStateBlockCheckIntervalMS: 200, - blockCheckIntervalMS: 200, - startProverNode: true, - // The bootstrap node is only an RPC/P2P anchor. HA validators are the first block producers in this suite. - disableValidator: true, - // Enable P2P for transaction gossip - p2pEnabled: true, - // Bind the bootstrap node above the ephemeral range too (see getFreeP2PPort), so it can't lose - // its port to an ephemeral socket and abort the whole suite before any HA node is created. Set - // the broadcast port explicitly to the same value: discv5 otherwise defaults p2pBroadcastPort to - // p2pPort by mutating this config object in place, and that mutated value would then leak into the - // HA nodes' configs below (built by spreading `config`), making them advertise the wrong port. - p2pPort: bootstrapP2PPort, - p2pBroadcastPort: bootstrapP2PPort, - // Enable slashing for testing governance + slashing vote coordination - slasherEnabled: true, - slashingRoundSizeInEpochs: 1, // 32 slots (1 epoch) - slashingQuorum: 17, // >50% of 32 slots for tally quorum, - }, - { syncChainTip: 'proven' }, - )); - - ownerAddress = accounts[0]; - testContract = await registerTestContract(wallet); - - if (!dateProvider) { - throw new Error('dateProvider must be provided by setup for HA tests'); - } - - logger.info('Bootstrap node setup complete; funded initializerless account and test contract registered locally'); - - // Get bootstrap node's P2P ENR for HA nodes to connect to - const bootstrapNodeEnr = await aztecNode.getEncodedEnr(); - if (!bootstrapNodeEnr) { - throw new Error('Failed to get bootstrap node ENR - P2P may not be enabled'); - } - logger.info(`Bootstrap node ENR: ${bootstrapNodeEnr}`); - - // L1 contract wrappers for querying votes - governanceProposer = new GovernanceProposerContract( - deployL1ContractsValues.l1Client, - deployL1ContractsValues.l1ContractAddresses.governanceProposerAddress.toString(), - ); - logger.info('L1 contract wrappers initialized'); - - haNodeServices = []; - haKeystoreDirs = []; - logger.info(`Starting ${NODE_COUNT} HA peer nodes...`); - - // Per-node keystore: all attesters but only this node's publisher to avoid nonce conflicts. - // When keyStoreDirectory is set the node loads validators/publishers from file only, so we omit them from config. - initialKeystoreJsons = []; - - for (let i = 0; i < NODE_COUNT; i++) { - const nodeId = `${databaseConfig.nodeId}-${i + 1}`; - logger.info(`Starting HA peer node ${i} with nodeId: ${nodeId}`); - - const keystoreContent = { - schemaVersion: 1, - validators: [ - { - attester: attesterAddresses, - feeRecipient: AztecAddress.ZERO.toString(), - coinbase: EthAddress.fromString(attesterAddresses[0]).toChecksumString(), - remoteSigner: web3SignerUrl, - publisher: [publisherAddresses[i]], - }, - ], - }; - const keystoreJson = JSON.stringify(keystoreContent, null, 2); - initialKeystoreJsons.push(keystoreJson); - - const keystoreDir = await mkdtemp(join(tmpdir(), `ha-keystore-${i}-`)); - haKeystoreDirs.push(keystoreDir); - await writeFile(join(keystoreDir, 'keystore.json'), keystoreJson); - - const dataDirectory = config.dataDirectory ? `${config.dataDirectory}-${i}` : undefined; - - const nodeP2PPort = await getFreeP2PPort(); - const nodeConfig: AztecNodeConfig = { - ...config, - nodeId, - keyStoreDirectory: keystoreDir, - // Ensure txs are included in proposals to test full signing path - publishTxsWithProposals: true, - dataDirectory, - databaseUrl: databaseConfig.databaseUrl, - pollingIntervalMs: databaseConfig.pollingIntervalMs, - signingTimeoutMs: databaseConfig.signingTimeoutMs, - maxStuckDutiesAgeMs: databaseConfig.maxStuckDutiesAgeMs, - haSigningEnabled: true, - disableValidator: false, - // Enable P2P for transaction and block gossip - p2pEnabled: true, - // Each HA node gets its own free port above the ephemeral range. Override the broadcast port too: - // `...config` carries the bootstrap node's broadcast port (discv5 sets it in place), which would - // otherwise make every HA node advertise the bootstrap's port instead of its own. - p2pPort: nodeP2PPort, - p2pBroadcastPort: nodeP2PPort, - // Connect to bootstrap node for tx gossip - bootstrapNodes: [bootstrapNodeEnr], - web3SignerUrl, - }; - - const nodeService = await withLoggerBindings({ actor: `HA-${i}` }, async () => { - return await createAztecNodeService(nodeConfig, { dateProvider }, { genesis, dontStartSequencer: true }); - }); - - haNodeServices.push(nodeService); - logger.info(`HA peer node ${i} started successfully`); - } - - logger.info(`All ${NODE_COUNT} HA peer nodes started and coordinating via PostgreSQL database`); - logger.info('Waiting for HA peer nodes to join the tx gossip mesh'); - await retryUntil( - async () => { - const meshStates = await Promise.all( - haNodeServices.map(async (service, nodeIndex) => { - const p2p = service.getP2P(); - const [peers, txMeshPeerCount] = await Promise.all([ - p2p.getPeers(), - p2p.getGossipMeshPeerCount(TopicType.tx), - ]); - - return { nodeIndex, peerCount: peers.length, txMeshPeerCount }; - }), - ); - - logger.debug('HA tx gossip mesh status', { meshStates }); - return meshStates.every(({ peerCount, txMeshPeerCount }) => peerCount > 0 && txMeshPeerCount > 0) - ? true - : undefined; - }, - 'HA tx gossip mesh readiness', - 60, - 1, - ); - - // The owner is an initializerless account, so it needs no deployment tx -- it was funded at genesis - // and registered during setup, and is ready to transact as soon as the HA nodes start building blocks. - logger.info(`Test account ready at ${ownerAddress}`); + await t.setup(); }); afterAll(async () => { - // Stop all sequencers before tearing down the nodes: a sequencer stop awaits its in-flight - // iteration, which can spend tens of seconds finishing a vote or checkpoint publish on L1. - // Stops must be awaited fully — jest runs without forceExit, so a node abandoned mid-stop - // outlives the test environment and keeps the worker process alive until the CI job timeout. - // The dateProvider reset must wait until nodes are stopped: it rewinds the shared clock from - // chain time to wall time (minutes apart after the automine deploy burst), and any publisher - // deadline armed against the rewound clock would block shutdown until wall time catches up. - if (haNodeServices) { - await Promise.allSettled( - haNodeServices.map(async (service, i) => { - try { - await service.getSequencer()?.stop(); - } catch (error) { - logger.error(`Failed to stop sequencer of HA peer node ${i}: ${error}`); - } - }), - ); - await Promise.allSettled( - haNodeServices.map((_, i) => - stopHANode(i).catch(error => { - logger.error(`Failed to stop HA peer node ${i}: ${error}`); - }), - ), - ); - } - - dateProvider?.reset(); - - // Cleanup HA keystore temp directories - if (haKeystoreDirs) { - for (let i = 0; i < haKeystoreDirs.length; i++) { - try { - await rm(haKeystoreDirs[i], { recursive: true }); - } catch (error) { - logger.error(`Failed to remove HA keystore dir ${i}: ${error}`); - } - } - } - - // Cleanup HA resources (database pools, etc.) - if (haNodePools) { - for (const pool of haNodePools) { - try { - await pool.end(); - } catch (error) { - logger.error(`Failed to close HA node pool: ${error}`); - } - } - } - await cleanupHADatabase(mainPool, logger); - await mainPool.end(); - - // Cleanup bootstrap node and test infrastructure (this cleans up the shared data directory) - await teardown(); + await t.teardown(); }); afterEach(async () => { @@ -443,15 +49,12 @@ describe('HA Full Setup', () => { jest.restoreAllMocks(); // Clean up database state between tests - try { - await mainPool.query('DELETE FROM validator_duties'); - } catch (error) { - // Ignore cleanup errors (table might not exist on first run failure) - logger?.warn(`Failed to clean up validator_duties: ${error}`); - } + await t.resetDutiesTable(); }); it('should produce blocks with HA coordination and attestations', async () => { + const { logger, wallet, testContract, ownerAddress, aztecNode, mainPool, haNodeServices, startHASequencers } = t; + logger.info('Testing full HA setup: block production, attestations, and coordination'); // Send a tx to trigger block building. The account and contract are funded/registered at genesis, @@ -559,6 +162,9 @@ describe('HA Full Setup', () => { }); it('should coordinate governance voting across HA nodes', async () => { + const { logger, deployL1ContractsValues, haNodeServices, sendTriggerTx, aztecNode, governanceProposer, mainPool } = + t; + logger.info('Testing real governance voting with HA coordination'); const mockGovernancePayload = deployL1ContractsValues.l1ContractAddresses.governanceAddress; @@ -703,6 +309,18 @@ describe('HA Full Setup', () => { }); it('should reload keystore via admin API and keep building blocks after swapping attesters', async () => { + const { + logger, + attesterAddresses, + haKeystoreDirs, + web3SignerUrl, + publisherAddresses, + initialKeystoreJsons, + haNodeServices, + sendTriggerTx, + aztecNode, + } = t; + logger.info('Testing reloadKeystore: swap all attesters across HA nodes'); const groupA = attesterAddresses.slice(0, 2); @@ -777,423 +395,4 @@ describe('HA Full Setup', () => { } } }); - - // NOTE: this test needs to run last - it('should distribute work across multiple HA nodes', async () => { - logger.info('Testing HA resilience by killing nodes after they produce blocks'); - - // We'll produce NODE_COUNT blocks (5 total with NODE_COUNT=5) - // Each node produces exactly 1 block, and we kill it after it produces - // The last remaining node will produce the final block - const blockCount = NODE_COUNT; - const receipts = []; - const killedNodes: number[] = []; // Track indices of killed nodes - const blockProducers = new Map(); // Map block index to node ID - let previousBlockNumber: number | undefined; - - const nodeIds: string[] = []; - for (const service of haNodeServices) { - nodeIds.push((await service.getConfig()).nodeId); - } - - for (let i = 0; i < blockCount; i++) { - logger.info(`\n=== Producing block ${i + 1}/${blockCount} ===`); - logger.info(`Active nodes: ${haNodeServices.length - killedNodes.length}/${NODE_COUNT}`); - - const receipt = await sendTriggerTx(); - - expect(receipt.blockNumber).toBeDefined(); - - // Verify this transaction is in a different block than the previous one - if (previousBlockNumber !== undefined) { - expect(receipt.blockNumber).toBeGreaterThan(previousBlockNumber); - } - - previousBlockNumber = receipt.blockNumber; - receipts.push(receipt); - - // Find which node produced this block - const [block] = await aztecNode.getBlocks(receipt.blockNumber!, 1, { - includeL1PublishInfo: true, - includeAttestations: true, - includeTransactions: true, - onlyCheckpointed: true, - }); - if (!block) { - throw new Error(`Block ${receipt.blockNumber} not found`); - } - const slotNumber = BigInt(block.header.globalVariables.slotNumber); - const duties = await getValidatorDuties(mainPool, slotNumber); - const blockProposalDuty = duties.find(d => d.dutyType === 'BLOCK_PROPOSAL'); - - if (!blockProposalDuty) { - throw new Error(`No block proposal duty found for slot ${slotNumber}`); - } - - blockProducers.set(i, blockProposalDuty.nodeId); - logger.info(`Block ${receipt.blockNumber} produced by node ${blockProposalDuty.nodeId}`); - - const producerNodeId = blockProposalDuty.nodeId; - const producerNodeIndex = nodeIds.findIndex(nodeId => nodeId === producerNodeId); - - if (producerNodeIndex === -1) { - throw new Error(`Could not find active node with ID ${producerNodeId}`); - } - - // Kill the node that produced this block, unless it's the last block - if (i < blockCount - 1) { - logger.info(`Killing node ${producerNodeId} that produced this block`); - await stopHANode(producerNodeIndex); - killedNodes.push(producerNodeIndex); - } else { - // The final survivor is kept online for the slash-offense assertion below, but its sequencer - // is no longer needed. Stop it before running the remaining assertions so it cannot start a - // new empty checkpoint and then block service shutdown while awaiting a delayed L1 publish. - logger.info(`Last block produced; stopping sequencer for survivor ${producerNodeId}`); - await haNodeServices[producerNodeIndex].getSequencer()?.stop(); - } - - logger.info(`Block ${i + 1}/${blockCount} completed. Killed nodes: ${killedNodes.length}/${NODE_COUNT}`); - } - - // Verify we got the expected number of distinct blocks - const blockNumbers = receipts.map(r => r.blockNumber!).sort((a, b) => a - b); - const uniqueBlockNumbers = new Set(blockNumbers); - expect(uniqueBlockNumbers.size).toBe(blockCount); - logger.info(`Created ${uniqueBlockNumbers.size} distinct blocks: ${Array.from(uniqueBlockNumbers).join(', ')}`); - - // Verify each node produced at least 1 block - const nodeBlockCounts = new Map(); - for (const nodeId of blockProducers.values()) { - const count = nodeBlockCounts.get(nodeId) || 0; - nodeBlockCounts.set(nodeId, count + 1); - } - - logger.info(`Block production by node: ${JSON.stringify(Array.from(nodeBlockCounts.entries()))}`); - - // Verify: each node should have produced at least 1 block - // (there may be empty blocks produced during node transitions) - for (const [nodeId, count] of nodeBlockCounts.entries()) { - expect(count).toBeGreaterThanOrEqual(1); - logger.info(`Node ${nodeId} produced ${count} block(s) as expected`); - } - - // Verify all nodes participated (NODE_COUNT nodes total) - expect(nodeBlockCounts.size).toBe(NODE_COUNT); - logger.info(`All ${NODE_COUNT} nodes participated in block production`); - - // Verify no double-signing occurred across all blocks - const quorum = Math.floor((COMMITTEE_SIZE * 2) / 3) + 1; - for (const receipt of receipts) { - const [block] = await aztecNode.getBlocks(receipt.blockNumber!, 1, { - includeL1PublishInfo: true, - includeAttestations: true, - includeTransactions: true, - onlyCheckpointed: true, - }); - if (!block) { - throw new Error(`Block ${receipt.blockNumber} not found`); - } - const slotNumber = BigInt(block.header.globalVariables.slotNumber); - - // PRIMARY CHECK: Database records show all attestation duties attempted/completed - const duties = await getValidatorDuties(mainPool, slotNumber); - const attestationDuties = duties.filter(d => d.dutyType === 'ATTESTATION'); - - // Verify no duplicate attestation duties per validator (HA protection ensures 1 per validator) - const dutiesByValidator = verifyNoDuplicateAttestations(attestationDuties, logger); - expect(dutiesByValidator.size).toBeGreaterThanOrEqual(quorum); - logger.info( - `Block ${receipt.blockNumber}: Database shows ${dutiesByValidator.size} unique validators attested (quorum: ${quorum}), no double-signing detected in DB`, - ); - - // SECONDARY CHECK: Verify checkpoint attestations match database records - const [publishedCheckpoint] = await aztecNode.getCheckpoints(block.checkpointNumber, 1, { - includeAttestations: true, - }); - const attestationInfos = getAttestationInfoFromPublishedCheckpoint( - { - attestations: publishedCheckpoint.attestations ?? [], - checkpoint: new Checkpoint( - publishedCheckpoint.archive, - publishedCheckpoint.header, - [], - publishedCheckpoint.number, - publishedCheckpoint.feeAssetPriceModifier, - ), - }, - getSignatureContext(), - ); - - // Filter to only valid attestations with recovered addresses - const validAttestations = attestationInfos.filter( - (info: AttestationInfo) => info.status === 'recovered-from-signature' && info.address !== undefined, - ); - - // Verify checkpoint has exactly quorum attestations (trimmed to minimum required) - const checkpointValidatorAddresses = new Set(validAttestations.map(info => info.address!.toString())); - expect(checkpointValidatorAddresses.size).toBe(quorum); - - // Verify every validator in the checkpoint has a corresponding DB duty record - // (checkpoint is trimmed to quorum, so it's a subset of DB records) - for (const validatorAddress of checkpointValidatorAddresses) { - expect(dutiesByValidator.has(validatorAddress)).toBe(true); - } - } - - // GOSSIP-LAYER CHECK: each HA node's libp2p service detects when a signer attests to two - // distinct payloads at the same slot and fires `duplicateAttestationCallback` -> validator - // client emits WANT_TO_SLASH_EVENT -> SlashOffensesCollector persists a DUPLICATE_ATTESTATION - // offense. We assert no such offense (or DUPLICATE_PROPOSAL) was collected on any surviving - // HA node. Killed nodes are unreachable, but the surviving node — which has been alive the - // whole test — has observed all gossiped attestations and proposals across every slot. - const aliveNodes = haNodeServices.filter((_, idx) => !killedNodes.includes(idx)); - const allOffenses = (await Promise.all(aliveNodes.map(n => n.getSlashOffenses('all')))).flat(); - const equivocationOffenses = allOffenses.filter( - o => o.offenseType === OffenseType.DUPLICATE_ATTESTATION || o.offenseType === OffenseType.DUPLICATE_PROPOSAL, - ); - expect(equivocationOffenses).toEqual([]); - - await Promise.all(haNodeServices.map((_, nodeIndex) => stopHANode(nodeIndex))); - }); - - describe('Clock Skew and Timezone Safety', () => { - const rollupAddress = EthAddress.random(); - const validatorAddress = EthAddress.random(); - it('should not be affected by process.env.TZ changes', async () => { - const spDb = new PostgresSlashingProtectionDatabase(mainPool); - const originalTZ = process.env.TZ; - - try { - // Node 1 in UTC creates and signs a duty - process.env.TZ = 'UTC'; - const duty1 = await spDb.tryInsertOrGetExisting({ - rollupAddress, - validatorAddress, - slot: SlotNumber(100), - blockNumber: BlockNumber(0), - checkpointNumber: CheckpointNumber(0), - dutyType: DutyType.ATTESTATION, - messageHash: Buffer32.random().toString(), - nodeId: 'node-utc', - }); - expect(duty1.isNew).toBe(true); - await spDb.updateDutySigned( - rollupAddress, - validatorAddress, - SlotNumber(100), - DutyType.ATTESTATION, - '0xsig', - duty1.record.lockToken, - -1, - ); - - // Wait for real database time to pass (duties need different timestamps in PostgreSQL) - await sleep(100); - - // Node 2 in Tokyo creates and signs a duty at approximately the same time - process.env.TZ = 'Asia/Tokyo'; - const duty2 = await spDb.tryInsertOrGetExisting({ - rollupAddress, - validatorAddress, - slot: SlotNumber(101), - blockNumber: BlockNumber(0), - checkpointNumber: CheckpointNumber(0), - dutyType: DutyType.ATTESTATION, - messageHash: Buffer32.random().toString(), - nodeId: 'node-tokyo', - }); - expect(duty2.isNew).toBe(true); - await spDb.updateDutySigned( - rollupAddress, - validatorAddress, - SlotNumber(101), - DutyType.ATTESTATION, - '0xsig', - duty2.record.lockToken, - -1, - ); - - // Verify both duties were stored at correct absolute times (seconds apart, not hours) - const result = await mainPool.query<{ slot: string; unix_timestamp: string }>( - `SELECT slot, EXTRACT(EPOCH FROM started_at) as unix_timestamp - FROM validator_duties - WHERE slot IN ('100', '101') - ORDER BY slot DESC`, - ); - - const timestamp1 = parseFloat(result.rows[0].unix_timestamp); - const timestamp2 = parseFloat(result.rows[1].unix_timestamp); - const diffSeconds = Math.abs(timestamp1 - timestamp2); - - // Should be less than 10 seconds apart (not hours due to timezone interpretation) - expect(diffSeconds).toBeLessThan(10); - } finally { - process.env.TZ = originalTZ; - } - }); - - it('should not delete recent duties via cleanupOldDuties when node clock is ahead', async () => { - const spDb = new PostgresSlashingProtectionDatabase(mainPool); - - // Ensure clean slate for this test - await mainPool.query('DELETE FROM validator_duties WHERE slot = $1', ['200']); - - // Create and sign a duty using our actual methods - const duty = await spDb.tryInsertOrGetExisting({ - rollupAddress, - validatorAddress, - slot: SlotNumber(200), - blockNumber: BlockNumber(0), - checkpointNumber: CheckpointNumber(0), - dutyType: DutyType.ATTESTATION, - messageHash: Buffer32.random().toString(), - nodeId: 'test-node', - }); - expect(duty.isNew).toBe(true); - - await spDb.updateDutySigned( - rollupAddress, - validatorAddress, - SlotNumber(200), - DutyType.ATTESTATION, - '0xsig', - duty.record.lockToken, - -1, - ); - - // Verify duty exists before cleanup - const beforeCleanup = await mainPool.query( - `SELECT * FROM validator_duties WHERE slot = $1 AND validator_address = $2`, - ['200', validatorAddress.toString().toLowerCase()], - ); - expect(beforeCleanup.rows.length).toBe(1); - expect(beforeCleanup.rows[0].status).toBe('signed'); - - // Simulate node with clock 2 hours ahead using dateProvider - // NOTE: Database cleanup uses PostgreSQL's CURRENT_TIMESTAMP, not application time - // This test verifies that even if the application clock is skewed, cleanup - // correctly uses database time to determine duty age - dateProvider.setTime(Date.now() + 2 * 60 * 60 * 1000); // 2 hours ahead - - try { - // Use our actual cleanupOldDuties method - const numCleaned = await spDb.cleanupOldDuties(60 * 60 * 1000); // 1 hour - - // Should NOT delete the duty we just created (it uses DB's clock, not node's) - expect(numCleaned).toBe(0); - } finally { - // Reset dateProvider back to real time - dateProvider.reset(); - } - - // Verify duty still exists - const result = await mainPool.query( - `SELECT * FROM validator_duties WHERE slot = $1 AND validator_address = $2`, - ['200', validatorAddress.toString().toLowerCase()], - ); - expect(result.rows.length).toBe(1); - }); - - it('should delete old duties via cleanupOldDuties based on DB time, not node time', async () => { - const spDb = new PostgresSlashingProtectionDatabase(mainPool); - - // Ensure clean slate for this test - await mainPool.query('DELETE FROM validator_duties WHERE slot = $1', ['300']); - - // Create and sign a duty using our actual methods - const duty = await spDb.tryInsertOrGetExisting({ - rollupAddress, - validatorAddress, - slot: SlotNumber(300), - blockNumber: BlockNumber(0), - checkpointNumber: CheckpointNumber(0), - dutyType: DutyType.ATTESTATION, - messageHash: Buffer32.random().toString(), - nodeId: 'test-node', - }); - expect(duty.isNew).toBe(true); - - await spDb.updateDutySigned( - rollupAddress, - validatorAddress, - SlotNumber(300), - DutyType.ATTESTATION, - '0xsig', - duty.record.lockToken, - -1, - ); - - // Manually backdate the duty to 2 hours old (simulating an old duty from DB's perspective) - const updateResult = await mainPool.query( - `UPDATE validator_duties - SET started_at = CURRENT_TIMESTAMP - INTERVAL '2 hours', - completed_at = CURRENT_TIMESTAMP - INTERVAL '2 hours' - WHERE slot = $1 AND validator_address = $2`, - ['300', validatorAddress.toString().toLowerCase()], - ); - expect(updateResult.rowCount).toBe(1); - - // Verify duty is backdated (should be ~2 hours old) - const beforeCleanup = await mainPool.query( - `SELECT *, EXTRACT(EPOCH FROM (CURRENT_TIMESTAMP - started_at)) as age_seconds - FROM validator_duties WHERE slot = $1`, - ['300'], - ); - expect(beforeCleanup.rows.length).toBe(1); - expect(beforeCleanup.rows[0].status).toBe('signed'); - expect(parseFloat(beforeCleanup.rows[0].age_seconds)).toBeGreaterThan(7000); // ~2 hours in seconds - - // Simulate node with clock 1 hour behind using - dateProvider.setTime(Date.now() - 1 * 60 * 60 * 1000); // 1 hour behind - - try { - // Use our actual cleanupOldDuties method - should delete based on DB time - const numCleaned = await spDb.cleanupOldDuties(60 * 60 * 1000); // 1 hour - expect(numCleaned).toBeGreaterThanOrEqual(1); - } finally { - // Reset dateProvider back to real time - dateProvider.reset(); - } - - // Verify duty was deleted - const result = await mainPool.query( - `SELECT * FROM validator_duties WHERE slot = $1 AND validator_address = $2`, - ['300', validatorAddress.toString().toLowerCase()], - ); - expect(result.rows.length).toBe(0); - }); - - it('should not delete recent stuck duties via cleanupOwnStuckDuties when node clock is ahead', async () => { - const spDb = new PostgresSlashingProtectionDatabase(mainPool); - - // Create a signing duty (stuck, not completed) using our actual method - const duty = await spDb.tryInsertOrGetExisting({ - rollupAddress, - validatorAddress, - slot: SlotNumber(400), - blockNumber: BlockNumber(0), - checkpointNumber: CheckpointNumber(0), - dutyType: DutyType.ATTESTATION, - messageHash: Buffer32.random().toString(), - nodeId: 'stuck-node', - }); - expect(duty.isNew).toBe(true); - // Don't call updateDutySigned - leave it in 'signing' state (stuck) - - // Simulate node with clock 3 hours ahead - dateProvider.setTime(Date.now() + 3 * 60 * 60 * 1000); // 3 hours ahead - - try { - // Use our actual cleanupOwnStuckDuties method - const numCleaned = await spDb.cleanupOwnStuckDuties('stuck-node', 60 * 60 * 1000); // 1 hour - - // Should NOT delete the duty (it uses DB's clock, not node's) - expect(numCleaned).toBe(0); - } finally { - // Reset dateProvider back to real time - dateProvider.reset(); - } - }); - }); }); diff --git a/yarn-project/end-to-end/src/composed/ha/ha_full_setup.ts b/yarn-project/end-to-end/src/composed/ha/ha_full_setup.ts new file mode 100644 index 000000000000..02c9218e52da --- /dev/null +++ b/yarn-project/end-to-end/src/composed/ha/ha_full_setup.ts @@ -0,0 +1,463 @@ +/** + * Shared setup for the docker-compose HA full suite. + * + * Stands up the complete HA cluster used by `e2e_ha_full.parallel.test.ts` and + * `e2e_ha_distribute_work.parallel.test.ts`: a bootstrap RPC/P2P node plus NODE_COUNT in-proc + * `AztecNodeService` HA peers that share one PostgreSQL slashing-protection DB and a Web3Signer keystore. + * Requires the docker-compose HA suite (run_test.sh ha): live Postgres (DATABASE_URL) and Web3Signer + * sidecar. + * + * The suite is split across two files because the "distribute work" test kills nodes as it runs, leaving + * the cluster unusable; giving it its own file (its own cluster) removes the previous "must run last" + * ordering contract without changing what any test asserts. + */ +import { type AztecNodeConfig, AztecNodeService, createAztecNodeService } from '@aztec/aztec-node'; +import { AztecAddress, EthAddress } from '@aztec/aztec.js/addresses'; +import { NO_WAIT, getContractInstanceFromInstantiationParams } from '@aztec/aztec.js/contracts'; +import { Fr } from '@aztec/aztec.js/fields'; +import type { Logger } from '@aztec/aztec.js/log'; +import { type AztecNode, waitForTx } from '@aztec/aztec.js/node'; +import { GovernanceProposerContract } from '@aztec/ethereum/contracts'; +import type { DeployAztecL1ContractsReturnType } from '@aztec/ethereum/deploy-aztec-l1-contracts'; +import { SecretValue } from '@aztec/foundation/config'; +import { withLoggerBindings } from '@aztec/foundation/log/server'; +import { retryUntil } from '@aztec/foundation/retry'; +import type { TestDateProvider } from '@aztec/foundation/timer'; +import { TestContract } from '@aztec/noir-test-contracts.js/Test'; +import { TopicType } from '@aztec/stdlib/p2p'; +import { TxHash, type TxReceipt, TxStatus } from '@aztec/stdlib/tx'; +import type { GenesisData } from '@aztec/stdlib/world-state'; + +import getPort, { portNumbers } from 'get-port'; +import { mkdtemp, rm, writeFile } from 'node:fs/promises'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; +import { Pool } from 'pg'; + +import { PIPELINING_SETUP_OPTS } from '../../fixtures/fixtures.js'; +import { + type HADatabaseConfig, + cleanupHADatabase, + createHADatabaseConfig, + createInitialValidatorsFromPrivateKeys, + getAddressesFromPrivateKeys, + setupHADatabase, +} from '../../fixtures/ha_setup.js'; +import { getPrivateKeyFromIndex, setup } from '../../fixtures/utils.js'; +import { + createWeb3SignerKeystore, + getWeb3SignerTestKeystoreDir, + getWeb3SignerUrl, + refreshWeb3Signer, +} from '../../fixtures/web3signer.js'; +import type { TestWallet } from '../../test-wallet/test_wallet.js'; +import { proveInteraction } from '../../test-wallet/utils.js'; + +export const NODE_COUNT = 5; +export const VALIDATOR_COUNT = 4; +export const COMMITTEE_SIZE = 4; + +// Allocate p2p listen ports from above the OS ephemeral range (Linux default tops out at 60999) so they +// never collide with an ephemeral socket the OS may already have handed out -- e.g. the in-process prover +// node (which listens on p2pPort 0) or any outbound connection. The previous fixed 4040x ports sat inside +// the ephemeral range, so an ephemeral socket occasionally held a node's port at bind time, surfacing as +// libp2p ERR_NO_VALID_ADDRESSES and aborting beforeAll. get-port also locks each returned port briefly, so +// concurrent calls within this process never hand back the same one. +const getFreeP2PPort = () => getPort({ port: portNumbers(61000, 65535) }); + +export async function registerTestContract(wallet: TestWallet): Promise { + const instance = await getContractInstanceFromInstantiationParams(TestContract.artifact, { + constructorArgs: [], + constructorArtifact: undefined, + salt: Fr.ZERO, + publicKeys: undefined, + deployer: undefined, + }); + await wallet.registerContract(instance, TestContract.artifact); + return TestContract.at(instance.address, wallet); +} + +export async function submitTriggerTx( + wallet: TestWallet, + testContract: TestContract, + from: AztecAddress, +): Promise { + const tx = await proveInteraction(wallet, testContract.methods.emit_nullifier(Fr.random()), { from }); + return await tx.send({ wait: NO_WAIT }); +} + +export async function waitForTriggerTx(node: AztecNode, txHash: TxHash): Promise { + const receipt = await waitForTx(node, txHash, { waitForStatus: TxStatus.CHECKPOINTED }); + if (!receipt.blockNumber) { + throw new Error('Trigger tx was checkpointed without a block number'); + } + return receipt; +} + +/** + * Owns the full HA cluster lifecycle and the shared helpers the two HA test files drive it with. Each + * test file constructs one instance and calls {@link setup} in `beforeAll` / {@link teardown} in + * `afterAll`; state fields are populated by {@link setup} and read directly by the tests. + */ +export class HaFullTestContext { + logger!: Logger; + wallet!: TestWallet; + ownerAddress!: AztecAddress; + testContract!: TestContract; + aztecNode!: AztecNode; + config!: AztecNodeConfig; + accounts!: AztecAddress[]; + dateProvider!: TestDateProvider; + genesis: GenesisData | undefined; + + haNodePools!: Pool[]; // Database pools for HA nodes (for cleanup) + haNodeServices!: AztecNodeService[]; // All N HA peer nodes + haKeystoreDirs!: string[]; + mainPool!: Pool; + databaseConfig!: HADatabaseConfig; + attesterPrivateKeys!: `0x${string}`[]; + attesterAddresses!: string[]; + publisherPrivateKeys!: `0x${string}`[]; + publisherAddresses!: string[]; + web3SignerUrl!: string; + deployL1ContractsValues!: DeployAztecL1ContractsReturnType; + governanceProposer!: GovernanceProposerContract; + /** Per-node initial keystore JSON (all 4 attesters, node's own publisher) for restore after reload test */ + initialKeystoreJsons!: string[]; + + private teardownBootstrap: () => Promise = async () => {}; + private haSequencersStarted = false; + private readonly stoppedHANodeIndexes = new Set(); + + getSignatureContext = () => ({ + chainId: this.config.l1ChainId, + rollupAddress: this.deployL1ContractsValues.l1ContractAddresses.rollupAddress, + }); + + startHASequencers = async () => { + if (this.haSequencersStarted) { + return; + } + + await Promise.all( + this.haNodeServices.map(async (service, i) => { + this.logger.info(`Starting HA peer node ${i} sequencer`); + await service.getSequencer()?.start(); + }), + ); + this.haSequencersStarted = true; + this.logger.info('All HA peer sequencers started'); + }; + + sendTriggerTx = async (): Promise => { + await this.startHASequencers(); + const txHash = await submitTriggerTx(this.wallet, this.testContract, this.ownerAddress); + return await waitForTriggerTx(this.aztecNode, txHash); + }; + + stopHANode = async (nodeIndex: number) => { + if (this.stoppedHANodeIndexes.has(nodeIndex)) { + return; + } + + this.logger.info(`Stopping HA peer node ${nodeIndex}`); + await this.haNodeServices[nodeIndex].stop(); + this.stoppedHANodeIndexes.add(nodeIndex); + }; + + async setup(): Promise { + // Check required environment variables + if (!process.env.DATABASE_URL) { + throw new Error('DATABASE_URL environment variable must be set for HA tests'); + } + + this.web3SignerUrl = getWeb3SignerUrl(); + if (!this.web3SignerUrl) { + throw new Error('WEB3_SIGNER_URL environment variable must be set for HA tests'); + } + + // Setup database configuration + this.databaseConfig = createHADatabaseConfig('ha-full-test'); + + // Connect to database (migrations already run by docker-compose entrypoint) + this.mainPool = setupHADatabase(this.databaseConfig.databaseUrl.getValue()!); + + this.attesterPrivateKeys = Array.from( + { length: VALIDATOR_COUNT }, + (_, i) => `0x${getPrivateKeyFromIndex(i)!.toString('hex')}` as `0x${string}`, + ); + + this.publisherPrivateKeys = Array.from( + { length: NODE_COUNT }, + (_, i) => `0x${getPrivateKeyFromIndex(i + VALIDATOR_COUNT)!.toString('hex')}` as `0x${string}`, + ); + + const web3SignerDir = getWeb3SignerTestKeystoreDir(); + const allKeys = [...this.attesterPrivateKeys, ...this.publisherPrivateKeys]; + for (const key of allKeys) { + await createWeb3SignerKeystore(web3SignerDir, key); + } + + this.attesterAddresses = getAddressesFromPrivateKeys(this.attesterPrivateKeys); + + this.publisherAddresses = getAddressesFromPrivateKeys(this.publisherPrivateKeys); + + // Refresh Web3Signer to load all the keys (attesters + publishers) + await refreshWeb3Signer(this.web3SignerUrl, ...this.attesterAddresses, ...this.publisherAddresses); + + // Create database pools for HA nodes + this.haNodePools = Array.from( + { length: NODE_COUNT }, + () => new Pool({ connectionString: this.databaseConfig.databaseUrl.getValue()! }), + ); + + const initialValidators = createInitialValidatorsFromPrivateKeys(this.attesterPrivateKeys); + + const bootstrapP2PPort = await getFreeP2PPort(); + + ({ + teardown: this.teardownBootstrap, + logger: this.logger, + wallet: this.wallet, + aztecNode: this.aztecNode, + config: this.config, + accounts: this.accounts, + dateProvider: this.dateProvider, + deployL1ContractsValues: this.deployL1ContractsValues, + genesis: this.genesis, + } = await setup( + // A single default initializerless account, created/funded/registered by setup with no on-chain + // deploy tx -- the bootstrap node can't build blocks (disableValidator), so the owner must be usable + // without one. + 1, + { + ...PIPELINING_SETUP_OPTS, + automineL1Setup: true, + initialValidators, + sequencerPublisherPrivateKeys: [new SecretValue(this.publisherPrivateKeys[0])], + aztecTargetCommitteeSize: COMMITTEE_SIZE, + // The full HA docker/Web3Signer stack can still be joining and syncing after the shared + // 12s pipelining preset's 2.5s start window has closed. Keep real sequencing, but give + // HA validators enough time to pass the enforced build-start gate in CI. + aztecSlotDuration: 16, + // This suite validates HA coordination on tx-bearing checkpoints. Requiring one tx avoids a startup empty + // checkpoint from occupying the shared HA publisher while the trigger tx is still being prepared. + minTxsPerBlock: 1, + archiverPollingIntervalMS: 200, + sequencerPollingIntervalMS: 200, + worldStateBlockCheckIntervalMS: 200, + blockCheckIntervalMS: 200, + startProverNode: true, + // The bootstrap node is only an RPC/P2P anchor. HA validators are the first block producers in this suite. + disableValidator: true, + // Enable P2P for transaction gossip + p2pEnabled: true, + // Bind the bootstrap node above the ephemeral range too (see getFreeP2PPort), so it can't lose + // its port to an ephemeral socket and abort the whole suite before any HA node is created. Set + // the broadcast port explicitly to the same value: discv5 otherwise defaults p2pBroadcastPort to + // p2pPort by mutating this config object in place, and that mutated value would then leak into the + // HA nodes' configs below (built by spreading `config`), making them advertise the wrong port. + p2pPort: bootstrapP2PPort, + p2pBroadcastPort: bootstrapP2PPort, + // Enable slashing for testing governance + slashing vote coordination + slasherEnabled: true, + slashingRoundSizeInEpochs: 1, // 32 slots (1 epoch) + slashingQuorum: 17, // >50% of 32 slots for tally quorum, + }, + { syncChainTip: 'proven' }, + )); + + this.ownerAddress = this.accounts[0]; + this.testContract = await registerTestContract(this.wallet); + + if (!this.dateProvider) { + throw new Error('dateProvider must be provided by setup for HA tests'); + } + + this.logger.info( + 'Bootstrap node setup complete; funded initializerless account and test contract registered locally', + ); + + // Get bootstrap node's P2P ENR for HA nodes to connect to + const bootstrapNodeEnr = await this.aztecNode.getEncodedEnr(); + if (!bootstrapNodeEnr) { + throw new Error('Failed to get bootstrap node ENR - P2P may not be enabled'); + } + this.logger.info(`Bootstrap node ENR: ${bootstrapNodeEnr}`); + + // L1 contract wrappers for querying votes + this.governanceProposer = new GovernanceProposerContract( + this.deployL1ContractsValues.l1Client, + this.deployL1ContractsValues.l1ContractAddresses.governanceProposerAddress.toString(), + ); + this.logger.info('L1 contract wrappers initialized'); + + this.haNodeServices = []; + this.haKeystoreDirs = []; + this.logger.info(`Starting ${NODE_COUNT} HA peer nodes...`); + + // Per-node keystore: all attesters but only this node's publisher to avoid nonce conflicts. + // When keyStoreDirectory is set the node loads validators/publishers from file only, so we omit them from config. + this.initialKeystoreJsons = []; + + for (let i = 0; i < NODE_COUNT; i++) { + const nodeId = `${this.databaseConfig.nodeId}-${i + 1}`; + this.logger.info(`Starting HA peer node ${i} with nodeId: ${nodeId}`); + + const keystoreContent = { + schemaVersion: 1, + validators: [ + { + attester: this.attesterAddresses, + feeRecipient: AztecAddress.ZERO.toString(), + coinbase: EthAddress.fromString(this.attesterAddresses[0]).toChecksumString(), + remoteSigner: this.web3SignerUrl, + publisher: [this.publisherAddresses[i]], + }, + ], + }; + const keystoreJson = JSON.stringify(keystoreContent, null, 2); + this.initialKeystoreJsons.push(keystoreJson); + + const keystoreDir = await mkdtemp(join(tmpdir(), `ha-keystore-${i}-`)); + this.haKeystoreDirs.push(keystoreDir); + await writeFile(join(keystoreDir, 'keystore.json'), keystoreJson); + + const dataDirectory = this.config.dataDirectory ? `${this.config.dataDirectory}-${i}` : undefined; + + const nodeP2PPort = await getFreeP2PPort(); + const nodeConfig: AztecNodeConfig = { + ...this.config, + nodeId, + keyStoreDirectory: keystoreDir, + // Ensure txs are included in proposals to test full signing path + publishTxsWithProposals: true, + dataDirectory, + databaseUrl: this.databaseConfig.databaseUrl, + pollingIntervalMs: this.databaseConfig.pollingIntervalMs, + signingTimeoutMs: this.databaseConfig.signingTimeoutMs, + maxStuckDutiesAgeMs: this.databaseConfig.maxStuckDutiesAgeMs, + haSigningEnabled: true, + disableValidator: false, + // Enable P2P for transaction and block gossip + p2pEnabled: true, + // Each HA node gets its own free port above the ephemeral range. Override the broadcast port too: + // `...config` carries the bootstrap node's broadcast port (discv5 sets it in place), which would + // otherwise make every HA node advertise the bootstrap's port instead of its own. + p2pPort: nodeP2PPort, + p2pBroadcastPort: nodeP2PPort, + // Connect to bootstrap node for tx gossip + bootstrapNodes: [bootstrapNodeEnr], + web3SignerUrl: this.web3SignerUrl, + }; + + const nodeService = await withLoggerBindings({ actor: `HA-${i}` }, async () => { + return await createAztecNodeService( + nodeConfig, + { dateProvider: this.dateProvider }, + { genesis: this.genesis, dontStartSequencer: true }, + ); + }); + + this.haNodeServices.push(nodeService); + this.logger.info(`HA peer node ${i} started successfully`); + } + + this.logger.info(`All ${NODE_COUNT} HA peer nodes started and coordinating via PostgreSQL database`); + this.logger.info('Waiting for HA peer nodes to join the tx gossip mesh'); + await retryUntil( + async () => { + const meshStates = await Promise.all( + this.haNodeServices.map(async (service, nodeIndex) => { + const p2p = service.getP2P(); + const [peers, txMeshPeerCount] = await Promise.all([ + p2p.getPeers(), + p2p.getGossipMeshPeerCount(TopicType.tx), + ]); + + return { nodeIndex, peerCount: peers.length, txMeshPeerCount }; + }), + ); + + this.logger.debug('HA tx gossip mesh status', { meshStates }); + return meshStates.every(({ peerCount, txMeshPeerCount }) => peerCount > 0 && txMeshPeerCount > 0) + ? true + : undefined; + }, + 'HA tx gossip mesh readiness', + 60, + 1, + ); + + // The owner is an initializerless account, so it needs no deployment tx -- it was funded at genesis + // and registered during setup, and is ready to transact as soon as the HA nodes start building blocks. + this.logger.info(`Test account ready at ${this.ownerAddress}`); + } + + async teardown(): Promise { + // Stop all sequencers before tearing down the nodes: a sequencer stop awaits its in-flight + // iteration, which can spend tens of seconds finishing a vote or checkpoint publish on L1. + // Stops must be awaited fully — jest runs without forceExit, so a node abandoned mid-stop + // outlives the test environment and keeps the worker process alive until the CI job timeout. + // The dateProvider reset must wait until nodes are stopped: it rewinds the shared clock from + // chain time to wall time (minutes apart after the automine deploy burst), and any publisher + // deadline armed against the rewound clock would block shutdown until wall time catches up. + if (this.haNodeServices) { + await Promise.allSettled( + this.haNodeServices.map(async (service, i) => { + try { + await service.getSequencer()?.stop(); + } catch (error) { + this.logger.error(`Failed to stop sequencer of HA peer node ${i}: ${error}`); + } + }), + ); + await Promise.allSettled( + this.haNodeServices.map((_, i) => + this.stopHANode(i).catch(error => { + this.logger.error(`Failed to stop HA peer node ${i}: ${error}`); + }), + ), + ); + } + + this.dateProvider?.reset(); + + // Cleanup HA keystore temp directories + if (this.haKeystoreDirs) { + for (let i = 0; i < this.haKeystoreDirs.length; i++) { + try { + await rm(this.haKeystoreDirs[i], { recursive: true }); + } catch (error) { + this.logger.error(`Failed to remove HA keystore dir ${i}: ${error}`); + } + } + } + + // Cleanup HA resources (database pools, etc.) + if (this.haNodePools) { + for (const pool of this.haNodePools) { + try { + await pool.end(); + } catch (error) { + this.logger.error(`Failed to close HA node pool: ${error}`); + } + } + } + await cleanupHADatabase(this.mainPool, this.logger); + await this.mainPool.end(); + + // Cleanup bootstrap node and test infrastructure (this cleans up the shared data directory) + await this.teardownBootstrap(); + } + + /** Clean up database state between tests. */ + async resetDutiesTable(): Promise { + try { + await this.mainPool.query('DELETE FROM validator_duties'); + } catch (error) { + // Ignore cleanup errors (table might not exist on first run failure) + this.logger?.warn(`Failed to clean up validator_duties: ${error}`); + } + } +} diff --git a/yarn-project/end-to-end/src/composed/integration_proof_verification.test.ts b/yarn-project/end-to-end/src/composed/integration_proof_verification.test.ts index f682cba794fc..64008d29bf9c 100644 --- a/yarn-project/end-to-end/src/composed/integration_proof_verification.test.ts +++ b/yarn-project/end-to-end/src/composed/integration_proof_verification.test.ts @@ -26,7 +26,14 @@ import { getLogger, startAnvil } from '../fixtures/utils.js'; */ // Standalone Honk proof verifier integration test. Starts its own anvil, deploys a HonkVerifier contract, // loads a serialised RootRollupPublicInputs fixture, and verifies the proof on-chain via BBCircuitVerifier. -// No Aztec node. Excluded from compose glob; requires a pre-generated proof fixture (AZTEC_GENERATE_TEST_DATA). +// No Aztec node. +// +// EXCLUDED from every CI test list (see bootstrap.sh) and does NOT run anywhere. The committed +// fixtures/dumps/epoch_proof_result.json is stale: it was last regenerated in Feb 2026, but the rollup +// circuits and verification key have changed since, so bb and the on-chain HonkVerifier both reject the +// proof ("Failed to verify RootRollupArtifact proof!"). Re-enabling it needs the fixture regenerated +// against the current circuits (see the command above) and is better relocated alongside the bb-prover +// circuit tests than kept here. describe('proof_verification', () => { let proof: Proof; let publicInputs: RootRollupPublicInputs; diff --git a/yarn-project/end-to-end/src/multi-node/high-availability/clock_skew.test.ts b/yarn-project/end-to-end/src/multi-node/high-availability/clock_skew.test.ts new file mode 100644 index 000000000000..33a6fee464a7 --- /dev/null +++ b/yarn-project/end-to-end/src/multi-node/high-availability/clock_skew.test.ts @@ -0,0 +1,285 @@ +/** + * Clock-skew and timezone safety for the HA slashing-protection database. + * + * These assertions were extracted from `composed/ha/e2e_ha_full.parallel.test.ts`, where each one used to + * ride the full 5-node Postgres/Web3Signer HA cluster just to poke the slashing-protection database + * directly. They do not need any node cluster — only a PostgreSQL-semantics database and a skewable clock. + * + * The properties under test are specific to `PostgresSlashingProtectionDatabase`: duty timestamps are + * stored in absolute time (immune to `process.env.TZ`), and the cleanup queries use the database clock + * (`CURRENT_TIMESTAMP`) rather than the node's application clock, so a node whose wall clock is skewed + * cannot delete duties it should keep or keep duties it should delete. Because the property is a + * PostgreSQL behavior, they cannot be recreated on the mock-gossip HA stack, whose shared + * slashing-protection DB is driven by the node's `dateProvider`. Instead we run against PGlite — real + * PostgreSQL compiled to WASM, running in-process — so the exact database semantics are preserved with no + * docker Postgres. The `dateProvider` here only simulates a skewed node clock; the database never reads + * it. + */ +import { BlockNumber, CheckpointNumber, SlotNumber } from '@aztec/foundation/branded-types'; +import { Buffer32 } from '@aztec/foundation/buffer'; +import { EthAddress } from '@aztec/foundation/eth-address'; +import { sleep } from '@aztec/foundation/sleep'; +import { TestDateProvider } from '@aztec/foundation/timer'; +import { + INSERT_SCHEMA_VERSION, + PostgresSlashingProtectionDatabase, + SCHEMA_SETUP, + SCHEMA_VERSION, +} from '@aztec/validator-ha-signer/db'; +import { Pool } from '@aztec/validator-ha-signer/test'; +import { type DutyRow, DutyType } from '@aztec/validator-ha-signer/types'; + +import { PGlite } from '@electric-sql/pglite'; +import { jest } from '@jest/globals'; + +describe('multi-node/high-availability/clock_skew', () => { + jest.setTimeout(60 * 1000); + + const rollupAddress = EthAddress.random(); + const validatorAddress = EthAddress.random(); + + let pglite: PGlite; + let pool: Pool; + let spDb: PostgresSlashingProtectionDatabase; + let dateProvider: TestDateProvider; + + beforeEach(async () => { + pglite = new PGlite(); + pool = new Pool({ pglite }); + for (const statement of SCHEMA_SETUP) { + await pglite.query(statement); + } + await pglite.query(INSERT_SCHEMA_VERSION, [SCHEMA_VERSION]); + spDb = new PostgresSlashingProtectionDatabase(pool); + dateProvider = new TestDateProvider(); + }); + + afterEach(async () => { + dateProvider.reset(); + await pool.end(); + await pglite.close(); + }); + + it('should not be affected by process.env.TZ changes', async () => { + const originalTZ = process.env.TZ; + + try { + // Node 1 in UTC creates and signs a duty + process.env.TZ = 'UTC'; + const duty1 = await spDb.tryInsertOrGetExisting({ + rollupAddress, + validatorAddress, + slot: SlotNumber(100), + blockNumber: BlockNumber(0), + checkpointNumber: CheckpointNumber(0), + dutyType: DutyType.ATTESTATION, + messageHash: Buffer32.random().toString(), + nodeId: 'node-utc', + }); + expect(duty1.isNew).toBe(true); + await spDb.updateDutySigned( + rollupAddress, + validatorAddress, + SlotNumber(100), + DutyType.ATTESTATION, + '0xsig', + duty1.record.lockToken, + -1, + ); + + // Wait for real database time to pass (duties need different timestamps in PostgreSQL) + await sleep(100); + + // Node 2 in Tokyo creates and signs a duty at approximately the same time + process.env.TZ = 'Asia/Tokyo'; + const duty2 = await spDb.tryInsertOrGetExisting({ + rollupAddress, + validatorAddress, + slot: SlotNumber(101), + blockNumber: BlockNumber(0), + checkpointNumber: CheckpointNumber(0), + dutyType: DutyType.ATTESTATION, + messageHash: Buffer32.random().toString(), + nodeId: 'node-tokyo', + }); + expect(duty2.isNew).toBe(true); + await spDb.updateDutySigned( + rollupAddress, + validatorAddress, + SlotNumber(101), + DutyType.ATTESTATION, + '0xsig', + duty2.record.lockToken, + -1, + ); + + // Verify both duties were stored at correct absolute times (seconds apart, not hours) + const result = await pool.query<{ slot: string; unix_timestamp: string }>( + `SELECT slot, EXTRACT(EPOCH FROM started_at) as unix_timestamp + FROM validator_duties + WHERE slot IN ('100', '101') + ORDER BY slot DESC`, + ); + + const timestamp1 = parseFloat(result.rows[0].unix_timestamp); + const timestamp2 = parseFloat(result.rows[1].unix_timestamp); + const diffSeconds = Math.abs(timestamp1 - timestamp2); + + // Should be less than 10 seconds apart (not hours due to timezone interpretation) + expect(diffSeconds).toBeLessThan(10); + } finally { + process.env.TZ = originalTZ; + } + }); + + it('should not delete recent duties via cleanupOldDuties when node clock is ahead', async () => { + // Create and sign a duty using our actual methods + const duty = await spDb.tryInsertOrGetExisting({ + rollupAddress, + validatorAddress, + slot: SlotNumber(200), + blockNumber: BlockNumber(0), + checkpointNumber: CheckpointNumber(0), + dutyType: DutyType.ATTESTATION, + messageHash: Buffer32.random().toString(), + nodeId: 'test-node', + }); + expect(duty.isNew).toBe(true); + + await spDb.updateDutySigned( + rollupAddress, + validatorAddress, + SlotNumber(200), + DutyType.ATTESTATION, + '0xsig', + duty.record.lockToken, + -1, + ); + + // Verify duty exists before cleanup + const beforeCleanup = await pool.query( + `SELECT * FROM validator_duties WHERE slot = $1 AND validator_address = $2`, + ['200', validatorAddress.toString().toLowerCase()], + ); + expect(beforeCleanup.rows.length).toBe(1); + expect(beforeCleanup.rows[0].status).toBe('signed'); + + // Simulate node with clock 2 hours ahead using dateProvider + // NOTE: Database cleanup uses PostgreSQL's CURRENT_TIMESTAMP, not application time + // This test verifies that even if the application clock is skewed, cleanup + // correctly uses database time to determine duty age + dateProvider.setTime(Date.now() + 2 * 60 * 60 * 1000); // 2 hours ahead + + try { + // Use our actual cleanupOldDuties method + const numCleaned = await spDb.cleanupOldDuties(60 * 60 * 1000); // 1 hour + + // Should NOT delete the duty we just created (it uses DB's clock, not node's) + expect(numCleaned).toBe(0); + } finally { + // Reset dateProvider back to real time + dateProvider.reset(); + } + + // Verify duty still exists + const result = await pool.query( + `SELECT * FROM validator_duties WHERE slot = $1 AND validator_address = $2`, + ['200', validatorAddress.toString().toLowerCase()], + ); + expect(result.rows.length).toBe(1); + }); + + it('should delete old duties via cleanupOldDuties based on DB time, not node time', async () => { + // Create and sign a duty using our actual methods + const duty = await spDb.tryInsertOrGetExisting({ + rollupAddress, + validatorAddress, + slot: SlotNumber(300), + blockNumber: BlockNumber(0), + checkpointNumber: CheckpointNumber(0), + dutyType: DutyType.ATTESTATION, + messageHash: Buffer32.random().toString(), + nodeId: 'test-node', + }); + expect(duty.isNew).toBe(true); + + await spDb.updateDutySigned( + rollupAddress, + validatorAddress, + SlotNumber(300), + DutyType.ATTESTATION, + '0xsig', + duty.record.lockToken, + -1, + ); + + // Manually backdate the duty to 2 hours old (simulating an old duty from DB's perspective) + const updateResult = await pool.query( + `UPDATE validator_duties + SET started_at = CURRENT_TIMESTAMP - INTERVAL '2 hours', + completed_at = CURRENT_TIMESTAMP - INTERVAL '2 hours' + WHERE slot = $1 AND validator_address = $2`, + ['300', validatorAddress.toString().toLowerCase()], + ); + expect(updateResult.rowCount).toBe(1); + + // Verify duty is backdated (should be ~2 hours old) + const beforeCleanup = await pool.query( + `SELECT *, EXTRACT(EPOCH FROM (CURRENT_TIMESTAMP - started_at)) as age_seconds + FROM validator_duties WHERE slot = $1`, + ['300'], + ); + expect(beforeCleanup.rows.length).toBe(1); + expect(beforeCleanup.rows[0].status).toBe('signed'); + expect(parseFloat(beforeCleanup.rows[0].age_seconds)).toBeGreaterThan(7000); // ~2 hours in seconds + + // Simulate node with clock 1 hour behind using + dateProvider.setTime(Date.now() - 1 * 60 * 60 * 1000); // 1 hour behind + + try { + // Use our actual cleanupOldDuties method - should delete based on DB time + const numCleaned = await spDb.cleanupOldDuties(60 * 60 * 1000); // 1 hour + expect(numCleaned).toBeGreaterThanOrEqual(1); + } finally { + // Reset dateProvider back to real time + dateProvider.reset(); + } + + // Verify duty was deleted + const result = await pool.query( + `SELECT * FROM validator_duties WHERE slot = $1 AND validator_address = $2`, + ['300', validatorAddress.toString().toLowerCase()], + ); + expect(result.rows.length).toBe(0); + }); + + it('should not delete recent stuck duties via cleanupOwnStuckDuties when node clock is ahead', async () => { + // Create a signing duty (stuck, not completed) using our actual method + const duty = await spDb.tryInsertOrGetExisting({ + rollupAddress, + validatorAddress, + slot: SlotNumber(400), + blockNumber: BlockNumber(0), + checkpointNumber: CheckpointNumber(0), + dutyType: DutyType.ATTESTATION, + messageHash: Buffer32.random().toString(), + nodeId: 'stuck-node', + }); + expect(duty.isNew).toBe(true); + // Don't call updateDutySigned - leave it in 'signing' state (stuck) + + // Simulate node with clock 3 hours ahead + dateProvider.setTime(Date.now() + 3 * 60 * 60 * 1000); // 3 hours ahead + + try { + // Use our actual cleanupOwnStuckDuties method + const numCleaned = await spDb.cleanupOwnStuckDuties('stuck-node', 60 * 60 * 1000); // 1 hour + + // Should NOT delete the duty (it uses DB's clock, not node's) + expect(numCleaned).toBe(0); + } finally { + // Reset dateProvider back to real time + dateProvider.reset(); + } + }); +}); diff --git a/yarn-project/yarn.lock b/yarn-project/yarn.lock index e29c09aba20b..7f2d91855403 100644 --- a/yarn-project/yarn.lock +++ b/yarn-project/yarn.lock @@ -1233,6 +1233,7 @@ __metadata: "@aztec/wallet-sdk": "workspace:^" "@aztec/wallets": "workspace:^" "@aztec/world-state": "workspace:^" + "@electric-sql/pglite": "npm:^0.3.14" "@iarna/toml": "npm:^2.2.5" "@jest/globals": "npm:^30.0.0" "@noble/curves": "npm:=1.0.0" From cfbcb0f7c578406b9f59386a36143dd1b95ee7c2 Mon Sep 17 00:00:00 2001 From: Santiago Palladino Date: Fri, 3 Jul 2026 19:52:37 -0300 Subject: [PATCH 2/2] test(e2e): keep clock-skew on composed HA postgres suite, drop distribute-work .parallel Address review feedback on the HA suite split: - Restore the "Clock Skew and Timezone Safety" describe into the composed e2e_ha_full.parallel.test.ts suite, where it runs against the cluster's real dockerized PostgreSQL slashing-protection DB. Timezone/clock divergence between the node and its database can only be reproduced against a genuine, separate Postgres process, so the in-process PGlite variant is removed and the @electric-sql/pglite dependency dropped from end-to-end. - Rename e2e_ha_distribute_work.parallel.test.ts to e2e_ha_distribute_work.test.ts: it has a single it(), so it needs no per-test .parallel container splitting. --- .test_patterns.yml | 2 +- yarn-project/end-to-end/package.json | 1 - ...test.ts => e2e_ha_distribute_work.test.ts} | 0 .../composed/ha/e2e_ha_full.parallel.test.ts | 261 +++++++++++++++- .../src/composed/ha/ha_full_setup.ts | 2 +- .../high-availability/clock_skew.test.ts | 285 ------------------ yarn-project/yarn.lock | 1 - 7 files changed, 257 insertions(+), 295 deletions(-) rename yarn-project/end-to-end/src/composed/ha/{e2e_ha_distribute_work.parallel.test.ts => e2e_ha_distribute_work.test.ts} (100%) delete mode 100644 yarn-project/end-to-end/src/multi-node/high-availability/clock_skew.test.ts diff --git a/.test_patterns.yml b/.test_patterns.yml index e1ee02f76c9c..449f40c5cbc5 100644 --- a/.test_patterns.yml +++ b/.test_patterns.yml @@ -397,7 +397,7 @@ tests: owners: - *spyros - - regex: "yarn-project/end-to-end/scripts/run_test.sh ha src/composed/ha/e2e_ha_distribute_work.parallel.test.ts" + - regex: "yarn-project/end-to-end/scripts/run_test.sh ha src/composed/ha/e2e_ha_distribute_work.test.ts" owners: - *spyros diff --git a/yarn-project/end-to-end/package.json b/yarn-project/end-to-end/package.json index cb01ef53320d..6b21639a228f 100644 --- a/yarn-project/end-to-end/package.json +++ b/yarn-project/end-to-end/package.json @@ -68,7 +68,6 @@ "@aztec/wallet-sdk": "workspace:^", "@aztec/wallets": "workspace:^", "@aztec/world-state": "workspace:^", - "@electric-sql/pglite": "^0.3.14", "@iarna/toml": "^2.2.5", "@jest/globals": "^30.0.0", "@noble/curves": "=1.0.0", diff --git a/yarn-project/end-to-end/src/composed/ha/e2e_ha_distribute_work.parallel.test.ts b/yarn-project/end-to-end/src/composed/ha/e2e_ha_distribute_work.test.ts similarity index 100% rename from yarn-project/end-to-end/src/composed/ha/e2e_ha_distribute_work.parallel.test.ts rename to yarn-project/end-to-end/src/composed/ha/e2e_ha_distribute_work.test.ts diff --git a/yarn-project/end-to-end/src/composed/ha/e2e_ha_full.parallel.test.ts b/yarn-project/end-to-end/src/composed/ha/e2e_ha_full.parallel.test.ts index 52df5c030842..da7a812b2e07 100644 --- a/yarn-project/end-to-end/src/composed/ha/e2e_ha_full.parallel.test.ts +++ b/yarn-project/end-to-end/src/composed/ha/e2e_ha_full.parallel.test.ts @@ -5,17 +5,22 @@ * and Web3Signer for remote signing. Verifies that blocks are produced, * attestations are signed, and no double-signing occurs. * - * The cluster setup lives in `ha_full_setup.ts` and is shared with - * `e2e_ha_distribute_work.parallel.test.ts`. The node-killing "distribute work" resilience test lives in - * that separate file so it gets its own cluster instead of relying on running last, and the clock-skew / - * timezone DB assertions moved to the cheap in-process `multi-node/high-availability/clock_skew.test.ts`. + * The cluster setup lives in `ha_full_setup.ts` and is shared with `e2e_ha_distribute_work.test.ts`. The + * node-killing "distribute work" resilience test lives in that separate file so it gets its own cluster + * instead of relying on running last. The clock-skew / timezone DB assertions stay here in the nested + * `Clock Skew and Timezone Safety` describe: they exercise the cluster's real dockerized PostgreSQL + * slashing-protection database, and timezone/clock divergence between the node and its database can only + * be reproduced against a genuine, separate Postgres process (not an in-process one). */ import { AztecAddress, EthAddress } from '@aztec/aztec.js/addresses'; -import { SlotNumber } from '@aztec/foundation/branded-types'; +import { BlockNumber, CheckpointNumber, SlotNumber } from '@aztec/foundation/branded-types'; +import { Buffer32 } from '@aztec/foundation/buffer'; import { retryUntil } from '@aztec/foundation/retry'; +import { sleep } from '@aztec/foundation/sleep'; import { GovernanceProposerAbi } from '@aztec/l1-artifacts/GovernanceProposerAbi'; import type { ValidatorClient } from '@aztec/validator-client'; -import { type DutyRow, DutyStatus } from '@aztec/validator-ha-signer/types'; +import { PostgresSlashingProtectionDatabase } from '@aztec/validator-ha-signer/db'; +import { type DutyRow, DutyStatus, DutyType } from '@aztec/validator-ha-signer/types'; import { jest } from '@jest/globals'; import { writeFile } from 'node:fs/promises'; @@ -395,4 +400,248 @@ describe('HA Full Setup', () => { } } }); + + describe('Clock Skew and Timezone Safety', () => { + const rollupAddress = EthAddress.random(); + const validatorAddress = EthAddress.random(); + it('should not be affected by process.env.TZ changes', async () => { + const { mainPool } = t; + const spDb = new PostgresSlashingProtectionDatabase(mainPool); + const originalTZ = process.env.TZ; + + try { + // Node 1 in UTC creates and signs a duty + process.env.TZ = 'UTC'; + const duty1 = await spDb.tryInsertOrGetExisting({ + rollupAddress, + validatorAddress, + slot: SlotNumber(100), + blockNumber: BlockNumber(0), + checkpointNumber: CheckpointNumber(0), + dutyType: DutyType.ATTESTATION, + messageHash: Buffer32.random().toString(), + nodeId: 'node-utc', + }); + expect(duty1.isNew).toBe(true); + await spDb.updateDutySigned( + rollupAddress, + validatorAddress, + SlotNumber(100), + DutyType.ATTESTATION, + '0xsig', + duty1.record.lockToken, + -1, + ); + + // Wait for real database time to pass (duties need different timestamps in PostgreSQL) + await sleep(100); + + // Node 2 in Tokyo creates and signs a duty at approximately the same time + process.env.TZ = 'Asia/Tokyo'; + const duty2 = await spDb.tryInsertOrGetExisting({ + rollupAddress, + validatorAddress, + slot: SlotNumber(101), + blockNumber: BlockNumber(0), + checkpointNumber: CheckpointNumber(0), + dutyType: DutyType.ATTESTATION, + messageHash: Buffer32.random().toString(), + nodeId: 'node-tokyo', + }); + expect(duty2.isNew).toBe(true); + await spDb.updateDutySigned( + rollupAddress, + validatorAddress, + SlotNumber(101), + DutyType.ATTESTATION, + '0xsig', + duty2.record.lockToken, + -1, + ); + + // Verify both duties were stored at correct absolute times (seconds apart, not hours) + const result = await mainPool.query<{ slot: string; unix_timestamp: string }>( + `SELECT slot, EXTRACT(EPOCH FROM started_at) as unix_timestamp + FROM validator_duties + WHERE slot IN ('100', '101') + ORDER BY slot DESC`, + ); + + const timestamp1 = parseFloat(result.rows[0].unix_timestamp); + const timestamp2 = parseFloat(result.rows[1].unix_timestamp); + const diffSeconds = Math.abs(timestamp1 - timestamp2); + + // Should be less than 10 seconds apart (not hours due to timezone interpretation) + expect(diffSeconds).toBeLessThan(10); + } finally { + process.env.TZ = originalTZ; + } + }); + + it('should not delete recent duties via cleanupOldDuties when node clock is ahead', async () => { + const { mainPool, dateProvider } = t; + const spDb = new PostgresSlashingProtectionDatabase(mainPool); + + // Ensure clean slate for this test + await mainPool.query('DELETE FROM validator_duties WHERE slot = $1', ['200']); + + // Create and sign a duty using our actual methods + const duty = await spDb.tryInsertOrGetExisting({ + rollupAddress, + validatorAddress, + slot: SlotNumber(200), + blockNumber: BlockNumber(0), + checkpointNumber: CheckpointNumber(0), + dutyType: DutyType.ATTESTATION, + messageHash: Buffer32.random().toString(), + nodeId: 'test-node', + }); + expect(duty.isNew).toBe(true); + + await spDb.updateDutySigned( + rollupAddress, + validatorAddress, + SlotNumber(200), + DutyType.ATTESTATION, + '0xsig', + duty.record.lockToken, + -1, + ); + + // Verify duty exists before cleanup + const beforeCleanup = await mainPool.query( + `SELECT * FROM validator_duties WHERE slot = $1 AND validator_address = $2`, + ['200', validatorAddress.toString().toLowerCase()], + ); + expect(beforeCleanup.rows.length).toBe(1); + expect(beforeCleanup.rows[0].status).toBe('signed'); + + // Simulate node with clock 2 hours ahead using dateProvider + // NOTE: Database cleanup uses PostgreSQL's CURRENT_TIMESTAMP, not application time + // This test verifies that even if the application clock is skewed, cleanup + // correctly uses database time to determine duty age + dateProvider.setTime(Date.now() + 2 * 60 * 60 * 1000); // 2 hours ahead + + try { + // Use our actual cleanupOldDuties method + const numCleaned = await spDb.cleanupOldDuties(60 * 60 * 1000); // 1 hour + + // Should NOT delete the duty we just created (it uses DB's clock, not node's) + expect(numCleaned).toBe(0); + } finally { + // Reset dateProvider back to real time + dateProvider.reset(); + } + + // Verify duty still exists + const result = await mainPool.query( + `SELECT * FROM validator_duties WHERE slot = $1 AND validator_address = $2`, + ['200', validatorAddress.toString().toLowerCase()], + ); + expect(result.rows.length).toBe(1); + }); + + it('should delete old duties via cleanupOldDuties based on DB time, not node time', async () => { + const { mainPool, dateProvider } = t; + const spDb = new PostgresSlashingProtectionDatabase(mainPool); + + // Ensure clean slate for this test + await mainPool.query('DELETE FROM validator_duties WHERE slot = $1', ['300']); + + // Create and sign a duty using our actual methods + const duty = await spDb.tryInsertOrGetExisting({ + rollupAddress, + validatorAddress, + slot: SlotNumber(300), + blockNumber: BlockNumber(0), + checkpointNumber: CheckpointNumber(0), + dutyType: DutyType.ATTESTATION, + messageHash: Buffer32.random().toString(), + nodeId: 'test-node', + }); + expect(duty.isNew).toBe(true); + + await spDb.updateDutySigned( + rollupAddress, + validatorAddress, + SlotNumber(300), + DutyType.ATTESTATION, + '0xsig', + duty.record.lockToken, + -1, + ); + + // Manually backdate the duty to 2 hours old (simulating an old duty from DB's perspective) + const updateResult = await mainPool.query( + `UPDATE validator_duties + SET started_at = CURRENT_TIMESTAMP - INTERVAL '2 hours', + completed_at = CURRENT_TIMESTAMP - INTERVAL '2 hours' + WHERE slot = $1 AND validator_address = $2`, + ['300', validatorAddress.toString().toLowerCase()], + ); + expect(updateResult.rowCount).toBe(1); + + // Verify duty is backdated (should be ~2 hours old) + const beforeCleanup = await mainPool.query( + `SELECT *, EXTRACT(EPOCH FROM (CURRENT_TIMESTAMP - started_at)) as age_seconds + FROM validator_duties WHERE slot = $1`, + ['300'], + ); + expect(beforeCleanup.rows.length).toBe(1); + expect(beforeCleanup.rows[0].status).toBe('signed'); + expect(parseFloat(beforeCleanup.rows[0].age_seconds)).toBeGreaterThan(7000); // ~2 hours in seconds + + // Simulate node with clock 1 hour behind using + dateProvider.setTime(Date.now() - 1 * 60 * 60 * 1000); // 1 hour behind + + try { + // Use our actual cleanupOldDuties method - should delete based on DB time + const numCleaned = await spDb.cleanupOldDuties(60 * 60 * 1000); // 1 hour + expect(numCleaned).toBeGreaterThanOrEqual(1); + } finally { + // Reset dateProvider back to real time + dateProvider.reset(); + } + + // Verify duty was deleted + const result = await mainPool.query( + `SELECT * FROM validator_duties WHERE slot = $1 AND validator_address = $2`, + ['300', validatorAddress.toString().toLowerCase()], + ); + expect(result.rows.length).toBe(0); + }); + + it('should not delete recent stuck duties via cleanupOwnStuckDuties when node clock is ahead', async () => { + const { mainPool, dateProvider } = t; + const spDb = new PostgresSlashingProtectionDatabase(mainPool); + + // Create a signing duty (stuck, not completed) using our actual method + const duty = await spDb.tryInsertOrGetExisting({ + rollupAddress, + validatorAddress, + slot: SlotNumber(400), + blockNumber: BlockNumber(0), + checkpointNumber: CheckpointNumber(0), + dutyType: DutyType.ATTESTATION, + messageHash: Buffer32.random().toString(), + nodeId: 'stuck-node', + }); + expect(duty.isNew).toBe(true); + // Don't call updateDutySigned - leave it in 'signing' state (stuck) + + // Simulate node with clock 3 hours ahead + dateProvider.setTime(Date.now() + 3 * 60 * 60 * 1000); // 3 hours ahead + + try { + // Use our actual cleanupOwnStuckDuties method + const numCleaned = await spDb.cleanupOwnStuckDuties('stuck-node', 60 * 60 * 1000); // 1 hour + + // Should NOT delete the duty (it uses DB's clock, not node's) + expect(numCleaned).toBe(0); + } finally { + // Reset dateProvider back to real time + dateProvider.reset(); + } + }); + }); }); diff --git a/yarn-project/end-to-end/src/composed/ha/ha_full_setup.ts b/yarn-project/end-to-end/src/composed/ha/ha_full_setup.ts index 02c9218e52da..78e6a4090283 100644 --- a/yarn-project/end-to-end/src/composed/ha/ha_full_setup.ts +++ b/yarn-project/end-to-end/src/composed/ha/ha_full_setup.ts @@ -2,7 +2,7 @@ * Shared setup for the docker-compose HA full suite. * * Stands up the complete HA cluster used by `e2e_ha_full.parallel.test.ts` and - * `e2e_ha_distribute_work.parallel.test.ts`: a bootstrap RPC/P2P node plus NODE_COUNT in-proc + * `e2e_ha_distribute_work.test.ts`: a bootstrap RPC/P2P node plus NODE_COUNT in-proc * `AztecNodeService` HA peers that share one PostgreSQL slashing-protection DB and a Web3Signer keystore. * Requires the docker-compose HA suite (run_test.sh ha): live Postgres (DATABASE_URL) and Web3Signer * sidecar. diff --git a/yarn-project/end-to-end/src/multi-node/high-availability/clock_skew.test.ts b/yarn-project/end-to-end/src/multi-node/high-availability/clock_skew.test.ts deleted file mode 100644 index 33a6fee464a7..000000000000 --- a/yarn-project/end-to-end/src/multi-node/high-availability/clock_skew.test.ts +++ /dev/null @@ -1,285 +0,0 @@ -/** - * Clock-skew and timezone safety for the HA slashing-protection database. - * - * These assertions were extracted from `composed/ha/e2e_ha_full.parallel.test.ts`, where each one used to - * ride the full 5-node Postgres/Web3Signer HA cluster just to poke the slashing-protection database - * directly. They do not need any node cluster — only a PostgreSQL-semantics database and a skewable clock. - * - * The properties under test are specific to `PostgresSlashingProtectionDatabase`: duty timestamps are - * stored in absolute time (immune to `process.env.TZ`), and the cleanup queries use the database clock - * (`CURRENT_TIMESTAMP`) rather than the node's application clock, so a node whose wall clock is skewed - * cannot delete duties it should keep or keep duties it should delete. Because the property is a - * PostgreSQL behavior, they cannot be recreated on the mock-gossip HA stack, whose shared - * slashing-protection DB is driven by the node's `dateProvider`. Instead we run against PGlite — real - * PostgreSQL compiled to WASM, running in-process — so the exact database semantics are preserved with no - * docker Postgres. The `dateProvider` here only simulates a skewed node clock; the database never reads - * it. - */ -import { BlockNumber, CheckpointNumber, SlotNumber } from '@aztec/foundation/branded-types'; -import { Buffer32 } from '@aztec/foundation/buffer'; -import { EthAddress } from '@aztec/foundation/eth-address'; -import { sleep } from '@aztec/foundation/sleep'; -import { TestDateProvider } from '@aztec/foundation/timer'; -import { - INSERT_SCHEMA_VERSION, - PostgresSlashingProtectionDatabase, - SCHEMA_SETUP, - SCHEMA_VERSION, -} from '@aztec/validator-ha-signer/db'; -import { Pool } from '@aztec/validator-ha-signer/test'; -import { type DutyRow, DutyType } from '@aztec/validator-ha-signer/types'; - -import { PGlite } from '@electric-sql/pglite'; -import { jest } from '@jest/globals'; - -describe('multi-node/high-availability/clock_skew', () => { - jest.setTimeout(60 * 1000); - - const rollupAddress = EthAddress.random(); - const validatorAddress = EthAddress.random(); - - let pglite: PGlite; - let pool: Pool; - let spDb: PostgresSlashingProtectionDatabase; - let dateProvider: TestDateProvider; - - beforeEach(async () => { - pglite = new PGlite(); - pool = new Pool({ pglite }); - for (const statement of SCHEMA_SETUP) { - await pglite.query(statement); - } - await pglite.query(INSERT_SCHEMA_VERSION, [SCHEMA_VERSION]); - spDb = new PostgresSlashingProtectionDatabase(pool); - dateProvider = new TestDateProvider(); - }); - - afterEach(async () => { - dateProvider.reset(); - await pool.end(); - await pglite.close(); - }); - - it('should not be affected by process.env.TZ changes', async () => { - const originalTZ = process.env.TZ; - - try { - // Node 1 in UTC creates and signs a duty - process.env.TZ = 'UTC'; - const duty1 = await spDb.tryInsertOrGetExisting({ - rollupAddress, - validatorAddress, - slot: SlotNumber(100), - blockNumber: BlockNumber(0), - checkpointNumber: CheckpointNumber(0), - dutyType: DutyType.ATTESTATION, - messageHash: Buffer32.random().toString(), - nodeId: 'node-utc', - }); - expect(duty1.isNew).toBe(true); - await spDb.updateDutySigned( - rollupAddress, - validatorAddress, - SlotNumber(100), - DutyType.ATTESTATION, - '0xsig', - duty1.record.lockToken, - -1, - ); - - // Wait for real database time to pass (duties need different timestamps in PostgreSQL) - await sleep(100); - - // Node 2 in Tokyo creates and signs a duty at approximately the same time - process.env.TZ = 'Asia/Tokyo'; - const duty2 = await spDb.tryInsertOrGetExisting({ - rollupAddress, - validatorAddress, - slot: SlotNumber(101), - blockNumber: BlockNumber(0), - checkpointNumber: CheckpointNumber(0), - dutyType: DutyType.ATTESTATION, - messageHash: Buffer32.random().toString(), - nodeId: 'node-tokyo', - }); - expect(duty2.isNew).toBe(true); - await spDb.updateDutySigned( - rollupAddress, - validatorAddress, - SlotNumber(101), - DutyType.ATTESTATION, - '0xsig', - duty2.record.lockToken, - -1, - ); - - // Verify both duties were stored at correct absolute times (seconds apart, not hours) - const result = await pool.query<{ slot: string; unix_timestamp: string }>( - `SELECT slot, EXTRACT(EPOCH FROM started_at) as unix_timestamp - FROM validator_duties - WHERE slot IN ('100', '101') - ORDER BY slot DESC`, - ); - - const timestamp1 = parseFloat(result.rows[0].unix_timestamp); - const timestamp2 = parseFloat(result.rows[1].unix_timestamp); - const diffSeconds = Math.abs(timestamp1 - timestamp2); - - // Should be less than 10 seconds apart (not hours due to timezone interpretation) - expect(diffSeconds).toBeLessThan(10); - } finally { - process.env.TZ = originalTZ; - } - }); - - it('should not delete recent duties via cleanupOldDuties when node clock is ahead', async () => { - // Create and sign a duty using our actual methods - const duty = await spDb.tryInsertOrGetExisting({ - rollupAddress, - validatorAddress, - slot: SlotNumber(200), - blockNumber: BlockNumber(0), - checkpointNumber: CheckpointNumber(0), - dutyType: DutyType.ATTESTATION, - messageHash: Buffer32.random().toString(), - nodeId: 'test-node', - }); - expect(duty.isNew).toBe(true); - - await spDb.updateDutySigned( - rollupAddress, - validatorAddress, - SlotNumber(200), - DutyType.ATTESTATION, - '0xsig', - duty.record.lockToken, - -1, - ); - - // Verify duty exists before cleanup - const beforeCleanup = await pool.query( - `SELECT * FROM validator_duties WHERE slot = $1 AND validator_address = $2`, - ['200', validatorAddress.toString().toLowerCase()], - ); - expect(beforeCleanup.rows.length).toBe(1); - expect(beforeCleanup.rows[0].status).toBe('signed'); - - // Simulate node with clock 2 hours ahead using dateProvider - // NOTE: Database cleanup uses PostgreSQL's CURRENT_TIMESTAMP, not application time - // This test verifies that even if the application clock is skewed, cleanup - // correctly uses database time to determine duty age - dateProvider.setTime(Date.now() + 2 * 60 * 60 * 1000); // 2 hours ahead - - try { - // Use our actual cleanupOldDuties method - const numCleaned = await spDb.cleanupOldDuties(60 * 60 * 1000); // 1 hour - - // Should NOT delete the duty we just created (it uses DB's clock, not node's) - expect(numCleaned).toBe(0); - } finally { - // Reset dateProvider back to real time - dateProvider.reset(); - } - - // Verify duty still exists - const result = await pool.query( - `SELECT * FROM validator_duties WHERE slot = $1 AND validator_address = $2`, - ['200', validatorAddress.toString().toLowerCase()], - ); - expect(result.rows.length).toBe(1); - }); - - it('should delete old duties via cleanupOldDuties based on DB time, not node time', async () => { - // Create and sign a duty using our actual methods - const duty = await spDb.tryInsertOrGetExisting({ - rollupAddress, - validatorAddress, - slot: SlotNumber(300), - blockNumber: BlockNumber(0), - checkpointNumber: CheckpointNumber(0), - dutyType: DutyType.ATTESTATION, - messageHash: Buffer32.random().toString(), - nodeId: 'test-node', - }); - expect(duty.isNew).toBe(true); - - await spDb.updateDutySigned( - rollupAddress, - validatorAddress, - SlotNumber(300), - DutyType.ATTESTATION, - '0xsig', - duty.record.lockToken, - -1, - ); - - // Manually backdate the duty to 2 hours old (simulating an old duty from DB's perspective) - const updateResult = await pool.query( - `UPDATE validator_duties - SET started_at = CURRENT_TIMESTAMP - INTERVAL '2 hours', - completed_at = CURRENT_TIMESTAMP - INTERVAL '2 hours' - WHERE slot = $1 AND validator_address = $2`, - ['300', validatorAddress.toString().toLowerCase()], - ); - expect(updateResult.rowCount).toBe(1); - - // Verify duty is backdated (should be ~2 hours old) - const beforeCleanup = await pool.query( - `SELECT *, EXTRACT(EPOCH FROM (CURRENT_TIMESTAMP - started_at)) as age_seconds - FROM validator_duties WHERE slot = $1`, - ['300'], - ); - expect(beforeCleanup.rows.length).toBe(1); - expect(beforeCleanup.rows[0].status).toBe('signed'); - expect(parseFloat(beforeCleanup.rows[0].age_seconds)).toBeGreaterThan(7000); // ~2 hours in seconds - - // Simulate node with clock 1 hour behind using - dateProvider.setTime(Date.now() - 1 * 60 * 60 * 1000); // 1 hour behind - - try { - // Use our actual cleanupOldDuties method - should delete based on DB time - const numCleaned = await spDb.cleanupOldDuties(60 * 60 * 1000); // 1 hour - expect(numCleaned).toBeGreaterThanOrEqual(1); - } finally { - // Reset dateProvider back to real time - dateProvider.reset(); - } - - // Verify duty was deleted - const result = await pool.query( - `SELECT * FROM validator_duties WHERE slot = $1 AND validator_address = $2`, - ['300', validatorAddress.toString().toLowerCase()], - ); - expect(result.rows.length).toBe(0); - }); - - it('should not delete recent stuck duties via cleanupOwnStuckDuties when node clock is ahead', async () => { - // Create a signing duty (stuck, not completed) using our actual method - const duty = await spDb.tryInsertOrGetExisting({ - rollupAddress, - validatorAddress, - slot: SlotNumber(400), - blockNumber: BlockNumber(0), - checkpointNumber: CheckpointNumber(0), - dutyType: DutyType.ATTESTATION, - messageHash: Buffer32.random().toString(), - nodeId: 'stuck-node', - }); - expect(duty.isNew).toBe(true); - // Don't call updateDutySigned - leave it in 'signing' state (stuck) - - // Simulate node with clock 3 hours ahead - dateProvider.setTime(Date.now() + 3 * 60 * 60 * 1000); // 3 hours ahead - - try { - // Use our actual cleanupOwnStuckDuties method - const numCleaned = await spDb.cleanupOwnStuckDuties('stuck-node', 60 * 60 * 1000); // 1 hour - - // Should NOT delete the duty (it uses DB's clock, not node's) - expect(numCleaned).toBe(0); - } finally { - // Reset dateProvider back to real time - dateProvider.reset(); - } - }); -}); diff --git a/yarn-project/yarn.lock b/yarn-project/yarn.lock index 7f2d91855403..e29c09aba20b 100644 --- a/yarn-project/yarn.lock +++ b/yarn-project/yarn.lock @@ -1233,7 +1233,6 @@ __metadata: "@aztec/wallet-sdk": "workspace:^" "@aztec/wallets": "workspace:^" "@aztec/world-state": "workspace:^" - "@electric-sql/pglite": "npm:^0.3.14" "@iarna/toml": "npm:^2.2.5" "@jest/globals": "npm:^30.0.0" "@noble/curves": "npm:=1.0.0"