diff --git a/ci3/run_compose_test b/ci3/run_compose_test index 105584084376..c257062ff46d 100755 --- a/ci3/run_compose_test +++ b/ci3/run_compose_test @@ -45,4 +45,8 @@ docker logs -f "$cid" & # Block until the target container exits; exit status is the container's exit code. rc=$(docker wait "$cid") +# Give the backgrounded `docker logs -f` a moment to flush the container's final output +# (e.g. the trailing local-network logs) before the EXIT trap tears the container down. +sleep 1 + exit "$rc" diff --git a/yarn-project/end-to-end/bootstrap.sh b/yarn-project/end-to-end/bootstrap.sh index ec37e3445067..18fd910e9372 100755 --- a/yarn-project/end-to-end/bootstrap.sh +++ b/yarn-project/end-to-end/bootstrap.sh @@ -34,12 +34,12 @@ function test_cmds { fi echo "$prefix:TIMEOUT=25m:NAME=e2e_block_building $(set_dump_avm e2e_block_building) $run_test_script simple e2e_block_building" echo "$prefix:TIMEOUT=30m:NAME=e2e_avm_simulator $(set_dump_avm e2e_avm_simulator) $run_test_script simple src/e2e_avm_simulator.test.ts" - - + echo "$prefix:TIMEOUT=15m:NAME=e2e_epochs/epochs_long_proving_time $run_test_script simple src/e2e_epochs/epochs_long_proving_time.test.ts" local tests=( # List all standalone and nested tests, except for the ones listed above. - src/e2e_!(prover)/*.test.ts + src/e2e_!(prover|epochs)/*.test.ts + src/e2e_epochs/!(epochs_long_proving_time).test.ts src/e2e_p2p/reqresp/*.test.ts src/e2e_!(block_building|avm_simulator).test.ts ) @@ -106,12 +106,8 @@ function test_cmds { # compose-based tests with custom scripts for flow in ../cli-wallet/test/flows/*.sh; do # Note these scripts are ran directly by docker-compose.yml because it ends in '.sh'. - # Set LOG_LEVEL=info for a better output experience. Deeper debugging should happen with other e2e tests. - if [[ "$flow" == *private_transfer.sh ]]; then - echo "$hash:ONLY_TERM_PARENT=1 LOG_LEVEL=info LOCAL_NETWORK_LOG_LEVEL='info; debug:p2p,sequencer,archiver,world-state,aztec-node' $run_test_script compose $flow" - else - echo "$hash:ONLY_TERM_PARENT=1 LOG_LEVEL=info $run_test_script compose $flow" - fi + # Run at LOG_LEVEL=verbose so the captured local-network logs are detailed enough for diagnostics. + echo "$hash:ONLY_TERM_PARENT=1 LOG_LEVEL=verbose $run_test_script compose $flow" done } diff --git a/yarn-project/end-to-end/scripts/docker-compose.yml b/yarn-project/end-to-end/scripts/docker-compose.yml index 5c11c48d26c5..ec34d2d12f15 100644 --- a/yarn-project/end-to-end/scripts/docker-compose.yml +++ b/yarn-project/end-to-end/scripts/docker-compose.yml @@ -19,13 +19,11 @@ services: working_dir: /root/aztec-packages/yarn-project/aztec entrypoint: > bash -c ' - export LOG_LEVEL="$${LOCAL_NETWORK_LOG_LEVEL:-$${LOG_LEVEL:-verbose}}" set -o pipefail node ./dest/bin start --local-network 2>&1 | tee /logs/local-network.log ' environment: LOG_LEVEL: ${LOG_LEVEL:-verbose} - LOCAL_NETWORK_LOG_LEVEL: ${LOCAL_NETWORK_LOG_LEVEL:-} ETHEREUM_HOSTS: http://fork:8545 L1_CHAIN_ID: 31337 FORCE_COLOR: ${FORCE_COLOR:-1} @@ -87,11 +85,9 @@ services: while kill -0 -$$pgid 2>/dev/null; do sleep 0.1; done wait $$pid rc=$$? - if [ $$rc -ne 0 ]; then - echo "===== local-network logs =====" - cat /logs/local-network.log || true - echo "===== end local-network logs =====" - fi + echo "===== local-network logs =====" + cat /logs/local-network.log || true + echo "===== end local-network logs =====" exit $$rc ' depends_on: diff --git a/yarn-project/end-to-end/scripts/run_test.sh b/yarn-project/end-to-end/scripts/run_test.sh index f466b3d65a5e..7f5ca7b8219a 100755 --- a/yarn-project/end-to-end/scripts/run_test.sh +++ b/yarn-project/end-to-end/scripts/run_test.sh @@ -18,7 +18,8 @@ case "$type" in ;; "compose") # TODO: Replace this file with test_simple.sh, and just emit the below as part of test_cmds. - TEST=$test exec run_compose_test $test end-to-end $PWD + # Remove volumes on cleanup so the local-network logs volume doesn't persist across runs. + TEST=$test REMOVE_COMPOSE_VOLUMES=1 exec run_compose_test $test end-to-end $PWD ;; "web3signer") TEST=$test exec run_compose_test $test end-to-end $PWD/web3signer diff --git a/yarn-project/end-to-end/src/e2e_cross_chain_messaging/cross_chain_messaging_test.ts b/yarn-project/end-to-end/src/e2e_cross_chain_messaging/cross_chain_messaging_test.ts index df33b44eebab..8b1555a732f3 100644 --- a/yarn-project/end-to-end/src/e2e_cross_chain_messaging/cross_chain_messaging_test.ts +++ b/yarn-project/end-to-end/src/e2e_cross_chain_messaging/cross_chain_messaging_test.ts @@ -15,11 +15,13 @@ import { deployL1Contract } from '@aztec/ethereum/deploy-l1-contract'; import { pickL1ContractAddresses } from '@aztec/ethereum/l1-contract-addresses'; import type { ExtendedViemWalletClient } from '@aztec/ethereum/types'; import { EpochNumber } from '@aztec/foundation/branded-types'; +import { retryUntil } from '@aztec/foundation/retry'; import { sleep } from '@aztec/foundation/sleep'; import { TestERC20Abi, TestERC20Bytecode } from '@aztec/l1-artifacts'; import { TokenContract } from '@aztec/noir-contracts.js/Token'; import { TokenBridgeContract } from '@aztec/noir-contracts.js/TokenBridge'; import type { PXEConfig } from '@aztec/pxe/server'; +import { getEpochAtSlot } from '@aztec/stdlib/epoch-helpers'; import type { AztecNodeAdmin } from '@aztec/stdlib/interfaces/client'; import { MNEMONIC } from '../fixtures/fixtures.js'; @@ -102,6 +104,15 @@ export class CrossChainMessagingTest { fundSponsoredFPC: true, skipAccountDeployment: true, l1ContractsArgs: { ...this.deployL1ContractsArgs, ...opts.l1ContractsArgs }, + // `advanceToEpochProven` warps anvil's L1 clock forward by up to a full epoch in one + // step. The prover-node tracks L1 time via `dateProvider.setTime(...)`, so any + // in-flight tx-gather sees its deadline jump into the past and short-circuits. Use + // a generous gather window so the deadline survives the warp. + proverNodeConfig: { + ...this.setupOptions.proverNodeConfig, + ...opts.proverNodeConfig, + txGatheringTimeoutMs: opts.proverNodeConfig?.txGatheringTimeoutMs ?? 10 * 60 * 1000, + }, }, { ...this.pxeOpts, ...pxeOpts }, ); @@ -110,7 +121,14 @@ export class CrossChainMessagingTest { async advanceToEpochProven(l2TxReceipt: TxReceipt): Promise { const block = await this.aztecNode.getBlock(l2TxReceipt.blockNumber!); - const epoch = await this.rollup.getEpochNumberForCheckpoint(block!.checkpointNumber); + const cp = await retryUntil( + async () => (await this.aztecNode.getCheckpoints(block!.checkpointNumber, 1))[0], + `archiver indexes checkpoint ${block!.checkpointNumber}`, + 120, + 0.5, + ); + const epochDuration = await this.rollup.getEpochDuration(); + const epoch = getEpochAtSlot(cp.header.slotNumber, { epochDuration }); // Warp to the next epoch. await this.cheatCodes.rollup.advanceToEpoch(EpochNumber(epoch + 1)); // Wait for the tx to be proven. diff --git a/yarn-project/end-to-end/src/e2e_cross_chain_messaging/l2_to_l1.test.ts b/yarn-project/end-to-end/src/e2e_cross_chain_messaging/l2_to_l1.test.ts index e423ca41a5c0..36a8864f0f5b 100644 --- a/yarn-project/end-to-end/src/e2e_cross_chain_messaging/l2_to_l1.test.ts +++ b/yarn-project/end-to-end/src/e2e_cross_chain_messaging/l2_to_l1.test.ts @@ -10,7 +10,7 @@ import { type Sequencer, type SequencerEvents, SequencerState } from '@aztec/seq import { computeL2ToL1MessageHash } from '@aztec/stdlib/hash'; import type { AztecNode, AztecNodeAdmin } from '@aztec/stdlib/interfaces/client'; import { type L2ToL1MembershipWitness, getL2ToL1MessageLeafId } from '@aztec/stdlib/messaging'; -import { type TxHash, TxStatus } from '@aztec/stdlib/tx'; +import { TxExecutionResult, type TxHash, TxStatus } from '@aztec/stdlib/tx'; import { jest } from '@jest/globals'; import { type Hex, decodeEventLog } from 'viem'; @@ -114,6 +114,73 @@ describe('e2e_cross_chain_messaging l2_to_l1', () => { await expectConsumeMessageToSucceed(messages[1], txReceipt.txHash); }); + // A message-bearing tx that gets reorged out of its checkpoint and remined into a fresh + // one must still prove correctly — the message has to follow the tx into its new home and + // end up in the epoch out-hash. A successful outbox consume after `advanceToEpochProven` + // proves the message survived the reorg+remine all the way through to a valid epoch proof. + it('proves an L2-to-L1 message whose tx is reorged out and remined', async () => { + const recipient = msgSender; + const content = Fr.random(); + const message = makeL2ToL1Message(recipient, content); + + // One tx per block so the message-bearing tx owns its checkpoint. + await aztecNodeAdmin.setConfig({ minTxsPerBlock: 1 }); + await waitForSequencerIdle(t.context.sequencer!.getSequencer()); + + // Send the message-bearing tx and note where it first landed. + const { receipt: txReceipt } = await contract.methods + .create_l2_to_l1_message_arbitrary_recipient_private(content, recipient) + .send({ from: user1Address }); + const originalBlock = (await aztecNode.getBlock(txReceipt.blockNumber!))!; + const originalCheckpoint = originalBlock.checkpointNumber; + t.logger.info(`Message tx landed in checkpoint ${originalCheckpoint} (block ${txReceipt.blockNumber})`); + + // Reorg L1 deeply enough to drop the L1 block that published this checkpoint. + const [cp] = await aztecNode.getCheckpoints(originalCheckpoint, 1, { includeL1PublishInfo: true }); + if (!cp.l1.published) { + throw new Error(`Expected checkpoint ${originalCheckpoint} to have L1 publish info`); + } + const checkpointL1Block = Number(cp.l1.blockNumber); + const currentL1Block = await t.context.cheatCodes.eth.blockNumber(); + const reorgDepth = currentL1Block - checkpointL1Block + 1; + t.logger.info(`Reorging ${reorgDepth} L1 blocks to remove checkpoint ${originalCheckpoint}`); + await t.context.cheatCodes.eth.reorgWithReplacement(reorgDepth); + + // The node detects the prune and drops back below the reorged-out checkpoint. + await retryUntil( + () => aztecNode.getCheckpointNumber('checkpointed').then(cpNum => cpNum < originalCheckpoint), + 'node detects reorg', + 60, + 0.5, + ); + t.logger.info(`Node observed the reorg removing checkpoint ${originalCheckpoint}`); + + // The tx returns to the mempool and is remined. Poll for a successful receipt whose + // checkpoint is at or beyond the reorged-out one (i.e. the freshly-mined instance, + // not a stale read of the removed block). + const reminedReceipt = await retryUntil( + async () => { + const r = await aztecNode.getTxReceipt(txReceipt.txHash); + if (r.executionResult !== TxExecutionResult.SUCCESS || !r.blockNumber) { + return undefined; + } + const block = await aztecNode.getBlock(r.blockNumber); + return block && block.checkpointNumber >= originalCheckpoint ? r : undefined; + }, + 'tx remined after reorg', + 120, + 0.5, + ); + const reminedBlock = (await aztecNode.getBlock(reminedReceipt.blockNumber!))!; + t.logger.info( + `Message tx remined into checkpoint ${reminedBlock.checkpointNumber} (block ${reminedReceipt.blockNumber})`, + ); + + // Prove the epoch containing the remined tx, then consume its message from the outbox. + await t.advanceToEpochProven(reminedReceipt); + await expectConsumeMessageToSucceed(message, txReceipt.txHash); + }); + // When the block contains a tx with no messages, the zero txOutHash is skipped and won't be included in the top tree. // In this test, we test that the correct tree class is used, and the final out hash equals the only message leaf. it('2 txs in the same block, one with no messages, one with a message', async () => { diff --git a/yarn-project/end-to-end/src/e2e_epochs/epochs_long_proving_time.test.ts b/yarn-project/end-to-end/src/e2e_epochs/epochs_long_proving_time.test.ts index 63d442847e40..0f8d966a1061 100644 --- a/yarn-project/end-to-end/src/e2e_epochs/epochs_long_proving_time.test.ts +++ b/yarn-project/end-to-end/src/e2e_epochs/epochs_long_proving_time.test.ts @@ -6,7 +6,9 @@ import { jest } from '@jest/globals'; import { EpochsTestContext } from './epochs_test.js'; -jest.setTimeout(1000 * 60 * 10); +jest.setTimeout(1000 * 60 * 15); + +const MAX_JOB_COUNT = 20; describe('e2e_epochs/epochs_long_proving_time', () => { let logger: Logger; @@ -24,11 +26,14 @@ describe('e2e_epochs/epochs_long_proving_time', () => { const { aztecSlotDuration } = EpochsTestContext.getSlotDurations({ aztecEpochDuration }); const epochDurationInSeconds = aztecSlotDuration * aztecEpochDuration; const proverTestDelayMs = (epochDurationInSeconds * 1000 * 3) / 4; + // Each epoch takes ~3 epochs to prove, so the broker needs to keep results for + // at least that many epochs to avoid rejecting jobs as stale. test = await EpochsTestContext.setup({ aztecEpochDuration, aztecProofSubmissionEpochs: 1000, // Effectively don't re-org proverTestDelayMs, - proverNodeMaxPendingJobs: 1, // We test for only a single job at once + proverNodeMaxPendingJobs: MAX_JOB_COUNT, // Prove multiple epochs concurrently + proverBrokerMaxEpochsToKeepResultsFor: 10, }); ({ logger, monitor, L1_BLOCK_TIME_IN_S } = test); logger.warn(`Initialized with prover delay set to ${proverTestDelayMs}ms (epoch is ${epochDurationInSeconds}s)`); @@ -58,10 +63,7 @@ describe('e2e_epochs/epochs_long_proving_time', () => { // At least 3 epochs should have passed after the proven one (though we add a -1 just in case) expect(monitor.checkpointNumber).toBeGreaterThanOrEqual(targetProvenEpochs * test.epochDuration * 3 - 1); - // We expect maxJobCount to equal 1, since the prover node epoch monitor defines an epoch as ready to be proven - // only if the previous one has already been proven. We can relax this check if we want to support multiple epochs - // to be proven in parallel, in which case we should update the assertion below. - expect(maxJobCount).toEqual(1); - logger.info(`Test succeeded`); + expect(maxJobCount).toBeLessThanOrEqual(MAX_JOB_COUNT); + logger.info(`Test succeeded, max prover jobs ${maxJobCount}`); }); }); diff --git a/yarn-project/end-to-end/src/e2e_epochs/epochs_multi_proof.test.ts b/yarn-project/end-to-end/src/e2e_epochs/epochs_multi_proof.test.ts index 377d8b2672a8..ff9802cc5a5d 100644 --- a/yarn-project/end-to-end/src/e2e_epochs/epochs_multi_proof.test.ts +++ b/yarn-project/end-to-end/src/e2e_epochs/epochs_multi_proof.test.ts @@ -46,18 +46,18 @@ describe('e2e_epochs/epochs_multi_proof', () => { // This prevents the race condition where multiple provers submit to L1 at the same time test.proverNodes.forEach((proverAztecNode, index) => { const proverManager = proverAztecNode.getProverNode()!.getProver(); - const origCreateEpochProver = proverManager.createEpochProver.bind(proverManager); - proverManager.createEpochProver = () => { - const epochProver = origCreateEpochProver(); - const origFinalizeEpoch = epochProver.finalizeEpoch.bind(epochProver); - epochProver.finalizeEpoch = async () => { - const result = await origFinalizeEpoch(); + const origCreateTopTree = proverManager.createTopTreeOrchestrator.bind(proverManager); + proverManager.createTopTreeOrchestrator = () => { + const topTree = origCreateTopTree(); + const origProve = topTree.prove.bind(topTree); + topTree.prove = async (...args: Parameters) => { + const result = await origProve(...args); const sleepTime = index * 1000 * test.constants.ethereumSlotDuration; - logger.warn(`Delaying finalizeEpoch for prover node ${index} by ${sleepTime}ms`); + logger.warn(`Delaying top-tree prove for prover node ${index} by ${sleepTime}ms`); await sleep(sleepTime); return result; }; - return epochProver; + return topTree; }; }); diff --git a/yarn-project/end-to-end/src/e2e_epochs/epochs_optimistic_proving.parallel.test.ts b/yarn-project/end-to-end/src/e2e_epochs/epochs_optimistic_proving.parallel.test.ts new file mode 100644 index 000000000000..27c0e2dd2e97 --- /dev/null +++ b/yarn-project/end-to-end/src/e2e_epochs/epochs_optimistic_proving.parallel.test.ts @@ -0,0 +1,852 @@ +import type { Logger } from '@aztec/aztec.js/log'; +import { RollupContract } from '@aztec/ethereum/contracts'; +import { BlockNumber, CheckpointNumber, EpochNumber, SlotNumber } from '@aztec/foundation/branded-types'; +import { Fr } from '@aztec/foundation/curves/bn254'; +import { retryUntil } from '@aztec/foundation/retry'; +import type { TestProverNode } from '@aztec/prover-node/test'; +import { getEpochAtSlot, getSlotRangeForEpoch } from '@aztec/stdlib/epoch-helpers'; +import type { AztecNode } from '@aztec/stdlib/interfaces/server'; +import { TxExecutionResult } from '@aztec/stdlib/tx'; + +import { expect, jest } from '@jest/globals'; + +import type { EndToEndContext } from '../fixtures/utils.js'; +import { proveInteraction } from '../test-wallet/utils.js'; +import { EpochsTestContext } from './epochs_test.js'; + +jest.setTimeout(1000 * 60 * 20); + +/** + * E2E tests for optimistic (checkpoint-driven) proving with reorg scenarios. + */ +describe('e2e_epochs/epochs_optimistic_proving', () => { + let context: EndToEndContext; + let rollup: RollupContract; + let logger: Logger; + let node: AztecNode; + + let L2_SLOT_DURATION_IN_S: number; + + let test: EpochsTestContext; + + const getCheckpointNumber = (n: AztecNode) => n.getCheckpointNumber('checkpointed'); + + /** + * Looks up the epoch a given checkpoint sits in by reading its slot from the archiver. + */ + const epochOfCheckpoint = async (cpNumber: CheckpointNumber, timeoutSeconds = 30): Promise => { + const cp = await retryUntil( + async () => (await node.getCheckpoints(cpNumber, 1))[0], + `archiver indexes checkpoint ${cpNumber}`, + timeoutSeconds, + 0.1, + ); + return getEpochAtSlot(cp.header.slotNumber, test.constants); + }; + + /** Returns the last block number contained in the given checkpoint. */ + const lastBlockOfCheckpoint = async (cpNumber: CheckpointNumber): Promise => { + const cp = await retryUntil( + async () => (await node.getCheckpoints(cpNumber, 1))[0], + `archiver indexes checkpoint ${cpNumber}`, + 30, + 0.1, + ); + return BlockNumber(cp.startBlock + cp.blockCount - 1); + }; + + /** + * Returns the canonical checkpoint numbers that fall within `epoch`, considering checkpoints + * `1..upTo`. Retries until the archiver has indexed the whole range so the count is stable. + */ + const checkpointsInEpoch = async (epoch: EpochNumber, upTo: CheckpointNumber): Promise => { + const cps = await retryUntil( + async () => { + const all = await node.getCheckpoints(CheckpointNumber(1), Number(upTo)); + return all.length >= Number(upTo) ? all : undefined; + }, + `archiver indexes checkpoints up to ${upTo}`, + 30, + 0.2, + ); + return cps.filter(cp => getEpochAtSlot(cp.header.slotNumber, test.constants) === epoch).map(cp => cp.number); + }; + + /** + * Background sampler proving the prover-node works an epoch *optimistically* — i.e. it + * spawns a checkpoint's sub-tree before the epoch is over on L1, not just after the + * last checkpoint lands. + * + * The check has to be more than "a session exists for the epoch": full sessions only + * open once the epoch is complete on L1, and even a non-optimistic prover would start + * the moment the epoch's last checkpoint is pushed (a few L1 slots before the epoch's + * final L2 slot). So instead we watch the long-lived `CheckpointStore`: at each tick we + * record, per epoch, the earliest wall-clock slot at which *some* `CheckpointProver` + * for that epoch has been registered. Sub-trees are spawned at registration, so this + * slot is strictly before the epoch's last slot when optimistic proving is active. + */ + const startMidEpochProvingSampler = (proverNode: TestProverNode) => { + /** epoch -> earliest wall-clock slot at which a CheckpointProver for that epoch was registered. */ + const provingStartedAtSlot = new Map(); + let stopped = false; + const loop = (async () => { + while (!stopped) { + const { epoch, slot } = test.epochCache.getEpochAndSlotNow(); + const hasProverThisEpoch = proverNode + .getCheckpointStore() + .listAll() + .some(p => p.epochNumber === epoch); + if (hasProverThisEpoch && !provingStartedAtSlot.has(epoch)) { + provingStartedAtSlot.set(epoch, slot); + } + await new Promise(resolve => setTimeout(resolve, 100)); + } + })(); + return async () => { + stopped = true; + await loop; + return provingStartedAtSlot; + }; + }; + + /** Asserts a CheckpointProver for `epoch` was registered before the epoch's last L2 slot. */ + const expectOptimisticProving = (provingStartedAtSlot: Map, epoch: EpochNumber) => { + const observedSlot = provingStartedAtSlot.get(epoch); + const [, lastSlot] = getSlotRangeForEpoch(epoch, test.constants); + expect(observedSlot).toBeDefined(); + expect(observedSlot!).toBeLessThan(lastSlot); + }; + + afterEach(async () => { + await test.teardown(); + }); + + describe('happy path', () => { + beforeEach(async () => { + test = await EpochsTestContext.setup({ numberOfAccounts: 1 }); + ({ rollup, logger, context } = test); + ({ L2_SLOT_DURATION_IN_S } = test); + node = context.aztecNode; + }); + + it('proves an epoch via checkpoint-driven flow', async () => { + const proverNode = test.proverNodes[0].getProverNode() as TestProverNode; + const stopSampler = startMidEpochProvingSampler(proverNode); + + // Land a real tx in the epoch so we prove actual tx effects, not just empty blocks. + const contract = await test.registerTestContract(context.wallet); + const provenTx = await proveInteraction(context.wallet, contract.methods.emit_nullifier(new Fr(1)), { + from: context.accounts[0], + }); + const txReceipt = await provenTx.send(); + const txCheckpoint = (await node.getBlock(txReceipt.blockNumber!))!.checkpointNumber; + const txEpoch = await epochOfCheckpoint(txCheckpoint); + logger.info(`Tx ${txReceipt.txHash} landed in checkpoint ${txCheckpoint} (epoch ${txEpoch})`); + + logger.info(`Waiting for epoch ${txEpoch} to end`); + await test.waitUntilEpochStarts(txEpoch + 1); + const epochEndCheckpointNumber = (await test.monitor.run(true)).checkpointNumber; + logger.info(`Epoch ${txEpoch} ended with checkpoint number ${epochEndCheckpointNumber}`); + expect(epochEndCheckpointNumber).toBeGreaterThanOrEqual(txCheckpoint); + + await test.waitUntilProvenCheckpointNumber(epochEndCheckpointNumber, 240); + expect(await rollup.getProvenCheckpointNumber()).toBeGreaterThanOrEqual(txCheckpoint); + + await test.waitForNodeToSync(await lastBlockOfCheckpoint(epochEndCheckpointNumber), 'proven'); + + // The tx is in a proven block. + expect((await node.getTxReceipt(txReceipt.txHash)).executionResult).toEqual(TxExecutionResult.SUCCESS); + + // A CheckpointProver for the epoch was registered before the epoch's last slot — i.e. + // the prover-node started proving optimistically rather than waiting for the epoch to end. + const provingStartedAtSlot = await stopSampler(); + logger.info(`Optimistic proving start slots by epoch: ${JSON.stringify([...provingStartedAtSlot])}`); + expectOptimisticProving(provingStartedAtSlot, txEpoch); + }); + + it('proves multiple epochs via checkpoint-driven flow', async () => { + const proverNode = test.proverNodes[0].getProverNode() as TestProverNode; + const stopSampler = startMidEpochProvingSampler(proverNode); + const contract = await test.registerTestContract(context.wallet); + + const numEpochs = 4; + const provenEpochs: EpochNumber[] = []; + for (let i = 0; i < numEpochs; i++) { + // Land a real tx (distinct nullifier per iteration) in the current epoch. + const provenTx = await proveInteraction(context.wallet, contract.methods.emit_nullifier(new Fr(i + 1)), { + from: context.accounts[0], + }); + const txReceipt = await provenTx.send(); + const txCheckpoint = (await node.getBlock(txReceipt.blockNumber!))!.checkpointNumber; + const txEpoch = await epochOfCheckpoint(txCheckpoint); + provenEpochs.push(txEpoch); + logger.info(`Tx ${txReceipt.txHash} landed in checkpoint ${txCheckpoint} (epoch ${txEpoch})`); + + logger.info(`Waiting for epoch ${txEpoch} to end`); + await test.waitUntilEpochStarts(txEpoch + 1); + const cp = (await test.monitor.run(true)).checkpointNumber; + expect(cp).toBeGreaterThanOrEqual(txCheckpoint); + + await test.waitUntilProvenCheckpointNumber(cp, 240); + expect(await rollup.getProvenCheckpointNumber()).toBeGreaterThanOrEqual(txCheckpoint); + + await test.waitForNodeToSync(await lastBlockOfCheckpoint(cp), 'proven'); + expect((await node.getTxReceipt(txReceipt.txHash)).executionResult).toEqual(TxExecutionResult.SUCCESS); + } + + // Every epoch the prover-node proved should have had a CheckpointProver registered + // before the epoch's last slot — i.e. proving started mid-epoch, not after. + const provingStartedAtSlot = await stopSampler(); + logger.info(`Optimistic proving start slots by epoch: ${JSON.stringify([...provingStartedAtSlot])}`); + for (const epoch of provenEpochs) { + expectOptimisticProving(provingStartedAtSlot, epoch); + } + }); + }); + + describe('mid-epoch checkpoint reorg with replacement', () => { + beforeEach(async () => { + test = await EpochsTestContext.setup({ + maxSpeedUpAttempts: 0, + cancelTxOnTimeout: false, + // Use a longer epoch so the replacement checkpoint has room to land in the same + // epoch after a reorg. With epochDuration=4 the sequencer's "prepare one slot + // ahead" pattern, plus any L1-tx slip, pushes the replacement past the epoch + // boundary (see CI failure on `+2` reorg, replacement landed two slots into the + // next epoch). + aztecEpochDuration: 8, + ethereumSlotDuration: 4, + aztecSlotDuration: 36, + blockDurationMs: 8000, + minTxsPerBlock: 0, + enforceTimeTable: true, + aztecProofSubmissionEpochs: 1000, + anvilSlotsInAnEpoch: 32, + inboxLag: 2, + }); + ({ rollup, logger, context } = test); + ({ L2_SLOT_DURATION_IN_S } = test); + node = context.aztecNode; + }); + + it('replaces a reorged checkpoint and proves the epoch', async () => { + const proverNode = test.proverNodes[0].getProverNode() as TestProverNode; + + // Anchor on a freshly-started epoch so we have enough slots for the replacement + // to land in the same epoch after the reorg — `waitUntilEpochStarts(1)` would + // return immediately under CI load and leave us with no slack. + await test.waitUntilNextEpochStarts(); + + // Wait for the 2nd checkpoint within this epoch. + const initialCheckpoint = (await test.monitor.run(true)).checkpointNumber; + const midCheckpoint = CheckpointNumber(initialCheckpoint + 2); + await test.waitUntilCheckpointNumber(midCheckpoint, L2_SLOT_DURATION_IN_S * 6); + const checkpointBeforeReorg = test.monitor.checkpointNumber; + logger.info(`Reached checkpoint ${checkpointBeforeReorg}`); + + // Capture the epoch of the checkpoint we're about to reorg out — we can't look it + // up after the reorg removes it from the archiver. The replacement must land in the + // same epoch for this test to be exercising what it claims. + const epochBeforeReorg = await epochOfCheckpoint(checkpointBeforeReorg); + + // The prover-node must have started a sub-tree for the checkpoint we're about to + // reorg out — otherwise this test could pass simply because the prover hadn't begun + // assembling the checkpoint yet. Capture the prover's slot so we can identify the + // original after the reorg even if the replacement reuses the same checkpoint number. + const originalProver = await retryUntil( + () => + Promise.resolve( + proverNode + .getCheckpointStore() + .listAll() + .find(p => p.checkpoint.number === checkpointBeforeReorg), + ), + `prover starts sub-tree for checkpoint ${checkpointBeforeReorg}`, + 30, + 0.2, + ); + const originalSlot = originalProver.slotNumber; + logger.info(`Prover started sub-tree for checkpoint ${checkpointBeforeReorg} at slot ${originalSlot}`); + + // Stop block production. + await context.aztecNodeAdmin!.setConfig({ skipPublishingCheckpointsPercent: 100 }); + + // Reorg L1 to remove the last checkpoint. + logger.info(`Reorging L1 to remove checkpoint ${checkpointBeforeReorg}`); + await context.cheatCodes.eth.reorgWithReplacement(1); + + const afterReorgCheckpoint = (await test.monitor.run(true)).checkpointNumber; + expect(afterReorgCheckpoint).toBeLessThan(checkpointBeforeReorg); + logger.info(`After reorg: checkpoint ${afterReorgCheckpoint} (was ${checkpointBeforeReorg})`); + + // Verify node detects the reorg. + await retryUntil( + () => getCheckpointNumber(node).then(cp => cp <= afterReorgCheckpoint), + 'reorg detected', + 30, + 0.5, + ); + + // Verify the prover-node observes the prune. `markPruned()` fires reactively when + // the L2BlockStream emits the prune; the SlotWatcher then reaps the (now pruned) + // prover on its next tick (default 1s), so checking strictly for `isPruned()` would + // race against the reap. Identify the original by `(checkpointNumber, slot)` — + // checkpoint numbers refill sequentially after a reorg, so the replacement reuses + // the same number but lives at a different slot. Accept either state for the + // original: still in the store and pruned, or already reaped. + await retryUntil( + () => { + const prover = proverNode + .getCheckpointStore() + .listAll() + .find(p => p.checkpoint.number === checkpointBeforeReorg && p.slotNumber === originalSlot); + return Promise.resolve(!prover || prover.isPruned()); + }, + `prover marks original checkpoint ${checkpointBeforeReorg} (slot ${originalSlot}) as pruned (or reaps it)`, + 30, + 0.2, + ); + + // Resume block production — sequencer proposes a replacement in the next slot. + logger.info('Resuming block production for replacement checkpoint'); + await context.aztecNodeAdmin!.setConfig({ skipPublishingCheckpointsPercent: 0 }); + + // After the L1 reorg, anvil's L1 clock can drift relative to the prior schedule, which + // sometimes makes the first replacement publish tx timeout and retry in a later slot. + // Give the wait enough headroom (~half an epoch) for that retry path. + const replacementCheckpoint = CheckpointNumber(afterReorgCheckpoint + 1); + await test.waitUntilCheckpointNumber(replacementCheckpoint, L2_SLOT_DURATION_IN_S * 6); + logger.info(`Replacement checkpoint ${replacementCheckpoint} published`); + + // The replacement must land in the same epoch as the reorged-out checkpoint — + // otherwise we'd be testing a fresh epoch, not a re-created one (A-1046). + const currentEpoch = await epochOfCheckpoint(replacementCheckpoint); + expect(currentEpoch).toEqual(epochBeforeReorg); + + // The prover-node must have a sub-tree for the replacement checkpoint — i.e. it + // re-created its work for epoch X after the prune (A-1046: checkpoint arrives → + // removed → new checkpoint for the same epoch → proves with the new one). + await retryUntil( + () => + Promise.resolve( + proverNode + .getCheckpointStore() + .listAll() + .some(p => p.checkpoint.number === replacementCheckpoint && !p.isPruned()), + ), + `prover re-creates sub-tree for replacement checkpoint ${replacementCheckpoint}`, + 30, + 0.2, + ); + + // Wait for the epoch to end and the replacement to be proven on L1. Block + // production has been resumed and may produce additional checkpoints before the + // next epoch starts; we only assert that the chain advanced past the replacement + // and that the replacement itself ends up proven. + await test.waitUntilEpochStarts(currentEpoch + 1); + const epochEndCheckpoint = (await test.monitor.run(true)).checkpointNumber; + expect(epochEndCheckpoint).toBeGreaterThanOrEqual(replacementCheckpoint); + + await test.waitUntilProvenCheckpointNumber(replacementCheckpoint, 240); + logger.info(`Epoch proven after mid-epoch checkpoint replacement`); + }); + }); + + describe('mid-epoch checkpoint reorg moving a tx', () => { + beforeEach(async () => { + test = await EpochsTestContext.setup({ + numberOfAccounts: 1, + maxSpeedUpAttempts: 0, + cancelTxOnTimeout: false, + aztecEpochDuration: 4, + ethereumSlotDuration: 4, + aztecSlotDuration: 36, + blockDurationMs: 8000, + minTxsPerBlock: 0, + enforceTimeTable: true, + aztecProofSubmissionEpochs: 1000, + anvilSlotsInAnEpoch: 32, + }); + ({ rollup, logger, context } = test); + ({ L2_SLOT_DURATION_IN_S } = test); + node = context.aztecNode; + }); + + // A real tx (not just an empty checkpoint) whose checkpoint is reorged out must be + // remined into a fresh checkpoint and proven there — the tx moves checkpoints across + // the reorg. (PR #23002 review: "a tx that was mined in a checkpoint is now mined in a + // different one".) + it('reorgs a tx out of its checkpoint and proves it after it is remined', async () => { + const contract = await test.registerTestContract(context.wallet); + const from = context.accounts[0]; + + // Anchor on a freshly-started epoch so the reorg + remine has room to complete + // before the proof-submission window closes. + await test.waitUntilNextEpochStarts(); + + // Send a tx and wait for it to be mined into a checkpoint. + const provenTx = await proveInteraction(context.wallet, contract.methods.emit_nullifier(new Fr(1)), { from }); + const txReceipt = await provenTx.send(); + const originalBlock = (await node.getBlock(txReceipt.blockNumber!))!; + const originalCheckpoint = originalBlock.checkpointNumber; + logger.info(`Tx ${txReceipt.txHash} landed in checkpoint ${originalCheckpoint} (block ${txReceipt.blockNumber})`); + + // Reorg L1 deeply enough to drop the L1 block that published the tx's checkpoint. + const [cp] = await node.getCheckpoints(originalCheckpoint, 1, { includeL1PublishInfo: true }); + if (!cp.l1.published) { + throw new Error(`Expected checkpoint ${originalCheckpoint} to have L1 publish info`); + } + const originalSlot = cp.header.slotNumber; + const checkpointL1Block = Number(cp.l1.blockNumber); + const currentL1Block = await context.cheatCodes.eth.blockNumber(); + const reorgDepth = currentL1Block - checkpointL1Block + 1; + logger.info(`Reorging ${reorgDepth} L1 blocks to remove checkpoint ${originalCheckpoint}`); + await context.cheatCodes.eth.reorgWithReplacement(reorgDepth); + + // The node detects the prune and drops back below the reorged-out checkpoint. + await retryUntil( + () => getCheckpointNumber(node).then(cpNum => cpNum < originalCheckpoint), + 'node detects reorg', + 60, + 0.5, + ); + logger.info(`Node observed the reorg removing checkpoint ${originalCheckpoint}`); + + // The tx returns to the mempool and is remined into a fresh checkpoint. Poll for a + // successful receipt whose checkpoint is at or beyond the reorged-out one (not a + // stale read of the removed block). + const reminedCheckpoint = await retryUntil( + async () => { + const r = await node.getTxReceipt(txReceipt.txHash); + if (r.executionResult !== TxExecutionResult.SUCCESS || !r.blockNumber) { + return undefined; + } + const block = await node.getBlock(r.blockNumber); + return block && block.checkpointNumber >= originalCheckpoint ? block.checkpointNumber : undefined; + }, + 'tx remined after reorg', + 120, + 0.5, + ); + logger.info(`Tx remined into checkpoint ${reminedCheckpoint}`); + + // The remined checkpoint must live at a different slot than the original — otherwise + // we'd be testing same-slot replacement, not the "tx moves checkpoints across the + // reorg" path. Checkpoint numbers refill after a reorg, so the number alone could + // match either case. The remine signal fires on local world-state mining ahead of + // L1 inclusion, so poll the archiver for the new checkpoint before reading its slot. + const remined = await retryUntil( + async () => (await node.getCheckpoints(reminedCheckpoint, 1))[0], + `archiver indexes remined checkpoint ${reminedCheckpoint}`, + 120, + 0.5, + ); + expect(remined.header.slotNumber).not.toEqual(originalSlot); + logger.info( + `Remined checkpoint ${reminedCheckpoint} is at slot ${remined.header.slotNumber} (original was ${originalSlot})`, + ); + + // Wait for the epoch to end and the remined tx's checkpoint to be proven on L1. The + // archiver indexes the replacement checkpoint only after the sequencer's slot completes + // (~slot duration) and the L1 propose tx confirms — far longer than the default 30s. + const currentEpoch = await epochOfCheckpoint(reminedCheckpoint, 120); + await test.waitUntilEpochStarts(currentEpoch + 1); + const epochEndCheckpoint = (await test.monitor.run(true)).checkpointNumber; + expect(epochEndCheckpoint).toBeGreaterThanOrEqual(reminedCheckpoint); + + await test.waitUntilProvenCheckpointNumber(epochEndCheckpoint, 240); + expect(await rollup.getProvenCheckpointNumber()).toBeGreaterThanOrEqual(reminedCheckpoint); + logger.info(`Remined tx proven in checkpoint ${reminedCheckpoint}`); + }); + }); + + describe('mid-epoch checkpoint reorg without replacement', () => { + beforeEach(async () => { + test = await EpochsTestContext.setup({ + maxSpeedUpAttempts: 0, + cancelTxOnTimeout: false, + aztecEpochDuration: 4, + ethereumSlotDuration: 4, + aztecSlotDuration: 36, + blockDurationMs: 8000, + minTxsPerBlock: 0, + enforceTimeTable: true, + aztecProofSubmissionEpochs: 1000, + anvilSlotsInAnEpoch: 32, + }); + ({ rollup, logger, context } = test); + ({ L2_SLOT_DURATION_IN_S } = test); + node = context.aztecNode; + }); + + it('removes a checkpoint mid-epoch via reorg and proves with survivors', async () => { + // Anchor on a freshly-started epoch so the checkpoints we reorg over (and the survivor) + // are guaranteed to live in the same epoch. Without this, setup landing near an epoch + // boundary could leave the survivor in the previous epoch, passing the test without + // actually exercising in-epoch checkpoint removal (see #22990). + await test.waitUntilNextEpochStarts(); + + // Wait for 2 checkpoints mid-epoch. + const initialCheckpoint = (await test.monitor.run(true)).checkpointNumber; + const midCheckpoint = CheckpointNumber(initialCheckpoint + 2); + await test.waitUntilCheckpointNumber(midCheckpoint, L2_SLOT_DURATION_IN_S * 6); + const checkpointBeforeReorg = test.monitor.checkpointNumber; + logger.info(`Reached checkpoint ${checkpointBeforeReorg}`); + + // Capture the epoch we're reorging within so we can assert the survivor stays in it. + const epochBeforeReorg = await epochOfCheckpoint(checkpointBeforeReorg); + + // (1) The epoch must hold multiple checkpoints, with checkpointBeforeReorg as its latest — + // otherwise removing the last one wouldn't leave any in-epoch survivors to prove with. + const epochCheckpointsBeforeReorg = await checkpointsInEpoch(epochBeforeReorg, checkpointBeforeReorg); + expect(epochCheckpointsBeforeReorg.length).toBeGreaterThanOrEqual(2); + expect(epochCheckpointsBeforeReorg.at(-1)).toEqual(checkpointBeforeReorg); + + // Stop block production so no replacement is proposed. + await context.aztecNodeAdmin!.setConfig({ skipPublishingCheckpointsPercent: 100 }); + + // Reorg L1 to remove the last checkpoint — before the epoch completes. + logger.info(`Reorging L1 to remove checkpoint ${checkpointBeforeReorg}`); + await context.cheatCodes.eth.reorgWithReplacement(1); + + const afterReorgCheckpoint = (await test.monitor.run(true)).checkpointNumber; + // (2) The reorg removed exactly the last checkpoint, leaving N-1. + expect(afterReorgCheckpoint).toEqual(CheckpointNumber(checkpointBeforeReorg - 1)); + logger.info(`After reorg: checkpoint ${afterReorgCheckpoint} (was ${checkpointBeforeReorg})`); + + // Verify node detects the reorg. + await retryUntil( + () => getCheckpointNumber(node).then(cp => cp <= afterReorgCheckpoint), + 'reorg detected', + 30, + 0.5, + ); + + // The survivor must still be in the epoch we reorged within — otherwise the reorg removed + // the only in-epoch checkpoint and the test isn't exercising mid-epoch removal. + const currentEpoch = await epochOfCheckpoint(afterReorgCheckpoint); + expect(currentEpoch).toEqual(epochBeforeReorg); + + // The epoch now holds exactly N-1 checkpoints — the survivors of the removal. + const survivingCheckpoints = await checkpointsInEpoch(epochBeforeReorg, afterReorgCheckpoint); + expect(survivingCheckpoints.length).toEqual(epochCheckpointsBeforeReorg.length - 1); + expect(survivingCheckpoints.at(-1)).toEqual(afterReorgCheckpoint); + + // Wait for the epoch to end and proof to land with the surviving checkpoints. + await test.waitUntilEpochStarts(currentEpoch + 1); + const epochEndCheckpoint = (await test.monitor.run(true)).checkpointNumber; + + // (3) The epoch proved up to and including the last surviving checkpoint (the (N-1)th). + expect(epochEndCheckpoint).toEqual(afterReorgCheckpoint); + + await test.waitUntilProvenCheckpointNumber(epochEndCheckpoint, 240); + expect(await rollup.getProvenCheckpointNumber()).toBeGreaterThanOrEqual(epochEndCheckpoint); + logger.info(`Epoch proven with surviving checkpoints after mid-epoch reorg`); + }); + }); + + describe('last-slot checkpoint reorg without replacement', () => { + beforeEach(async () => { + test = await EpochsTestContext.setup({ + maxSpeedUpAttempts: 0, + cancelTxOnTimeout: false, + aztecEpochDuration: 4, + ethereumSlotDuration: 4, + aztecSlotDuration: 36, + blockDurationMs: 8000, + minTxsPerBlock: 0, + enforceTimeTable: true, + aztecProofSubmissionEpochs: 1000, + anvilSlotsInAnEpoch: 32, + inboxLag: 2, + // Apply a delay between "epoch complete on L1" and the prover-node hand-off so + // the reorg below has time to be processed before finalization starts. + proverNodeConfig: { proverNodeEpochProvingDelayMs: 10_000 }, + }); + ({ rollup, logger, context } = test); + ({ L2_SLOT_DURATION_IN_S } = test); + node = context.aztecNode; + }); + + it('removes the last-slot checkpoint of an epoch via reorg and proves with survivors', async () => { + // Anchor on a freshly-started epoch so the full slot range is ahead of us. + const epoch = await test.waitUntilNextEpochStarts(); + const [, epochEndSlot] = getSlotRangeForEpoch(epoch, test.constants); + + // Wait until the wall clock crosses into the last slot of the epoch. + await retryUntil( + () => Promise.resolve(test.epochCache.getEpochAndSlotNow().slot >= epochEndSlot), + `enter slot ${epochEndSlot}`, + L2_SLOT_DURATION_IN_S * test.epochDuration * 2, + 1, + ); + logger.info(`Reached last slot ${epochEndSlot} of epoch ${epoch}`); + + // Wait for a checkpoint published in the last slot to actually appear. + const lastSlotCheckpointNumber = await retryUntil( + async () => { + const cpNum = (await test.monitor.run(true)).checkpointNumber; + if (cpNum === CheckpointNumber.ZERO) { + return undefined; + } + const [cp] = await node.getCheckpoints(cpNum, 1); + return cp && cp.header.slotNumber === epochEndSlot ? cpNum : undefined; + }, + 'last-slot checkpoint published', + L2_SLOT_DURATION_IN_S, + 0.5, + ); + logger.info(`Last-slot checkpoint ${lastSlotCheckpointNumber} published in slot ${epochEndSlot}`); + + // Suppress further publishing so no replacement is proposed. + await context.aztecNodeAdmin!.setConfig({ skipPublishingCheckpointsPercent: 100 }); + + // Reorg L1 to remove the last-slot checkpoint. + logger.info(`Reorging L1 to remove last-slot checkpoint ${lastSlotCheckpointNumber}`); + await context.cheatCodes.eth.reorgWithReplacement(1); + + const afterReorgCheckpoint = (await test.monitor.run(true)).checkpointNumber; + expect(afterReorgCheckpoint).toBeLessThan(lastSlotCheckpointNumber); + logger.info(`After reorg: checkpoint ${afterReorgCheckpoint} (was ${lastSlotCheckpointNumber})`); + + // The surviving last checkpoint sits in an earlier slot than the epoch's last slot — + // i.e. the epoch's last block is no longer in the epoch's last slot. + const [survivor] = await node.getCheckpoints(afterReorgCheckpoint, 1); + expect(survivor.header.slotNumber).toBeLessThan(epochEndSlot); + + // Verify node detects the reorg. + await retryUntil( + () => getCheckpointNumber(node).then(cp => cp <= afterReorgCheckpoint), + 'reorg detected', + 30, + 0.5, + ); + + // Wait for the next epoch to start, then for proof to land with the surviving checkpoints. + await test.waitUntilEpochStarts(epoch + 1); + const epochEndCheckpoint = (await test.monitor.run(true)).checkpointNumber; + expect(epochEndCheckpoint).toEqual(afterReorgCheckpoint); + + await test.waitUntilProvenCheckpointNumber(epochEndCheckpoint, 240); + expect(await rollup.getProvenCheckpointNumber()).toBeGreaterThanOrEqual(epochEndCheckpoint); + logger.info(`Epoch ${epoch} proven with last-slot checkpoint reorged out`); + }); + }); + + describe('checkpoint reorg during proving', () => { + beforeEach(async () => { + test = await EpochsTestContext.setup({ + maxSpeedUpAttempts: 0, + cancelTxOnTimeout: false, + aztecEpochDuration: 4, + ethereumSlotDuration: 4, + aztecSlotDuration: 36, + blockDurationMs: 8000, + minTxsPerBlock: 0, + enforceTimeTable: true, + aztecProofSubmissionEpochs: 1000, + anvilSlotsInAnEpoch: 32, + }); + ({ rollup, logger, context } = test); + ({ L2_SLOT_DURATION_IN_S } = test); + node = context.aztecNode; + }); + + it('handles a reorg arriving while the top of the epoch is proving', async () => { + // Gate top-tree proving so it deterministically blocks until we release it. This + // gives us a window where the session is mid-proof, and we can fire the reorg + // precisely during that window. We use the session's `beforeTopTreeProve` hook + // rather than monkey-patching the orchestrator factory. + const proverNode = test.proverNodes[0].getProverNode() as TestProverNode; + let releaseProvingGate: () => void = () => {}; + const provingGate = new Promise(resolve => { + releaseProvingGate = resolve; + }); + // Only gate sessions with at least 2 checkpoints — reorging the last checkpoint + // of a single-checkpoint epoch leaves nothing to prove, the session is cancelled + // without replacement, and the test's "wait for fewer checkpoints" check never + // converges. Sessions with one checkpoint just pass through. + const sessionHasMultipleCheckpoints = async () => { + const job = (await proverNode.getJobs()).find(j => j.status === 'awaiting-checkpoints'); + const session = job && proverNode.sessionManager.getFullSession(job.epochNumber); + return !!session && session.getCheckpoints().length >= 2; + }; + proverNode.setSessionHooks({ + beforeTopTreeProve: async () => { + if (!(await sessionHasMultipleCheckpoints())) { + return; + } + logger.warn('Top-tree proving gated — waiting for test to release'); + await provingGate; + logger.warn('Proving gate released'); + }, + }); + + // Wait for a session with at least 2 checkpoints to hit the gate. The session + // manager opens one full session at a time, starting with the lowest unproven + // epoch; small epochs pass through (see hook above) and we keep polling until a + // gateable epoch lands. `getJobs()` tells us which epoch is actually blocked. + await retryUntil( + sessionHasMultipleCheckpoints, + 'gateable session blocks at proving gate', + L2_SLOT_DURATION_IN_S * 12, + 0.5, + ); + const gatedJob = (await proverNode.getJobs()).find(j => j.status === 'awaiting-checkpoints')!; + const gatedEpoch = gatedJob.epochNumber; + logger.info(`Job for epoch ${gatedEpoch} is blocked inside proving — firing reorg now`); + + // Capture the in-flight session and the last checkpoint of the gated epoch — + // we'll reorg that checkpoint out and verify the prover recovers with the + // surviving prefix. We take the session's own checkpoint list rather than + // `monitor.checkpointNumber` because the global high may sit in a later epoch. + const inFlightSession = proverNode.sessionManager.getFullSession(gatedEpoch); + if (!inFlightSession) { + throw new Error(`No in-flight session for epoch ${gatedEpoch}`); + } + const trackedBeforeReorg = inFlightSession.getCheckpoints().length; + const epochEndCheckpoint = inFlightSession.getCheckpoints()[trackedBeforeReorg - 1].checkpoint.number; + logger.info(`Reorging last checkpoint ${epochEndCheckpoint} of gated epoch ${gatedEpoch}`); + + // Stop block production so no replacement comes in. + await context.aztecNodeAdmin!.setConfig({ skipPublishingCheckpointsPercent: 100 }); + + // Reorg L1 deeply enough to actually remove the L1 block in which the last + // checkpoint of the proving-in-progress epoch was published. L1 may have + // mined several blocks between the checkpoint publish and now (votes, + // attestations, slot ticks), so depth=1 is not always sufficient. + const [cp] = await node.getCheckpoints(epochEndCheckpoint, 1, { includeL1PublishInfo: true }); + if (!cp.l1.published) { + throw new Error(`Expected checkpoint ${epochEndCheckpoint} to have L1 publish info`); + } + const checkpointL1Block = Number(cp.l1.blockNumber); + const currentL1Block = await context.cheatCodes.eth.blockNumber(); + const reorgDepth = currentL1Block - checkpointL1Block + 1; + logger.info( + `Reorging ${reorgDepth} L1 blocks (checkpoint ${epochEndCheckpoint} was published in L1 block ${checkpointL1Block}, current L1 block is ${currentL1Block})`, + ); + await context.cheatCodes.eth.reorgWithReplacement(reorgDepth); + const afterReorgCheckpoint = (await test.monitor.run(true)).checkpointNumber; + expect(afterReorgCheckpoint).toBeLessThan(epochEndCheckpoint); + logger.info(`Reorg fired: checkpoint ${afterReorgCheckpoint} (was ${epochEndCheckpoint})`); + + // Wait until the prover-node observes the prune and removes the reorged-out + // checkpoint(s) from the in-flight job. This is the prerequisite for the + // restart-with-survivors path: when we release the gate below, the cancelled + // top tree throws `TopTreeCancelledError` and the finalize loop rebuilds with + // the surviving checkpoints. Without this wait we'd race the L2BlockStream + // poll and risk top tree #1 starting its real prove before cancellation lands. + await retryUntil( + () => { + // After a prune the original session is cancelled and replaced; the new full + // session for the same epoch sits in `fullSessions` over the surviving prefix. + const current = proverNode.sessionManager.getFullSession(gatedEpoch); + return Promise.resolve(!!current && current.getCheckpoints().length < trackedBeforeReorg); + }, + 'prover-node sees the prune and recreates session with fewer provers', + 30, + 0.2, + ); + const trimmedSession = proverNode.sessionManager.getFullSession(gatedEpoch)!; + logger.info( + `Prover-node trimmed in-flight session: ${trackedBeforeReorg} → ${trimmedSession.getCheckpoints().length} tracked checkpoints`, + ); + expect(trackedBeforeReorg).toBeGreaterThan(trimmedSession.getCheckpoints().length); + + // Release the gate. The cancelled top tree #1 short-circuits with + // TopTreeCancelledError, the finalize loop restarts with the surviving sub-trees, + // and a fresh top tree submits a valid proof for checkpoints 1..afterReorgCheckpoint. + releaseProvingGate(); + + // The in-flight epoch should now be proven on L1 + await test.waitUntilProvenCheckpointNumber(afterReorgCheckpoint, 240); + expect(await rollup.getProvenCheckpointNumber()).toBeGreaterThanOrEqual(afterReorgCheckpoint); + logger.info(`In-flight epoch proven up to surviving checkpoint ${afterReorgCheckpoint}`); + }); + }); + + describe('prover-node starts mid-epoch', () => { + beforeEach(async () => { + test = await EpochsTestContext.setup({ + // Don't start the prover-node automatically — we spin it up mid-epoch in the test. + startProverNode: false, + maxSpeedUpAttempts: 0, + cancelTxOnTimeout: false, + aztecEpochDuration: 4, + ethereumSlotDuration: 4, + aztecSlotDuration: 36, + blockDurationMs: 8000, + minTxsPerBlock: 0, + enforceTimeTable: true, + aztecProofSubmissionEpochs: 1000, + anvilSlotsInAnEpoch: 32, + }); + ({ rollup, logger, context } = test); + ({ L2_SLOT_DURATION_IN_S } = test); + node = context.aztecNode; + }); + + it('proves the whole epoch when started mid-epoch including pre-spawn checkpoints', async () => { + // Sanity: no prover-node yet — the test is responsible for starting one. + expect(test.proverNodes).toHaveLength(0); + + // Anchor on a freshly-started epoch, then wait until at least TWO checkpoints + // INSIDE that epoch have landed (epochDuration=4 ⇒ epoch covers 4 slots, so two + // checkpoints puts us mid-epoch rather than at the boundary). These are the + // pre-spawn checkpoints that exist before the prover-node is constructed — the + // new-prover-mid-epoch invariant is that the L2BlockStream replay from + // `computeStartupState`'s starting block surfaces them as `chain-checkpointed` + // events and the prover-node registers and proves them. + const epoch = await test.waitUntilNextEpochStarts(); + const preSpawnCheckpoints = await retryUntil( + async () => { + const checkpoints = await node.getCheckpointsData({ epoch }); + return checkpoints.length >= 2 ? checkpoints : undefined; + }, + `at least 2 checkpoints inside epoch ${epoch}`, + L2_SLOT_DURATION_IN_S * 8, + 0.5, + ); + const preSpawnCheckpointNumbers = preSpawnCheckpoints.map(c => c.checkpointNumber); + logger.info( + `Pre-spawn checkpoints in epoch ${epoch}: ${preSpawnCheckpointNumbers.join(', ')}; starting prover-node now`, + ); + + // Spawn and start the prover-node. computeStartupState resolves a starting block of + // 1 (nothing proven yet), so the L2BlockStream replays from the genesis tip and the + // prover-node sees every checkpoint of the anchored epoch — including both + // pre-spawn ones. + const proverAztecNode = await test.createProverNode(); + const proverNode = proverAztecNode.getProverNode() as TestProverNode; + logger.info(`Prover-node started with id ${proverNode.getProverId().toString()}`); + + // Wait for the anchored epoch to end and its proof to land on L1. + await test.waitUntilEpochStarts(epoch + 1); + const epochEndCheckpoint = (await test.monitor.run(true)).checkpointNumber; + const lastPreSpawn = preSpawnCheckpointNumbers[preSpawnCheckpointNumbers.length - 1]; + expect(epochEndCheckpoint).toBeGreaterThanOrEqual(lastPreSpawn); + logger.info(`Epoch ${epoch} ended at checkpoint ${epochEndCheckpoint}; waiting for proof`); + + await test.waitUntilProvenCheckpointNumber(epochEndCheckpoint, 240); + + // The L1 proof must cover the entire epoch — including every pre-spawn checkpoint + // that landed before the prover-node existed. L1 rejects partial / out-of-order + // epoch proofs, so this is the strict "whole epoch proven" assertion. + const provenCheckpointNumber = await rollup.getProvenCheckpointNumber(); + expect(provenCheckpointNumber).toBeGreaterThanOrEqual(epochEndCheckpoint); + expect(provenCheckpointNumber).toBeGreaterThanOrEqual(lastPreSpawn); + logger.info(`Epoch ${epoch} fully proven up to checkpoint ${provenCheckpointNumber}`); + + // Every pre-spawn checkpoint should be in the prover-node's checkpoint store — + // each one was registered via the L2BlockStream's replay (chain-checkpointed events). + // The session manager constructs a full session over the canonical content for the + // anchored epoch when it completes, then proves it; the store retains the provers + // until expiry. + const epochCheckpointsInStore = await proverNode.getCheckpointStore().listCanonicalForEpoch(epoch); + const storedNumbers = new Set(epochCheckpointsInStore.map(p => p.checkpoint.number)); + for (const n of preSpawnCheckpointNumbers) { + expect(storedNumbers.has(n)).toBe(true); + } + }); + }); +}); diff --git a/yarn-project/end-to-end/src/e2e_epochs/epochs_proof_fails.parallel.test.ts b/yarn-project/end-to-end/src/e2e_epochs/epochs_proof_fails.parallel.test.ts index 18c5a0d895da..379091db6b21 100644 --- a/yarn-project/end-to-end/src/e2e_epochs/epochs_proof_fails.parallel.test.ts +++ b/yarn-project/end-to-end/src/e2e_epochs/epochs_proof_fails.parallel.test.ts @@ -123,22 +123,24 @@ describe('e2e_epochs/epochs_proof_fails', () => { const testProverNode = proverNode.getProverNode() as TestProverNode; proverDelayer = testProverNode.getDelayer()!; - // Inject a delay in prover node proving equal to the length of an epoch, to make sure deadline will be hit + // Inject a delay in prover node proving equal to the length of an epoch, to make sure deadline will be hit. + // Patches `createTopTreeOrchestrator` so each top tree's `prove()` is replaced with a delayed + // synthetic proof const epochProverManager = testProverNode.prover; - const originalCreate = epochProverManager.createEpochProver.bind(epochProverManager); + const originalCreateTopTree = epochProverManager.createTopTreeOrchestrator.bind(epochProverManager); const finalizeEpochPromise = promiseWithResolvers(); let hasFinalizeEpochWaited = false; - jest.spyOn(epochProverManager, 'createEpochProver').mockImplementation(() => { - const prover = originalCreate(); - jest.spyOn(prover, 'finalizeEpoch').mockImplementation(async () => { + jest.spyOn(epochProverManager, 'createTopTreeOrchestrator').mockImplementation(() => { + const topTree = originalCreateTopTree(); + jest.spyOn(topTree, 'prove').mockImplementation(async () => { if (!hasFinalizeEpochWaited) { // Note the following is very fragile, as it relies on timing. const seconds = L2_SLOT_DURATION_IN_S * (test.epochDuration + 1); // Forgive me for I have sinned. - logger.warn(`Finalize epoch: sleeping ${seconds}s.`); + logger.warn(`Top-tree prove: sleeping ${seconds}s.`); await sleep(seconds * 1000); } hasFinalizeEpochWaited = true; - logger.warn(`Finalize epoch: returning.`); + logger.warn(`Top-tree prove: returning.`); finalizeEpochPromise.resolve(); const ourPublicInputs = RootRollupPublicInputs.random(); const ourBatchedBlob = new BatchedBlob( @@ -150,7 +152,7 @@ describe('e2e_epochs/epochs_proof_fails', () => { ); return { publicInputs: ourPublicInputs, proof: Proof.empty(), batchedBlobInputs: ourBatchedBlob }; }); - return prover; + return topTree; }); context.proverNode = proverNode; diff --git a/yarn-project/end-to-end/src/e2e_epochs/epochs_test.ts b/yarn-project/end-to-end/src/e2e_epochs/epochs_test.ts index 858c196c3669..b172be919708 100644 --- a/yarn-project/end-to-end/src/e2e_epochs/epochs_test.ts +++ b/yarn-project/end-to-end/src/e2e_epochs/epochs_test.ts @@ -345,15 +345,43 @@ export class EpochsTestContext { public async waitUntilEpochStarts(epoch: number) { const [start] = getTimestampRangeForEpoch(EpochNumber(epoch), this.constants); this.logger.info(`Waiting until L1 timestamp ${start} is reached as the start of epoch ${epoch}`); + // Cover at least two full epochs of wall time so callers issuing the wait mid-epoch + // still have headroom — the prior `30 * epochDuration` mixed units (slots vs seconds) + // and timed out at 120s for configs whose epoch wall time is 144s+. await waitUntilL1Timestamp( this.l1Client, start - BigInt(this.L1_BLOCK_TIME_IN_S), undefined, - 30 * this.epochDuration, + 2 * this.epochDuration * this.L2_SLOT_DURATION_IN_S, ); return start; } + /** + * Waits until the next epoch boundary and returns that epoch's number. Anchors tests + * on a guaranteed-fresh epoch regardless of how much wall time `beforeEach` consumed — + * `waitUntilEpochStarts(1)` returns immediately when the chain has already advanced past + * slot 4, which under CI load can leave only seconds of the target epoch remaining. + * + * If the chain has more than two slots of headroom before the target boundary, warps + * the L1 clock to within two slots of the boundary instead of waiting in wall-clock. + * The two-slot tail is intentional — it lets the sequencer/builder settle so the first + * checkpoint of the target epoch lands correctly. + */ + public async waitUntilNextEpochStarts(): Promise { + const { epoch } = this.epochCache.getEpochAndSlotNow(); + const target = EpochNumber(Number(epoch) + 1); + const [targetTs] = getTimestampRangeForEpoch(target, this.constants); + const safeTs = targetTs - BigInt(2 * this.L2_SLOT_DURATION_IN_S); + const currentTs = BigInt(await this.context.cheatCodes.eth.lastBlockTimestamp()); + if (currentTs < safeTs) { + this.logger.info(`Warping L1 from ${currentTs} to ${safeTs} (2 slots before epoch ${target})`); + await this.context.cheatCodes.eth.warp(Number(safeTs), { resetBlockInterval: true }); + } + await this.waitUntilEpochStarts(Number(target)); + return target; + } + /** Waits until the given checkpoint number is mined. */ public async waitUntilCheckpointNumber(target: CheckpointNumber, timeout = 120) { await retryUntil( diff --git a/yarn-project/end-to-end/src/e2e_epochs/epochs_upload_failed_proof.test.ts b/yarn-project/end-to-end/src/e2e_epochs/epochs_upload_failed_proof.test.ts index 1e23882ba981..b5ee69c32160 100644 --- a/yarn-project/end-to-end/src/e2e_epochs/epochs_upload_failed_proof.test.ts +++ b/yarn-project/end-to-end/src/e2e_epochs/epochs_upload_failed_proof.test.ts @@ -53,25 +53,21 @@ describe('e2e_epochs/epochs_upload_failed_proof', () => { }); it('uploads failed proving job state and re-runs it on a fresh instance', async () => { - // Make initial prover node fail to prove + // Make initial prover node fail to prove, via the session's top-tree-prove hook. const proverNode = test.proverNodes[0].getProverNode() as TestProverNode; - const proverManager = proverNode.getProver(); - const origCreateEpochProver = proverManager.createEpochProver.bind(proverManager); - proverManager.createEpochProver = () => { - const epochProver = origCreateEpochProver(); - epochProver.finalizeEpoch = async () => { + proverNode.setSessionHooks({ + topTreeProveOverride: async () => { await sleep(1000); - logger.warn(`Triggering error on finalizeEpoch`); + logger.warn(`Triggering error on top-tree prove`); throw new Error(`Fake error while proving epoch`); - }; - return epochProver; - }; + }, + }); // And track when the epoch failure upload is complete let epochUploadUrl: string | undefined = undefined; - const origTryUploadEpochFailure = proverNode.tryUploadEpochFailure.bind(proverNode); - proverNode.tryUploadEpochFailure = async (job: any) => { - epochUploadUrl = await origTryUploadEpochFailure(job); + const origTryUploadEpochFailure = proverNode.tryUploadSessionFailure.bind(proverNode); + proverNode.tryUploadSessionFailure = async (session: any) => { + epochUploadUrl = await origTryUploadEpochFailure(session); return epochUploadUrl; }; diff --git a/yarn-project/end-to-end/src/e2e_p2p/inactivity_slash_test.ts b/yarn-project/end-to-end/src/e2e_p2p/inactivity_slash_test.ts index b21c884a6d2a..85886d4dba79 100644 --- a/yarn-project/end-to-end/src/e2e_p2p/inactivity_slash_test.ts +++ b/yarn-project/end-to-end/src/e2e_p2p/inactivity_slash_test.ts @@ -61,6 +61,7 @@ export class P2PInactivityTest { inboxLag: 2, anvilSlotsInAnEpoch: 4, proverNodeConfig: { proverNodeEpochProvingDelayMs: AZTEC_SLOT_DURATION * 1000 }, + proverBrokerMaxEpochsToKeepResultsFor: 20, aztecTargetCommitteeSize: COMMITTEE_SIZE, aztecSlotDuration: AZTEC_SLOT_DURATION, ethereumSlotDuration: ETHEREUM_SLOT_DURATION, diff --git a/yarn-project/end-to-end/src/e2e_p2p/slash_veto_demo.test.ts b/yarn-project/end-to-end/src/e2e_p2p/slash_veto_demo.test.ts index 281d44d2f97b..9713acdf8964 100644 --- a/yarn-project/end-to-end/src/e2e_p2p/slash_veto_demo.test.ts +++ b/yarn-project/end-to-end/src/e2e_p2p/slash_veto_demo.test.ts @@ -94,6 +94,7 @@ describe('veto slash', () => { slashingDisableDuration: SLASHING_DISABLE_DURATION_SECONDS, slashingVetoer: VETOER_ADDRESS, slashInactivityTargetPercentage: SLASH_INACTIVITY_TARGET_PERCENTAGE, + proverBrokerMaxEpochsToKeepResultsFor: 20, }, }); diff --git a/yarn-project/end-to-end/src/fixtures/e2e_prover_test.ts b/yarn-project/end-to-end/src/fixtures/e2e_prover_test.ts index 3e345cd6d98b..906aa5b9ec20 100644 --- a/yarn-project/end-to-end/src/fixtures/e2e_prover_test.ts +++ b/yarn-project/end-to-end/src/fixtures/e2e_prover_test.ts @@ -248,7 +248,10 @@ export class FullProverTest { txGatheringIntervalMs: 1000, txGatheringBatchSize: 10, txGatheringMaxParallelRequestsPerNode: 100, - txGatheringTimeoutMs: 24_000, + // The test warps the L1 clock forward a full epoch via cheatcodes; the prover-node + // tracks L1 time, so an in-flight tx-gather would see its deadline jump into the + // past. Use a generous window so the deadline survives the warp. + txGatheringTimeoutMs: 10 * 60 * 1000, proverNodeFailedEpochStore: undefined, proverNodeEpochProvingDelayMs: undefined, validatorPrivateKeys: new SecretValue([]), diff --git a/yarn-project/ethereum/src/contracts/gse.ts b/yarn-project/ethereum/src/contracts/gse.ts index 7f6a6df8962a..4487aeee6d42 100644 --- a/yarn-project/ethereum/src/contracts/gse.ts +++ b/yarn-project/ethereum/src/contracts/gse.ts @@ -47,11 +47,16 @@ export class GSEContract { return EthAddress.fromString(await this.gse.read.getGovernance()); } - getAttestersFromIndicesAtTime(instance: Hex | EthAddress, ts: bigint, indices: bigint[]) { + getAttestersFromIndicesAtTime( + instance: Hex | EthAddress, + ts: bigint, + indices: bigint[], + options?: { blockNumber?: bigint }, + ) { if (instance instanceof EthAddress) { instance = instance.toString(); } - return this.gse.read.getAttestersFromIndicesAtTime([instance, ts, indices]); + return this.gse.read.getAttestersFromIndicesAtTime([instance, ts, indices], options); } public async getRegistrationDigest(publicKey: ProjPointType): Promise> { diff --git a/yarn-project/ethereum/src/contracts/rollup.ts b/yarn-project/ethereum/src/contracts/rollup.ts index 60f3fcc1a1b7..92a634dcfdcd 100644 --- a/yarn-project/ethereum/src/contracts/rollup.ts +++ b/yarn-project/ethereum/src/contracts/rollup.ts @@ -554,8 +554,9 @@ export class RollupContract { return EthAddress.fromString(await this.rollup.read.owner()); } - async getActiveAttesterCount(): Promise { - return Number(await this.rollup.read.getActiveAttesterCount()); + async getActiveAttesterCount(options?: { blockNumber?: bigint }): Promise { + await checkBlockTag(options?.blockNumber, this.client); + return Number(await this.rollup.read.getActiveAttesterCount(options)); } public async getSlashingProposerAddress() { @@ -1190,13 +1191,20 @@ export class RollupContract { } async getAttesters(timestamp?: bigint): Promise { - const attesterSize = await this.getActiveAttesterCount(); + // Pin every read to a single L1 block so the attester count and the chunked index reads + // observe a consistent set. Without this, the count and each chunk default to `latest` and + // can straddle a block boundary (or reorg), yielding an inconsistent or truncated set. + const block = await this.client.getBlock(); + const blockNumber = block.number ?? undefined; + const ts = timestamp ?? block.timestamp; + const attesterSize = await this.getActiveAttesterCount({ blockNumber }); const gse = new GSEContract(this.client, await this.getGSE()); - const ts = timestamp ?? (await this.client.getBlock()).timestamp; const indices = Array.from({ length: attesterSize }, (_, i) => BigInt(i)); const chunks = chunk(indices, 1000); - const results = await Promise.all(chunks.map(chunk => gse.getAttestersFromIndicesAtTime(this.address, ts, chunk))); + const results = await Promise.all( + chunks.map(chunk => gse.getAttestersFromIndicesAtTime(this.address, ts, chunk, { blockNumber })), + ); return results.flat().map(addr => EthAddress.fromString(addr)); } diff --git a/yarn-project/prover-client/src/mocks/test_context.ts b/yarn-project/prover-client/src/mocks/test_context.ts index c3a38c8cf7e8..065e19142844 100644 --- a/yarn-project/prover-client/src/mocks/test_context.ts +++ b/yarn-project/prover-client/src/mocks/test_context.ts @@ -5,13 +5,13 @@ import { BlockNumber, CheckpointNumber } from '@aztec/foundation/branded-types'; import { padArrayEnd, times, timesAsync } from '@aztec/foundation/collection'; import { Fr } from '@aztec/foundation/curves/bn254'; import type { Logger } from '@aztec/foundation/log'; +import { SerialQueue } from '@aztec/foundation/queue'; import type { FieldsOf } from '@aztec/foundation/types'; import { getVKTreeRoot } from '@aztec/noir-protocol-circuits-types/vk-tree'; import { ProtocolContractsList } from '@aztec/protocol-contracts'; import { computeFeePayerBalanceLeafSlot } from '@aztec/protocol-contracts/fee-juice'; import { PublicDataWrite } from '@aztec/stdlib/avm'; import { AztecAddress } from '@aztec/stdlib/aztec-address'; -import { EthAddress } from '@aztec/stdlib/block'; import type { Checkpoint } from '@aztec/stdlib/checkpoint'; import type { MerkleTreeWriteOperations, ServerCircuitProver } from '@aztec/stdlib/interfaces/server'; import type { CheckpointConstantData } from '@aztec/stdlib/rollup'; @@ -36,12 +36,21 @@ import { getTreeSnapshot, insertSideEffects, } from '../orchestrator/block-building-helpers.js'; -import type { BlockProvingState } from '../orchestrator/block-proving-state.js'; -import { ProvingOrchestrator } from '../orchestrator/index.js'; import { BrokerCircuitProverFacade } from '../proving_broker/broker_prover_facade.js'; import { TestBroker } from '../test/mock_prover.js'; import { getEnvironmentConfig, getSimulator, makeCheckpointConstants, makeGlobals } from './fixtures.js'; +/** + * Builds a started `SerialQueue` for use as an orchestrator's deferred-job queue in + * tests. Production wires a single shared queue via `ProverClient`; tests that construct + * orchestrators directly use one of these per orchestrator (or share one). + */ +export function makeTestDeferredJobQueue(concurrency = 10): SerialQueue { + const queue = new SerialQueue(); + queue.start(concurrency); + return queue; +} + export class TestContext { private headers: Map = new Map(); private checkpoints: Checkpoint[] = []; @@ -57,7 +66,6 @@ export class TestContext { public prover: ServerCircuitProver, public broker: TestBroker, public brokerProverFacade: BrokerCircuitProverFacade, - public orchestrator: TestProvingOrchestrator, private feePayer: AztecAddress, initialFeePayerBalance: Fr, private directoriesToCleanup: string[], @@ -66,10 +74,6 @@ export class TestContext { this.feePayerBalance = initialFeePayerBalance; } - public get epochProver() { - return this.orchestrator; - } - static async new( logger: Logger, { @@ -118,22 +122,11 @@ export class TestContext { const broker = new TestBroker(proverCount, localProver); const facade = new BrokerCircuitProverFacade(broker); - const orchestrator = new TestProvingOrchestrator(ws, facade, EthAddress.ZERO, false, 10); await broker.start(); facade.start(); - return new this( - ws, - localProver, - broker, - facade, - orchestrator, - feePayer, - initialFeePayerBalance, - directoriesToCleanup, - logger, - ); + return new this(ws, localProver, broker, facade, feePayer, initialFeePayerBalance, directoriesToCleanup, logger); } public getFork() { @@ -352,16 +345,3 @@ export class TestContext { return endStateReference; } } - -class TestProvingOrchestrator extends ProvingOrchestrator { - public isVerifyBuiltBlockAgainstSyncedStateEnabled = false; - - // Disable this check by default, since it requires seeding world state with the block being built - // This is only enabled in some tests with multiple blocks that populate the pending chain via makePendingBlock - protected override verifyBuiltBlockAgainstSyncedState(provingState: BlockProvingState): Promise { - if (this.isVerifyBuiltBlockAgainstSyncedStateEnabled) { - return super.verifyBuiltBlockAgainstSyncedState(provingState); - } - return Promise.resolve(); - } -} diff --git a/yarn-project/prover-client/src/orchestrator/checkpoint-proving-state.ts b/yarn-project/prover-client/src/orchestrator/checkpoint-proving-state.ts index d8dcecf95f97..af04c8ebd55d 100644 --- a/yarn-project/prover-client/src/orchestrator/checkpoint-proving-state.ts +++ b/yarn-project/prover-client/src/orchestrator/checkpoint-proving-state.ts @@ -1,66 +1,30 @@ -import { - BatchedBlobAccumulator, - type FinalBlobBatchingChallenges, - SpongeBlob, - encodeCheckpointBlobDataFromBlocks, -} from '@aztec/blob-lib'; +import { SpongeBlob } from '@aztec/blob-lib'; import { type ARCHIVE_HEIGHT, - BLOBS_PER_CHECKPOINT, - FIELDS_PER_BLOB, type L1_TO_L2_MSG_SUBTREE_ROOT_SIBLING_PATH_LENGTH, type NESTED_RECURSIVE_ROLLUP_HONK_PROOF_LENGTH, NUM_MSGS_PER_BASE_PARITY, - OUT_HASH_TREE_HEIGHT, } from '@aztec/constants'; import { BlockNumber } from '@aztec/foundation/branded-types'; import { padArrayEnd } from '@aztec/foundation/collection'; -import { BLS12Point } from '@aztec/foundation/curves/bls12'; import { Fr } from '@aztec/foundation/curves/bn254'; import type { Tuple } from '@aztec/foundation/serialize'; import { type TreeNodeLocation, UnbalancedTreeStore } from '@aztec/foundation/trees'; import type { PublicInputsAndRecursiveProof } from '@aztec/stdlib/interfaces/server'; -import { computeCheckpointOutHash } from '@aztec/stdlib/messaging'; import { ParityBasePrivateInputs } from '@aztec/stdlib/parity'; -import { - BlockMergeRollupPrivateInputs, - BlockRollupPublicInputs, - CheckpointConstantData, - CheckpointRollupPublicInputs, - CheckpointRootRollupHints, - CheckpointRootRollupPrivateInputs, - CheckpointRootSingleBlockRollupPrivateInputs, -} from '@aztec/stdlib/rollup'; -import type { CircuitName } from '@aztec/stdlib/stats'; +import { BlockMergeRollupPrivateInputs, BlockRollupPublicInputs, CheckpointConstantData } from '@aztec/stdlib/rollup'; import type { AppendOnlyTreeSnapshot } from '@aztec/stdlib/trees'; import type { BlockHeader } from '@aztec/stdlib/tx'; import type { UInt64 } from '@aztec/stdlib/types'; -import { accumulateBlobs, buildBlobHints, toProofData } from './block-building-helpers.js'; +import { toProofData } from './block-building-helpers.js'; import { BlockProvingState, type ProofState } from './block-proving-state.js'; -import type { EpochProvingState } from './epoch-proving-state.js'; - -type OutHashHint = { - treeSnapshot: AppendOnlyTreeSnapshot; - siblingPath: Tuple; -}; export class CheckpointProvingState { private blockProofs: UnbalancedTreeStore< ProofState >; - private checkpointRootProof: - | ProofState - | undefined; private blocks: (BlockProvingState | undefined)[] = []; - private previousOutHashHint: OutHashHint | undefined; - private outHash: Fr | undefined; - // The snapshot and sibling path after the checkpoint's out hash is inserted. - // Stored here to be retrieved for the next checkpoint when it's added. - private newOutHashHint: OutHashHint | undefined; - private startBlobAccumulator: BatchedBlobAccumulator | undefined; - private endBlobAccumulator: BatchedBlobAccumulator | undefined; - private blobFields: Fr[] | undefined; private error: string | undefined; public readonly firstBlockNumber: BlockNumber; @@ -68,7 +32,6 @@ export class CheckpointProvingState { public readonly index: number, public readonly constants: CheckpointConstantData, public readonly totalNumBlocks: number, - private readonly finalBlobBatchingChallenges: FinalBlobBatchingChallenges, private readonly headerOfLastBlockInPreviousCheckpoint: BlockHeader, private readonly lastArchiveSiblingPath: Tuple, private readonly l1ToL2Messages: Fr[], @@ -84,17 +47,16 @@ export class CheckpointProvingState { Fr, typeof L1_TO_L2_MSG_SUBTREE_ROOT_SIBLING_PATH_LENGTH >, - public parentEpoch: EpochProvingState, - private onBlobAccumulatorSet: (checkpoint: CheckpointProvingState) => Promise, + public readonly epochNumber: number, + /** Owner's liveness check. `verifyState()` returns false once this returns false. */ + private readonly isAlive: () => boolean, + /** Owner's failure callback. Invoked from `reject` to surface the error upward. */ + private readonly onReject: (reason: string) => void, ) { this.blockProofs = new UnbalancedTreeStore(totalNumBlocks); this.firstBlockNumber = BlockNumber(headerOfLastBlockInPreviousCheckpoint.globalVariables.blockNumber + 1); } - public get epochNumber(): number { - return this.parentEpoch.epochNumber; - } - public startNewBlock( blockNumber: BlockNumber, timestamp: UInt64, @@ -176,25 +138,6 @@ export class CheckpointProvingState { this.blockProofs.setNode(location, { provingOutput }); } - public tryStartProvingCheckpointRoot() { - if (this.checkpointRootProof?.isProving) { - return false; - } else { - this.checkpointRootProof = { isProving: true }; - return true; - } - } - - public setCheckpointRootRollupProof( - provingOutput: PublicInputsAndRecursiveProof< - CheckpointRollupPublicInputs, - typeof NESTED_RECURSIVE_ROLLUP_HONK_PROOF_LENGTH - >, - ): TreeNodeLocation { - this.checkpointRootProof = { provingOutput }; - return this.parentEpoch.setCheckpointRootRollupProof(this.index, provingOutput); - } - public getBaseParityInputs(baseParityIndex: number) { const messages = padArrayEnd( this.l1ToL2Messages.slice( @@ -207,53 +150,6 @@ export class CheckpointProvingState { return new ParityBasePrivateInputs(messages, this.constants.vkTreeRoot, this.constants.proverId); } - public setOutHashHint(hint: OutHashHint) { - this.previousOutHashHint = hint; - } - - public getOutHashHint() { - return this.previousOutHashHint; - } - - public accumulateBlockOutHashes() { - if (this.isAcceptingBlocks() || this.blocks.some(b => !b?.hasEndState())) { - return; - } - - if (!this.outHash) { - const messagesPerBlock = this.blocks.map(b => b!.getTxEffects().map(tx => tx.l2ToL1Msgs)); - this.outHash = computeCheckpointOutHash(messagesPerBlock); - } - - return this.outHash; - } - - public setOutHashHintForNextCheckpoint(hint: OutHashHint) { - this.newOutHashHint = hint; - } - - public getOutHashHintForNextCheckpoint() { - return this.newOutHashHint; - } - - public async accumulateBlobs(startBlobAccumulator: BatchedBlobAccumulator) { - if (this.isAcceptingBlocks() || this.blocks.some(b => !b?.hasEndState())) { - return; - } - - this.blobFields = encodeCheckpointBlobDataFromBlocks(this.blocks.map(b => b!.getBlockBlobData())); - this.endBlobAccumulator = await accumulateBlobs(this.blobFields!, startBlobAccumulator); - this.startBlobAccumulator = startBlobAccumulator; - - await this.onBlobAccumulatorSet(this); - - return this.endBlobAccumulator; - } - - public getEndBlobAccumulator() { - return this.endBlobAccumulator; - } - public getParentLocation(location: TreeNodeLocation) { return this.blockProofs.getParentLocation(location); } @@ -267,47 +163,6 @@ export class CheckpointProvingState { return new BlockMergeRollupPrivateInputs([toProofData(left), toProofData(right)]); } - public getCheckpointRootRollupType(): CircuitName { - return this.totalNumBlocks === 1 ? 'rollup-checkpoint-root-single-block' : 'rollup-checkpoint-root'; - } - - public async getCheckpointRootRollupInputs() { - const proofs = this.#getChildProofsForRoot(); - const nonEmptyProofs = proofs.filter(p => !!p); - if (proofs.length !== nonEmptyProofs.length) { - throw new Error('At least one child is not ready for the checkpoint root rollup.'); - } - if (!this.previousOutHashHint) { - throw new Error('Out hash hint is not set.'); - } - if (!this.startBlobAccumulator) { - throw new Error('Start blob accumulator is not set.'); - } - - // `blobFields` must've been set if `startBlobAccumulator` is set (in `accumulateBlobs`). - const blobFields = this.blobFields!; - - const { blobCommitments, blobsHash } = await buildBlobHints(blobFields); - - const hints = CheckpointRootRollupHints.from({ - previousBlockHeader: this.headerOfLastBlockInPreviousCheckpoint, - previousArchiveSiblingPath: this.lastArchiveSiblingPath, - previousOutHash: this.previousOutHashHint.treeSnapshot, - newOutHashSiblingPath: this.previousOutHashHint.siblingPath, - startBlobAccumulator: this.startBlobAccumulator.toBlobAccumulator(), - finalBlobChallenges: this.finalBlobBatchingChallenges, - blobFields: padArrayEnd(blobFields, Fr.ZERO, FIELDS_PER_BLOB * BLOBS_PER_CHECKPOINT), - blobCommitments: padArrayEnd(blobCommitments, BLS12Point.ZERO, BLOBS_PER_CHECKPOINT), - blobsHash, - }); - - const [left, right] = nonEmptyProofs.map(p => toProofData(p)); - - return !right - ? new CheckpointRootSingleBlockRollupPrivateInputs(left, hints) - : new CheckpointRootRollupPrivateInputs([left, right], hints); - } - public getBlockProvingStateByBlockNumber(blockNumber: BlockNumber) { const index = Number(blockNumber) - Number(this.firstBlockNumber); return this.blocks[index]; @@ -317,13 +172,8 @@ export class CheckpointProvingState { return !!this.blockProofs.getSibling(location)?.provingOutput; } - public isReadyForCheckpointRoot() { - const allChildProofsReady = this.#getChildProofsForRoot().every(p => !!p); - return allChildProofsReady && !!this.previousOutHashHint && !!this.startBlobAccumulator; - } - public verifyState() { - return this.parentEpoch.verifyState(); + return this.isAlive(); } public getError() { @@ -337,14 +187,7 @@ export class CheckpointProvingState { public reject(reason: string) { this.error = reason; - this.parentEpoch.reject(reason); - } - - #getChildProofsForRoot() { - const rootLocation = { level: 0, index: 0 }; - return this.totalNumBlocks === 1 - ? [this.blockProofs.getNode(rootLocation)?.provingOutput] // If there's only 1 block, its proof will be stored at the root. - : this.blockProofs.getChildren(rootLocation).map(c => c?.provingOutput); + this.onReject(reason); } /** @@ -352,7 +195,10 @@ export class CheckpointProvingState { * Used by `CheckpointSubTreeOrchestrator` to surface its sub-tree result. */ public getSubTreeOutputProofs() { - return this.#getChildProofsForRoot(); + const rootLocation = { level: 0, index: 0 }; + return this.totalNumBlocks === 1 + ? [this.blockProofs.getNode(rootLocation)?.provingOutput] // If there's only 1 block, its proof will be stored at the root. + : this.blockProofs.getChildren(rootLocation).map(c => c?.provingOutput); } /** Sibling path of the archive tree captured before any block in this checkpoint landed. */ diff --git a/yarn-project/prover-client/src/orchestrator/checkpoint-sub-tree-orchestrator.test.ts b/yarn-project/prover-client/src/orchestrator/checkpoint-sub-tree-orchestrator.test.ts index 667764d22dc4..e0b48d2559b6 100644 --- a/yarn-project/prover-client/src/orchestrator/checkpoint-sub-tree-orchestrator.test.ts +++ b/yarn-project/prover-client/src/orchestrator/checkpoint-sub-tree-orchestrator.test.ts @@ -1,25 +1,35 @@ -import { FinalBlobBatchingChallenges } from '@aztec/blob-lib'; +import { MAX_L2_TO_L1_MSGS_PER_TX } from '@aztec/constants'; import { EpochNumber } from '@aztec/foundation/branded-types'; +import { padArrayEnd } from '@aztec/foundation/collection'; import { EthAddress } from '@aztec/foundation/eth-address'; import { createLogger } from '@aztec/foundation/log'; +import { ScopedL2ToL1Message, computeBlockOutHash } from '@aztec/stdlib/messaging'; +import { makeScopedL2ToL1Message } from '@aztec/stdlib/testing'; -import { TestContext } from '../mocks/test_context.js'; +import { TestContext, makeTestDeferredJobQueue } from '../mocks/test_context.js'; import { CheckpointSubTreeOrchestrator } from './checkpoint-sub-tree-orchestrator.js'; -import { EpochProvingContext } from './epoch-proving-context.js'; +import { ChonkCache } from './chonk-cache.js'; const logger = createLogger('prover-client:test:checkpoint-sub-tree-orchestrator'); +/** A full tx-worth of L2-to-L1 messages, padded to the per-tx maximum. */ +const makeL2ToL1Messages = (count: number) => + padArrayEnd( + Array.from({ length: count }, (_, i) => makeScopedL2ToL1Message((i + 1) * 789)), + ScopedL2ToL1Message.empty(), + MAX_L2_TO_L1_MSGS_PER_TX, + ); + describe('prover/orchestrator/checkpoint-sub-tree', () => { let context: TestContext; - let epochContext: EpochProvingContext; + let chonkCache: ChonkCache; beforeEach(async () => { context = await TestContext.new(logger); - epochContext = new EpochProvingContext(context.prover, EpochNumber(1)); + chonkCache = new ChonkCache(); }); afterEach(async () => { - epochContext.stop(); await context.cleanup(); }); @@ -34,9 +44,10 @@ describe('prover/orchestrator/checkpoint-sub-tree', () => { context.worldState, context.prover, EthAddress.ZERO, - epochContext, + chonkCache, + EpochNumber(1), false, - 10, + makeTestDeferredJobQueue(), constants, l1ToL2Messages, numBlocks, @@ -74,9 +85,10 @@ describe('prover/orchestrator/checkpoint-sub-tree', () => { context.worldState, context.prover, EthAddress.ZERO, - epochContext, + chonkCache, + EpochNumber(1), false, - 10, + makeTestDeferredJobQueue(), constants, l1ToL2Messages, numBlocks, @@ -101,47 +113,97 @@ describe('prover/orchestrator/checkpoint-sub-tree', () => { } }); - it('throws when startNewEpoch is called explicitly', async () => { - const { constants, l1ToL2Messages, previousBlockHeader } = await context.makeCheckpoint(1, { numTxsPerBlock: 0 }); + it('proves a checkpoint carrying L1-to-L2 messages', async () => { + // Cross-chain messages flow into the checkpoint's first block via the L1-to-L2 + // message tree; the sub-tree must prove them through without error (A-1039). + const numBlocks = 1; + const { constants, blocks, l1ToL2Messages, previousBlockHeader } = await context.makeCheckpoint(numBlocks, { + numTxsPerBlock: 1, + numL1ToL2Messages: 3, + }); + expect(l1ToL2Messages.length).toBe(3); + const subTree = await CheckpointSubTreeOrchestrator.start( context.worldState, context.prover, EthAddress.ZERO, - epochContext, + chonkCache, + EpochNumber(1), false, - 10, + makeTestDeferredJobQueue(), constants, l1ToL2Messages, - 1, + numBlocks, previousBlockHeader, ); try { - expect(() => subTree.startNewEpoch(EpochNumber(2), 1, FinalBlobBatchingChallenges.empty())).toThrow( - /starts its epoch in the constructor/, - ); + const resultPromise = subTree.getSubTreeResult(); + + for (const block of blocks) { + const { blockNumber, timestamp } = block.header.globalVariables; + await subTree.startNewBlock(blockNumber, timestamp, block.txs.length); + if (block.txs.length > 0) { + await subTree.addTxs(block.txs); + } + await subTree.setBlockCompleted(blockNumber, block.header); + } + + const result = await resultPromise; + expect(result.blockProofOutputs).toHaveLength(1); + expect(result.blockProofOutputs[0].proof).toBeDefined(); } finally { await subTree.stop(); } }); - it('throws when startNewCheckpoint is called explicitly', async () => { - const { constants, l1ToL2Messages, previousBlockHeader } = await context.makeCheckpoint(1, { numTxsPerBlock: 0 }); + it('proves a checkpoint whose txs emit L2-to-L1 messages', async () => { + // L2-to-L1 (cross-chain) messages are carried on the public tx effects; the sub-tree + // must prove them through the base/block rollups without error (A-1039). + const numBlocks = 1; + const { constants, blocks, l1ToL2Messages, previousBlockHeader } = await context.makeCheckpoint(numBlocks, { + numTxsPerBlock: 1, + makeProcessedTxOpts: () => ({ + privateOnly: false, + avmAccumulatedData: { l2ToL1Msgs: makeL2ToL1Messages(2) }, + }), + }); + // Confirm the fixture actually attached the messages. + expect(blocks[0].txs[0].txEffect.l2ToL1Msgs.length).toBe(2); + const subTree = await CheckpointSubTreeOrchestrator.start( context.worldState, context.prover, EthAddress.ZERO, - epochContext, + chonkCache, + EpochNumber(1), false, - 10, + makeTestDeferredJobQueue(), constants, l1ToL2Messages, - 1, + numBlocks, previousBlockHeader, ); try { - await expect(subTree.startNewCheckpoint(0, constants, l1ToL2Messages, 1, previousBlockHeader)).rejects.toThrow( - /drives its single checkpoint in `start`/, - ); + const resultPromise = subTree.getSubTreeResult(); + + for (const block of blocks) { + const { blockNumber, timestamp } = block.header.globalVariables; + await subTree.startNewBlock(blockNumber, timestamp, block.txs.length); + if (block.txs.length > 0) { + await subTree.addTxs(block.txs); + } + await subTree.setBlockCompleted(blockNumber, block.header); + } + + const result = await resultPromise; + expect(result.blockProofOutputs).toHaveLength(1); + expect(result.blockProofOutputs[0].proof).toBeDefined(); + + // The messages flow through the base/block rollups and end up in the block's outHash. + const messagesPerTx = blocks[0].txs.map(tx => tx.txEffect.l2ToL1Msgs); + const expectedOutHash = computeBlockOutHash(messagesPerTx); + expect(expectedOutHash.isZero()).toBe(false); // sanity: the fixture really did carry messages + expect(result.blockProofOutputs[0].inputs.outHash).toEqual(expectedOutHash); } finally { await subTree.stop(); } diff --git a/yarn-project/prover-client/src/orchestrator/checkpoint-sub-tree-orchestrator.ts b/yarn-project/prover-client/src/orchestrator/checkpoint-sub-tree-orchestrator.ts index fed630aff151..55e22e1247cc 100644 --- a/yarn-project/prover-client/src/orchestrator/checkpoint-sub-tree-orchestrator.ts +++ b/yarn-project/prover-client/src/orchestrator/checkpoint-sub-tree-orchestrator.ts @@ -1,30 +1,71 @@ -import { FinalBlobBatchingChallenges } from '@aztec/blob-lib'; -import type { ARCHIVE_HEIGHT, NESTED_RECURSIVE_ROLLUP_HONK_PROOF_LENGTH } from '@aztec/constants'; -import type { EpochNumber } from '@aztec/foundation/branded-types'; +import type { SpongeBlob } from '@aztec/blob-lib/types'; +import { + type ARCHIVE_HEIGHT, + L1_TO_L2_MSG_SUBTREE_HEIGHT, + L1_TO_L2_MSG_SUBTREE_ROOT_SIBLING_PATH_LENGTH, + NESTED_RECURSIVE_ROLLUP_HONK_PROOF_LENGTH, + NUM_BASE_PARITY_PER_ROOT_PARITY, +} from '@aztec/constants'; +import { BlockNumber, type EpochNumber } from '@aztec/foundation/branded-types'; import { Fr } from '@aztec/foundation/curves/bn254'; +import { AbortError } from '@aztec/foundation/error'; import type { LoggerBindings } from '@aztec/foundation/log'; import { type PromiseWithResolvers, promiseWithResolvers } from '@aztec/foundation/promise'; -import type { Tuple } from '@aztec/foundation/serialize'; -import type { EthAddress } from '@aztec/stdlib/block'; +import type { SerialQueue } from '@aztec/foundation/queue'; +import { type Tuple, assertLength } from '@aztec/foundation/serialize'; +import type { TreeNodeLocation } from '@aztec/foundation/trees'; +import { EthAddress } from '@aztec/stdlib/block'; import type { ForkMerkleTreeOperations, + MerkleTreeWriteOperations, PublicInputsAndRecursiveProof, ReadonlyWorldStateAccess, ServerCircuitProver, } from '@aztec/stdlib/interfaces/server'; -import type { - BlockRollupPublicInputs, +import { appendL1ToL2MessagesToTree } from '@aztec/stdlib/messaging'; +import { + type BaseRollupHints, + type BlockRollupPublicInputs, + BlockRootEmptyTxFirstRollupPrivateInputs, + BlockRootFirstRollupPrivateInputs, + BlockRootSingleTxFirstRollupPrivateInputs, + BlockRootSingleTxRollupPrivateInputs, CheckpointConstantData, - PublicChonkVerifierPublicInputs, + PrivateTxBaseRollupPrivateInputs, + type PublicChonkVerifierPublicInputs, } from '@aztec/stdlib/rollup'; -import type { BlockHeader, Tx } from '@aztec/stdlib/tx'; -import { type TelemetryClient, getTelemetryClient } from '@aztec/telemetry-client'; +import type { CircuitName } from '@aztec/stdlib/stats'; +import { type AppendOnlyTreeSnapshot, MerkleTreeId } from '@aztec/stdlib/trees'; +import type { BlockHeader, ProcessedTx, Tx } from '@aztec/stdlib/tx'; +import type { UInt64 } from '@aztec/stdlib/types'; +import { + Attributes, + type TelemetryClient, + type Tracer, + getTelemetryClient, + trackSpan, + wrapCallbackInSpan, +} from '@aztec/telemetry-client'; + +import { inspect } from 'util'; -import { getPublicChonkVerifierPrivateInputsFromTx } from './block-building-helpers.js'; +import { + buildHeaderFromCircuitOutputs, + getLastSiblingPath, + getPublicChonkVerifierPrivateInputsFromTx, + getRootTreeSiblingPath, + getSubtreeSiblingPath, + getTreeSnapshot, + insertSideEffectsAndBuildBaseRollupHints, + validatePartialState, + validateTx, +} from './block-building-helpers.js'; import type { BlockProvingState } from './block-proving-state.js'; -import type { CheckpointProvingState } from './checkpoint-proving-state.js'; -import type { EpochProvingContext } from './epoch-proving-context.js'; -import { ProvingOrchestrator } from './orchestrator.js'; +import { CheckpointProvingState } from './checkpoint-proving-state.js'; +import type { ChonkCache } from './chonk-cache.js'; +import { ProvingOrchestratorMetrics } from './orchestrator_metrics.js'; +import { ProvingScheduler } from './proving-scheduler.js'; +import { TxProvingState } from './tx-proving-state.js'; /** * Result of proving a single checkpoint's block-level sub-tree. @@ -41,78 +82,114 @@ export type SubTreeResult = { previousArchiveSiblingPath: Tuple; }; +type TreeSnapshots = Map; + +/** + * Base rollup hints as produced before proving: `PrivateBaseRollupHints` / `PublicBaseRollupHints` + * deliberately carry no recursive proof or verification key. The proof + VK are supplied later, when + * `TxProvingState.getBaseRollupTypeAndInputs` wraps these hints into the "with proof + VK" types — + * `PrivateTxBaseRollupPrivateInputs` (a `ChonkProofData`) or `PublicTxBaseRollupPrivateInputs` (a + * chonk-verifier proof + AVM proof). Those proofs are *required constructor arguments* of the wrapper + * types, so the only way to obtain a provable input is to populate them — they cannot be silently + * omitted. Naming the proof-less hints type here makes that boundary explicit at `prepareBaseRollupInputs`. + */ +type BaseRollupHintsWithoutProofAndVK = BaseRollupHints; + /** * Orchestrates block-level proving for a single checkpoint, stopping at the boundary - * where checkpoint root rollup would otherwise begin. + * where checkpoint root rollup would otherwise begin. Used by the per-checkpoint + * `CheckpointProver` in production; the top-tree orchestrator then composes the + * sub-tree's block proofs into the epoch proof. * - * Reuses every circuit driver in `ProvingOrchestrator` (chonk verifier, base, merge, - * block-root, parity, block-merge) but overrides the gating method that escalates to - * checkpoint root rollup. Instead of escalating, the orchestrator resolves - * `getSubTreeResult()` once every block-level proof in the checkpoint's tree is ready. - * - * Wiring: a single-checkpoint epoch is created in the constructor (epoch number sourced - * from the supplied `EpochProvingContext`). The canonical way to obtain a fully-started - * sub-tree is the `start` static factory, which also drives the single internal - * `startNewCheckpoint(0, ...)` call. The sub-tree never calls `finalizeEpochStructure`; - * the override of `checkAndEnqueueCheckpointRootRollup` resolves `getSubTreeResult` once - * block-level proving completes. + * Wiring: a single-checkpoint mini-proving session is owned by the constructor. The + * canonical way to obtain a fully-started sub-tree is the `start` static factory, + * which also drives the single internal `startCheckpoint` call. The sub-tree never + * escalates past the checkpoint root boundary; `getSubTreeResult()` resolves once + * every block-level proof in the checkpoint's tree is ready. */ -export class CheckpointSubTreeOrchestrator extends ProvingOrchestrator { +export class CheckpointSubTreeOrchestrator extends ProvingScheduler { + /** The single checkpoint proving state this sub-tree owns. Allocated in the `start` factory. */ + protected provingState: CheckpointProvingState | undefined = undefined; private readonly subTreeResult: PromiseWithResolvers; + private readonly metrics: ProvingOrchestratorMetrics; + private dbs: Map = new Map(); constructor( - dbProvider: ReadonlyWorldStateAccess & ForkMerkleTreeOperations, - prover: ServerCircuitProver, - proverId: EthAddress, + private readonly dbProvider: ReadonlyWorldStateAccess & ForkMerkleTreeOperations, + protected readonly prover: ServerCircuitProver, + private readonly proverId: EthAddress, /** - * Per-epoch shared chonk-verifier proof cache. Every chonk-verifier proof started - * by this sub-tree lives on the context and survives the sub-tree's cancellation, - * so a tx whose original checkpoint is reorged out and re-appears in a replacement - * checkpoint reuses the cached proof. The context's `epochNumber` is the epoch - * this sub-tree proves into. + * Shared chonk-verifier proof cache. Every chonk-verifier proof started by this + * sub-tree lives on the cache and survives the sub-tree's cancellation, so a tx + * whose original checkpoint is reorged out and re-appears in a replacement + * checkpoint reuses the cached proof. */ - private readonly epochContext: EpochProvingContext, - cancelJobsOnStop: boolean = false, - enqueueConcurrency: number, + private readonly chonkCache: ChonkCache, + /** The epoch this sub-tree proves into. */ + private readonly epochNumber: EpochNumber, + private readonly cancelJobsOnStop: boolean = false, + deferredJobQueue: SerialQueue, telemetryClient: TelemetryClient = getTelemetryClient(), bindings?: LoggerBindings, ) { - super(dbProvider, prover, proverId, cancelJobsOnStop, enqueueConcurrency, telemetryClient, bindings); - - // Single-checkpoint mini-epoch by construction. The total/challenges supplied to - // `super.startNewEpoch` are never read, because the sub-tree overrides - // `checkAndEnqueueCheckpointRootRollup` to short-circuit before the parent's - // checkpoint-root / finalize machinery would consume them. - super.startNewEpoch(epochContext.epochNumber, 1, FinalBlobBatchingChallenges.empty()); + super(deferredJobQueue, 'prover-client:checkpoint-sub-tree-orchestrator', bindings); + this.metrics = new ProvingOrchestratorMetrics(telemetryClient, 'CheckpointSubTreeOrchestrator'); this.subTreeResult = promiseWithResolvers(); // Mark the rejection branch as observed so a `cancel()` or proving failure does not // surface an unhandled rejection when no consumer awaits getSubTreeResult(). this.subTreeResult.promise.catch(() => {}); + } - // If the parent's proving state ever rejects, surface the failure on the sub-tree promise. - void this.provingPromise!.then(result => { - if (result.status === 'failure') { - this.subTreeResult.reject(new Error(result.reason)); - } - }); + /** Tracks whether `cancel()` has been called; flows into the checkpoint state's isAlive hook. */ + private cancelled = false; + + public get tracer(): Tracer { + return this.metrics.tracer; + } + + public getProverId(): EthAddress { + return this.proverId; + } + + public getNumActiveForks(): number { + return this.dbs.size; + } + + /** Returns a promise that resolves when block-level proving completes for the checkpoint. */ + public getSubTreeResult(): Promise { + return this.subTreeResult.promise; + } + + /** + * Returns the archive sibling path captured at the internal checkpoint start. + * Available synchronously once `start` has resolved, before block-level proving + * completes. The top-tree consumer uses this to assemble checkpoint root rollup hints + * up-front so checkpoint root proofs can pipeline against in-flight sub-tree proving. + */ + public getPreviousArchiveSiblingPath(): Tuple { + if (!this.provingState) { + throw new Error('Checkpoint not started; call CheckpointSubTreeOrchestrator.start first.'); + } + return this.provingState.getLastArchiveSiblingPath(); } /** * Constructs and starts a sub-tree for a single checkpoint. The returned sub-tree - * has had its single internal `startNewCheckpoint(0, ...)` driven; callers proceed - * directly to per-block `startNewBlock` / `addTxs` / `setBlockCompleted`. + * has had its single internal checkpoint state allocated; callers proceed directly + * to per-block `startNewBlock` / `addTxs` / `setBlockCompleted`. * - * If the internal `startNewCheckpoint` rejects, the partially-constructed sub-tree - * is stopped before the error propagates, so no broker resources leak. + * If the internal start rejects, the partially-constructed sub-tree is stopped + * before the error propagates, so no broker resources leak. */ public static async start( dbProvider: ReadonlyWorldStateAccess & ForkMerkleTreeOperations, prover: ServerCircuitProver, proverId: EthAddress, - epochContext: EpochProvingContext, + chonkCache: ChonkCache, + epochNumber: EpochNumber, cancelJobsOnStop: boolean, - enqueueConcurrency: number, + deferredJobQueue: SerialQueue, checkpointConstants: CheckpointConstantData, l1ToL2Messages: Fr[], totalNumBlocks: number, @@ -124,16 +201,15 @@ export class CheckpointSubTreeOrchestrator extends ProvingOrchestrator { dbProvider, prover, proverId, - epochContext, + chonkCache, + epochNumber, cancelJobsOnStop, - enqueueConcurrency, + deferredJobQueue, telemetryClient, bindings, ); try { - await ProvingOrchestrator.prototype.startNewCheckpoint.call( - subTree, - 0, + await subTree.startCheckpoint( checkpointConstants, l1ToL2Messages, totalNumBlocks, @@ -146,91 +222,426 @@ export class CheckpointSubTreeOrchestrator extends ProvingOrchestrator { } } - /** Returns a promise that resolves when block-level proving completes for the checkpoint. */ - public getSubTreeResult(): Promise { - return this.subTreeResult.promise; + /** + * Kickstart chonk-verifier circuits via the shared `ChonkCache`. The cache owns the + * broker job lifecycle, so the proof survives this sub-tree's `cancel()` — a tx that + * ends up in a replacement checkpoint after a reorg can pick the cached promise up + * and skip re-proving. + */ + public startChonkVerifierCircuits(txs: Tx[]): Promise { + if (!this.provingState?.verifyState()) { + return Promise.reject(new Error('Sub-tree proving state is not active.')); + } + const publicTxs = txs.filter(tx => tx.data.forPublic); + for (const tx of publicTxs) { + const txHash = tx.getTxHash().toString(); + const inputs = getPublicChonkVerifierPrivateInputsFromTx(tx, this.getProverId().toField()); + // Fire and forget — getOrEnqueueChonkVerifier later picks up the cached promise + // when the tx is processed inside its block. + void this.chonkCache.getOrCompute(txHash, signal => + this.prover.getPublicChonkVerifierProof(inputs, signal, this.epochNumber), + ); + } + return Promise.resolve(); } + // ---------------- per-block driving (called by the per-checkpoint CheckpointProver) ---------------- + /** - * The epoch is started in the constructor. + * Starts off a new block. + * @param blockNumber - The block number + * @param timestamp - The timestamp of the block. Required for empty blocks to construct private inputs. + * @param totalNumTxs - The total number of txs in the block. */ - public override startNewEpoch( - _epochNumber: EpochNumber, - _totalNumCheckpoints: number, - _finalBlobBatchingChallenges: FinalBlobBatchingChallenges, - ): void { - throw new Error('CheckpointSubTreeOrchestrator starts its epoch in the constructor; do not call startNewEpoch.'); + @trackSpan('CheckpointSubTreeOrchestrator.startNewBlock', blockNumber => ({ + [Attributes.BLOCK_NUMBER]: blockNumber, + })) + public async startNewBlock(blockNumber: BlockNumber, timestamp: UInt64, totalNumTxs: number) { + if (!this.provingState) { + throw new Error('Empty proving state. The checkpoint sub-tree has not been started.'); + } + + if (!this.provingState.isAcceptingBlocks()) { + throw new Error(`Checkpoint not accepting further blocks.`); + } + + const constants = this.provingState.constants; + this.logger.info(`Starting block ${blockNumber} for slot ${constants.slotNumber}.`); + + // Fork the db only when it's not already set. The db for the first block is set in startCheckpoint. + if (!this.dbs.has(blockNumber)) { + // Fork world state at the end of the immediately previous block. + const db = await this.dbProvider.fork(BlockNumber(blockNumber - 1)); + this.dbs.set(blockNumber, db); + } + const db = this.getDbForBlock(blockNumber); + + // Get archive snapshot and sibling path before any txs in this block lands. + const lastArchiveTreeSnapshot = await getTreeSnapshot(MerkleTreeId.ARCHIVE, db); + const lastArchiveSiblingPath = await getRootTreeSiblingPath(MerkleTreeId.ARCHIVE, db); + + const blockProvingState = this.provingState.startNewBlock( + blockNumber, + timestamp, + totalNumTxs, + lastArchiveTreeSnapshot, + lastArchiveSiblingPath, + ); + + // Enqueue base parity circuits for the first block in the checkpoint. + if (blockProvingState.index === 0) { + for (let i = 0; i < NUM_BASE_PARITY_PER_ROOT_PARITY; i++) { + this.enqueueBaseParityCircuit(this.provingState, blockProvingState, i); + } + } + + // Because `addTxs` won't be called for a block without txs, and that's where the sponge blob state is computed, + // set its end sponge blob here. This becomes the start sponge blob for the next block. + if (totalNumTxs === 0) { + const endState = await db.getStateReference(); + blockProvingState.setEndState(endState); + + const endSpongeBlob = blockProvingState.getStartSpongeBlob().clone(); + const blockEndBlobFields = blockProvingState.getBlockEndBlobFields(); + await endSpongeBlob.absorb(blockEndBlobFields); + blockProvingState.setEndSpongeBlob(endSpongeBlob); + } } /** - * The single internal checkpoint is started by the `start` factory + * The interface to add simulated transactions to the scheduler. Called at most once per block. + * @param txs - The transactions to be proven */ - public override startNewCheckpoint( - _checkpointIndex: number, - _constants: CheckpointConstantData, - _l1ToL2Messages: Fr[], - _totalNumBlocks: number, - _headerOfLastBlockInPreviousCheckpoint: BlockHeader, - ): Promise { - return Promise.reject( - new Error( - 'CheckpointSubTreeOrchestrator drives its single checkpoint in `start`; do not call startNewCheckpoint.', - ), - ); + @trackSpan('CheckpointSubTreeOrchestrator.addTxs', txs => ({ + [Attributes.BLOCK_TXS_COUNT]: txs.length, + })) + public async addTxs(txs: ProcessedTx[]): Promise { + if (!this.provingState) { + throw new Error(`Empty proving state. The checkpoint sub-tree has not been started.`); + } + + if (!txs.length) { + // Empty block: setBlockCompleted handles this without addTxs being called. Bail to + // avoid the throw below (we cannot find the blockNumber without any txs). + this.logger.warn(`Provided no txs to addTxs.`); + return; + } + + const blockNumber = BlockNumber(txs[0].globalVariables.blockNumber); + const provingState = this.provingState.getBlockProvingStateByBlockNumber(blockNumber!); + if (!provingState) { + throw new Error(`Proving state for block ${blockNumber} not found. Call startNewBlock first.`); + } + + if (provingState.totalNumTxs !== txs.length) { + throw new Error( + `Block ${blockNumber} should be filled with ${provingState.totalNumTxs} txs. Received ${txs.length} txs.`, + ); + } + + if (!provingState.isAcceptingTxs()) { + throw new Error(`Block ${blockNumber} has been initialized with transactions.`); + } + + this.logger.info(`Adding ${txs.length} transactions to block ${blockNumber}`); + + const db = this.getDbForBlock(blockNumber); + const lastArchive = provingState.lastArchiveTreeSnapshot; + const newL1ToL2MessageTreeSnapshot = provingState.newL1ToL2MessageTreeSnapshot; + const spongeBlobState = provingState.getStartSpongeBlob().clone(); + + for (const tx of txs) { + try { + if (!provingState.verifyState()) { + throw new Error(`Invalid proving state when adding a tx`); + } + + validateTx(tx); + + this.logger.debug(`Received transaction: ${tx.hash}`); + + const startSpongeBlob = spongeBlobState.clone(); + const [hints, treeSnapshots] = await this.prepareBaseRollupInputs( + tx, + lastArchive, + newL1ToL2MessageTreeSnapshot, + startSpongeBlob, + db, + ); + + if (!provingState.verifyState()) { + throw new Error(`Unable to add transaction, preparing base inputs failed`); + } + + await spongeBlobState.absorb(tx.txEffect.toBlobFields()); + + const txProvingState = new TxProvingState(tx, hints, treeSnapshots, this.proverId.toField()); + const txIndex = provingState.addNewTx(txProvingState); + if (txProvingState.requireAvmProof) { + this.getOrEnqueueChonkVerifier(provingState, txIndex); + this.logger.debug(`Enqueueing public VM for tx ${txIndex}`); + this.enqueueVM(provingState, txIndex); + } else { + this.logger.debug(`Enqueueing base rollup for private-only tx ${txIndex}`); + this.enqueueBaseRollup(provingState, txIndex); + } + } catch (err: any) { + throw new Error(`Error adding transaction ${tx.hash.toString()} to block ${blockNumber}: ${err.message}`, { + cause: err, + }); + } + } + + const endState = await db.getStateReference(); + provingState.setEndState(endState); + + const blockEndBlobFields = provingState.getBlockEndBlobFields(); + await spongeBlobState.absorb(blockEndBlobFields); + + provingState.setEndSpongeBlob(spongeBlobState); } /** - * Returns the archive sibling path captured at the internal `startNewCheckpoint`. - * Available synchronously once `start` has resolved, before block-level proving - * completes. The top-tree consumer uses this to assemble checkpoint root rollup hints - * up-front so checkpoint root proofs can pipeline against in-flight sub-tree proving. + * Marks the block as completed. + * Computes the block header and updates the archive tree. */ - public getPreviousArchiveSiblingPath(): Tuple { - const checkpoint = this.provingState!.getCheckpointProvingState(0); - if (!checkpoint) { - throw new Error('Checkpoint not started; call CheckpointSubTreeOrchestrator.start first.'); + @trackSpan('CheckpointSubTreeOrchestrator.setBlockCompleted', (blockNumber: BlockNumber) => ({ + [Attributes.BLOCK_NUMBER]: blockNumber, + })) + public async setBlockCompleted(blockNumber: BlockNumber, expectedHeader?: BlockHeader): Promise { + const provingState = this.provingState?.getBlockProvingStateByBlockNumber(blockNumber); + if (!provingState) { + throw new Error(`Block proving state for ${blockNumber} not found`); } - return checkpoint.getLastArchiveSiblingPath(); + + // Abort with specific error for the block if there's one. + const error = provingState.getError(); + if (error) { + throw new Error(`Block proving failed: ${error}`); + } + + // Abort if the proving state is not valid due to errors occurred elsewhere. + if (!provingState.verifyState()) { + throw new Error(`Invalid proving state when completing block ${blockNumber}.`); + } + + if (provingState.isAcceptingTxs()) { + throw new Error( + `Block ${blockNumber} is still accepting txs. Call setBlockCompleted after all txs have been added.`, + ); + } + + // Given we've applied every change from this block, now assemble the block header: + this.logger.verbose(`Block ${blockNumber} completed. Assembling header.`); + const header = await provingState.buildBlockHeader(); + + if (expectedHeader && !header.equals(expectedHeader)) { + this.logger.error(`Block header mismatch: header=${header} expectedHeader=${expectedHeader}`); + throw new Error('Block header mismatch'); + } + + // Get db for this block and remove from map — no other code should use it after this point. + const db = this.getDbForBlock(provingState.blockNumber); + this.dbs.delete(provingState.blockNumber); + + // Update the archive tree, capture the snapshot, and close the fork deterministically. + try { + this.logger.verbose( + `Updating archive tree with block ${provingState.blockNumber} header ${(await header.hash()).toString()}`, + ); + await db.updateArchive(header); + provingState.setBuiltArchive(await getTreeSnapshot(MerkleTreeId.ARCHIVE, db)); + } finally { + await db.close(); + } + + await this.verifyBuiltBlockAgainstSyncedState(provingState); + + return header; } + // ---------------- lifecycle ---------------- + /** - * Override the checkpoint-root boundary: instead of escalating to checkpoint root, - * resolve the sub-tree promise with the block-level proof outputs once they're all ready. + * Cancels any further proving. If `cancelJobsOnStop` was set, aborts all pending broker jobs + * (used on reorg). Otherwise jobs remain in the broker queue and can be reused on restart. */ - // eslint-disable-next-line require-await - protected override async checkAndEnqueueCheckpointRootRollup(provingState: CheckpointProvingState): Promise { - const proofs = provingState.getSubTreeOutputProofs(); - const nonEmpty = proofs.filter((p): p is NonNullable => !!p); - if (proofs.length !== nonEmpty.length) { - // Block merge tree not fully resolved yet — will be retried as more block proofs land. - return; + public cancel() { + this.cancelled = true; + this.resetSchedulerState(this.cancelJobsOnStop); + // Reject the proving state (and hence subTreeResult) so anyone awaiting the sub-tree result + // is released rather than hanging — matching TopTreeOrchestrator.cancel(). + this.provingState?.cancel(); + + for (const [blockNumber, db] of this.dbs.entries()) { + void db.close().catch(err => this.logger.error(`Error closing db for block ${blockNumber}`, err)); } + this.dbs.clear(); + } - this.subTreeResult.resolve({ - blockProofOutputs: nonEmpty, - previousArchiveSiblingPath: provingState.getLastArchiveSiblingPath(), - }); + protected override cancelInternal(): void { + this.cancel(); } + // ---------------- private: per-checkpoint init ---------------- + /** - * Kickstart chonk-verifier circuits via the shared `EpochProvingContext`. The context - * owns the broker job lifecycle, so the proof survives this sub-tree's `cancel()` — - * a tx that ends up in a replacement checkpoint after a reorg can pick the cached - * promise up and skip re-proving. + * Internal driver for the single-checkpoint init. Allocates the world-state fork, + * inserts L1-to-L2 messages, and creates the per-checkpoint proving state with this + * sub-tree as its parent. Only called once, from the `start` factory. */ - public override startChonkVerifierCircuits(txs: Tx[]): Promise { - if (!this.provingState?.verifyState()) { - return Promise.reject(new Error('Sub-tree proving state is not active.')); + private async startCheckpoint( + constants: CheckpointConstantData, + l1ToL2Messages: Fr[], + totalNumBlocks: number, + headerOfLastBlockInPreviousCheckpoint: BlockHeader, + ): Promise { + if (this.provingState) { + throw new Error('Checkpoint sub-tree already started.'); } - const publicTxs = txs.filter(tx => tx.data.forPublic); - for (const tx of publicTxs) { - const txHash = tx.getTxHash().toString(); - const inputs = getPublicChonkVerifierPrivateInputsFromTx(tx, this.getProverId().toField()); - // Fire and forget — getOrEnqueueChonkVerifier later picks up the cached promise - // when the tx is processed inside its block. - void this.epochContext.enqueue(txHash, inputs); + + // Fork world state at the end of the immediately previous block. + const lastBlockNumber = headerOfLastBlockInPreviousCheckpoint.globalVariables.blockNumber; + const db = await this.dbProvider.fork(lastBlockNumber); + + const firstBlockNumber = BlockNumber(lastBlockNumber + 1); + this.dbs.set(firstBlockNumber, db); + + // Get archive sibling path before any block in this checkpoint lands. + const lastArchiveSiblingPath = await getLastSiblingPath(MerkleTreeId.ARCHIVE, db); + + // Insert all the l1 to l2 messages into the db. Get the states before and after the insertion. + const { + lastL1ToL2MessageTreeSnapshot, + lastL1ToL2MessageSubtreeRootSiblingPath, + newL1ToL2MessageTreeSnapshot, + newL1ToL2MessageSubtreeRootSiblingPath, + } = await this.updateL1ToL2MessageTree(l1ToL2Messages, db); + + this.provingState = new CheckpointProvingState( + /* index */ 0, + constants, + totalNumBlocks, + headerOfLastBlockInPreviousCheckpoint, + lastArchiveSiblingPath, + l1ToL2Messages, + lastL1ToL2MessageTreeSnapshot, + lastL1ToL2MessageSubtreeRootSiblingPath, + newL1ToL2MessageTreeSnapshot, + newL1ToL2MessageSubtreeRootSiblingPath, + Number(this.epochNumber), + /* isAlive */ () => !this.cancelled, + /* onReject */ reason => this.subTreeResult.reject(new Error(reason)), + ); + } + + // ---------------- private: per-block proof orchestration ---------------- + + private async updateL1ToL2MessageTree(l1ToL2Messages: Fr[], db: MerkleTreeWriteOperations) { + const lastL1ToL2MessageTreeSnapshot = await getTreeSnapshot(MerkleTreeId.L1_TO_L2_MESSAGE_TREE, db); + const lastL1ToL2MessageSubtreeRootSiblingPath = assertLength( + await getSubtreeSiblingPath(MerkleTreeId.L1_TO_L2_MESSAGE_TREE, L1_TO_L2_MSG_SUBTREE_HEIGHT, db), + L1_TO_L2_MSG_SUBTREE_ROOT_SIBLING_PATH_LENGTH, + ); + + // Update the local trees to include the new l1 to l2 messages. + await appendL1ToL2MessagesToTree(db, l1ToL2Messages); + + const newL1ToL2MessageTreeSnapshot = await getTreeSnapshot(MerkleTreeId.L1_TO_L2_MESSAGE_TREE, db); + const newL1ToL2MessageSubtreeRootSiblingPath = assertLength( + await getSubtreeSiblingPath(MerkleTreeId.L1_TO_L2_MESSAGE_TREE, L1_TO_L2_MSG_SUBTREE_HEIGHT, db), + L1_TO_L2_MSG_SUBTREE_ROOT_SIBLING_PATH_LENGTH, + ); + + return { + lastL1ToL2MessageTreeSnapshot, + lastL1ToL2MessageSubtreeRootSiblingPath, + newL1ToL2MessageTreeSnapshot, + newL1ToL2MessageSubtreeRootSiblingPath, + }; + } + + // Updates the merkle trees for a transaction. The first enqueued job for a transaction. + @trackSpan('CheckpointSubTreeOrchestrator.prepareBaseRollupInputs', tx => ({ + [Attributes.TX_HASH]: tx.hash.toString(), + })) + private async prepareBaseRollupInputs( + tx: ProcessedTx, + lastArchive: AppendOnlyTreeSnapshot, + newL1ToL2MessageTreeSnapshot: AppendOnlyTreeSnapshot, + startSpongeBlob: SpongeBlob, + db: MerkleTreeWriteOperations, + ): Promise<[BaseRollupHintsWithoutProofAndVK, TreeSnapshots]> { + // These hints deliberately carry no recursive proof or verification key — see + // BaseRollupHintsWithoutProofAndVK. The tx's proof + VK are attached later in + // TxProvingState.getBaseRollupTypeAndInputs from the proven chonk-verifier / kernel / AVM + // proofs, which are required there and so cannot be silently omitted. + const start = performance.now(); + const hints = await insertSideEffectsAndBuildBaseRollupHints( + tx, + lastArchive, + newL1ToL2MessageTreeSnapshot, + startSpongeBlob, + this.proverId.toField(), + db, + ); + this.metrics.recordBaseRollupInputs(performance.now() - start); + + const promises = [MerkleTreeId.NOTE_HASH_TREE, MerkleTreeId.NULLIFIER_TREE, MerkleTreeId.PUBLIC_DATA_TREE].map( + async (id: MerkleTreeId) => { + return { key: id, value: await getTreeSnapshot(id, db) }; + }, + ); + const treeSnapshots: TreeSnapshots = new Map((await Promise.all(promises)).map(obj => [obj.key, obj.value])); + + return [hints, treeSnapshots]; + } + + // Executes the base rollup circuit and stores the output as intermediate state for the parent merge/root circuit. + // Executes the next level of merge if all inputs are available. + private enqueueBaseRollup(provingState: BlockProvingState, txIndex: number) { + if (!provingState.verifyState()) { + this.logger.debug('Not running base rollup, state invalid'); + return; } - return Promise.resolve(); + + if (!provingState.tryStartProvingBase(txIndex)) { + this.logger.debug(`Base rollup for tx ${txIndex} already started.`); + return; + } + + const txProvingState = provingState.getTxProvingState(txIndex); + const { processedTx } = txProvingState; + const { rollupType, inputs } = txProvingState.getBaseRollupTypeAndInputs(); + + this.logger.debug(`Enqueuing deferred proving base rollup for ${processedTx.hash.toString()}`); + + this.deferredProving( + provingState, + this.wrapCircuitCall( + inputs instanceof PrivateTxBaseRollupPrivateInputs + ? 'getPrivateTxBaseRollupProof' + : 'getPublicTxBaseRollupProof', + signal => { + if (inputs instanceof PrivateTxBaseRollupPrivateInputs) { + return this.prover.getPrivateTxBaseRollupProof(inputs, signal, provingState.epochNumber); + } else { + return this.prover.getPublicTxBaseRollupProof(inputs, signal, provingState.epochNumber); + } + }, + { [Attributes.TX_HASH]: processedTx.hash.toString(), [Attributes.PROTOCOL_CIRCUIT_NAME]: rollupType }, + ), + result => { + this.logger.debug(`Completed proof for ${rollupType} for tx ${processedTx.hash.toString()}`); + validatePartialState(result.inputs.endTreeSnapshots, txProvingState.treeSnapshots); + const leafLocation = provingState.setBaseRollupProof(txIndex, result); + if (provingState.totalNumTxs === 1) { + this.checkAndEnqueueBlockRootRollup(provingState); + } else { + this.checkAndEnqueueNextMergeRollup(provingState, leafLocation); + } + }, + ); } /** @@ -238,7 +649,7 @@ export class CheckpointSubTreeOrchestrator extends ProvingOrchestrator { * cached promise (or enqueue if missing), then `.then(handleResult)` to progress to * the base rollup once the proof lands. */ - protected override getOrEnqueueChonkVerifier(provingState: BlockProvingState, txIndex: number) { + private getOrEnqueueChonkVerifier(provingState: BlockProvingState, txIndex: number) { if (!provingState.verifyState()) { return; } @@ -259,13 +670,375 @@ export class CheckpointSubTreeOrchestrator extends ProvingOrchestrator { this.checkAndEnqueueBaseRollup(provingState, txIndex); }; - let promise = this.epochContext.getCached(txHash); - if (!promise) { - promise = this.epochContext.enqueue(txHash, txProvingState.getPublicChonkVerifierPrivateInputs()); + const promise = this.chonkCache.getOrCompute(txHash, signal => + this.prover.getPublicChonkVerifierProof( + txProvingState.getPublicChonkVerifierPrivateInputs(), + signal, + this.epochNumber, + ), + ); + void promise.then(handleResult).catch(err => { + // The cache self-cleans on rejection, so a replacement sub-tree for this tx will see the + // miss and re-enqueue. But if this proving state is still active, the failure must abort + // it: otherwise the base rollup for this tx is never enqueued and the checkpoint (and + // epoch) orchestrators hang forever waiting for a proof that will never arrive. + if (err instanceof AbortError || !provingState.verifyState()) { + return; + } + this.logger.error(`Chonk verifier proof failed for tx ${txHash}`, err); + provingState.reject(`Chonk verifier proof failed for tx ${txHash}: ${err}`); + }); + } + + // Executes the merge rollup circuit. Enqueues the next level of merge if all inputs are available. + private enqueueMergeRollup(provingState: BlockProvingState, location: TreeNodeLocation) { + if (!provingState.verifyState()) { + this.logger.debug('Not running merge rollup. State no longer valid.'); + return; + } + + if (!provingState.tryStartProvingMerge(location)) { + this.logger.debug('Merge rollup already started.'); + return; + } + + const inputs = provingState.getMergeRollupInputs(location); + + this.deferredProving( + provingState, + this.wrapCircuitCall( + 'getTxMergeRollupProof', + signal => this.prover.getTxMergeRollupProof(inputs, signal, provingState.epochNumber), + { [Attributes.PROTOCOL_CIRCUIT_NAME]: 'rollup-tx-merge' satisfies CircuitName }, + ), + result => { + provingState.setMergeRollupProof(location, result); + this.checkAndEnqueueNextMergeRollup(provingState, location); + }, + ); + } + + // Executes the block root rollup circuit. + private enqueueBlockRootRollup(provingState: BlockProvingState) { + if (!provingState.verifyState()) { + this.logger.debug('Not running block root rollup, state no longer valid'); + return; } - void promise.then(handleResult).catch(() => { - // The context self-cleans on rejection; a future call (replacement sub-tree - // for this tx) will see the miss and re-enqueue. No action needed here. + + if (!provingState.tryStartProvingBlockRoot()) { + this.logger.debug('Block root rollup already started.'); + return; + } + + const { rollupType, inputs } = provingState.getBlockRootRollupTypeAndInputs(); + + this.logger.debug(`Enqueuing ${rollupType} for block ${provingState.blockNumber}.`); + + this.deferredProving( + provingState, + this.wrapCircuitCall( + 'getBlockRootRollupProof', + signal => { + if (inputs instanceof BlockRootFirstRollupPrivateInputs) { + return this.prover.getBlockRootFirstRollupProof(inputs, signal, provingState.epochNumber); + } else if (inputs instanceof BlockRootSingleTxFirstRollupPrivateInputs) { + return this.prover.getBlockRootSingleTxFirstRollupProof(inputs, signal, provingState.epochNumber); + } else if (inputs instanceof BlockRootEmptyTxFirstRollupPrivateInputs) { + return this.prover.getBlockRootEmptyTxFirstRollupProof(inputs, signal, provingState.epochNumber); + } else if (inputs instanceof BlockRootSingleTxRollupPrivateInputs) { + return this.prover.getBlockRootSingleTxRollupProof(inputs, signal, provingState.epochNumber); + } else { + return this.prover.getBlockRootRollupProof(inputs, signal, provingState.epochNumber); + } + }, + { [Attributes.PROTOCOL_CIRCUIT_NAME]: rollupType }, + ), + async result => { + this.logger.debug(`Completed ${rollupType} proof for block ${provingState.blockNumber}`, { + blockNumber: provingState.blockNumber, + checkpointIndex: provingState.parentCheckpoint.index, + ...result.inputs.toInspect(), + }); + + const leafLocation = provingState.setBlockRootRollupProof(result); + const checkpointProvingState = provingState.parentCheckpoint; + + // Verification is called from both here and setBlockCompleted. Whichever runs last + // will be the first to see all three pieces (header, proof output, archive) and run the checks. + await this.verifyBuiltBlockAgainstSyncedState(provingState); + + if (checkpointProvingState.totalNumBlocks === 1) { + this.checkAndEnqueueSubTreeResolution(checkpointProvingState); + } else { + this.checkAndEnqueueNextBlockMergeRollup(checkpointProvingState, leafLocation); + } + }, + ); + } + + // Executes the base parity circuit. Enqueues the root parity circuit if all inputs are available. + private enqueueBaseParityCircuit( + checkpointProvingState: CheckpointProvingState, + provingState: BlockProvingState, + baseParityIndex: number, + ) { + if (!provingState.verifyState()) { + this.logger.debug('Not running base parity. State no longer valid.'); + return; + } + + if (!provingState.tryStartProvingBaseParity(baseParityIndex)) { + this.logger.warn(`Base parity ${baseParityIndex} already started.`); + return; + } + + const inputs = checkpointProvingState.getBaseParityInputs(baseParityIndex); + + this.deferredProving( + provingState, + this.wrapCircuitCall( + 'getBaseParityProof', + signal => this.prover.getBaseParityProof(inputs, signal, provingState.epochNumber), + { [Attributes.PROTOCOL_CIRCUIT_NAME]: 'parity-base' satisfies CircuitName }, + ), + provingOutput => { + provingState.setBaseParityProof(baseParityIndex, provingOutput); + this.checkAndEnqueueRootParityCircuit(provingState); + }, + ); + } + + private checkAndEnqueueRootParityCircuit(provingState: BlockProvingState) { + if (!provingState.isReadyForRootParity()) { + return; + } + this.enqueueRootParityCircuit(provingState); + } + + // Runs the root parity circuit and stores the outputs. + // Enqueues the block root rollup if all inputs are available. + private enqueueRootParityCircuit(provingState: BlockProvingState) { + if (!provingState.verifyState()) { + this.logger.debug('Not running root parity. State no longer valid.'); + return; + } + + if (!provingState.tryStartProvingRootParity()) { + this.logger.debug('Root parity already started.'); + return; + } + + const inputs = provingState.getParityRootInputs(); + + this.deferredProving( + provingState, + this.wrapCircuitCall( + 'getRootParityProof', + signal => this.prover.getRootParityProof(inputs, signal, provingState.epochNumber), + { [Attributes.PROTOCOL_CIRCUIT_NAME]: 'parity-root' satisfies CircuitName }, + ), + result => { + provingState.setRootParityProof(result); + this.checkAndEnqueueBlockRootRollup(provingState); + }, + ); + } + + // Executes the block merge rollup circuit. + private enqueueBlockMergeRollup(provingState: CheckpointProvingState, location: TreeNodeLocation) { + if (!provingState.verifyState()) { + this.logger.debug('Not running block merge rollup. State no longer valid.'); + return; + } + + if (!provingState.tryStartProvingBlockMerge(location)) { + this.logger.debug('Block merge rollup already started.'); + return; + } + + const inputs = provingState.getBlockMergeRollupInputs(location); + this.deferredProving( + provingState, + this.wrapCircuitCall( + 'getBlockMergeRollupProof', + signal => this.prover.getBlockMergeRollupProof(inputs, signal, provingState.epochNumber), + { [Attributes.PROTOCOL_CIRCUIT_NAME]: 'rollup-block-merge' satisfies CircuitName }, + ), + result => { + this.logger.debug(`Completed block merge rollup proof for checkpoint ${provingState.index}`, { + checkpointIndex: provingState.index, + mergeLocation: location, + ...result.inputs.toInspect(), + }); + provingState.setBlockMergeRollupProof(location, result); + this.checkAndEnqueueNextBlockMergeRollup(provingState, location); + }, + ); + } + + private checkAndEnqueueNextMergeRollup(provingState: BlockProvingState, currentLocation: TreeNodeLocation) { + if (!provingState.isReadyForMergeRollup(currentLocation)) { + return; + } + const parentLocation = provingState.getParentLocation(currentLocation); + if (parentLocation.level === 0) { + this.checkAndEnqueueBlockRootRollup(provingState); + } else { + this.enqueueMergeRollup(provingState, parentLocation); + } + } + + private checkAndEnqueueBlockRootRollup(provingState: BlockProvingState) { + if (!provingState.isReadyForBlockRootRollup()) { + this.logger.debug('Not ready for block root rollup'); + return; + } + this.enqueueBlockRootRollup(provingState); + } + + private checkAndEnqueueNextBlockMergeRollup( + provingState: CheckpointProvingState, + currentLocation: TreeNodeLocation, + ): void { + if (!provingState.isReadyForBlockMerge(currentLocation)) { + return; + } + const parentLocation = provingState.getParentLocation(currentLocation); + if (parentLocation.level === 0) { + this.checkAndEnqueueSubTreeResolution(provingState); + } else { + this.enqueueBlockMergeRollup(provingState, parentLocation); + } + } + + /** + * Sub-tree analogue of the orchestrator's `checkAndEnqueueCheckpointRootRollup`: + * resolves the sub-tree promise with the block-level proof outputs once they're all ready, + * instead of escalating to the checkpoint root rollup. + */ + private checkAndEnqueueSubTreeResolution(provingState: CheckpointProvingState): void { + const proofs = provingState.getSubTreeOutputProofs(); + const nonEmpty = proofs.filter((p): p is NonNullable => !!p); + if (proofs.length !== nonEmpty.length) { + // Block merge tree not fully resolved yet — retried as more block proofs land. + return; + } + this.subTreeResult.resolve({ + blockProofOutputs: nonEmpty, + previousArchiveSiblingPath: provingState.getLastArchiveSiblingPath(), }); } + + /** + * Executes the VM circuit for a public function. Enqueues the base rollup once the + * tx's chonk-verifier + VM proofs are both ready. + */ + private enqueueVM(provingState: BlockProvingState, txIndex: number) { + if (!provingState.verifyState()) { + this.logger.debug(`Not running VM circuit as state is no longer valid`); + return; + } + + const txProvingState = provingState.getTxProvingState(txIndex); + + this.deferredProving( + provingState, + this.wrapCircuitCall( + 'getAvmProof', + async (signal: AbortSignal) => { + const inputs = txProvingState.getAvmInputs(); + return await this.prover.getAvmProof(inputs, signal, provingState.epochNumber); + }, + { [Attributes.TX_HASH]: txProvingState.processedTx.hash.toString() }, + ), + proof => { + this.logger.debug(`Proven VM for tx index: ${txIndex}`); + txProvingState.setAvmProof(proof); + this.checkAndEnqueueBaseRollup(provingState, txIndex); + }, + ); + } + + private checkAndEnqueueBaseRollup(provingState: BlockProvingState, txIndex: number) { + const txProvingState = provingState.getTxProvingState(txIndex); + if (!txProvingState.ready()) { + return; + } + // All upstream proofs (chonk verifier and, if required, vm) are ready — proceed to the base rollup. + this.logger.debug(`Public functions completed for tx ${txIndex} enqueueing base rollup`); + this.enqueueBaseRollup(provingState, txIndex); + } + + // Flagged as protected so unit tests can override. + protected async verifyBuiltBlockAgainstSyncedState(provingState: BlockProvingState) { + const builtBlockHeader = provingState.getBuiltBlockHeader(); + if (!builtBlockHeader) { + this.logger.debug('Block header not built yet, skipping header check.'); + return; + } + + const output = provingState.getBlockRootRollupOutput(); + if (!output) { + this.logger.debug('Block root rollup proof not built yet, skipping header check.'); + return; + } + + const newArchive = provingState.getBuiltArchive(); + if (!newArchive) { + this.logger.debug('Archive snapshot not yet captured, skipping header check.'); + return; + } + + const header = await buildHeaderFromCircuitOutputs(output); + + if (!(await header.hash()).equals(await builtBlockHeader.hash())) { + this.logger.error(`Block header mismatch.\nCircuit: ${inspect(header)}\nComputed: ${inspect(builtBlockHeader)}`); + provingState.reject(`Block header hash mismatch.`); + return; + } + + const blockNumber = provingState.blockNumber; + const syncedArchive = await getTreeSnapshot(MerkleTreeId.ARCHIVE, this.dbProvider.getSnapshot(blockNumber)); + if (!syncedArchive.equals(newArchive)) { + this.logger.error( + `Archive tree mismatch for block ${blockNumber}: world state synced to ${inspect( + syncedArchive, + )} but built ${inspect(newArchive)}`, + ); + provingState.reject(`Archive tree mismatch.`); + return; + } + + const circuitArchive = output.newArchive; + if (!newArchive.equals(circuitArchive)) { + this.logger.error(`New archive mismatch.\nCircuit: ${output.newArchive}\nComputed: ${newArchive}`); + provingState.reject(`New archive mismatch.`); + return; + } + } + + private getDbForBlock(blockNumber: BlockNumber): MerkleTreeWriteOperations { + const db = this.dbs.get(blockNumber); + if (!db) { + throw new Error(`World state fork for block ${blockNumber} not found.`); + } + return db; + } + + /** + * Wraps a circuit call with a tracer span and circuit attributes. Replaces the + * `ProvingScheduler.wrapCircuitCall` indirection that used to live on the abstract base. + */ + private wrapCircuitCall( + circuitName: string, + fn: (signal: AbortSignal) => Promise, + attributes: Record = {}, + ): (signal: AbortSignal) => Promise { + return wrapCallbackInSpan( + this.tracer, + `CheckpointSubTreeOrchestrator.prover.${circuitName}`, + { [Attributes.PROTOCOL_CIRCUIT_NAME]: circuitName as CircuitName, ...attributes }, + fn, + ); + } } diff --git a/yarn-project/prover-client/src/orchestrator/chonk-cache.test.ts b/yarn-project/prover-client/src/orchestrator/chonk-cache.test.ts new file mode 100644 index 000000000000..6d3f3ff7078d --- /dev/null +++ b/yarn-project/prover-client/src/orchestrator/chonk-cache.test.ts @@ -0,0 +1,88 @@ +import { BlockNumber } from '@aztec/foundation/branded-types'; +import { promiseWithResolvers } from '@aztec/foundation/promise'; +import { L2Block } from '@aztec/stdlib/block'; + +import { ChonkCache, type ChonkVerifierProofResult } from './chonk-cache.js'; + +describe('ChonkCache', () => { + let cache: ChonkCache; + + const fakeProof = {} as ChonkVerifierProofResult; + + beforeEach(() => { + cache = new ChonkCache(); + }); + + afterEach(() => { + cache.stop(); + }); + + it('returns undefined from get when txHash is not registered', () => { + expect(cache.get('missing')).toBeUndefined(); + }); + + it('dedupes concurrent getOrCompute calls for the same hash', async () => { + let calls = 0; + const factory = () => { + calls++; + return Promise.resolve(fakeProof); + }; + const a = cache.getOrCompute('tx1', factory); + const b = cache.getOrCompute('tx1', factory); + expect(a).toBe(b); + expect(calls).toBe(1); + await expect(a).resolves.toBe(fakeProof); + }); + + it('exposes the cached promise via get', () => { + const promise = cache.getOrCompute('tx1', () => Promise.resolve(fakeProof)); + expect(cache.get('tx1')).toBe(promise); + }); + + it('evicts on rejection so a later caller can re-issue', async () => { + const failing = promiseWithResolvers(); + failing.promise.catch(() => {}); + const first = cache.getOrCompute('tx1', () => failing.promise); + failing.reject(new Error('boom')); + await expect(first).rejects.toThrow(/boom/); + expect(cache.get('tx1')).toBeUndefined(); + + const second = cache.getOrCompute('tx1', () => Promise.resolve(fakeProof)); + await expect(second).resolves.toBe(fakeProof); + }); + + it('releases entries for supplied blocks', async () => { + await cache.getOrCompute('tx-a', () => Promise.resolve(fakeProof)); + await cache.getOrCompute('tx-b', () => Promise.resolve(fakeProof)); + + const block = await L2Block.random(BlockNumber(1), { txsPerBlock: 1 }); + const txHash = block.body.txEffects[0].txHash.toString(); + await cache.getOrCompute(txHash, () => Promise.resolve(fakeProof)); + + expect(cache.get(txHash)).toBeDefined(); + cache.releaseForBlocks([block]); + expect(cache.get(txHash)).toBeUndefined(); + // Unrelated entries untouched. + expect(cache.get('tx-a')).toBeDefined(); + expect(cache.get('tx-b')).toBeDefined(); + }); + + it('aborts in-flight factories on stop', () => { + let captured: AbortSignal | undefined; + const handle = cache.getOrCompute('tx1', signal => { + captured = signal; + return new Promise(() => {}); + }); + handle.catch(() => {}); + expect(captured?.aborted).toBe(false); + cache.stop(); + expect(captured?.aborted).toBe(true); + }); + + it('rejects getOrCompute after stop', async () => { + cache.stop(); + const handle = cache.getOrCompute('tx1', () => Promise.resolve(fakeProof)); + handle.catch(() => {}); + await expect(handle).rejects.toThrow(/stopped/); + }); +}); diff --git a/yarn-project/prover-client/src/orchestrator/chonk-cache.ts b/yarn-project/prover-client/src/orchestrator/chonk-cache.ts new file mode 100644 index 000000000000..0d4fbde8b41e --- /dev/null +++ b/yarn-project/prover-client/src/orchestrator/chonk-cache.ts @@ -0,0 +1,99 @@ +import type { NESTED_RECURSIVE_ROLLUP_HONK_PROOF_LENGTH } from '@aztec/constants'; +import { type Logger, type LoggerBindings, createLogger } from '@aztec/foundation/log'; +import type { L2Block } from '@aztec/stdlib/block'; +import type { PublicInputsAndRecursiveProof } from '@aztec/stdlib/interfaces/server'; +import type { PublicChonkVerifierPublicInputs } from '@aztec/stdlib/rollup'; + +/** Result of a chonk-verifier proof, cached by tx hash. */ +export type ChonkVerifierProofResult = PublicInputsAndRecursiveProof< + PublicChonkVerifierPublicInputs, + typeof NESTED_RECURSIVE_ROLLUP_HONK_PROOF_LENGTH +>; + +/** + * Shared cache of `getPublicChonkVerifierProof` results, keyed by tx hash. + * + * Owned by the prover-node and shared across every epoch / session: a tx remined into + * a different block (e.g. after a brief L1 reorg) reuses the already-computed chonk + * proof rather than redoing it. Entries are released via `releaseForBlocks` once the + * containing block is no longer interesting to the caller (e.g. its epoch's proof + * submission window has expired). + */ +export class ChonkCache { + private readonly cache = new Map>(); + private readonly pending = new Map(); + private readonly log: Logger; + private stopped = false; + + constructor(bindings?: LoggerBindings) { + this.log = createLogger('prover-client:chonk-cache', bindings); + } + + /** Returns the cached promise for `txHash`, or `undefined` if none is registered. */ + public get(txHash: string): Promise | undefined { + return this.cache.get(txHash); + } + + /** + * Atomic get-or-compute: returns the cached promise for `txHash`, or runs `factory` + * (passing an AbortSignal the cache controls) and caches its result. Concurrent + * callers for the same `txHash` share the same promise. + * + * Rejected promises are evicted so a future caller can retry. The factory's + * AbortSignal fires when the cache is stopped. + */ + public getOrCompute( + txHash: string, + factory: (signal: AbortSignal) => Promise, + ): Promise { + if (this.stopped) { + return Promise.reject(new Error('ChonkCache is stopped')); + } + const existing = this.cache.get(txHash); + if (existing) { + return existing; + } + const controller = new AbortController(); + this.pending.set(txHash, controller); + this.log.debug(`Enqueueing chonk-verifier circuit`, { txHash }); + const promise = factory(controller.signal).finally(() => this.pending.delete(txHash)); + // Silently observe the rejection branch and evict so a retry is possible. + promise.catch(err => { + this.cache.delete(txHash); + this.log.debug(`Chonk-verifier proof failed; evicted from cache`, { txHash, error: `${err}` }); + }); + this.cache.set(txHash, promise); + return promise; + } + + /** Drops cache entries for every tx in the supplied blocks. */ + public releaseForBlocks(blocks: L2Block[]): void { + let released = 0; + for (const block of blocks) { + for (const txEffect of block.body.txEffects) { + if (this.cache.delete(txEffect.txHash.toString())) { + released++; + } + } + } + if (released > 0) { + this.log.debug(`Released ${released} chonk-verifier cache entries`, { + blockCount: blocks.length, + releasedCount: released, + }); + } + } + + /** Aborts every in-flight chonk-verifier job and clears the cache. */ + public stop(): void { + if (this.stopped) { + return; + } + this.stopped = true; + for (const controller of this.pending.values()) { + controller.abort(); + } + this.pending.clear(); + this.cache.clear(); + } +} diff --git a/yarn-project/prover-client/src/orchestrator/epoch-proving-context.test.ts b/yarn-project/prover-client/src/orchestrator/epoch-proving-context.test.ts deleted file mode 100644 index a424d2ea5968..000000000000 --- a/yarn-project/prover-client/src/orchestrator/epoch-proving-context.test.ts +++ /dev/null @@ -1,84 +0,0 @@ -import { EpochNumber } from '@aztec/foundation/branded-types'; -import { promiseWithResolvers } from '@aztec/foundation/promise'; -import type { ServerCircuitProver } from '@aztec/stdlib/interfaces/server'; -import { PublicChonkVerifierPrivateInputs } from '@aztec/stdlib/rollup'; - -import { type MockProxy, mock } from 'jest-mock-extended'; - -import { type ChonkVerifierProofResult, EpochProvingContext } from './epoch-proving-context.js'; - -describe('EpochProvingContext', () => { - let prover: MockProxy; - let context: EpochProvingContext; - - // We don't need a real proof object — assertions only check identity via - // `toHaveBeenCalledTimes` and the resolved promise. - const fakeProof = {} as ChonkVerifierProofResult; - const fakeInputs = {} as PublicChonkVerifierPrivateInputs; - - beforeEach(() => { - prover = mock(); - context = new EpochProvingContext(prover, EpochNumber(1)); - }); - - it('caches and dedupes concurrent enqueue calls for the same tx', async () => { - prover.getPublicChonkVerifierProof.mockResolvedValue(fakeProof); - - const a = context.enqueue('tx1', fakeInputs); - const b = context.enqueue('tx1', fakeInputs); - - expect(a).toBe(b); - expect(prover.getPublicChonkVerifierProof).toHaveBeenCalledTimes(1); - - await expect(a).resolves.toBe(fakeProof); - }); - - it('returns the cached promise from getCached after enqueue', () => { - prover.getPublicChonkVerifierProof.mockResolvedValue(fakeProof); - - const promise = context.enqueue('tx1', fakeInputs); - expect(context.getCached('tx1')).toBe(promise); - expect(context.getCached('tx-other')).toBeUndefined(); - }); - - it('self-cleans the cache on rejection so a subsequent enqueue can re-issue the proof', async () => { - // First call rejects; second call should re-enqueue and succeed. - const failResolvers = promiseWithResolvers(); - failResolvers.promise.catch(() => {}); - prover.getPublicChonkVerifierProof.mockReturnValueOnce(failResolvers.promise); - prover.getPublicChonkVerifierProof.mockResolvedValueOnce(fakeProof); - - const first = context.enqueue('tx1', fakeInputs); - failResolvers.reject(new Error('boom')); - await expect(first).rejects.toThrow(/boom/); - - // Cache should now be empty for tx1. - expect(context.getCached('tx1')).toBeUndefined(); - - const second = context.enqueue('tx1', fakeInputs); - expect(prover.getPublicChonkVerifierProof).toHaveBeenCalledTimes(2); - await expect(second).resolves.toBe(fakeProof); - }); - - it('aborts in-flight chonk-verifier jobs on stop', () => { - let capturedSignal: AbortSignal | undefined; - prover.getPublicChonkVerifierProof.mockImplementation((_inputs, signal) => { - capturedSignal = signal; - return new Promise(() => {}); - }); - - const promise = context.enqueue('tx1', fakeInputs); - promise.catch(() => {}); - - expect(capturedSignal?.aborted).toBe(false); - context.stop(); - expect(capturedSignal?.aborted).toBe(true); - }); - - it('rejects new enqueues after stop', async () => { - context.stop(); - const promise = context.enqueue('tx1', fakeInputs); - promise.catch(() => {}); - await expect(promise).rejects.toThrow(/stopped/); - }); -}); diff --git a/yarn-project/prover-client/src/orchestrator/epoch-proving-context.ts b/yarn-project/prover-client/src/orchestrator/epoch-proving-context.ts deleted file mode 100644 index d2801afbd79d..000000000000 --- a/yarn-project/prover-client/src/orchestrator/epoch-proving-context.ts +++ /dev/null @@ -1,101 +0,0 @@ -import type { NESTED_RECURSIVE_ROLLUP_HONK_PROOF_LENGTH } from '@aztec/constants'; -import type { EpochNumber } from '@aztec/foundation/branded-types'; -import { type Logger, type LoggerBindings, createLogger } from '@aztec/foundation/log'; -import type { PublicInputsAndRecursiveProof, ServerCircuitProver } from '@aztec/stdlib/interfaces/server'; -import type { PublicChonkVerifierPrivateInputs, PublicChonkVerifierPublicInputs } from '@aztec/stdlib/rollup'; - -/** - * Result of a chonk-verifier proof, cached per tx hash on `EpochProvingContext`. - */ -export type ChonkVerifierProofResult = PublicInputsAndRecursiveProof< - PublicChonkVerifierPublicInputs, - typeof NESTED_RECURSIVE_ROLLUP_HONK_PROOF_LENGTH ->; - -/** - * Per-epoch state shared across every `CheckpointSubTreeOrchestrator` constructed for - * the same epoch. Owns the chonk-verifier proof cache so a tx whose checkpoint is - * reorged out and re-appears in a replacement checkpoint does not have to re-prove - * its chonk circuit. - * - * The context's chonk-verifier broker jobs are deliberately submitted **outside** the - * sub-tree's deferred-proving queue. The sub-tree's `cancel()` therefore does not abort - * them — by design, because their result is tx-scoped, not sub-tree-scoped, and a - * replacement sub-tree should be able to consume the cached proof. - * - * Callers (`EpochProvingJob`, or unit tests) construct one context per epoch and pass - * it into every sub-tree they create. `stop()` aborts every in-flight chonk job. - */ -export class EpochProvingContext { - private readonly cache = new Map>(); - /** Abort controllers for in-flight chonk jobs, keyed by tx hash. */ - private readonly pending = new Map(); - private readonly log: Logger; - private stopped = false; - - constructor( - private readonly prover: ServerCircuitProver, - public readonly epochNumber: EpochNumber, - bindings?: LoggerBindings, - ) { - this.log = createLogger('prover-client:epoch-proving-context', bindings); - } - - /** - * Returns the cached chonk-verifier proof promise for the given tx hash, or - * `undefined` if none has been enqueued yet. Non-mutating. - */ - public getCached(txHash: string): Promise | undefined { - return this.cache.get(txHash); - } - - /** - * Enqueues a chonk-verifier proof for the given tx hash, returning the promise (or - * the already-cached one if already enqueued). The promise resolves when the broker - * delivers the result; on rejection (including `stop()`), the cache entry is removed - * so a subsequent caller can re-enqueue. - */ - public enqueue(txHash: string, inputs: PublicChonkVerifierPrivateInputs): Promise { - if (this.stopped) { - return Promise.reject(new Error('EpochProvingContext is stopped')); - } - - const cached = this.cache.get(txHash); - if (cached) { - return cached; - } - - const controller = new AbortController(); - this.pending.set(txHash, controller); - this.log.debug(`Enqueueing chonk-verifier circuit`, { txHash, epochNumber: this.epochNumber }); - - const promise = this.prover - .getPublicChonkVerifierProof(inputs, controller.signal, this.epochNumber) - .finally(() => this.pending.delete(txHash)); - - // Self-clean on rejection so a future caller can re-enqueue. Mark the rejection - // path as observed to silence unhandled-rejection warnings when no consumer - // awaits the promise (e.g. when the only `.then` chain belonged to a cancelled - // sub-tree's tx-proving state). - promise.catch(err => { - this.cache.delete(txHash); - this.log.debug(`Chonk-verifier proof failed; evicted from cache`, { txHash, error: `${err}` }); - }); - - this.cache.set(txHash, promise); - return promise; - } - - /** - * Aborts every in-flight chonk-verifier broker job and clears the cache. Called by - * the owning `EpochProvingJob` when the job stops. - */ - public stop() { - this.stopped = true; - for (const controller of this.pending.values()) { - controller.abort(); - } - this.pending.clear(); - this.cache.clear(); - } -} diff --git a/yarn-project/prover-client/src/orchestrator/epoch-proving-state.ts b/yarn-project/prover-client/src/orchestrator/epoch-proving-state.ts deleted file mode 100644 index a551082873c6..000000000000 --- a/yarn-project/prover-client/src/orchestrator/epoch-proving-state.ts +++ /dev/null @@ -1,380 +0,0 @@ -import { BatchedBlob, BatchedBlobAccumulator, type FinalBlobBatchingChallenges } from '@aztec/blob-lib'; -import { - type ARCHIVE_HEIGHT, - type L1_TO_L2_MSG_SUBTREE_ROOT_SIBLING_PATH_LENGTH, - type NESTED_RECURSIVE_PROOF_LENGTH, - type NESTED_RECURSIVE_ROLLUP_HONK_PROOF_LENGTH, - OUT_HASH_TREE_HEIGHT, -} from '@aztec/constants'; -import { BlockNumber, EpochNumber } from '@aztec/foundation/branded-types'; -import { Fr } from '@aztec/foundation/curves/bn254'; -import type { Tuple } from '@aztec/foundation/serialize'; -import { - MerkleTreeCalculator, - type TreeNodeLocation, - UnbalancedTreeStore, - shaMerkleHash, -} from '@aztec/foundation/trees'; -import type { PublicInputsAndRecursiveProof } from '@aztec/stdlib/interfaces/server'; -import type { Proof } from '@aztec/stdlib/proofs'; -import { - CheckpointConstantData, - CheckpointMergeRollupPrivateInputs, - CheckpointPaddingRollupPrivateInputs, - CheckpointRollupPublicInputs, - PublicChonkVerifierPublicInputs, - RootRollupPrivateInputs, - type RootRollupPublicInputs, -} from '@aztec/stdlib/rollup'; -import { AppendOnlyTreeSnapshot, type MerkleTreeId } from '@aztec/stdlib/trees'; -import type { BlockHeader } from '@aztec/stdlib/tx'; - -import { toProofData } from './block-building-helpers.js'; -import type { ProofState } from './block-proving-state.js'; -import { CheckpointProvingState } from './checkpoint-proving-state.js'; - -export type TreeSnapshots = Map; - -enum PROVING_STATE_LIFECYCLE { - PROVING_STATE_CREATED, - PROVING_STATE_FULL, - PROVING_STATE_RESOLVED, - PROVING_STATE_REJECTED, -} - -export type ProvingResult = { status: 'success' } | { status: 'failure'; reason: string }; - -/** - * The current state of the proving schedule for an epoch. - * Contains the raw inputs and intermediate state to generate every constituent proof in the tree. - * Carries an identifier so we can identify if the proving state is discarded and a new one started. - * Captures resolve and reject callbacks to provide a promise base interface to the consumer of our proving. - */ -export class EpochProvingState { - private checkpointProofs: UnbalancedTreeStore< - ProofState - >; - private checkpointPaddingProof: - | ProofState - | undefined; - private rootRollupProof: ProofState | undefined; - private checkpoints: (CheckpointProvingState | undefined)[] = []; - private startBlobAccumulator: BatchedBlobAccumulator; - private endBlobAccumulator: BatchedBlobAccumulator | undefined; - private finalBatchedBlob: BatchedBlob | undefined; - private provingStateLifecycle = PROVING_STATE_LIFECYCLE.PROVING_STATE_CREATED; - - // Map from tx hash to chonk verifier proof promise. Used when kickstarting chonk verifier proofs before tx processing. - public readonly cachedChonkVerifierProofs = new Map< - string, - Promise< - PublicInputsAndRecursiveProof - > - >(); - - constructor( - public readonly epochNumber: EpochNumber, - public readonly totalNumCheckpoints: number, - private readonly finalBlobBatchingChallenges: FinalBlobBatchingChallenges, - private onCheckpointBlobAccumulatorSet: (checkpoint: CheckpointProvingState) => Promise, - private completionCallback: (result: ProvingResult) => void, - private rejectionCallback: (reason: string) => void, - ) { - this.checkpointProofs = new UnbalancedTreeStore(totalNumCheckpoints); - this.startBlobAccumulator = BatchedBlobAccumulator.newWithChallenges(finalBlobBatchingChallenges); - } - - // Adds a block to the proving state, returns its index - // Will update the proving life cycle if this is the last block - public startNewCheckpoint( - checkpointIndex: number, - constants: CheckpointConstantData, - totalNumBlocks: number, - previousBlockHeader: BlockHeader, - lastArchiveSiblingPath: Tuple, - l1ToL2Messages: Fr[], - lastL1ToL2MessageTreeSnapshot: AppendOnlyTreeSnapshot, - lastL1ToL2MessageSubtreeRootSiblingPath: Tuple, - newL1ToL2MessageTreeSnapshot: AppendOnlyTreeSnapshot, - newL1ToL2MessageSubtreeRootSiblingPath: Tuple, - ): CheckpointProvingState { - if (checkpointIndex >= this.totalNumCheckpoints) { - throw new Error( - `Unable to start a new checkpoint at index ${checkpointIndex}. Expected at most ${this.totalNumCheckpoints} checkpoints.`, - ); - } - - const checkpoint = new CheckpointProvingState( - checkpointIndex, - constants, - totalNumBlocks, - this.finalBlobBatchingChallenges, - previousBlockHeader, - lastArchiveSiblingPath, - l1ToL2Messages, - lastL1ToL2MessageTreeSnapshot, - lastL1ToL2MessageSubtreeRootSiblingPath, - newL1ToL2MessageTreeSnapshot, - newL1ToL2MessageSubtreeRootSiblingPath, - this, - this.onCheckpointBlobAccumulatorSet, - ); - this.checkpoints[checkpointIndex] = checkpoint; - - if (this.checkpoints.filter(c => !!c).length === this.totalNumCheckpoints) { - this.provingStateLifecycle = PROVING_STATE_LIFECYCLE.PROVING_STATE_FULL; - } - - return checkpoint; - } - - public getCheckpointProvingState(index: number) { - return this.checkpoints[index]; - } - - public getCheckpointProvingStateByBlockNumber(blockNumber: BlockNumber) { - return this.checkpoints.find( - c => - c && - Number(blockNumber) >= Number(c.firstBlockNumber) && - Number(blockNumber) < Number(c.firstBlockNumber) + c.totalNumBlocks, - ); - } - - public getBlockProvingStateByBlockNumber(blockNumber: BlockNumber) { - return this.getCheckpointProvingStateByBlockNumber(blockNumber)?.getBlockProvingStateByBlockNumber(blockNumber); - } - - // Returns true if this proving state is still valid, false otherwise - public verifyState() { - return ( - this.provingStateLifecycle === PROVING_STATE_LIFECYCLE.PROVING_STATE_CREATED || - this.provingStateLifecycle === PROVING_STATE_LIFECYCLE.PROVING_STATE_FULL - ); - } - - // Returns true if we are still able to accept checkpoints, false otherwise. - public isAcceptingCheckpoints() { - return this.checkpoints.filter(c => !!c).length < this.totalNumCheckpoints; - } - - public setCheckpointRootRollupProof( - checkpointIndex: number, - provingOutput: PublicInputsAndRecursiveProof< - CheckpointRollupPublicInputs, - typeof NESTED_RECURSIVE_ROLLUP_HONK_PROOF_LENGTH - >, - ): TreeNodeLocation { - return this.checkpointProofs.setLeaf(checkpointIndex, { provingOutput }); - } - - public tryStartProvingCheckpointMerge(location: TreeNodeLocation) { - if (this.checkpointProofs.getNode(location)?.isProving) { - return false; - } else { - this.checkpointProofs.setNode(location, { isProving: true }); - return true; - } - } - - public setCheckpointMergeRollupProof( - location: TreeNodeLocation, - provingOutput: PublicInputsAndRecursiveProof< - CheckpointRollupPublicInputs, - typeof NESTED_RECURSIVE_ROLLUP_HONK_PROOF_LENGTH - >, - ) { - this.checkpointProofs.setNode(location, { provingOutput }); - } - - public tryStartProvingRootRollup() { - if (this.rootRollupProof?.isProving) { - return false; - } else { - this.rootRollupProof = { isProving: true }; - return true; - } - } - - public setRootRollupProof(provingOutput: PublicInputsAndRecursiveProof) { - this.rootRollupProof = { provingOutput }; - } - - public tryStartProvingPaddingCheckpoint() { - if (this.checkpointPaddingProof?.isProving) { - return false; - } else { - this.checkpointPaddingProof = { isProving: true }; - return true; - } - } - - public setCheckpointPaddingProof( - provingOutput: PublicInputsAndRecursiveProof< - CheckpointRollupPublicInputs, - typeof NESTED_RECURSIVE_ROLLUP_HONK_PROOF_LENGTH - >, - ) { - this.checkpointPaddingProof = { provingOutput }; - } - - public async accumulateCheckpointOutHashes() { - const treeCalculator = await MerkleTreeCalculator.create(OUT_HASH_TREE_HEIGHT, undefined, (left, right) => - Promise.resolve(shaMerkleHash(left, right)), - ); - - const computeOutHashHint = async (leaves: Fr[]) => { - const tree = await treeCalculator.computeTree(leaves.map(l => l.toBuffer())); - const nextAvailableLeafIndex = leaves.length; - return { - treeSnapshot: new AppendOnlyTreeSnapshot(Fr.fromBuffer(tree.root), nextAvailableLeafIndex), - siblingPath: tree.getSiblingPath(nextAvailableLeafIndex).map(Fr.fromBuffer) as Tuple< - Fr, - typeof OUT_HASH_TREE_HEIGHT - >, - }; - }; - - let hint = this.checkpoints[0]?.getOutHashHint(); - const outHashes = []; - for (let i = 0; i < this.totalNumCheckpoints; i++) { - const checkpoint = this.checkpoints[i]; - if (!checkpoint) { - break; - } - - // If hints are not set yet, it must be the first checkpoint. Compute the hints with an empty tree. - hint ??= await computeOutHashHint([]); - checkpoint.setOutHashHint(hint); - - // Get the out hash for this checkpoint. - const outHash = checkpoint.accumulateBlockOutHashes(); - if (!outHash) { - break; - } - outHashes.push(outHash); - - // If this is NOT the last checkpoint, get or create the hint for the next checkpoint. - if (i !== this.totalNumCheckpoints - 1) { - hint = checkpoint.getOutHashHintForNextCheckpoint() ?? (await computeOutHashHint(outHashes)); - checkpoint.setOutHashHintForNextCheckpoint(hint); - } - } - } - - public async setBlobAccumulators() { - let previousAccumulator = this.startBlobAccumulator; - // Accumulate blobs as far as we can for this epoch. - for (let i = 0; i < this.totalNumCheckpoints; i++) { - const checkpoint = this.checkpoints[i]; - if (!checkpoint) { - break; - } - - const endAccumulator = - checkpoint.getEndBlobAccumulator() || (await checkpoint.accumulateBlobs(previousAccumulator)); - if (!endAccumulator) { - break; - } - - previousAccumulator = endAccumulator; - - // If this is the last checkpoint, set the end blob accumulator. - if (i === this.totalNumCheckpoints - 1) { - this.endBlobAccumulator = endAccumulator; - } - } - } - - public async finalizeBatchedBlob() { - if (!this.endBlobAccumulator) { - throw new Error('End blob accumulator not ready.'); - } - this.finalBatchedBlob = await this.endBlobAccumulator.finalize(true /* verifyProof */); - } - - public getParentLocation(location: TreeNodeLocation) { - return this.checkpointProofs.getParentLocation(location); - } - - public getCheckpointMergeRollupInputs(mergeLocation: TreeNodeLocation) { - const [left, right] = this.checkpointProofs.getChildren(mergeLocation).map(c => c?.provingOutput); - if (!left || !right) { - throw new Error('At least one child is not ready for the checkpoint merge rollup.'); - } - - return new CheckpointMergeRollupPrivateInputs([toProofData(left), toProofData(right)]); - } - - public getRootRollupInputs() { - const [left, right] = this.#getChildProofsForRoot(); - if (!left || !right) { - throw new Error('At least one child is not ready for the root rollup.'); - } - - return RootRollupPrivateInputs.from({ - previousRollups: [toProofData(left), toProofData(right)], - }); - } - - public getPaddingCheckpointInputs() { - return new CheckpointPaddingRollupPrivateInputs(); - } - - public getEpochProofResult(): { proof: Proof; publicInputs: RootRollupPublicInputs; batchedBlobInputs: BatchedBlob } { - const provingOutput = this.rootRollupProof?.provingOutput; - - if (!provingOutput || !this.finalBatchedBlob) { - throw new Error('Unable to get epoch proof result. Root rollup is not ready.'); - } - - return { - proof: provingOutput.proof.binaryProof, - publicInputs: provingOutput.inputs, - batchedBlobInputs: this.finalBatchedBlob, - }; - } - - public isReadyForCheckpointMerge(location: TreeNodeLocation) { - return !!this.checkpointProofs.getSibling(location)?.provingOutput; - } - - // Returns true if we have sufficient inputs to execute the block root rollup - public isReadyForRootRollup() { - const childProofs = this.#getChildProofsForRoot(); - return childProofs.every(p => !!p); - } - - // Attempts to reject the proving state promise with a reason of 'cancelled' - public cancel() { - this.reject('Proving cancelled'); - } - - // Attempts to reject the proving state promise with the given reason - // Does nothing if not in a valid state - public reject(reason: string) { - if (!this.verifyState()) { - return; - } - this.provingStateLifecycle = PROVING_STATE_LIFECYCLE.PROVING_STATE_REJECTED; - this.rejectionCallback(reason); - } - - // Attempts to resolve the proving state promise with the given result - // Does nothing if not in a valid state - public resolve(result: ProvingResult) { - if (!this.verifyState()) { - return; - } - this.provingStateLifecycle = PROVING_STATE_LIFECYCLE.PROVING_STATE_RESOLVED; - this.completionCallback(result); - } - - #getChildProofsForRoot() { - const rootLocation = { level: 0, index: 0 }; - // If there's only 1 block, its block root proof will be stored at the root. - return this.totalNumCheckpoints === 1 - ? [this.checkpointProofs.getNode(rootLocation)?.provingOutput, this.checkpointPaddingProof?.provingOutput] - : this.checkpointProofs.getChildren(rootLocation).map(c => c?.provingOutput); - } -} diff --git a/yarn-project/prover-client/src/orchestrator/index.ts b/yarn-project/prover-client/src/orchestrator/index.ts index 34ddb02de8d2..9fe9f502a7be 100644 --- a/yarn-project/prover-client/src/orchestrator/index.ts +++ b/yarn-project/prover-client/src/orchestrator/index.ts @@ -1,6 +1,5 @@ -export { ProvingOrchestrator } from './orchestrator.js'; export { CheckpointSubTreeOrchestrator, type SubTreeResult } from './checkpoint-sub-tree-orchestrator.js'; -export { EpochProvingContext, type ChonkVerifierProofResult } from './epoch-proving-context.js'; +export { ChonkCache, type ChonkVerifierProofResult } from './chonk-cache.js'; export { TopTreeOrchestrator, TopTreeCancelledError, diff --git a/yarn-project/prover-client/src/orchestrator/orchestrator.ts b/yarn-project/prover-client/src/orchestrator/orchestrator.ts deleted file mode 100644 index d413d3bba3c1..000000000000 --- a/yarn-project/prover-client/src/orchestrator/orchestrator.ts +++ /dev/null @@ -1,1111 +0,0 @@ -import { BatchedBlob, FinalBlobBatchingChallenges, SpongeBlob } from '@aztec/blob-lib/types'; -import { - L1_TO_L2_MSG_SUBTREE_HEIGHT, - L1_TO_L2_MSG_SUBTREE_ROOT_SIBLING_PATH_LENGTH, - NESTED_RECURSIVE_ROLLUP_HONK_PROOF_LENGTH, - NUM_BASE_PARITY_PER_ROOT_PARITY, -} from '@aztec/constants'; -import { BlockNumber, EpochNumber } from '@aztec/foundation/branded-types'; -import { Fr } from '@aztec/foundation/curves/bn254'; -import type { LoggerBindings } from '@aztec/foundation/log'; -import { promiseWithResolvers } from '@aztec/foundation/promise'; -import { assertLength } from '@aztec/foundation/serialize'; -import { pushTestData } from '@aztec/foundation/testing'; -import { elapsed } from '@aztec/foundation/timer'; -import type { TreeNodeLocation } from '@aztec/foundation/trees'; -import { EthAddress } from '@aztec/stdlib/block'; -import type { - EpochProver, - ForkMerkleTreeOperations, - MerkleTreeWriteOperations, - PublicInputsAndRecursiveProof, - ReadonlyWorldStateAccess, - ServerCircuitProver, -} from '@aztec/stdlib/interfaces/server'; -import { appendL1ToL2MessagesToTree } from '@aztec/stdlib/messaging'; -import type { Proof } from '@aztec/stdlib/proofs'; -import { - type BaseRollupHints, - BlockRootEmptyTxFirstRollupPrivateInputs, - BlockRootFirstRollupPrivateInputs, - BlockRootSingleTxFirstRollupPrivateInputs, - BlockRootSingleTxRollupPrivateInputs, - CheckpointConstantData, - CheckpointRootSingleBlockRollupPrivateInputs, - PrivateTxBaseRollupPrivateInputs, - PublicChonkVerifierPrivateInputs, - PublicChonkVerifierPublicInputs, - RootRollupPublicInputs, -} from '@aztec/stdlib/rollup'; -import type { CircuitName } from '@aztec/stdlib/stats'; -import { type AppendOnlyTreeSnapshot, MerkleTreeId } from '@aztec/stdlib/trees'; -import type { BlockHeader, ProcessedTx, Tx } from '@aztec/stdlib/tx'; -import type { UInt64 } from '@aztec/stdlib/types'; -import { - Attributes, - type TelemetryClient, - type Tracer, - getTelemetryClient, - trackSpan, - wrapCallbackInSpan, -} from '@aztec/telemetry-client'; - -import { inspect } from 'util'; - -import { - buildHeaderFromCircuitOutputs, - getLastSiblingPath, - getPublicChonkVerifierPrivateInputsFromTx, - getRootTreeSiblingPath, - getSubtreeSiblingPath, - getTreeSnapshot, - insertSideEffectsAndBuildBaseRollupHints, - validatePartialState, - validateTx, -} from './block-building-helpers.js'; -import type { BlockProvingState } from './block-proving-state.js'; -import type { CheckpointProvingState } from './checkpoint-proving-state.js'; -import { EpochProvingState, type ProvingResult, type TreeSnapshots } from './epoch-proving-state.js'; -import { ProvingOrchestratorMetrics } from './orchestrator_metrics.js'; -import { TopTreeProvingScheduler } from './top-tree-proving-scheduler.js'; -import { TxProvingState } from './tx-proving-state.js'; - -/** - * Implements an event driven proving scheduler to build the recursive proof tree. The idea being: - * 1. Transactions are provided to the scheduler post simulation. - * 2. Tree insertions are performed as required to generate transaction specific proofs - * 3. Those transaction specific proofs are generated in the necessary order accounting for dependencies - * 4. Once a transaction is proven, it will be incorporated into a merge proof - * 5. Merge proofs are produced at each level of the tree until the root proof is produced - * - * The proving implementation is determined by the provided prover. This could be for example a local prover or a remote prover pool. - */ - -/** - * The orchestrator, managing the flow of recursive proving operations required to build the rollup proof tree. - */ -export class ProvingOrchestrator extends TopTreeProvingScheduler implements EpochProver { - protected provingState: EpochProvingState | undefined = undefined; - - protected provingPromise: Promise | undefined = undefined; - private metrics: ProvingOrchestratorMetrics; - - private dbs: Map = new Map(); - - constructor( - private dbProvider: ReadonlyWorldStateAccess & ForkMerkleTreeOperations, - prover: ServerCircuitProver, - private readonly proverId: EthAddress, - private readonly cancelJobsOnStop: boolean = false, - enqueueConcurrency: number, - telemetryClient: TelemetryClient = getTelemetryClient(), - bindings?: LoggerBindings, - ) { - super(prover, enqueueConcurrency, 'prover-client:orchestrator', bindings); - this.metrics = new ProvingOrchestratorMetrics(telemetryClient, 'ProvingOrchestrator'); - } - - get tracer(): Tracer { - return this.metrics.tracer; - } - - public getProverId(): EthAddress { - return this.proverId; - } - - public getNumActiveForks() { - return this.dbs.size; - } - - protected override cancelInternal(): void { - this.cancel(); - } - - protected override wrapCircuitCall( - circuitName: string, - fn: (signal: AbortSignal) => Promise, - ): (signal: AbortSignal) => Promise { - return wrapCallbackInSpan( - this.tracer, - `ProvingOrchestrator.prover.${circuitName}`, - { [Attributes.PROTOCOL_CIRCUIT_NAME]: circuitName as CircuitName }, - fn, - ); - } - - protected override onRootRollupComplete(state: EpochProvingState) { - state.resolve({ status: 'success' }); - } - - public startNewEpoch( - epochNumber: EpochNumber, - totalNumCheckpoints: number, - finalBlobBatchingChallenges: FinalBlobBatchingChallenges, - ) { - if (this.provingState?.verifyState()) { - throw new Error( - `Cannot start epoch ${epochNumber} when epoch ${this.provingState.epochNumber} is still being processed.`, - ); - } - - const { promise: _promise, resolve, reject } = promiseWithResolvers(); - const promise = _promise.catch((reason): ProvingResult => ({ status: 'failure', reason })); - this.logger.info(`Starting epoch ${epochNumber} with ${totalNumCheckpoints} checkpoints.`); - this.provingState = new EpochProvingState( - epochNumber, - totalNumCheckpoints, - finalBlobBatchingChallenges, - provingState => this.checkAndEnqueueCheckpointRootRollup(provingState), - resolve, - reject, - ); - this.provingPromise = promise; - } - - /** - * Starts a new checkpoint. - * @param checkpointIndex - The index of the checkpoint in the epoch. - * @param constants - The constants for this checkpoint. - * @param l1ToL2Messages - The set of L1 to L2 messages to be inserted at the beginning of this checkpoint. - * @param totalNumBlocks - The total number of blocks expected in the checkpoint (must be at least one). - * @param headerOfLastBlockInPreviousCheckpoint - The header of the last block in the previous checkpoint. - */ - public async startNewCheckpoint( - checkpointIndex: number, - constants: CheckpointConstantData, - l1ToL2Messages: Fr[], - totalNumBlocks: number, - headerOfLastBlockInPreviousCheckpoint: BlockHeader, - ) { - if (!this.provingState) { - throw new Error('Empty epoch proving state. Call startNewEpoch before starting a checkpoint.'); - } - - if (!this.provingState.isAcceptingCheckpoints()) { - throw new Error(`Epoch not accepting further checkpoints.`); - } - - // Fork world state at the end of the immediately previous block. - const lastBlockNumber = headerOfLastBlockInPreviousCheckpoint.globalVariables.blockNumber; - const db = await this.dbProvider.fork(lastBlockNumber); - - const firstBlockNumber = BlockNumber(lastBlockNumber + 1); - this.dbs.set(firstBlockNumber, db); - - // Get archive sibling path before any block in this checkpoint lands. - const lastArchiveSiblingPath = await getLastSiblingPath(MerkleTreeId.ARCHIVE, db); - - // Insert all the l1 to l2 messages into the db. And get the states before and after the insertion. - const { - lastL1ToL2MessageTreeSnapshot, - lastL1ToL2MessageSubtreeRootSiblingPath, - newL1ToL2MessageTreeSnapshot, - newL1ToL2MessageSubtreeRootSiblingPath, - } = await this.updateL1ToL2MessageTree(l1ToL2Messages, db); - - this.provingState.startNewCheckpoint( - checkpointIndex, - constants, - totalNumBlocks, - headerOfLastBlockInPreviousCheckpoint, - lastArchiveSiblingPath, - l1ToL2Messages, - lastL1ToL2MessageTreeSnapshot, - lastL1ToL2MessageSubtreeRootSiblingPath, - newL1ToL2MessageTreeSnapshot, - newL1ToL2MessageSubtreeRootSiblingPath, - ); - } - - /** - * Starts off a new block - * @param blockNumber - The block number - * @param timestamp - The timestamp of the block. This is only required for constructing the private inputs for the - * block that doesn't have any txs. - * @param totalNumTxs - The total number of txs in the block - */ - @trackSpan('ProvingOrchestrator.startNewBlock', blockNumber => ({ - [Attributes.BLOCK_NUMBER]: blockNumber, - })) - public async startNewBlock(blockNumber: BlockNumber, timestamp: UInt64, totalNumTxs: number) { - if (!this.provingState) { - throw new Error('Empty epoch proving state. Call startNewEpoch before starting a block.'); - } - - const checkpointProvingState = this.provingState.getCheckpointProvingStateByBlockNumber(blockNumber); - if (!checkpointProvingState) { - throw new Error(`Checkpoint not started. Call startNewCheckpoint first.`); - } - - if (!checkpointProvingState.isAcceptingBlocks()) { - throw new Error(`Checkpoint not accepting further blocks.`); - } - - const constants = checkpointProvingState.constants; - this.logger.info(`Starting block ${blockNumber} for slot ${constants.slotNumber}.`); - - // Fork the db only when it's not already set. The db for the first block is set in `startNewCheckpoint`. - if (!this.dbs.has(blockNumber)) { - // Fork world state at the end of the immediately previous block - const db = await this.dbProvider.fork(BlockNumber(blockNumber - 1)); - this.dbs.set(blockNumber, db); - } - const db = this.getDbForBlock(blockNumber); - - // Get archive snapshot and sibling path before any txs in this block lands. - const lastArchiveTreeSnapshot = await getTreeSnapshot(MerkleTreeId.ARCHIVE, db); - const lastArchiveSiblingPath = await getRootTreeSiblingPath(MerkleTreeId.ARCHIVE, db); - - const blockProvingState = checkpointProvingState.startNewBlock( - blockNumber, - timestamp, - totalNumTxs, - lastArchiveTreeSnapshot, - lastArchiveSiblingPath, - ); - - // Enqueue base parity circuits for the first block in the checkpoint. - if (blockProvingState.index === 0) { - for (let i = 0; i < NUM_BASE_PARITY_PER_ROOT_PARITY; i++) { - this.enqueueBaseParityCircuit(checkpointProvingState, blockProvingState, i); - } - } - - // Because `addTxs` won't be called for a block without txs, and that's where the sponge blob state is computed. - // We need to set its end sponge blob here, which will become the start sponge blob for the next block. - if (totalNumTxs === 0) { - const endState = await db.getStateReference(); - blockProvingState.setEndState(endState); - - const endSpongeBlob = blockProvingState.getStartSpongeBlob().clone(); - const blockEndBlobFields = blockProvingState.getBlockEndBlobFields(); - await endSpongeBlob.absorb(blockEndBlobFields); - blockProvingState.setEndSpongeBlob(endSpongeBlob); - - // Try to accumulate the out hashes and blobs as far as we can: - await this.provingState.accumulateCheckpointOutHashes(); - await this.provingState.setBlobAccumulators(); - } - } - - /** - * The interface to add simulated transactions to the scheduler. This can only be called once per block. - * @param txs - The transactions to be proven - */ - @trackSpan('ProvingOrchestrator.addTxs', txs => ({ - [Attributes.BLOCK_TXS_COUNT]: txs.length, - })) - public async addTxs(txs: ProcessedTx[]): Promise { - if (!this.provingState) { - throw new Error(`Empty epoch proving state. Call startNewEpoch before adding txs.`); - } - - if (!txs.length) { - // To avoid an ugly throw below. If we require an empty block, we can just call setBlockCompleted - // on a block with no txs. We cannot do that here because we cannot find the blockNumber without any txs. - this.logger.warn(`Provided no txs to orchestrator addTxs.`); - return; - } - - const blockNumber = BlockNumber(txs[0].globalVariables.blockNumber); - const provingState = this.provingState.getBlockProvingStateByBlockNumber(blockNumber!); - if (!provingState) { - throw new Error(`Proving state for block ${blockNumber} not found. Call startNewBlock first.`); - } - - if (provingState.totalNumTxs !== txs.length) { - throw new Error( - `Block ${blockNumber} should be filled with ${provingState.totalNumTxs} txs. Received ${txs.length} txs.`, - ); - } - - if (!provingState.isAcceptingTxs()) { - throw new Error(`Block ${blockNumber} has been initialized with transactions.`); - } - - this.logger.info(`Adding ${txs.length} transactions to block ${blockNumber}`); - - const db = this.getDbForBlock(blockNumber); - const lastArchive = provingState.lastArchiveTreeSnapshot; - const newL1ToL2MessageTreeSnapshot = provingState.newL1ToL2MessageTreeSnapshot; - const spongeBlobState = provingState.getStartSpongeBlob().clone(); - - for (const tx of txs) { - try { - if (!provingState.verifyState()) { - throw new Error(`Invalid proving state when adding a tx`); - } - - validateTx(tx); - - this.logger.debug(`Received transaction: ${tx.hash}`); - - const startSpongeBlob = spongeBlobState.clone(); - const [hints, treeSnapshots] = await this.prepareBaseRollupInputs( - tx, - lastArchive, - newL1ToL2MessageTreeSnapshot, - startSpongeBlob, - db, - ); - - if (!provingState.verifyState()) { - throw new Error(`Unable to add transaction, preparing base inputs failed`); - } - - await spongeBlobState.absorb(tx.txEffect.toBlobFields()); - - const txProvingState = new TxProvingState(tx, hints, treeSnapshots, this.proverId.toField()); - const txIndex = provingState.addNewTx(txProvingState); - if (txProvingState.requireAvmProof) { - this.getOrEnqueueChonkVerifier(provingState, txIndex); - this.logger.debug(`Enqueueing public VM for tx ${txIndex}`); - this.enqueueVM(provingState, txIndex); - } else { - this.logger.debug(`Enqueueing base rollup for private-only tx ${txIndex}`); - this.enqueueBaseRollup(provingState, txIndex); - } - } catch (err: any) { - throw new Error(`Error adding transaction ${tx.hash.toString()} to block ${blockNumber}: ${err.message}`, { - cause: err, - }); - } - } - - const endState = await db.getStateReference(); - provingState.setEndState(endState); - - const blockEndBlobFields = provingState.getBlockEndBlobFields(); - await spongeBlobState.absorb(blockEndBlobFields); - - provingState.setEndSpongeBlob(spongeBlobState); - - // Txs have been added to the block. Now try to accumulate the out hashes and blobs as far as we can: - await this.provingState.accumulateCheckpointOutHashes(); - await this.provingState.setBlobAccumulators(); - } - - /** - * Kickstarts chonk verifier circuits for the specified txs. These will be used during epoch proving. - * Note that if the chonk verifier circuits are not started this way, they will be started nontheless after processing. - */ - @trackSpan('ProvingOrchestrator.startChonkVerifierCircuits') - public startChonkVerifierCircuits(txs: Tx[]) { - if (!this.provingState?.verifyState()) { - throw new Error(`Empty epoch proving state. call startNewEpoch before starting chonk verifier circuits.`); - } - const publicTxs = txs.filter(tx => tx.data.forPublic); - for (const tx of publicTxs) { - const txHash = tx.getTxHash().toString(); - const privateInputs = getPublicChonkVerifierPrivateInputsFromTx(tx, this.proverId.toField()); - const tubeProof = - promiseWithResolvers< - PublicInputsAndRecursiveProof< - PublicChonkVerifierPublicInputs, - typeof NESTED_RECURSIVE_ROLLUP_HONK_PROOF_LENGTH - > - >(); - this.logger.debug(`Starting chonk verifier circuit for tx ${txHash}`); - this.doEnqueueChonkVerifier(txHash, privateInputs, proof => { - tubeProof.resolve(proof); - }); - this.provingState.cachedChonkVerifierProofs.set(txHash, tubeProof.promise); - } - return Promise.resolve(); - } - - /** - * Marks the block as completed. - * Computes the block header and updates the archive tree. - */ - @trackSpan('ProvingOrchestrator.setBlockCompleted', (blockNumber: BlockNumber) => ({ - [Attributes.BLOCK_NUMBER]: blockNumber, - })) - public async setBlockCompleted(blockNumber: BlockNumber, expectedHeader?: BlockHeader): Promise { - const provingState = this.provingState?.getBlockProvingStateByBlockNumber(blockNumber); - if (!provingState) { - throw new Error(`Block proving state for ${blockNumber} not found`); - } - - // Abort with specific error for the block if there's one. - const error = provingState.getError(); - if (error) { - throw new Error(`Block proving failed: ${error}`); - } - - // Abort if the proving state is not valid due to errors occurred elsewhere. - if (!provingState.verifyState()) { - throw new Error(`Invalid proving state when completing block ${blockNumber}.`); - } - - if (provingState.isAcceptingTxs()) { - throw new Error( - `Block ${blockNumber} is still accepting txs. Call setBlockCompleted after all txs have been added.`, - ); - } - - // Given we've applied every change from this block, now assemble the block header: - this.logger.verbose(`Block ${blockNumber} completed. Assembling header.`); - const header = await provingState.buildBlockHeader(); - - if (expectedHeader && !header.equals(expectedHeader)) { - this.logger.error(`Block header mismatch: header=${header} expectedHeader=${expectedHeader}`); - throw new Error('Block header mismatch'); - } - - // Get db for this block and remove from map — no other code should use it after this point. - const db = this.getDbForBlock(provingState.blockNumber); - this.dbs.delete(provingState.blockNumber); - - // Update the archive tree, capture the snapshot, and close the fork deterministically. - try { - this.logger.verbose( - `Updating archive tree with block ${provingState.blockNumber} header ${(await header.hash()).toString()}`, - ); - await db.updateArchive(header); - provingState.setBuiltArchive(await getTreeSnapshot(MerkleTreeId.ARCHIVE, db)); - } finally { - await db.close(); - } - - await this.verifyBuiltBlockAgainstSyncedState(provingState); - - return header; - } - - // Flagged as protected to disable in certain unit tests - protected async verifyBuiltBlockAgainstSyncedState(provingState: BlockProvingState) { - const builtBlockHeader = provingState.getBuiltBlockHeader(); - if (!builtBlockHeader) { - this.logger.debug('Block header not built yet, skipping header check.'); - return; - } - - const output = provingState.getBlockRootRollupOutput(); - if (!output) { - this.logger.debug('Block root rollup proof not built yet, skipping header check.'); - return; - } - - const newArchive = provingState.getBuiltArchive(); - if (!newArchive) { - this.logger.debug('Archive snapshot not yet captured, skipping header check.'); - return; - } - - const header = await buildHeaderFromCircuitOutputs(output); - - if (!(await header.hash()).equals(await builtBlockHeader.hash())) { - this.logger.error(`Block header mismatch.\nCircuit: ${inspect(header)}\nComputed: ${inspect(builtBlockHeader)}`); - provingState.reject(`Block header hash mismatch.`); - return; - } - - const blockNumber = provingState.blockNumber; - const syncedArchive = await getTreeSnapshot(MerkleTreeId.ARCHIVE, this.dbProvider.getSnapshot(blockNumber)); - if (!syncedArchive.equals(newArchive)) { - this.logger.error( - `Archive tree mismatch for block ${blockNumber}: world state synced to ${inspect( - syncedArchive, - )} but built ${inspect(newArchive)}`, - ); - provingState.reject(`Archive tree mismatch.`); - return; - } - - const circuitArchive = output.newArchive; - if (!newArchive.equals(circuitArchive)) { - this.logger.error(`New archive mismatch.\nCircuit: ${output.newArchive}\nComputed: ${newArchive}`); - provingState.reject(`New archive mismatch.`); - return; - } - } - - /** - * Cancel any further proving. - * If cancelJobsOnStop is true, aborts all pending jobs with the broker (which marks them as 'Aborted'). - * If cancelJobsOnStop is false (default), jobs remain in the broker queue and can be reused on restart/reorg. - */ - public cancel() { - this.resetSchedulerState(this.cancelJobsOnStop); - - this.provingState?.cancel(); - - for (const [blockNumber, db] of this.dbs.entries()) { - void db.close().catch(err => this.logger.error(`Error closing db for block ${blockNumber}`, err)); - } - this.dbs.clear(); - } - - private getDbForBlock(blockNumber: BlockNumber): MerkleTreeWriteOperations { - const db = this.dbs.get(blockNumber); - if (!db) { - throw new Error(`World state fork for block ${blockNumber} not found.`); - } - return db; - } - - /** - * Returns the proof for the current epoch. - */ - public async finalizeEpoch(): Promise<{ - publicInputs: RootRollupPublicInputs; - proof: Proof; - batchedBlobInputs: BatchedBlob; - }> { - if (!this.provingState || !this.provingPromise) { - throw new Error(`Invalid proving state, an epoch must be proven before it can be finalized`); - } - - const result = await this.provingPromise!; - if (result.status === 'failure') { - throw new Error(`Epoch proving failed: ${result.reason}`); - } - - await this.provingState.finalizeBatchedBlob(); - - const epochProofResult = this.provingState.getEpochProofResult(); - - pushTestData('epochProofResult', { - proof: epochProofResult.proof.toString(), - publicInputs: epochProofResult.publicInputs.toString(), - }); - - return epochProofResult; - } - - private async updateL1ToL2MessageTree(l1ToL2Messages: Fr[], db: MerkleTreeWriteOperations) { - const lastL1ToL2MessageTreeSnapshot = await getTreeSnapshot(MerkleTreeId.L1_TO_L2_MESSAGE_TREE, db); - const lastL1ToL2MessageSubtreeRootSiblingPath = assertLength( - await getSubtreeSiblingPath(MerkleTreeId.L1_TO_L2_MESSAGE_TREE, L1_TO_L2_MSG_SUBTREE_HEIGHT, db), - L1_TO_L2_MSG_SUBTREE_ROOT_SIBLING_PATH_LENGTH, - ); - - // Update the local trees to include the new l1 to l2 messages - await appendL1ToL2MessagesToTree(db, l1ToL2Messages); - - const newL1ToL2MessageTreeSnapshot = await getTreeSnapshot(MerkleTreeId.L1_TO_L2_MESSAGE_TREE, db); - const newL1ToL2MessageSubtreeRootSiblingPath = assertLength( - await getSubtreeSiblingPath(MerkleTreeId.L1_TO_L2_MESSAGE_TREE, L1_TO_L2_MSG_SUBTREE_HEIGHT, db), - L1_TO_L2_MSG_SUBTREE_ROOT_SIBLING_PATH_LENGTH, - ); - - return { - lastL1ToL2MessageTreeSnapshot, - lastL1ToL2MessageSubtreeRootSiblingPath, - newL1ToL2MessageTreeSnapshot, - newL1ToL2MessageSubtreeRootSiblingPath, - }; - } - - // Updates the merkle trees for a transaction. The first enqueued job for a transaction - @trackSpan('ProvingOrchestrator.prepareBaseRollupInputs', tx => ({ - [Attributes.TX_HASH]: tx.hash.toString(), - })) - private async prepareBaseRollupInputs( - tx: ProcessedTx, - lastArchive: AppendOnlyTreeSnapshot, - newL1ToL2MessageTreeSnapshot: AppendOnlyTreeSnapshot, - startSpongeBlob: SpongeBlob, - db: MerkleTreeWriteOperations, - ): Promise<[BaseRollupHints, TreeSnapshots]> { - // We build the base rollup inputs using a mock proof and verification key. - // These will be overwritten later once we have proven the chonk verifier circuit and any public kernels - const [ms, hints] = await elapsed( - insertSideEffectsAndBuildBaseRollupHints( - tx, - lastArchive, - newL1ToL2MessageTreeSnapshot, - startSpongeBlob, - this.proverId.toField(), - db, - ), - ); - - this.metrics.recordBaseRollupInputs(ms); - - const promises = [MerkleTreeId.NOTE_HASH_TREE, MerkleTreeId.NULLIFIER_TREE, MerkleTreeId.PUBLIC_DATA_TREE].map( - async (id: MerkleTreeId) => { - return { key: id, value: await getTreeSnapshot(id, db) }; - }, - ); - const treeSnapshots: TreeSnapshots = new Map((await Promise.all(promises)).map(obj => [obj.key, obj.value])); - - return [hints, treeSnapshots]; - } - - // Executes the base rollup circuit and stored the output as intermediate state for the parent merge/root circuit - // Executes the next level of merge if all inputs are available - private enqueueBaseRollup(provingState: BlockProvingState, txIndex: number) { - if (!provingState.verifyState()) { - this.logger.debug('Not running base rollup, state invalid'); - return; - } - - if (!provingState.tryStartProvingBase(txIndex)) { - this.logger.debug(`Base rollup for tx ${txIndex} already started.`); - return; - } - - const txProvingState = provingState.getTxProvingState(txIndex); - const { processedTx } = txProvingState; - const { rollupType, inputs } = txProvingState.getBaseRollupTypeAndInputs(); - - this.logger.debug(`Enqueuing deferred proving base rollup for ${processedTx.hash.toString()}`); - - this.deferredProving( - provingState, - wrapCallbackInSpan( - this.tracer, - `ProvingOrchestrator.prover.${ - inputs instanceof PrivateTxBaseRollupPrivateInputs - ? 'getPrivateTxBaseRollupProof' - : 'getPublicTxBaseRollupProof' - }`, - { - [Attributes.TX_HASH]: processedTx.hash.toString(), - [Attributes.PROTOCOL_CIRCUIT_NAME]: rollupType, - }, - signal => { - if (inputs instanceof PrivateTxBaseRollupPrivateInputs) { - return this.prover.getPrivateTxBaseRollupProof(inputs, signal, provingState.epochNumber); - } else { - return this.prover.getPublicTxBaseRollupProof(inputs, signal, provingState.epochNumber); - } - }, - ), - result => { - this.logger.debug(`Completed proof for ${rollupType} for tx ${processedTx.hash.toString()}`); - validatePartialState(result.inputs.endTreeSnapshots, txProvingState.treeSnapshots); - const leafLocation = provingState.setBaseRollupProof(txIndex, result); - if (provingState.totalNumTxs === 1) { - this.checkAndEnqueueBlockRootRollup(provingState); - } else { - this.checkAndEnqueueNextMergeRollup(provingState, leafLocation); - } - }, - ); - } - - // Enqueues the public chonk verifier circuit for a given transaction index, or reuses the one already enqueued. - // Once completed, will enqueue the the public tx base rollup. - protected getOrEnqueueChonkVerifier(provingState: BlockProvingState, txIndex: number) { - if (!provingState.verifyState()) { - this.logger.debug('Not running chonk verifier circuit, state invalid'); - return; - } - - const txProvingState = provingState.getTxProvingState(txIndex); - const txHash = txProvingState.processedTx.hash.toString(); - const handleResult = ( - result: PublicInputsAndRecursiveProof< - PublicChonkVerifierPublicInputs, - typeof NESTED_RECURSIVE_ROLLUP_HONK_PROOF_LENGTH - >, - ) => { - this.logger.debug(`Got chonk verifier proof for tx index: ${txIndex}`, { txHash }); - txProvingState.setPublicChonkVerifierProof(result); - this.provingState?.cachedChonkVerifierProofs.delete(txHash); - this.checkAndEnqueueBaseRollup(provingState, txIndex); - }; - - if (this.provingState?.cachedChonkVerifierProofs.has(txHash)) { - this.logger.debug(`Chonk verifier proof already enqueued for tx index: ${txIndex}`, { txHash }); - void this.provingState!.cachedChonkVerifierProofs.get(txHash)!.then(handleResult); - return; - } - - this.logger.debug(`Enqueuing chonk verifier circuit for tx index: ${txIndex}`); - this.doEnqueueChonkVerifier(txHash, txProvingState.getPublicChonkVerifierPrivateInputs(), handleResult); - } - - private doEnqueueChonkVerifier( - txHash: string, - inputs: PublicChonkVerifierPrivateInputs, - handler: ( - result: PublicInputsAndRecursiveProof< - PublicChonkVerifierPublicInputs, - typeof NESTED_RECURSIVE_ROLLUP_HONK_PROOF_LENGTH - >, - ) => void, - provingState: EpochProvingState | BlockProvingState = this.provingState!, - ) { - if (!provingState.verifyState()) { - this.logger.debug('Not running chonk verifier circuit, state invalid'); - return; - } - - this.deferredProving( - provingState, - wrapCallbackInSpan( - this.tracer, - 'ProvingOrchestrator.prover.getPublicChonkVerifierProof', - { - [Attributes.TX_HASH]: txHash, - [Attributes.PROTOCOL_CIRCUIT_NAME]: 'chonk-verifier-public' satisfies CircuitName, - }, - signal => this.prover.getPublicChonkVerifierProof(inputs, signal, provingState.epochNumber), - ), - handler, - ); - } - - // Executes the merge rollup circuit and stored the output as intermediate state for the parent merge/block root circuit - // Enqueues the next level of merge if all inputs are available - private enqueueMergeRollup(provingState: BlockProvingState, location: TreeNodeLocation) { - if (!provingState.verifyState()) { - this.logger.debug('Not running merge rollup. State no longer valid.'); - return; - } - - if (!provingState.tryStartProvingMerge(location)) { - this.logger.debug('Merge rollup already started.'); - return; - } - - const inputs = provingState.getMergeRollupInputs(location); - - this.deferredProving( - provingState, - wrapCallbackInSpan( - this.tracer, - 'ProvingOrchestrator.prover.getTxMergeRollupProof', - { - [Attributes.PROTOCOL_CIRCUIT_NAME]: 'rollup-tx-merge' satisfies CircuitName, - }, - signal => this.prover.getTxMergeRollupProof(inputs, signal, provingState.epochNumber), - ), - result => { - provingState.setMergeRollupProof(location, result); - this.checkAndEnqueueNextMergeRollup(provingState, location); - }, - ); - } - - // Executes the block root rollup circuit - private enqueueBlockRootRollup(provingState: BlockProvingState) { - if (!provingState.verifyState()) { - this.logger.debug('Not running block root rollup, state no longer valid'); - return; - } - - if (!provingState.tryStartProvingBlockRoot()) { - this.logger.debug('Block root rollup already started.'); - return; - } - - const { rollupType, inputs } = provingState.getBlockRootRollupTypeAndInputs(); - - this.logger.debug(`Enqueuing ${rollupType} for block ${provingState.blockNumber}.`); - - this.deferredProving( - provingState, - wrapCallbackInSpan( - this.tracer, - 'ProvingOrchestrator.prover.getBlockRootRollupProof', - { - [Attributes.PROTOCOL_CIRCUIT_NAME]: rollupType, - }, - signal => { - if (inputs instanceof BlockRootFirstRollupPrivateInputs) { - return this.prover.getBlockRootFirstRollupProof(inputs, signal, provingState.epochNumber); - } else if (inputs instanceof BlockRootSingleTxFirstRollupPrivateInputs) { - return this.prover.getBlockRootSingleTxFirstRollupProof(inputs, signal, provingState.epochNumber); - } else if (inputs instanceof BlockRootEmptyTxFirstRollupPrivateInputs) { - return this.prover.getBlockRootEmptyTxFirstRollupProof(inputs, signal, provingState.epochNumber); - } else if (inputs instanceof BlockRootSingleTxRollupPrivateInputs) { - return this.prover.getBlockRootSingleTxRollupProof(inputs, signal, provingState.epochNumber); - } else { - return this.prover.getBlockRootRollupProof(inputs, signal, provingState.epochNumber); - } - }, - ), - async result => { - this.logger.debug(`Completed ${rollupType} proof for block ${provingState.blockNumber}`, { - blockNumber: provingState.blockNumber, - checkpointIndex: provingState.parentCheckpoint.index, - ...result.inputs.toInspect(), - }); - - const leafLocation = provingState.setBlockRootRollupProof(result); - const checkpointProvingState = provingState.parentCheckpoint; - - // Verification is called from both here and setBlockCompleted. Whichever runs last - // will be the first to see all three pieces (header, proof output, archive) and run the checks. - await this.verifyBuiltBlockAgainstSyncedState(provingState); - - if (checkpointProvingState.totalNumBlocks === 1) { - await this.checkAndEnqueueCheckpointRootRollup(checkpointProvingState); - } else { - await this.checkAndEnqueueNextBlockMergeRollup(checkpointProvingState, leafLocation); - } - }, - ); - } - - // Executes the base parity circuit and stores the intermediate state for the root parity circuit - // Enqueues the root parity circuit if all inputs are available - private enqueueBaseParityCircuit( - checkpointProvingState: CheckpointProvingState, - provingState: BlockProvingState, - baseParityIndex: number, - ) { - if (!provingState.verifyState()) { - this.logger.debug('Not running base parity. State no longer valid.'); - return; - } - - if (!provingState.tryStartProvingBaseParity(baseParityIndex)) { - this.logger.warn(`Base parity ${baseParityIndex} already started.`); - return; - } - - const inputs = checkpointProvingState.getBaseParityInputs(baseParityIndex); - - this.deferredProving( - provingState, - wrapCallbackInSpan( - this.tracer, - 'ProvingOrchestrator.prover.getBaseParityProof', - { - [Attributes.PROTOCOL_CIRCUIT_NAME]: 'parity-base' satisfies CircuitName, - }, - signal => this.prover.getBaseParityProof(inputs, signal, provingState.epochNumber), - ), - provingOutput => { - provingState.setBaseParityProof(baseParityIndex, provingOutput); - this.checkAndEnqueueRootParityCircuit(provingState); - }, - ); - } - - private checkAndEnqueueRootParityCircuit(provingState: BlockProvingState) { - if (!provingState.isReadyForRootParity()) { - return; - } - - this.enqueueRootParityCircuit(provingState); - } - - // Runs the root parity circuit ans stored the outputs - // Enqueues the root rollup proof if all inputs are available - private enqueueRootParityCircuit(provingState: BlockProvingState) { - if (!provingState.verifyState()) { - this.logger.debug('Not running root parity. State no longer valid.'); - return; - } - - if (!provingState.tryStartProvingRootParity()) { - this.logger.debug('Root parity already started.'); - return; - } - - const inputs = provingState.getParityRootInputs(); - - this.deferredProving( - provingState, - wrapCallbackInSpan( - this.tracer, - 'ProvingOrchestrator.prover.getRootParityProof', - { - [Attributes.PROTOCOL_CIRCUIT_NAME]: 'parity-root' satisfies CircuitName, - }, - signal => this.prover.getRootParityProof(inputs, signal, provingState.epochNumber), - ), - result => { - provingState.setRootParityProof(result); - this.checkAndEnqueueBlockRootRollup(provingState); - }, - ); - } - - // Executes the block merge rollup circuit and stored the output as intermediate state for the parent merge/block root circuit - // Enqueues the next level of merge if all inputs are available - private enqueueBlockMergeRollup(provingState: CheckpointProvingState, location: TreeNodeLocation) { - if (!provingState.verifyState()) { - this.logger.debug('Not running block merge rollup. State no longer valid.'); - return; - } - - if (!provingState.tryStartProvingBlockMerge(location)) { - this.logger.debug('Block merge rollup already started.'); - return; - } - - const inputs = provingState.getBlockMergeRollupInputs(location); - this.deferredProving( - provingState, - wrapCallbackInSpan( - this.tracer, - 'ProvingOrchestrator.prover.getBlockMergeRollupProof', - { - [Attributes.PROTOCOL_CIRCUIT_NAME]: 'rollup-block-merge' satisfies CircuitName, - }, - signal => this.prover.getBlockMergeRollupProof(inputs, signal, provingState.epochNumber), - ), - async result => { - this.logger.debug(`Completed block merge rollup proof for checkpoint ${provingState.index}`, { - checkpointIndex: provingState.index, - mergeLocation: location, - ...result.inputs.toInspect(), - }); - provingState.setBlockMergeRollupProof(location, result); - await this.checkAndEnqueueNextBlockMergeRollup(provingState, location); - }, - ); - } - - private async enqueueCheckpointRootRollup(provingState: CheckpointProvingState) { - if (!provingState.verifyState()) { - this.logger.debug('Not running checkpoint root rollup. State no longer valid.'); - return; - } - - if (!provingState.tryStartProvingCheckpointRoot()) { - this.logger.debug('Checkpoint root rollup already started.'); - return; - } - - const rollupType = provingState.getCheckpointRootRollupType(); - - this.logger.debug(`Enqueuing ${rollupType} for checkpoint ${provingState.index}.`); - - const inputs = await provingState.getCheckpointRootRollupInputs(); - - this.deferredProving( - provingState, - wrapCallbackInSpan( - this.tracer, - 'ProvingOrchestrator.prover.getCheckpointRootRollupProof', - { - [Attributes.PROTOCOL_CIRCUIT_NAME]: rollupType, - }, - signal => { - if (inputs instanceof CheckpointRootSingleBlockRollupPrivateInputs) { - return this.prover.getCheckpointRootSingleBlockRollupProof(inputs, signal, provingState.epochNumber); - } else { - return this.prover.getCheckpointRootRollupProof(inputs, signal, provingState.epochNumber); - } - }, - ), - result => { - const computedEndBlobAccumulatorState = provingState.getEndBlobAccumulator()!.toBlobAccumulator(); - const circuitEndBlobAccumulatorState = result.inputs.endBlobAccumulator; - if (!circuitEndBlobAccumulatorState.equals(computedEndBlobAccumulatorState)) { - this.logger.error( - `Blob accumulator state mismatch.\nCircuit: ${inspect(circuitEndBlobAccumulatorState)}\nComputed: ${inspect( - computedEndBlobAccumulatorState, - )}`, - ); - provingState.reject(`Blob accumulator state mismatch.`); - return; - } - - this.logger.debug(`Completed ${rollupType} proof for checkpoint ${provingState.index}`, { - checkpointIndex: provingState.index, - ...result.inputs.toInspect(), - }); - - const leafLocation = provingState.setCheckpointRootRollupProof(result); - const epochProvingState = provingState.parentEpoch; - - if (epochProvingState.totalNumCheckpoints === 1) { - this.enqueueEpochPadding(epochProvingState); - } else { - this.checkAndEnqueueNextCheckpointMergeRollup(epochProvingState, leafLocation); - } - }, - ); - } - - private checkAndEnqueueNextMergeRollup(provingState: BlockProvingState, currentLocation: TreeNodeLocation) { - if (!provingState.isReadyForMergeRollup(currentLocation)) { - return; - } - - const parentLocation = provingState.getParentLocation(currentLocation); - if (parentLocation.level === 0) { - this.checkAndEnqueueBlockRootRollup(provingState); - } else { - this.enqueueMergeRollup(provingState, parentLocation); - } - } - - private checkAndEnqueueBlockRootRollup(provingState: BlockProvingState) { - if (!provingState.isReadyForBlockRootRollup()) { - this.logger.debug('Not ready for block root rollup'); - return; - } - - this.enqueueBlockRootRollup(provingState); - } - - private async checkAndEnqueueNextBlockMergeRollup( - provingState: CheckpointProvingState, - currentLocation: TreeNodeLocation, - ) { - if (!provingState.isReadyForBlockMerge(currentLocation)) { - return; - } - - const parentLocation = provingState.getParentLocation(currentLocation); - if (parentLocation.level === 0) { - await this.checkAndEnqueueCheckpointRootRollup(provingState); - } else { - this.enqueueBlockMergeRollup(provingState, parentLocation); - } - } - - protected async checkAndEnqueueCheckpointRootRollup(provingState: CheckpointProvingState) { - if (!provingState.isReadyForCheckpointRoot()) { - return; - } - - await this.enqueueCheckpointRootRollup(provingState); - } - - /** - * Executes the VM circuit for a public function, will enqueue the corresponding kernel if the - * previous kernel is ready - * @param provingState - The proving state being operated on - * @param txIndex - The index of the transaction being proven - */ - private enqueueVM(provingState: BlockProvingState, txIndex: number) { - if (!provingState.verifyState()) { - this.logger.debug(`Not running VM circuit as state is no longer valid`); - return; - } - - const txProvingState = provingState.getTxProvingState(txIndex); - - const doAvmProving = wrapCallbackInSpan( - this.tracer, - 'ProvingOrchestrator.prover.getAvmProof', - { - [Attributes.TX_HASH]: txProvingState.processedTx.hash.toString(), - }, - async (signal: AbortSignal) => { - const inputs = txProvingState.getAvmInputs(); - return await this.prover.getAvmProof(inputs, signal, provingState.epochNumber); - }, - ); - - this.deferredProving(provingState, doAvmProving, proof => { - this.logger.debug(`Proven VM for tx index: ${txIndex}`); - txProvingState.setAvmProof(proof); - this.checkAndEnqueueBaseRollup(provingState, txIndex); - }); - } - - protected checkAndEnqueueBaseRollup(provingState: BlockProvingState, txIndex: number) { - const txProvingState = provingState.getTxProvingState(txIndex); - if (!txProvingState.ready()) { - return; - } - - // We must have completed all proving (chonk verifier proof and (if required) vm proof are generated), we now move to the base rollup. - this.logger.debug(`Public functions completed for tx ${txIndex} enqueueing base rollup`); - - this.enqueueBaseRollup(provingState, txIndex); - } -} diff --git a/yarn-project/prover-client/src/orchestrator/orchestrator_errors.test.ts b/yarn-project/prover-client/src/orchestrator/orchestrator_errors.test.ts deleted file mode 100644 index 3d148f6beca7..000000000000 --- a/yarn-project/prover-client/src/orchestrator/orchestrator_errors.test.ts +++ /dev/null @@ -1,172 +0,0 @@ -import type { FinalBlobBatchingChallenges } from '@aztec/blob-lib/types'; -import { NUMBER_OF_L1_L2_MESSAGES_PER_ROLLUP } from '@aztec/constants'; -import { BlockNumber, EpochNumber } from '@aztec/foundation/branded-types'; -import { Fr } from '@aztec/foundation/curves/bn254'; -import { createLogger } from '@aztec/foundation/log'; -import type { CheckpointConstantData } from '@aztec/stdlib/rollup'; -import type { BlockHeader, ProcessedTx } from '@aztec/stdlib/tx'; - -import { TestContext } from '../mocks/test_context.js'; -import type { ProvingOrchestrator } from './orchestrator.js'; - -const logger = createLogger('prover-client:test:orchestrator-errors'); - -describe('prover/orchestrator/errors', () => { - let context: TestContext; - let orchestrator: ProvingOrchestrator; - let constants: CheckpointConstantData; - let block: { header: BlockHeader; txs: ProcessedTx[] }; - let previousBlockHeader: BlockHeader; - let finalBlobChallenges: FinalBlobBatchingChallenges; - const numBlocks = 1; - - beforeEach(async () => { - context = await TestContext.new(logger); - orchestrator = context.orchestrator; - ({ - constants, - blocks: [block], - previousBlockHeader, - } = await context.makeCheckpoint(numBlocks, { numTxsPerBlock: 1 })); - finalBlobChallenges = await context.getFinalBlobChallenges(); - }); - - afterEach(async () => { - await context.cleanup(); - }); - - afterAll(async () => {}); - - describe('errors', () => { - it('throws if adding too many transactions', async () => { - orchestrator.startNewEpoch(EpochNumber(1), 1 /* numCheckpoints */, finalBlobChallenges); - - await orchestrator.startNewCheckpoint( - 0, // checkpointIndex - constants, - [], // l1ToL2Messages - numBlocks, - previousBlockHeader, - ); - const { blockNumber, timestamp } = block.header.globalVariables; - await orchestrator.startNewBlock(blockNumber, timestamp, block.txs.length); - await orchestrator.addTxs(block.txs); - - await expect(async () => await orchestrator.addTxs(block.txs)).rejects.toThrow( - `Block ${blockNumber} has been initialized with transactions.`, - ); - }); - - it('throws if adding too many blocks', async () => { - orchestrator.startNewEpoch(EpochNumber(1), 1, finalBlobChallenges); - - await orchestrator.startNewCheckpoint( - 0, // checkpointIndex - constants, - [], // l1ToL2Messages - numBlocks, - previousBlockHeader, - ); - - const { blockNumber, timestamp } = block.header.globalVariables; - await orchestrator.startNewBlock(blockNumber, timestamp, block.txs.length); - await orchestrator.addTxs(block.txs); - await orchestrator.setBlockCompleted(blockNumber); - - await expect( - async () => await orchestrator.startNewBlock(blockNumber, timestamp, block.txs.length), - ).rejects.toThrow('Checkpoint not accepting further blocks'); - }); - - it('throws if adding empty block as non-first block', async () => { - orchestrator.startNewEpoch(EpochNumber(1), 1, finalBlobChallenges); - - await orchestrator.startNewCheckpoint( - 0, // checkpointIndex - constants, - [], // l1ToL2Messages - 2, // numBlocks - previousBlockHeader, - ); - - const { blockNumber, timestamp } = block.header.globalVariables; - await orchestrator.startNewBlock(blockNumber, timestamp, block.txs.length); - await orchestrator.addTxs(block.txs); - - await expect( - async () => await orchestrator.startNewBlock(BlockNumber(blockNumber + 1), timestamp + 1n, 0 /* numTxs */), - ).rejects.toThrow(`Cannot create a block with 0 txs, unless it's the first block.`); - }); - - it('throws if adding a transaction before starting epoch', async () => { - await expect(async () => await orchestrator.addTxs(block.txs)).rejects.toThrow(/Empty epoch proving state./); - }); - - it('throws if adding a transaction before starting checkpoint', async () => { - orchestrator.startNewEpoch(EpochNumber(1), 1, finalBlobChallenges); - - await expect(async () => await orchestrator.addTxs(block.txs)).rejects.toThrow( - /Proving state for block 1 not found/, - ); - }); - - it('throws if adding a transaction before starting block', async () => { - orchestrator.startNewEpoch(EpochNumber(1), 1, finalBlobChallenges); - await orchestrator.startNewCheckpoint( - 0, // checkpointIndex - constants, - [], - numBlocks, - previousBlockHeader, - ); - await expect(async () => await orchestrator.addTxs(block.txs)).rejects.toThrow( - /Proving state for block 1 not found/, - ); - }); - - it('throws if completing a block before start', async () => { - orchestrator.startNewEpoch(EpochNumber(1), 1, finalBlobChallenges); - await orchestrator.startNewCheckpoint( - 0, // checkpointIndex - constants, - [], - numBlocks, - previousBlockHeader, - ); - await expect(async () => await orchestrator.setBlockCompleted(block.header.getBlockNumber())).rejects.toThrow( - /Block proving state for 1 not found/, - ); - }); - - it('throws if adding to a cancelled block', async () => { - orchestrator.startNewEpoch(EpochNumber(1), 1, finalBlobChallenges); - await orchestrator.startNewCheckpoint( - 0, // checkpointIndex - constants, - [], - numBlocks, - previousBlockHeader, - ); - const { blockNumber, timestamp } = block.header.globalVariables; - await orchestrator.startNewBlock(blockNumber, timestamp, 1); - orchestrator.cancel(); - - await expect(async () => await orchestrator.addTxs(block.txs)).rejects.toThrow('World state fork for block'); - }); - - it('rejects if too many l1 to l2 messages are provided', async () => { - const l1ToL2Messages = new Array(NUMBER_OF_L1_L2_MESSAGES_PER_ROLLUP + 1).fill(new Fr(0n)); - orchestrator.startNewEpoch(EpochNumber(1), 1, finalBlobChallenges); - await expect( - async () => - await orchestrator.startNewCheckpoint( - 0, // checkpointIndex - constants, - l1ToL2Messages, - numBlocks, - previousBlockHeader, - ), - ).rejects.toThrow('Too many L1 to L2 messages'); - }); - }); -}); diff --git a/yarn-project/prover-client/src/orchestrator/orchestrator_failures.test.ts b/yarn-project/prover-client/src/orchestrator/orchestrator_failures.test.ts deleted file mode 100644 index 0c127406b133..000000000000 --- a/yarn-project/prover-client/src/orchestrator/orchestrator_failures.test.ts +++ /dev/null @@ -1,191 +0,0 @@ -import { EpochNumber } from '@aztec/foundation/branded-types'; -import { timesAsync } from '@aztec/foundation/collection'; -import { createLogger } from '@aztec/foundation/log'; -import type { ServerCircuitProver } from '@aztec/stdlib/interfaces/server'; - -import { jest } from '@jest/globals'; - -import { TestContext } from '../mocks/test_context.js'; -import type { ProvingOrchestrator } from './orchestrator.js'; - -const logger = createLogger('prover-client:test:orchestrator-failures'); -const LONG_TIMEOUT = 600_000; - -describe('prover/orchestrator/failures', () => { - let context: TestContext; - let orchestrator: ProvingOrchestrator; - let prover: ServerCircuitProver; - - beforeEach(async () => { - context = await TestContext.new(logger); - }); - - afterEach(async () => { - await context.cleanup(); - }); - - describe('error handling', () => { - beforeEach(() => { - ({ prover, orchestrator } = context); - }); - - const run = async ( - message: string, - { - numCheckpoints = 1, - numBlocksPerCheckpoint = 1, - numTxsPerBlock = 0, - numL1ToL2Messages = 0, - privateOnly = true, - }: { - numCheckpoints?: number; - numBlocksPerCheckpoint?: number; - numTxsPerBlock?: number; - numL1ToL2Messages?: number; - privateOnly?: boolean; - } = {}, - ) => { - const checkpoints = await timesAsync(numCheckpoints, () => - context.makeCheckpoint(numBlocksPerCheckpoint, { - numTxsPerBlock, - numL1ToL2Messages, - makeProcessedTxOpts: () => ({ privateOnly }), - }), - ); - - const finalBlobChallenges = await context.getFinalBlobChallenges(); - orchestrator.startNewEpoch(EpochNumber(1), 1, finalBlobChallenges); - - for (let checkpointIndex = 0; checkpointIndex < checkpoints.length; checkpointIndex++) { - const { constants, blocks, l1ToL2Messages, previousBlockHeader } = checkpoints[checkpointIndex]; - // these operations could fail if the target circuit fails before adding all blocks or txs - try { - await orchestrator.startNewCheckpoint( - checkpointIndex, - constants, - l1ToL2Messages, - blocks.length, - previousBlockHeader, - ); - - for (const block of blocks) { - const { blockNumber, timestamp } = block.header.globalVariables; - await orchestrator.startNewBlock(blockNumber, timestamp, block.txs.length); - - let allTxsAdded = true; - try { - await orchestrator.addTxs(block.txs); - } catch { - allTxsAdded = false; - break; - } - - if (!allTxsAdded) { - await expect(orchestrator.setBlockCompleted(blockNumber)).rejects.toThrow( - `Block proving failed: ${message}`, - ); - } else { - await orchestrator.setBlockCompleted(blockNumber); - } - } - } catch { - break; - } - } - }; - - it( - 'succeeds without failed proof', - async () => { - await run('successful case'); - await expect(orchestrator.finalizeEpoch()).resolves.not.toThrow(); - }, - LONG_TIMEOUT, - ); - - it.each([ - [ - 'Private Base Rollup Failed', - (msg: string) => jest.spyOn(prover, 'getPrivateTxBaseRollupProof').mockRejectedValue(msg), - { numTxsPerBlock: 1, privateOnly: true }, - ], - [ - 'Public Base Rollup Failed', - (msg: string) => jest.spyOn(prover, 'getPublicTxBaseRollupProof').mockRejectedValue(msg), - { numTxsPerBlock: 1, privateOnly: false }, - ], - [ - 'Tx Merge Rollup Failed', - (msg: string) => jest.spyOn(prover, 'getTxMergeRollupProof').mockRejectedValue(msg), - { numTxsPerBlock: 3 }, // Need at least 3 txs to use a tx merge rollup. - ], - [ - 'Block Root First Empty Tx Rollup Failed', - (msg: string) => jest.spyOn(prover, 'getBlockRootEmptyTxFirstRollupProof').mockRejectedValue(msg), - { numTxsPerBlock: 0, numL1ToL2Messages: 1 }, - ], - [ - 'Block Root First Single Tx Rollup Failed', - (msg: string) => jest.spyOn(prover, 'getBlockRootSingleTxFirstRollupProof').mockRejectedValue(msg), - { numTxsPerBlock: 1 }, - ], - [ - 'Block Root First Rollup Failed', - (msg: string) => jest.spyOn(prover, 'getBlockRootFirstRollupProof').mockRejectedValue(msg), - { numTxsPerBlock: 2 }, // Need at least 2 txs to use a block root first rollup. - ], - [ - 'Checkpoint Root Single Block Rollup Failed', - (msg: string) => jest.spyOn(prover, 'getCheckpointRootSingleBlockRollupProof').mockRejectedValue(msg), - ], - [ - 'Checkpoint Root Rollup Failed', - (msg: string) => jest.spyOn(prover, 'getCheckpointRootRollupProof').mockRejectedValue(msg), - { numBlocksPerCheckpoint: 2, numTxsPerBlock: 1 }, - ], - [ - 'Checkpoint Merge Rollup Failed', - (msg: string) => jest.spyOn(prover, 'getCheckpointMergeRollupProof').mockRejectedValue(msg), - { numCheckpoints: 3 }, // Need at least 3 checkpoints to use a checkpoint merge rollup. - ], - ['Root Rollup Failed', (msg: string) => jest.spyOn(prover, 'getRootRollupProof').mockRejectedValue(msg)], - [ - 'Base Parity Failed', - (msg: string) => jest.spyOn(prover, 'getBaseParityProof').mockRejectedValue(msg), - { - numL1ToL2Messages: 1, - }, - ], - [ - 'Root Parity Failed', - (msg: string) => jest.spyOn(prover, 'getRootParityProof').mockRejectedValue(msg), - { - numL1ToL2Messages: 1, - }, - ], - ] as const)( - 'handles a %s error', - async ( - message: string, - makeFailedProof: (msg: string) => void, - opts: Partial[1]> = {}, - ) => { - /** - * NOTE: these tests start a new epoch with N blocks. Each block will have M txs in it. - * Txs are proven in parallel and as soon as one fails (which is what this test is setting up to happen) - * the orchestrator stops accepting txs in a block. - * This means we have to be careful with our assertions as the order in which things happen is non-deterministic. - * We need to expect - * - addTx to fail (because a block's provingState became invalid) - * - addTx to work fine (because we haven't hit the error in the test setup) but the epoch to fail - */ - makeFailedProof(message); - - await run(message, opts); - - await expect(() => orchestrator.finalizeEpoch()).rejects.toThrow(); - }, - LONG_TIMEOUT, - ); - }); -}); diff --git a/yarn-project/prover-client/src/orchestrator/orchestrator_lifecycle.test.ts b/yarn-project/prover-client/src/orchestrator/orchestrator_lifecycle.test.ts deleted file mode 100644 index 687f294711ed..000000000000 --- a/yarn-project/prover-client/src/orchestrator/orchestrator_lifecycle.test.ts +++ /dev/null @@ -1,111 +0,0 @@ -import { TestCircuitProver } from '@aztec/bb-prover'; -import { NUM_BASE_PARITY_PER_ROOT_PARITY } from '@aztec/constants'; -import { EpochNumber } from '@aztec/foundation/branded-types'; -import { EthAddress } from '@aztec/foundation/eth-address'; -import { createLogger } from '@aztec/foundation/log'; -import { type PromiseWithResolvers, promiseWithResolvers } from '@aztec/foundation/promise'; -import { sleep } from '@aztec/foundation/sleep'; -import type { ServerCircuitProver } from '@aztec/stdlib/interfaces/server'; - -import { jest } from '@jest/globals'; - -import { TestContext } from '../mocks/test_context.js'; -import { ProvingOrchestrator } from './orchestrator.js'; - -const logger = createLogger('prover-client:test:orchestrator-lifecycle'); - -describe('prover/orchestrator/lifecycle', () => { - let context: TestContext; - - beforeEach(async () => { - context = await TestContext.new(logger); - }); - - afterEach(async () => { - await context.cleanup(); - }); - - describe('lifecycle', () => { - it('cancels proving requests', async () => { - const prover: ServerCircuitProver = new TestCircuitProver(); - // Pass cancelJobsOnStop=true to test that cancellation actually aborts jobs - const orchestrator = new ProvingOrchestrator(context.worldState, prover, EthAddress.ZERO, true, 10); - - const spy = jest.spyOn(prover, 'getBaseParityProof'); - const deferredPromises: PromiseWithResolvers[] = []; - spy.mockImplementation(() => { - const deferred = promiseWithResolvers(); - deferredPromises.push(deferred); - return deferred.promise; - }); - - const { - constants, - blocks: [block], - previousBlockHeader, - } = await context.makeCheckpoint(1, { - numTxsPerBlock: 0, - }); - - const finalBlobChallenges = await context.getFinalBlobChallenges(); - orchestrator.startNewEpoch(EpochNumber(1), 1, finalBlobChallenges); - - await orchestrator.startNewCheckpoint( - 0, // checkpointIndex - constants, - [], - 1, - previousBlockHeader, - ); - - const { blockNumber, timestamp } = block.header.globalVariables; - await orchestrator.startNewBlock(blockNumber, timestamp, 0); - - await sleep(1); - - expect(spy).toHaveBeenCalledTimes(NUM_BASE_PARITY_PER_ROOT_PARITY); - expect(spy.mock.calls.every(([_, signal]) => !signal?.aborted)).toBeTruthy(); - - orchestrator.cancel(); - expect(spy.mock.calls.every(([_, signal]) => signal?.aborted)).toBeTruthy(); - }); - - it('does not abort proving requests when cancelJobsOnStop is false (default)', async () => { - const prover: ServerCircuitProver = new TestCircuitProver(); - // Default behavior: cancelJobsOnStop=false, jobs remain in queue for reuse - const orchestrator = new ProvingOrchestrator(context.worldState, prover, EthAddress.ZERO, false, 10); - - const spy = jest.spyOn(prover, 'getBaseParityProof'); - const deferredPromises: PromiseWithResolvers[] = []; - spy.mockImplementation(() => { - const deferred = promiseWithResolvers(); - deferredPromises.push(deferred); - return deferred.promise; - }); - - const { - constants, - blocks: [block], - previousBlockHeader, - } = await context.makeCheckpoint(1, { - numTxsPerBlock: 0, - }); - - const finalBlobChallenges = await context.getFinalBlobChallenges(); - orchestrator.startNewEpoch(EpochNumber(1), 1, finalBlobChallenges); - - await orchestrator.startNewCheckpoint(0, constants, [], 1, previousBlockHeader); - - const { blockNumber, timestamp } = block.header.globalVariables; - await orchestrator.startNewBlock(blockNumber, timestamp, 0); - - await sleep(1); - - expect(spy).toHaveBeenCalledTimes(NUM_BASE_PARITY_PER_ROOT_PARITY); - expect(spy.mock.calls.every(([_, signal]) => !signal?.aborted)).toBeTruthy(); - - orchestrator.cancel(); - expect(spy.mock.calls.every(([_, signal]) => !signal?.aborted)).toBeTruthy(); - }); - }); -}); diff --git a/yarn-project/prover-client/src/orchestrator/orchestrator_multiple_checkpoints.test.ts b/yarn-project/prover-client/src/orchestrator/orchestrator_multiple_checkpoints.test.ts deleted file mode 100644 index 7b013bf0a62a..000000000000 --- a/yarn-project/prover-client/src/orchestrator/orchestrator_multiple_checkpoints.test.ts +++ /dev/null @@ -1,135 +0,0 @@ -import { MAX_CHECKPOINTS_PER_EPOCH } from '@aztec/constants'; -import { EpochNumber } from '@aztec/foundation/branded-types'; -import { padArrayEnd, timesAsync } from '@aztec/foundation/collection'; -import { Fr } from '@aztec/foundation/curves/bn254'; -import { createLogger } from '@aztec/foundation/log'; - -import { TestContext } from '../mocks/test_context.js'; - -const logger = createLogger('prover-client:test:orchestrator-multi-checkpoints'); - -const LONG_TIMEOUT = 600_000; - -describe('prover/orchestrator/multi-checkpoints', () => { - let context: TestContext; - - beforeEach(async () => { - context = await TestContext.new(logger); - context.orchestrator.isVerifyBuiltBlockAgainstSyncedStateEnabled = true; - }); - - afterEach(async () => { - await context.cleanup(); - }); - - describe('multiple checkpoints ', () => { - it.each([4, 5, MAX_CHECKPOINTS_PER_EPOCH])( - 'builds an epoch with %s checkpoints in sequence', - async (numCheckpoints: number) => { - const numBlocksPerCheckpoint = 1; - const numTxsPerBlock = 1; - logger.info(`Seeding world state with ${numCheckpoints * numBlocksPerCheckpoint} blocks`); - const checkpoints = await timesAsync(numCheckpoints, () => - context.makeCheckpoint(numBlocksPerCheckpoint, { numTxsPerBlock }), - ); - - logger.info(`Starting new epoch with ${numCheckpoints} checkpoints`); - const finalBlobChallenges = await context.getFinalBlobChallenges(); - context.orchestrator.startNewEpoch(EpochNumber(1), numCheckpoints, finalBlobChallenges); - - for (let i = 0; i < checkpoints.length; i++) { - const { - constants, - blocks: [block], - previousBlockHeader, - } = checkpoints[i]; - await context.orchestrator.startNewCheckpoint( - i, // checkpointIndex - constants, - [], // l1ToL2Messages - numBlocksPerCheckpoint, - previousBlockHeader, - ); - - const { blockNumber, timestamp } = block.header.globalVariables; - await context.orchestrator.startNewBlock(blockNumber, timestamp, block.txs.length); - await context.orchestrator.addTxs(block.txs); - await context.orchestrator.setBlockCompleted(blockNumber, block.header); - } - - logger.info('Finalizing epoch'); - const epoch = await context.orchestrator.finalizeEpoch(); - expect(epoch.proof).toBeDefined(); - - const headerHashes = checkpoints.map(c => c.header.hash()); - expect(epoch.publicInputs.checkpointHeaderHashes).toEqual( - padArrayEnd(headerHashes, Fr.ZERO, MAX_CHECKPOINTS_PER_EPOCH), - ); - }, - LONG_TIMEOUT, - ); - - it( - 'builds two consecutive epochs', - async () => { - const numEpochs = 2; - const numCheckpointsPerEpoch = 3; - const numBlocksPerCheckpoint = 1; - const numTxsPerBlock = 1; - logger.info(`Seeding world state with ${numEpochs * numCheckpointsPerEpoch * numBlocksPerCheckpoint} blocks`); - const epochs = await timesAsync(numEpochs, async () => { - const checkpoints = await timesAsync(numCheckpointsPerEpoch, () => - context.makeCheckpoint(numBlocksPerCheckpoint, { numTxsPerBlock }), - ); - const finalBlobChallenges = await context.getFinalBlobChallenges(); - context.startNewEpoch(); - return { checkpoints, finalBlobChallenges }; - }); - - for (let epochIndex = 0; epochIndex < numEpochs; epochIndex++) { - const epochNumber = epochIndex + 1; - const { checkpoints, finalBlobChallenges } = epochs[epochIndex]; - logger.info(`Starting epoch ${epochNumber} with ${checkpoints.length} checkpoints`); - context.orchestrator.startNewEpoch(EpochNumber(epochNumber), checkpoints.length, finalBlobChallenges); - - for (let i = 0; i < checkpoints.length; i++) { - const { - constants, - blocks: [block], - previousBlockHeader, - } = checkpoints[i]; - await context.orchestrator.startNewCheckpoint( - i, // checkpointIndex - constants, - [], - 1 /* numBlocks */, - previousBlockHeader, - ); - - const { blockNumber, timestamp } = block.header.globalVariables; - await context.orchestrator.startNewBlock(blockNumber, timestamp, block.txs.length); - // txs must be added for each block sequentially. - await context.orchestrator.addTxs(block.txs); - } - - // setBlockCompleted may be called in parallel, but it must be called after all txs have been added. - await Promise.all( - checkpoints.map(({ blocks: [block] }) => - context.orchestrator.setBlockCompleted(block.header.globalVariables.blockNumber, block.header), - ), - ); - - logger.info('Finalizing epoch'); - const epoch = await context.orchestrator.finalizeEpoch(); - expect(epoch.proof).toBeDefined(); - - const headerHashes = checkpoints.map(c => c.header.hash()); - expect(epoch.publicInputs.checkpointHeaderHashes).toEqual( - padArrayEnd(headerHashes, Fr.ZERO, MAX_CHECKPOINTS_PER_EPOCH), - ); - } - }, - LONG_TIMEOUT, - ); - }); -}); diff --git a/yarn-project/prover-client/src/orchestrator/orchestrator_rollup_structure.test.ts b/yarn-project/prover-client/src/orchestrator/orchestrator_rollup_structure.test.ts deleted file mode 100644 index 3ceb5715adc2..000000000000 --- a/yarn-project/prover-client/src/orchestrator/orchestrator_rollup_structure.test.ts +++ /dev/null @@ -1,248 +0,0 @@ -import { BatchedBlobAccumulator } from '@aztec/blob-lib'; -import { MAX_CHECKPOINTS_PER_EPOCH, MAX_L2_TO_L1_MSGS_PER_TX } from '@aztec/constants'; -import { asyncMap } from '@aztec/foundation/async-map'; -import { EpochNumber } from '@aztec/foundation/branded-types'; -import { padArrayEnd } from '@aztec/foundation/collection'; -import { Fr } from '@aztec/foundation/curves/bn254'; -import { EthAddress } from '@aztec/foundation/eth-address'; -import { createLogger } from '@aztec/foundation/log'; -import { Gas, GasFees } from '@aztec/stdlib/gas'; -import { ScopedL2ToL1Message, computeEpochOutHash } from '@aztec/stdlib/messaging'; -import { FeeRecipient } from '@aztec/stdlib/rollup'; -import type { ServerCircuitName } from '@aztec/stdlib/stats'; -import { makeScopedL2ToL1Message } from '@aztec/stdlib/testing'; -import { MerkleTreeId } from '@aztec/stdlib/trees'; -import type { GlobalVariables } from '@aztec/stdlib/tx'; - -import { jest } from '@jest/globals'; - -import { TestContext } from '../mocks/test_context.js'; -import { getTreeSnapshot } from './block-building-helpers.js'; - -const logger = createLogger('prover-client:test:orchestrator-single-blocks'); - -describe('prover/orchestrator/rollup-structure', () => { - let context: TestContext; - let proverSpy: Record any>>; - - const mockCoinbase = (checkpointIndex: number) => { - return EthAddress.fromNumber(checkpointIndex + 9876); - }; - - const mockCheckpointGasFees = (checkpointIndex: number) => { - return new GasFees(checkpointIndex + 2, checkpointIndex + 3); - }; - - const mockTxGasUsed = (txIndex: number, blockNumber: number) => { - return Gas.from({ - daGas: (txIndex + 1) * (blockNumber + 2), - l2Gas: (txIndex + 3) * blockNumber, - }); - }; - - const makeL2ToL1Messages = (blockGlobalVariables: GlobalVariables, txIndex: number) => { - // Tweak the l2-to-l1 messages to have different amounts for building the out hashes in various tree shapes. - const numL2ToL1Messages = (blockGlobalVariables.blockNumber + txIndex) % (MAX_L2_TO_L1_MSGS_PER_TX + 1); - const messages = Array.from({ length: numL2ToL1Messages }, () => makeScopedL2ToL1Message((txIndex + 1) * 456)); - return padArrayEnd(messages, ScopedL2ToL1Message.empty(), MAX_L2_TO_L1_MSGS_PER_TX); - }; - - beforeEach(async () => { - context = await TestContext.new(logger); - - proverSpy = { - 'parity-base': jest.spyOn(context.prover, 'getBaseParityProof'), - 'parity-root': jest.spyOn(context.prover, 'getRootParityProof'), - 'chonk-verifier-public': jest.spyOn(context.prover, 'getPublicChonkVerifierProof'), - 'avm-circuit': jest.spyOn(context.prover, 'getAvmProof'), - 'rollup-tx-base-public': jest.spyOn(context.prover, 'getPublicTxBaseRollupProof'), - 'rollup-tx-base-private': jest.spyOn(context.prover, 'getPrivateTxBaseRollupProof'), - 'rollup-tx-merge': jest.spyOn(context.prover, 'getTxMergeRollupProof'), - 'rollup-block-root-first': jest.spyOn(context.prover, 'getBlockRootFirstRollupProof'), - 'rollup-block-root-first-single-tx': jest.spyOn(context.prover, 'getBlockRootSingleTxFirstRollupProof'), - 'rollup-block-root-first-empty-tx': jest.spyOn(context.prover, 'getBlockRootEmptyTxFirstRollupProof'), - 'rollup-block-root': jest.spyOn(context.prover, 'getBlockRootRollupProof'), - 'rollup-block-root-single-tx': jest.spyOn(context.prover, 'getBlockRootSingleTxRollupProof'), - 'rollup-block-merge': jest.spyOn(context.prover, 'getBlockMergeRollupProof'), - 'rollup-checkpoint-root': jest.spyOn(context.prover, 'getCheckpointRootRollupProof'), - 'rollup-checkpoint-root-single-block': jest.spyOn(context.prover, 'getCheckpointRootSingleBlockRollupProof'), - 'rollup-checkpoint-padding': jest.spyOn(context.prover, 'getCheckpointPaddingRollupProof'), - 'rollup-checkpoint-merge': jest.spyOn(context.prover, 'getCheckpointMergeRollupProof'), - 'rollup-root': jest.spyOn(context.prover, 'getRootRollupProof'), - }; - }); - - afterEach(async () => { - await context.cleanup(); - }); - - describe('rollups the txs/blocks/checkpoints correctly to produce the expected public inputs', () => { - it('wonky checkpoint tree', async () => { - const numTxsPerBlockInCheckpoints = [ - [1, 5, 2], // Checkpoint 0 has 3 blocks, with 1, 5 and 2 txs respectively. - [3], // Checkpoint 1 has 1 block with 3 txs. - [0, 4, 6, 1], // Checkpoint 2 has 4 blocks, with 0, 4, 6, and 1 txs respectively. - ]; - const numBlocksInCheckpoints = numTxsPerBlockInCheckpoints.map(c => c.length); - const numCheckpoints = numTxsPerBlockInCheckpoints.length; - const numL1ToL2Messages = 2; - - const l1ToL2MessagesInEpoch: Fr[][][][] = []; - const epochStartArchive = await getTreeSnapshot(MerkleTreeId.ARCHIVE, await context.worldState.fork()); - - const expectedFees: FeeRecipient[] = []; - const checkpoints = await asyncMap(numBlocksInCheckpoints, async (numBlocks, checkpointIndex) => { - const numTxsPerBlock = numTxsPerBlockInCheckpoints[checkpointIndex]; - const coinbase = mockCoinbase(checkpointIndex); - const checkpoint = await context.makeCheckpoint(numBlocks, { - numTxsPerBlock, - numL1ToL2Messages, - gasFees: mockCheckpointGasFees(checkpointIndex), - coinbase, - makeProcessedTxOpts: (blockGlobalVariables: GlobalVariables, txIndex: number) => ({ - gasUsed: mockTxGasUsed(txIndex, blockGlobalVariables.blockNumber), - privateOnly: txIndex % 2 === 0, - avmAccumulatedData: { - l2ToL1Msgs: makeL2ToL1Messages(blockGlobalVariables, txIndex), - }, - }), - }); - - l1ToL2MessagesInEpoch[checkpointIndex] = checkpoint.blocks.map(b => b.txs.map(tx => tx.txEffect.l2ToL1Msgs)); - - // Accumulate the fees for the checkpoint, to be compared with the values from the root rollup's public inputs. - const totalFee = checkpoint.blocks - .map(b => b.txs) - .flat() - .reduce((acc, tx) => acc.add(tx.txEffect.transactionFee), Fr.ZERO); - expect(totalFee).not.toEqual(Fr.ZERO); - expectedFees.push(new FeeRecipient(coinbase, totalFee)); - - return checkpoint; - }); - - const finalBlobChallenges = await context.getFinalBlobChallenges(); - context.orchestrator.startNewEpoch(EpochNumber(1) /* epochNumber */, numCheckpoints, finalBlobChallenges); - - for (let checkpointIndex = 0; checkpointIndex < checkpoints.length; checkpointIndex++) { - const { constants, blocks, l1ToL2Messages, previousBlockHeader } = checkpoints[checkpointIndex]; - - await context.orchestrator.startNewCheckpoint( - checkpointIndex, - constants, - l1ToL2Messages, - blocks.length, - previousBlockHeader, - ); - - for (const block of blocks) { - const { blockNumber, timestamp } = block.header.globalVariables; - await context.orchestrator.startNewBlock(blockNumber, timestamp, block.txs.length); - await context.orchestrator.addTxs(block.txs); - await context.orchestrator.setBlockCompleted(blockNumber, block.header); - } - } - - const result = await context.orchestrator.finalizeEpoch(); - - expect(result.publicInputs.previousArchiveRoot).toEqual(epochStartArchive.root); - - const epochEndArchive = await getTreeSnapshot(MerkleTreeId.ARCHIVE, await context.worldState.fork()); - expect(result.publicInputs.endArchiveRoot).toEqual(epochEndArchive.root); - - const epochOutHash = computeEpochOutHash(l1ToL2MessagesInEpoch); - expect(result.publicInputs.outHash).toEqual(epochOutHash); - - const expectedCheckpointHeaderHashes = checkpoints.map(c => c.header.hash()); - expect(result.publicInputs.checkpointHeaderHashes).toEqual( - padArrayEnd(expectedCheckpointHeaderHashes, Fr.ZERO, MAX_CHECKPOINTS_PER_EPOCH), - ); - - expect(result.publicInputs.fees).toEqual( - padArrayEnd(expectedFees, FeeRecipient.empty(), MAX_CHECKPOINTS_PER_EPOCH), - ); - - const batchedBlob = await BatchedBlobAccumulator.batch(context.getBlobFields()); - const expectedFinalBlobAccumulator = batchedBlob.toFinalBlobAccumulator(); - expect(result.publicInputs.blobPublicInputs).toEqual(expectedFinalBlobAccumulator); - - // Make sure all the circuits are called except for the checkpoint padding. - for (const circuitName of Object.keys(proverSpy) as ServerCircuitName[]) { - if (circuitName === 'rollup-checkpoint-padding') { - expect(proverSpy[circuitName]).not.toHaveBeenCalled(); - } else { - expect(proverSpy[circuitName]).toHaveBeenCalled(); - } - } - }); - - it('builds a checkpoint with l1 to l2 messages but no txs', async () => { - const numBlocks = 1; - const numL1ToL2Messages = 5; - - const epochStartArchive = await getTreeSnapshot(MerkleTreeId.ARCHIVE, await context.worldState.fork()); - - const { - constants, - header, - blocks: [block], - l1ToL2Messages, - previousBlockHeader, - } = await context.makeCheckpoint(numBlocks, { - numTxsPerBlock: 0, - numL1ToL2Messages, - }); - - const finalBlobChallenges = await context.getFinalBlobChallenges(); - context.orchestrator.startNewEpoch(EpochNumber(1), 1, finalBlobChallenges); - - await context.orchestrator.startNewCheckpoint( - 0, // checkpointIndex - constants, - l1ToL2Messages, - numBlocks, - previousBlockHeader, - ); - - const { blockNumber, timestamp } = block.header.globalVariables; - await context.orchestrator.startNewBlock(blockNumber, timestamp, block.txs.length); - await context.orchestrator.setBlockCompleted(blockNumber, block.header); - - const result = await context.orchestrator.finalizeEpoch(); - - expect(result.publicInputs.previousArchiveRoot).toEqual(epochStartArchive.root); - - const epochEndArchive = await getTreeSnapshot(MerkleTreeId.ARCHIVE, await context.worldState.fork()); - expect(result.publicInputs.endArchiveRoot).toEqual(epochEndArchive.root); - - expect(result.publicInputs.checkpointHeaderHashes).toEqual( - padArrayEnd([header.hash()], Fr.ZERO, MAX_CHECKPOINTS_PER_EPOCH), - ); - - expect(result.publicInputs.fees).toEqual(Array.from({ length: MAX_CHECKPOINTS_PER_EPOCH }, FeeRecipient.empty)); - - const batchedBlob = await BatchedBlobAccumulator.batch(context.getBlobFields()); - const expectedFinalBlobAccumulator = batchedBlob.toFinalBlobAccumulator(); - expect(result.publicInputs.blobPublicInputs).toEqual(expectedFinalBlobAccumulator); - - const expectedProvenCircuits = [ - 'parity-base', - 'parity-root', - 'rollup-block-root-first-empty-tx', - 'rollup-checkpoint-root-single-block', - 'rollup-checkpoint-padding', - 'rollup-root', - ]; - for (const circuitName of Object.keys(proverSpy) as ServerCircuitName[]) { - if (!expectedProvenCircuits.includes(circuitName)) { - expect(proverSpy[circuitName]).not.toHaveBeenCalled(); - } else if (circuitName === 'parity-base') { - // 1 proof with messages, 1 proof with empty messages. - expect(proverSpy[circuitName]).toHaveBeenCalledTimes(2); - } else { - expect(proverSpy[circuitName]).toHaveBeenCalledTimes(1); - } - } - }); - }); -}); diff --git a/yarn-project/prover-client/src/orchestrator/orchestrator_single_blocks.test.ts b/yarn-project/prover-client/src/orchestrator/orchestrator_single_blocks.test.ts deleted file mode 100644 index 79fd868b090e..000000000000 --- a/yarn-project/prover-client/src/orchestrator/orchestrator_single_blocks.test.ts +++ /dev/null @@ -1,107 +0,0 @@ -import { NUMBER_OF_L1_L2_MESSAGES_PER_ROLLUP } from '@aztec/constants'; -import { EpochNumber } from '@aztec/foundation/branded-types'; -import { createLogger } from '@aztec/foundation/log'; - -import { TestContext } from '../mocks/test_context.js'; - -const logger = createLogger('prover-client:test:orchestrator-single-blocks'); - -describe('prover/orchestrator/blocks', () => { - let context: TestContext; - - beforeEach(async () => { - context = await TestContext.new(logger); - }); - - afterEach(async () => { - await context.cleanup(); - }); - - describe('blocks', () => { - it('builds an empty L2 block', async () => { - const { - constants, - blocks: [emptyBlock], - previousBlockHeader, - } = await context.makeCheckpoint(1, { numTxsPerBlock: 0 }); - - const finalBlobChallenges = await context.getFinalBlobChallenges(); - context.orchestrator.startNewEpoch(EpochNumber(1), 1, finalBlobChallenges); - - await context.orchestrator.startNewCheckpoint( - 0, // checkpointIndex - constants, - [], - 1, // numBlocks - previousBlockHeader, - ); - - const { blockNumber, timestamp } = emptyBlock.header.globalVariables; - await context.orchestrator.startNewBlock(blockNumber, timestamp, 0 /* numTxs */); - - const header = await context.orchestrator.setBlockCompleted(blockNumber, emptyBlock.header); - await context.orchestrator.finalizeEpoch(); - expect(header).toEqual(emptyBlock.header); - }); - - it('builds a block with 1 transaction', async () => { - const { - constants, - blocks: [block], - previousBlockHeader, - } = await context.makeCheckpoint(1, { numTxsPerBlock: 1 }); - - const finalBlobChallenges = await context.getFinalBlobChallenges(); - context.orchestrator.startNewEpoch(EpochNumber(1), 1, finalBlobChallenges); - - await context.orchestrator.startNewCheckpoint( - 0, // checkpointIndex - constants, - [], - 1, // numBlocks - previousBlockHeader, - ); - - const { blockNumber, timestamp } = block.header.globalVariables; - await context.orchestrator.startNewBlock(blockNumber, timestamp, block.txs.length); - await context.orchestrator.addTxs(block.txs); - - const header = await context.orchestrator.setBlockCompleted(blockNumber, block.header); - await context.orchestrator.finalizeEpoch(); - expect(header).toEqual(block.header); - }); - - it('builds a block concurrently with transaction simulation', async () => { - const { - constants, - blocks: [block], - l1ToL2Messages, - previousBlockHeader, - } = await context.makeCheckpoint(1, { - numTxsPerBlock: 4, - numL1ToL2Messages: NUMBER_OF_L1_L2_MESSAGES_PER_ROLLUP, - makeProcessedTxOpts: (_, txIndex) => ({ privateOnly: txIndex % 2 === 0 }), - }); - - const finalBlobChallenges = await context.getFinalBlobChallenges(); - context.orchestrator.startNewEpoch(EpochNumber(1), 1, finalBlobChallenges); - - await context.orchestrator.startNewCheckpoint( - 0, // checkpointIndex - constants, - l1ToL2Messages, - 1, // numBlocks - previousBlockHeader, - ); - - const { blockNumber, timestamp } = block.header.globalVariables; - await context.orchestrator.startNewBlock(blockNumber, timestamp, block.txs.length); - - await context.orchestrator.addTxs(block.txs); - - const header = await context.orchestrator.setBlockCompleted(blockNumber, block.header); - await context.orchestrator.finalizeEpoch(); - expect(header).toEqual(block.header); - }); - }); -}); diff --git a/yarn-project/prover-client/src/orchestrator/orchestrator_single_checkpoint.test.ts b/yarn-project/prover-client/src/orchestrator/orchestrator_single_checkpoint.test.ts deleted file mode 100644 index be7a9b017843..000000000000 --- a/yarn-project/prover-client/src/orchestrator/orchestrator_single_checkpoint.test.ts +++ /dev/null @@ -1,115 +0,0 @@ -import { MAX_CHECKPOINTS_PER_EPOCH } from '@aztec/constants'; -import { EpochNumber } from '@aztec/foundation/branded-types'; -import { padArrayEnd } from '@aztec/foundation/collection'; -import { Fr } from '@aztec/foundation/curves/bn254'; -import { createLogger } from '@aztec/foundation/log'; -import { getTestData, isGenerateTestDataEnabled } from '@aztec/foundation/testing'; -import { updateProtocolCircuitSampleInputs } from '@aztec/foundation/testing/files'; - -import TOML from '@iarna/toml'; - -import { TestContext } from '../mocks/test_context.js'; - -const logger = createLogger('prover-client:test:orchestrator-single-blocks'); - -describe('prover/orchestrator/single-checkpoint', () => { - let context: TestContext; - - beforeEach(async () => { - context = await TestContext.new(logger); - }); - - afterEach(async () => { - await context.cleanup(); - }); - - it('builds a checkpoint with l1 to l2 messages but no txs, followed by a block with 3 txs', async () => { - const numCheckpoints = 1; - const numBlocks = 2; - const numTxsPerBlock = [0, 3]; - const numL1ToL2Messages = 2; - const { constants, blocks, l1ToL2Messages, previousBlockHeader, header } = await context.makeCheckpoint(numBlocks, { - numTxsPerBlock, - numL1ToL2Messages, - }); - - const finalBlobChallenges = await context.getFinalBlobChallenges(); - context.orchestrator.startNewEpoch(EpochNumber(1), numCheckpoints, finalBlobChallenges); - - await context.orchestrator.startNewCheckpoint( - 0, // checkpointIndex - constants, - l1ToL2Messages, - numBlocks, - previousBlockHeader, - ); - - for (const block of blocks) { - const { blockNumber, timestamp } = block.header.globalVariables; - await context.orchestrator.startNewBlock(blockNumber, timestamp, block.txs.length); - if (block.txs.length > 0) { - await context.orchestrator.addTxs(block.txs); - } - await context.orchestrator.setBlockCompleted(blockNumber, block.header); - } - - const epoch = await context.orchestrator.finalizeEpoch(); - expect(epoch.proof).toBeDefined(); - - expect(epoch.publicInputs.checkpointHeaderHashes).toEqual( - padArrayEnd([header.hash()], Fr.ZERO, MAX_CHECKPOINTS_PER_EPOCH), - ); - - if (isGenerateTestDataEnabled()) { - // These are the circuits that are not executed in prover/full.test.ts - ['rollup-tx-merge', 'rollup-block-root-first-empty-tx', 'rollup-block-root'].forEach(circuitName => { - const data = getTestData(circuitName); - updateProtocolCircuitSampleInputs(circuitName, TOML.stringify(data[0] as any)); - }); - } - }); - - it('builds a checkpoint with multiple blocks', async () => { - const numCheckpoints = 1; - const numBlocks = 3; - const numTxsPerBlock = 1; - const numL1ToL2Messages = 2; - const { constants, blocks, l1ToL2Messages, previousBlockHeader, header } = await context.makeCheckpoint(numBlocks, { - numTxsPerBlock, - numL1ToL2Messages, - }); - - const finalBlobChallenges = await context.getFinalBlobChallenges(); - context.orchestrator.startNewEpoch(EpochNumber(1), numCheckpoints, finalBlobChallenges); - - await context.orchestrator.startNewCheckpoint( - 0, // checkpointIndex - constants, - l1ToL2Messages, - numBlocks, - previousBlockHeader, - ); - - for (const block of blocks) { - const { blockNumber, timestamp } = block.header.globalVariables; - await context.orchestrator.startNewBlock(blockNumber, timestamp, block.txs.length); - await context.orchestrator.addTxs(block.txs); - await context.orchestrator.setBlockCompleted(blockNumber, block.header); - } - - const epoch = await context.orchestrator.finalizeEpoch(); - expect(epoch.proof).toBeDefined(); - - expect(epoch.publicInputs.checkpointHeaderHashes).toEqual( - padArrayEnd([header.hash()], Fr.ZERO, MAX_CHECKPOINTS_PER_EPOCH), - ); - - if (isGenerateTestDataEnabled()) { - // These are the circuits that are not executed in prover/full.test.ts - ['rollup-block-root-single-tx', 'rollup-block-merge', 'rollup-checkpoint-root'].forEach(circuitName => { - const data = getTestData(circuitName); - updateProtocolCircuitSampleInputs(circuitName, TOML.stringify(data[0] as any)); - }); - } - }); -}); diff --git a/yarn-project/prover-client/src/orchestrator/orchestrator_workflow.test.ts b/yarn-project/prover-client/src/orchestrator/orchestrator_workflow.test.ts deleted file mode 100644 index f62184a6dbfd..000000000000 --- a/yarn-project/prover-client/src/orchestrator/orchestrator_workflow.test.ts +++ /dev/null @@ -1,318 +0,0 @@ -import { NESTED_RECURSIVE_PROOF_LENGTH, RECURSIVE_PROOF_LENGTH } from '@aztec/constants'; -import { EpochNumber } from '@aztec/foundation/branded-types'; -import { timesAsync } from '@aztec/foundation/collection'; -import { createLogger } from '@aztec/foundation/log'; -import { promiseWithResolvers } from '@aztec/foundation/promise'; -import { retryFastUntil } from '@aztec/foundation/retry'; -import { ProtocolCircuitVks } from '@aztec/noir-protocol-circuits-types/server/vks'; -import { - type PublicInputsAndRecursiveProof, - type ServerCircuitProver, - makePublicInputsAndRecursiveProof, -} from '@aztec/stdlib/interfaces/server'; -import type { ParityPublicInputs } from '@aztec/stdlib/parity'; -import { ChonkProof, makeRecursiveProof } from '@aztec/stdlib/proofs'; -import { makeParityPublicInputs } from '@aztec/stdlib/testing'; -import { Tx } from '@aztec/stdlib/tx'; - -import { jest } from '@jest/globals'; -import { type MockProxy, mock } from 'jest-mock-extended'; - -import { TestContext } from '../mocks/test_context.js'; -import type { ProvingOrchestrator } from './orchestrator.js'; - -const logger = createLogger('prover-client:test:orchestrator-workflow'); - -describe('prover/orchestrator', () => { - describe('workflow', () => { - let orchestrator: ProvingOrchestrator; - let context: TestContext; - - afterEach(async () => { - await context.cleanup(); - }); - - describe('with mock prover', () => { - let mockProver: MockProxy; - - beforeEach(async () => { - mockProver = mock(); - context = await TestContext.new(logger, { - proverCount: 4, - createProver: () => Promise.resolve(mockProver), - }); - ({ orchestrator } = context); - }); - - it('calls root parity circuit only when ready', async () => { - // create a custom L2 to L1 message - const numL1ToL2Messages = 1; - const { - constants, - blocks: [{ header }], - l1ToL2Messages, - previousBlockHeader, - } = await context.makeCheckpoint(1, { numTxsPerBlock: 0, numL1ToL2Messages }); - - const message = l1ToL2Messages[0]; - - // and delay its proof - const pendingBaseParityResult = - promiseWithResolvers>(); - const expectedBaseParityResult = makePublicInputsAndRecursiveProof( - makeParityPublicInputs(0xff), - makeRecursiveProof(RECURSIVE_PROOF_LENGTH), - ProtocolCircuitVks.ParityBaseArtifact, - ); - - mockProver.getRootParityProof.mockResolvedValue( - makePublicInputsAndRecursiveProof( - makeParityPublicInputs(), - makeRecursiveProof(NESTED_RECURSIVE_PROOF_LENGTH), - ProtocolCircuitVks.ParityRootArtifact, - ), - ); - - mockProver.getBaseParityProof.mockImplementation(inputs => { - if (inputs.msgs[0].equals(message)) { - return pendingBaseParityResult.promise; - } else { - return Promise.resolve( - makePublicInputsAndRecursiveProof( - makeParityPublicInputs(), - makeRecursiveProof(RECURSIVE_PROOF_LENGTH), - ProtocolCircuitVks.ParityBaseArtifact, - ), - ); - } - }); - - const finalBlobChallenges = await context.getFinalBlobChallenges(); - orchestrator.startNewEpoch(EpochNumber(1), 1, finalBlobChallenges); - - await orchestrator.startNewCheckpoint( - 0, // checkpointIndex - constants, - [message], - 1, - previousBlockHeader, - ); - - const { blockNumber, timestamp } = header.globalVariables; - await orchestrator.startNewBlock(blockNumber, timestamp, 0 /* numTxs */); - - // the prover broker deduplicates jobs, so the base parity proof - // for the three sets empty messages is called only once. so total - // calls is one for the empty messages and one for the custom message. - await retryFastUntil( - () => mockProver.getBaseParityProof.mock.calls.length === 2, - 'base parity proofs to be called', - ); - expect(mockProver.getRootParityProof).not.toHaveBeenCalled(); - - // only after the base parity proof is resolved, the root parity should be called - pendingBaseParityResult.resolve(expectedBaseParityResult); - - // give the orchestrator a chance to calls its callbacks - await retryFastUntil( - () => mockProver.getRootParityProof.mock.calls.length === 1, - 'root parity proof to be called', - ); - - orchestrator.cancel(); - }); - }); - - describe('with simulated prover', () => { - let prover: ServerCircuitProver; - - beforeEach(async () => { - context = await TestContext.new(logger); - ({ prover, orchestrator } = context); - }); - - it('waits for block to be completed before enqueueing block root proof', async () => { - const numBlocks = 1; - const { - constants, - blocks: [{ header, txs }], - previousBlockHeader, - } = await context.makeCheckpoint(numBlocks); - - const finalBlobChallenges = await context.getFinalBlobChallenges(); - orchestrator.startNewEpoch(EpochNumber(1), 1, finalBlobChallenges); - - await orchestrator.startNewCheckpoint( - 0, // checkpointIndex - constants, - [], - numBlocks, - previousBlockHeader, - ); - - const { blockNumber, timestamp } = header.globalVariables; - await orchestrator.startNewBlock(blockNumber, timestamp, txs.length); - - // now finish the block - await orchestrator.setBlockCompleted(blockNumber); - - const result = await orchestrator.finalizeEpoch(); - expect(result.proof).toBeDefined(); - }); - - it('cleans up all world state forks', async () => { - const numBlocks = 1; - const { - constants, - blocks: [{ header, txs }], - previousBlockHeader, - } = await context.makeCheckpoint(numBlocks); - - const finalBlobChallenges = await context.getFinalBlobChallenges(); - orchestrator.startNewEpoch(EpochNumber(1), 1, finalBlobChallenges); - - await orchestrator.startNewCheckpoint( - 0, // checkpointIndex - constants, - [], - numBlocks, - previousBlockHeader, - ); - - const { blockNumber, timestamp } = header.globalVariables; - await orchestrator.startNewBlock(blockNumber, timestamp, txs.length); - - // now finish the block - await orchestrator.setBlockCompleted(blockNumber); - - const result = await orchestrator.finalizeEpoch(); - expect(result.proof).toBeDefined(); - // Forks are closed deterministically in setBlockCompleted, so no cancel() needed. - expect(orchestrator.getNumActiveForks()).toEqual(0); - }); - - it('can start chonk verifier proofs before adding processed txs', async () => { - const getChonkVerifierSpy = jest.spyOn(prover, 'getPublicChonkVerifierProof'); - - const numBlocks = 1; - const { - constants, - blocks: [{ header, txs }], - previousBlockHeader, - } = await context.makeCheckpoint(numBlocks, { - numTxsPerBlock: 2, - makeProcessedTxOpts: () => ({ privateOnly: false }), // The chonk verifier circuit is only used for public txs. - }); - - const finalBlobChallenges = await context.getFinalBlobChallenges(); - orchestrator.startNewEpoch(EpochNumber(1), 1, finalBlobChallenges); - - await orchestrator.startNewCheckpoint( - 0, // checkpointIndex - constants, - [], - numBlocks, - previousBlockHeader, - ); - - txs.forEach(tx => (tx.chonkProof = ChonkProof.random())); - await orchestrator.startChonkVerifierCircuits( - txs.map(tx => - Tx.from({ - txHash: tx.hash, - data: tx.data, - chonkProof: tx.chonkProof, - contractClassLogFields: [], - publicFunctionCalldata: [], - }), - ), - ); - - await retryFastUntil(() => getChonkVerifierSpy.mock.calls.length === 2, 'chonk verifier proofs to be called'); - getChonkVerifierSpy.mockReset(); - - const { blockNumber, timestamp } = header.globalVariables; - await orchestrator.startNewBlock(blockNumber, timestamp, txs.length); - await orchestrator.addTxs(txs); - await orchestrator.setBlockCompleted(blockNumber); - - const result = await orchestrator.finalizeEpoch(); - expect(result.proof).toBeDefined(); - expect(getChonkVerifierSpy).toHaveBeenCalledTimes(0); - }); - - it('can add checkpoints in arbitrary order', async () => { - const numCheckpoints = 3; - const numBlocksPerCheckpoint = 2; - const numTxsPerBlock = 2; - const checkpoints = await timesAsync(numCheckpoints, () => - context.makeCheckpoint(numBlocksPerCheckpoint, { - numTxsPerBlock, - }), - ); - - const finalBlobChallenges = await context.getFinalBlobChallenges(); - context.orchestrator.startNewEpoch(EpochNumber(1), numCheckpoints, finalBlobChallenges); - - // Start checkpoint in reverse order. - for (let checkpointIndex = numCheckpoints - 1; checkpointIndex >= 0; checkpointIndex--) { - const { constants, blocks, l1ToL2Messages, previousBlockHeader } = checkpoints[checkpointIndex]; - await context.orchestrator.startNewCheckpoint( - checkpointIndex, - constants, - l1ToL2Messages, - blocks.length, - previousBlockHeader, - ); - - // Blocks in a checkpoint need to be started in order. - for (const block of blocks) { - const { txs } = block; - const { blockNumber, timestamp } = block.header.globalVariables; - await context.orchestrator.startNewBlock(blockNumber, timestamp, txs.length); - await context.orchestrator.addTxs(txs); - await context.orchestrator.setBlockCompleted(blockNumber); - } - } - - logger.info('Finalizing epoch'); - const epoch = await context.orchestrator.finalizeEpoch(); - expect(epoch.proof).toBeDefined(); - }); - - it('can add checkpoints asynchronously', async () => { - const numCheckpoints = 4; - const numBlocksPerCheckpoint = 2; - const numTxsPerBlock = 1; - const checkpoints = await timesAsync(numCheckpoints, () => - context.makeCheckpoint(numBlocksPerCheckpoint, { numTxsPerBlock }), - ); - - const finalBlobChallenges = await context.getFinalBlobChallenges(); - context.orchestrator.startNewEpoch(EpochNumber(1), numCheckpoints, finalBlobChallenges); - - await Promise.all( - checkpoints.map(async (checkpoint, checkpointIndex) => { - const { constants, blocks, l1ToL2Messages, previousBlockHeader } = checkpoint; - await context.orchestrator.startNewCheckpoint( - checkpointIndex, - constants, - l1ToL2Messages, - blocks.length, - previousBlockHeader, - ); - - // Blocks in a checkpoint need to be added in order. - for (const block of blocks) { - const { txs } = block; - const { blockNumber, timestamp } = block.header.globalVariables; - await context.orchestrator.startNewBlock(blockNumber, timestamp, txs.length); - await context.orchestrator.addTxs(txs); - await context.orchestrator.setBlockCompleted(blockNumber); - } - }), - ); - }); - }); - }); -}); diff --git a/yarn-project/prover-client/src/orchestrator/proving-scheduler.ts b/yarn-project/prover-client/src/orchestrator/proving-scheduler.ts index 75561d9e911a..dd608474da94 100644 --- a/yarn-project/prover-client/src/orchestrator/proving-scheduler.ts +++ b/yarn-project/prover-client/src/orchestrator/proving-scheduler.ts @@ -4,9 +4,9 @@ import { SerialQueue } from '@aztec/foundation/queue'; import { sleep } from '@aztec/foundation/sleep'; /** - * Minimal surface a deferred-proving state must expose. Both `EpochProvingState` / - * `CheckpointProvingState` / `BlockProvingState` (used by `ProvingOrchestrator`) and - * `TopTreeProvingState` (used by `TopTreeOrchestrator`) satisfy it. + * Minimal surface a deferred-proving state must expose. Both `CheckpointProvingState` / + * `BlockProvingState` (used by `CheckpointSubTreeOrchestrator`) and `TopTreeProvingState` + * (used by `TopTreeOrchestrator`) satisfy it. */ export interface ProvingStateLike { /** Returns false once the state has been cancelled or otherwise invalidated. */ @@ -19,7 +19,16 @@ export interface ProvingStateLike { * Common scheduling infrastructure shared by every orchestrator that drives broker * proving jobs: * - * - One `SerialQueue` (`deferredJobQueue`) acting as the enqueue-throttle. + * - A shared `SerialQueue` (`deferredJobQueue`) that serialises the *act of handing a + * job to the broker*, one initiation per event-loop tick. Each queue task kicks off + * `safeJob` (which submits to the broker) without awaiting it, then yields with + * `sleep(0)`; the next task therefore runs on the following macrotask. This does NOT + * cap how many broker jobs are concurrently in flight — the broker's own queue absorbs + * that. What it bounds is the burst rate: a sub-tree that synchronously discovers + * thousands of ready jobs can't flood the broker (and monopolise the event loop) in a + * single tick. The queue is owned by the `ProverClient` and shared across every + * orchestrator (every sub-tree and top-tree across every concurrent epoch session), so + * this pacing is applied once globally rather than once-per-orchestrator. * - A list of `AbortController`s (`pendingProvingJobs`) so a `cancel()` can abort * in-flight broker jobs when needed. * - A `deferredProving(state, request, callback, isCancelled?)` method that wraps @@ -28,23 +37,20 @@ export interface ProvingStateLike { * * Subclasses own their own concrete proving state and define `cancelInternal()` for * the rest of the cleanup work (closing world-state forks, marking sub-trees - * cancelled, etc.). `stop()` lives on the base class and follows the standard pattern - * of grabbing the old queue, calling `cancelInternal()` (which recreates the queue), - * and awaiting the old queue's drain. + * cancelled, etc.). Because the queue is shared, neither `cancel()` nor `stop()` touch + * it — they only abort this orchestrator's in-flight broker jobs. Queued-but-unrun jobs + * for a cancelled orchestrator no-op via the guards in `deferredProving`. */ export abstract class ProvingScheduler { protected pendingProvingJobs: AbortController[] = []; protected logger: Logger; - private deferredJobQueue: SerialQueue; constructor( - private readonly enqueueConcurrency: number, + private readonly deferredJobQueue: SerialQueue, loggerName = 'prover-client:proving-scheduler', bindings?: LoggerBindings, ) { this.logger = createLogger(loggerName, bindings); - this.deferredJobQueue = new SerialQueue(); - this.deferredJobQueue.start(this.enqueueConcurrency); } /** Number of broker jobs currently in flight. */ @@ -53,16 +59,14 @@ export abstract class ProvingScheduler { } /** - * Drains the deferred-job queue, recreates it (so the subclass can be reused), and - * optionally aborts every in-flight broker job. Aborting is the right choice on + * Optionally aborts every in-flight broker job. Aborting is the right choice on * reorg-driven cancel (where the in-flight inputs are no longer valid) and the * wrong choice on shutdown (where leaving jobs in the broker queue lets a restart - * pick them up). + * pick them up). The shared queue is not touched — queued-but-unrun jobs belonging + * to this orchestrator no-op once their controller is aborted (or once their state + * is marked invalid by the subclass). */ protected resetSchedulerState(abortJobs: boolean): void { - void this.deferredJobQueue.cancel(); - this.deferredJobQueue = new SerialQueue(); - this.deferredJobQueue.start(this.enqueueConcurrency); if (abortJobs) { for (const controller of this.pendingProvingJobs) { controller.abort(); @@ -78,14 +82,12 @@ export abstract class ProvingScheduler { protected abstract cancelInternal(): void; /** - * Standard stop: grab the old queue, cancel (which recreates the queue), then - * await the old queue's drain so any final job tear-down has unwound before we - * return. + * Standard stop: cancel this orchestrator's work. The shared queue is owned by the + * `ProverClient` and outlives every orchestrator, so it is not drained here. */ - public async stop(): Promise { - const oldQueue = this.deferredJobQueue; + public stop(): Promise { this.cancelInternal(); - await oldQueue.cancel(); + return Promise.resolve(); } /** @@ -147,9 +149,11 @@ export abstract class ProvingScheduler { }; void this.deferredJobQueue.put(async () => { + // Kick off the broker submission without awaiting it — awaiting here would serialise all + // proving (one job at a time) and kill parallelism. The `sleep(0)` yields the event loop + // so the next queued job initiates on the following macrotask, pacing bursts rather than + // bounding in-flight broker concurrency (the broker's own queue handles that). void safeJob(); - // Yield to the macrotask queue so Node has a chance to interleave other work - // between enqueues. await sleep(0); }); } diff --git a/yarn-project/prover-client/src/orchestrator/top-tree-orchestrator.test.ts b/yarn-project/prover-client/src/orchestrator/top-tree-orchestrator.test.ts index 0f52cd656e24..9be914f034b2 100644 --- a/yarn-project/prover-client/src/orchestrator/top-tree-orchestrator.test.ts +++ b/yarn-project/prover-client/src/orchestrator/top-tree-orchestrator.test.ts @@ -1,15 +1,29 @@ +import { MAX_L2_TO_L1_MSGS_PER_TX } from '@aztec/constants'; import { EpochNumber } from '@aztec/foundation/branded-types'; +import { padArrayEnd } from '@aztec/foundation/collection'; import { EthAddress } from '@aztec/foundation/eth-address'; import { createLogger } from '@aztec/foundation/log'; import { promiseWithResolvers } from '@aztec/foundation/promise'; +import { retryUntil } from '@aztec/foundation/retry'; +import { sleep } from '@aztec/foundation/sleep'; +import { ScopedL2ToL1Message, computeEpochOutHash } from '@aztec/stdlib/messaging'; +import { makeScopedL2ToL1Message } from '@aztec/stdlib/testing'; -import { TestContext } from '../mocks/test_context.js'; +import { TestContext, makeTestDeferredJobQueue } from '../mocks/test_context.js'; import { CheckpointSubTreeOrchestrator } from './checkpoint-sub-tree-orchestrator.js'; -import { EpochProvingContext } from './epoch-proving-context.js'; +import { ChonkCache } from './chonk-cache.js'; import { type CheckpointTopTreeData, TopTreeCancelledError, TopTreeOrchestrator } from './top-tree-orchestrator.js'; const logger = createLogger('prover-client:test:top-tree-orchestrator'); +/** A full tx-worth of L2-to-L1 messages, padded to the per-tx maximum. */ +const makeL2ToL1Messages = (count: number) => + padArrayEnd( + Array.from({ length: count }, (_, i) => makeScopedL2ToL1Message((i + 1) * 321)), + ScopedL2ToL1Message.empty(), + MAX_L2_TO_L1_MSGS_PER_TX, + ); + /** * End-to-end exercises for `TopTreeOrchestrator`. Each test drives one or more * `CheckpointSubTreeOrchestrator`s to produce block proofs, then feeds them into a @@ -31,17 +45,24 @@ describe('prover/orchestrator/top-tree', () => { * Drives a single checkpoint through `CheckpointSubTreeOrchestrator` and returns * the assembled `CheckpointTopTreeData` plus the originating checkpoint metadata. */ - async function driveSubTree(numBlocks: number, numTxsPerBlock: number, numL1ToL2Messages = 0) { - const fixture = await context.makeCheckpoint(numBlocks, { numTxsPerBlock, numL1ToL2Messages }); + async function driveSubTree(numBlocks: number, numTxsPerBlock: number, numL1ToL2Messages = 0, numL2ToL1Messages = 0) { + const fixture = await context.makeCheckpoint(numBlocks, { + numTxsPerBlock, + numL1ToL2Messages, + makeProcessedTxOpts: + numL2ToL1Messages > 0 + ? () => ({ privateOnly: false, avmAccumulatedData: { l2ToL1Msgs: makeL2ToL1Messages(numL2ToL1Messages) } }) + : undefined, + }); - const epochContext = new EpochProvingContext(context.prover, EpochNumber(1)); const subTree = await CheckpointSubTreeOrchestrator.start( context.worldState, context.prover, EthAddress.ZERO, - epochContext, + new ChonkCache(), + EpochNumber(1), false, - 10, + makeTestDeferredJobQueue(), fixture.constants, fixture.l1ToL2Messages, numBlocks, @@ -60,7 +81,6 @@ describe('prover/orchestrator/top-tree', () => { const result = await resultPromise; await subTree.stop(); - epochContext.stop(); const topTreeData: CheckpointTopTreeData = { blockProofs: Promise.resolve(result.blockProofOutputs), @@ -77,7 +97,7 @@ describe('prover/orchestrator/top-tree', () => { const { topTreeData } = await driveSubTree(1, 1); const challenges = await context.getFinalBlobChallenges(); - const topTree = new TopTreeOrchestrator(context.prover, EthAddress.ZERO, 10); + const topTree = new TopTreeOrchestrator(context.prover, EthAddress.ZERO, makeTestDeferredJobQueue()); try { const result = await topTree.prove(EpochNumber(1), 1, challenges, [topTreeData]); expect(result.proof).toBeDefined(); @@ -93,7 +113,7 @@ describe('prover/orchestrator/top-tree', () => { const b = await driveSubTree(1, 1); const challenges = await context.getFinalBlobChallenges(); - const topTree = new TopTreeOrchestrator(context.prover, EthAddress.ZERO, 10); + const topTree = new TopTreeOrchestrator(context.prover, EthAddress.ZERO, makeTestDeferredJobQueue()); try { const result = await topTree.prove(EpochNumber(1), 2, challenges, [a.topTreeData, b.topTreeData]); expect(result.proof).toBeDefined(); @@ -102,6 +122,43 @@ describe('prover/orchestrator/top-tree', () => { } }); + it('produces an epoch proof for a checkpoint carrying L1-to-L2 messages', async () => { + // L1-to-L2 (cross-chain) messages must survive the full sub-tree → top-tree path (A-1039). + const { topTreeData } = await driveSubTree(1, 1, 3); + const challenges = await context.getFinalBlobChallenges(); + + const topTree = new TopTreeOrchestrator(context.prover, EthAddress.ZERO, makeTestDeferredJobQueue()); + try { + const result = await topTree.prove(EpochNumber(1), 1, challenges, [topTreeData]); + expect(result.proof).toBeDefined(); + expect(result.publicInputs).toBeDefined(); + } finally { + await topTree.stop(); + } + }); + + it('produces an epoch proof for a checkpoint emitting L2-to-L1 messages', async () => { + // L2-to-L1 messages feed the epoch out-hash assembled at the top tree (A-1039). + const { fixture, topTreeData } = await driveSubTree(1, 1, 0, 2); + expect(fixture.blocks[0].txs[0].txEffect.l2ToL1Msgs.length).toBe(2); + const challenges = await context.getFinalBlobChallenges(); + + const topTree = new TopTreeOrchestrator(context.prover, EthAddress.ZERO, makeTestDeferredJobQueue()); + try { + const result = await topTree.prove(EpochNumber(1), 1, challenges, [topTreeData]); + expect(result.proof).toBeDefined(); + expect(result.publicInputs).toBeDefined(); + + // The messages flow all the way through to the epoch out-hash on the root-rollup proof. + const messagesPerEpoch = [fixture.blocks.map(b => b.txs.map(tx => tx.txEffect.l2ToL1Msgs))]; + const expectedEpochOutHash = computeEpochOutHash(messagesPerEpoch); + expect(expectedEpochOutHash.isZero()).toBe(false); // sanity: the fixture really did carry messages + expect(result.publicInputs.outHash).toEqual(expectedEpochOutHash); + } finally { + await topTree.stop(); + } + }); + it('pipelines: starts ckpt0 root rollup before ckpt1 sub-tree resolves', async () => { // Drive both sub-trees synchronously (still no top tree running). const a = await driveSubTree(1, 1); @@ -112,7 +169,7 @@ describe('prover/orchestrator/top-tree', () => { const deferred = promiseWithResolvers ? T : never>(); const ckpt1 = { ...b.topTreeData, blockProofs: deferred.promise } as CheckpointTopTreeData; - const topTree = new TopTreeOrchestrator(context.prover, EthAddress.ZERO, 10); + const topTree = new TopTreeOrchestrator(context.prover, EthAddress.ZERO, makeTestDeferredJobQueue()); try { // Top tree proves in the background; it should be able to advance ckpt0's root // rollup before we resolve ckpt1's promise. @@ -139,7 +196,7 @@ describe('prover/orchestrator/top-tree', () => { const stuck = new Promise ? T : never>(() => {}); const stuckData = { ...topTreeData, blockProofs: stuck } as CheckpointTopTreeData; - const topTree = new TopTreeOrchestrator(context.prover, EthAddress.ZERO, 10); + const topTree = new TopTreeOrchestrator(context.prover, EthAddress.ZERO, makeTestDeferredJobQueue()); const provePromise = topTree.prove(EpochNumber(1), 1, challenges, [stuckData]); // Yield then cancel. @@ -159,7 +216,7 @@ describe('prover/orchestrator/top-tree', () => { const { topTreeData } = await driveSubTree(1, 1); const challenges = await context.getFinalBlobChallenges(); - const topTree = new TopTreeOrchestrator(context.prover, EthAddress.ZERO, 10); + const topTree = new TopTreeOrchestrator(context.prover, EthAddress.ZERO, makeTestDeferredJobQueue()); topTree.cancel({ abortJobs: true }); let actual: unknown; @@ -176,7 +233,7 @@ describe('prover/orchestrator/top-tree', () => { const { topTreeData } = await driveSubTree(1, 1); const challenges = await context.getFinalBlobChallenges(); - const topTree = new TopTreeOrchestrator(context.prover, EthAddress.ZERO, 10); + const topTree = new TopTreeOrchestrator(context.prover, EthAddress.ZERO, makeTestDeferredJobQueue()); try { const first = topTree.prove(EpochNumber(1), 1, challenges, [topTreeData]); // Second call before first settles should throw synchronously inside the function @@ -187,11 +244,86 @@ describe('prover/orchestrator/top-tree', () => { } }); + it('rejects (does not hang) when building checkpoint-root inputs fails', async () => { + // A-1036: if input-building throws (bad block proof, blob-hint failure, etc.) the failure + // must reach state.reject(). Otherwise the completion promise never settles and prove() hangs. + const { topTreeData } = await driveSubTree(1, 1); + const challenges = await context.getFinalBlobChallenges(); + + // A malformed block proof makes toProofData (inside buildCheckpointRootInputs) throw. + const badData = { ...topTreeData, blockProofs: Promise.resolve([{} as any]) } as CheckpointTopTreeData; + + const topTree = new TopTreeOrchestrator(context.prover, EthAddress.ZERO, makeTestDeferredJobQueue()); + try { + const provePromise = topTree.prove(EpochNumber(1), 1, challenges, [badData]); + const hung = Symbol('hung'); + const outcome = await Promise.race([ + provePromise.then( + () => 'resolved' as const, + err => err, + ), + sleep(5000).then(() => hung), + ]); + expect(outcome).not.toBe(hung); + expect(outcome).toBeInstanceOf(Error); + expect((outcome as Error).message).toMatch(/checkpoint root inputs/i); + } finally { + topTree.cancel({ abortJobs: true }); + await topTree.stop(); + } + }); + + it('surfaces a genuine proving failure even when a cancel races in', async () => { + // A-1035: a real failure rejects the completion promise first, then a reorg cancel arrives + // before prove()'s catch observes it. The genuine error must survive, not be masked as + // TopTreeCancelledError. + const { topTreeData } = await driveSubTree(1, 1); + const challenges = await context.getFinalBlobChallenges(); + + const deferred = promiseWithResolvers ? T : never>(); + // Observe exactly when prove() attaches its blockProofs handler, so we can sequence the + // genuine rejection and the cancel deterministically rather than racing a fixed timeout. + let handlerAttached = false; + const observableBlockProofs = { + then: (onF: any, onR: any) => { + handlerAttached = true; + return deferred.promise.then(onF, onR); + }, + }; + const failingData = { ...topTreeData, blockProofs: observableBlockProofs as any } as CheckpointTopTreeData; + + const topTree = new TopTreeOrchestrator(context.prover, EthAddress.ZERO, makeTestDeferredJobQueue()); + const provePromise = topTree.prove(EpochNumber(1), 1, challenges, [failingData]); + + // Wait until prove() has finished its pre-loop setup and registered the blockProofs handler. + await retryUntil(() => handlerAttached, 'prove() attaches blockProofs handler', 5, 0.005); + + // Register a cancel reaction on the rejection, after prove()'s own handler (registered + // first, so it runs first). On rejection the ordering is: prove's handler rejects the + // completion promise with the genuine error → our reaction cancels → prove's catch runs. + // That places the cancel in the exact one-microtask window where A-1035's masking occurs. + const cancelOnReject = deferred.promise.catch(() => topTree.cancel({ abortJobs: true })); + + // Genuine failure rejects the completion promise while cancelled is still false. + deferred.reject(new Error('REAL CIRCUIT FAILURE')); + await cancelOnReject; + + const err = await provePromise.then( + () => undefined, + e => e, + ); + expect(err).toBeInstanceOf(Error); + expect(err).not.toBeInstanceOf(TopTreeCancelledError); + expect((err as Error).message).toContain('REAL CIRCUIT FAILURE'); + + await topTree.stop(); + }); + it('rejects when checkpointData length disagrees with totalNumCheckpoints', async () => { const { topTreeData } = await driveSubTree(1, 1); const challenges = await context.getFinalBlobChallenges(); - const topTree = new TopTreeOrchestrator(context.prover, EthAddress.ZERO, 10); + const topTree = new TopTreeOrchestrator(context.prover, EthAddress.ZERO, makeTestDeferredJobQueue()); try { await expect(topTree.prove(EpochNumber(1), 2, challenges, [topTreeData])).rejects.toThrow( /does not match totalNumCheckpoints/, diff --git a/yarn-project/prover-client/src/orchestrator/top-tree-orchestrator.ts b/yarn-project/prover-client/src/orchestrator/top-tree-orchestrator.ts index c0479d15d8cd..5c3d9e66bc39 100644 --- a/yarn-project/prover-client/src/orchestrator/top-tree-orchestrator.ts +++ b/yarn-project/prover-client/src/orchestrator/top-tree-orchestrator.ts @@ -13,8 +13,9 @@ import { BLS12Point } from '@aztec/foundation/curves/bls12'; import { Fr } from '@aztec/foundation/curves/bn254'; import type { LoggerBindings } from '@aztec/foundation/log'; import { promiseWithResolvers } from '@aztec/foundation/promise'; +import type { SerialQueue } from '@aztec/foundation/queue'; import type { Tuple } from '@aztec/foundation/serialize'; -import { MerkleTreeCalculator, shaMerkleHash } from '@aztec/foundation/trees'; +import { MerkleTreeCalculator, type TreeNodeLocation, shaMerkleHash } from '@aztec/foundation/trees'; import type { EthAddress } from '@aztec/stdlib/block'; import type { PublicInputsAndRecursiveProof, ServerCircuitProver } from '@aztec/stdlib/interfaces/server'; import { computeCheckpointOutHash } from '@aztec/stdlib/messaging'; @@ -31,7 +32,7 @@ import type { BlockHeader } from '@aztec/stdlib/tx'; import { type TelemetryClient, getTelemetryClient } from '@aztec/telemetry-client'; import { buildBlobHints, toProofData } from './block-building-helpers.js'; -import { TopTreeProvingScheduler } from './top-tree-proving-scheduler.js'; +import { ProvingScheduler } from './proving-scheduler.js'; import { TopTreeProvingState } from './top-tree-proving-state.js'; /** Per-checkpoint data fed into the top tree. */ @@ -86,18 +87,18 @@ type OutHashHint = { * and each checkpoint's root rollup fires the moment its sub-tree's `blockProofs` * promise resolves. Later checkpoints can still be block-level proving in parallel. */ -export class TopTreeOrchestrator extends TopTreeProvingScheduler { +export class TopTreeOrchestrator extends ProvingScheduler { private state: TopTreeProvingState | undefined; private cancelled = false; constructor( - prover: ServerCircuitProver, + protected readonly prover: ServerCircuitProver, private readonly proverId: EthAddress, - enqueueConcurrency: number, + deferredJobQueue: SerialQueue, _telemetryClient: TelemetryClient = getTelemetryClient(), bindings?: LoggerBindings, ) { - super(prover, enqueueConcurrency, 'prover-client:top-tree-orchestrator', bindings); + super(deferredJobQueue, 'prover-client:top-tree-orchestrator', bindings); } public getProverId(): EthAddress { @@ -133,10 +134,9 @@ export class TopTreeOrchestrator extends TopTreeProvingScheduler { } const { promise: completionPromise, resolve, reject } = promiseWithResolvers(); - // The completion promise is awaited inside the try/catch below. Attach a no-op catch - // here as well so any spurious unhandled-rejection detection during cancellation - // (where reject() can fire synchronously before the await microtask installs a handler) - // is silenced. + // The completion promise is awaited below. Attach a no-op catch here as well so any + // spurious unhandled-rejection detection during cancellation (where reject() can fire + // synchronously before the await microtask installs a handler) is silenced. completionPromise.catch(() => {}); const startBlobAccumulator = BatchedBlobAccumulator.newWithChallenges(finalBlobBatchingChallenges); @@ -189,16 +189,13 @@ export class TopTreeOrchestrator extends TopTreeProvingScheduler { ); } - try { - await completionPromise; - await this.state.finalizeBatchedBlob(); - return this.state.getEpochProofResult(); - } catch (err: any) { - if (this.cancelled) { - throw new TopTreeCancelledError(); - } - throw err; - } + // The error type is stamped atomically at rejection time by the rejectionCallback above + // (TopTreeCancelledError iff cancel() drove the rejection). Re-deriving it here from the + // live `cancelled` flag would mask a genuine failure that lost a race with a late cancel + // (A-1035), so let the original error propagate untouched. + await completionPromise; + await this.state.finalizeBatchedBlob(); + return this.state.getEpochProofResult(); } /** @@ -220,10 +217,6 @@ export class TopTreeOrchestrator extends TopTreeProvingScheduler { // --- internal: per-checkpoint enqueue path --- - protected override onRootRollupComplete(state: TopTreeProvingState) { - state.resolve(); - } - private enqueueCheckpointRoot( state: TopTreeProvingState, checkpointIndex: number, @@ -235,26 +228,36 @@ export class TopTreeOrchestrator extends TopTreeProvingScheduler { outHashHint: OutHashHint, startBlobAccumulator: BatchedBlobAccumulator, ) { - void this.buildCheckpointRootInputs(blockProofs, cd, outHashHint, startBlobAccumulator).then(inputs => { - this.deferredProving( - state, - signal => { - if (inputs instanceof CheckpointRootSingleBlockRollupPrivateInputs) { - return this.prover.getCheckpointRootSingleBlockRollupProof(inputs, signal, state.epochNumber); - } - return this.prover.getCheckpointRootRollupProof(inputs, signal, state.epochNumber); - }, - result => { - this.logger.debug(`Completed checkpoint root proof for checkpoint ${checkpointIndex}`); - const leafLocation = state.setCheckpointRootRollupProof(checkpointIndex, result); - if (state.totalNumCheckpoints === 1) { - this.enqueueEpochPadding(state); - } else { - this.checkAndEnqueueNextCheckpointMergeRollup(state, leafLocation); - } - }, - ); - }); + void this.buildCheckpointRootInputs(blockProofs, cd, outHashHint, startBlobAccumulator).then( + inputs => { + this.deferredProving( + state, + signal => { + if (inputs instanceof CheckpointRootSingleBlockRollupPrivateInputs) { + return this.prover.getCheckpointRootSingleBlockRollupProof(inputs, signal, state.epochNumber); + } + return this.prover.getCheckpointRootRollupProof(inputs, signal, state.epochNumber); + }, + result => { + this.logger.debug(`Completed checkpoint root proof for checkpoint ${checkpointIndex}`); + const leafLocation = state.setCheckpointRootRollupProof(checkpointIndex, result); + if (state.totalNumCheckpoints === 1) { + this.enqueueEpochPadding(state); + } else { + this.checkAndEnqueueNextCheckpointMergeRollup(state, leafLocation); + } + }, + ); + }, + // Without this, an input-building failure rejects a discarded promise, state.reject() is + // never called, and prove() hangs forever on its completion promise (A-1036). + err => { + if (this.cancelled) { + return; + } + state.reject(`Building checkpoint root inputs for checkpoint ${checkpointIndex} failed: ${err}`); + }, + ); } private async buildCheckpointRootInputs( @@ -286,6 +289,73 @@ export class TopTreeOrchestrator extends TopTreeProvingScheduler { : new CheckpointRootRollupPrivateInputs([proofDatas[0], proofDatas[1]], hints); } + // --- internal: top-tree proof orchestration (formerly TopTreeProvingScheduler) --- + + private enqueueCheckpointMergeRollup(state: TopTreeProvingState, location: TreeNodeLocation) { + if (!state.verifyState() || !state.tryStartProvingCheckpointMerge(location)) { + return; + } + const inputs = state.getCheckpointMergeRollupInputs(location); + this.deferredProving( + state, + signal => this.prover.getCheckpointMergeRollupProof(inputs, signal, state.epochNumber), + result => { + state.setCheckpointMergeRollupProof(location, result); + this.checkAndEnqueueNextCheckpointMergeRollup(state, location); + }, + ); + } + + private enqueueEpochPadding(state: TopTreeProvingState) { + if (!state.verifyState() || !state.tryStartProvingPaddingCheckpoint()) { + return; + } + const inputs = state.getPaddingCheckpointInputs(); + this.deferredProving( + state, + signal => this.prover.getCheckpointPaddingRollupProof(inputs, signal, state.epochNumber), + result => { + state.setCheckpointPaddingProof(result); + this.checkAndEnqueueRootRollup(state); + }, + ); + } + + private enqueueRootRollup(state: TopTreeProvingState) { + if (!state.verifyState() || !state.tryStartProvingRootRollup()) { + return; + } + const inputs = state.getRootRollupInputs(); + this.deferredProving( + state, + signal => this.prover.getRootRollupProof(inputs, signal, state.epochNumber), + result => { + this.logger.verbose(`Completed root rollup for epoch ${state.epochNumber}`); + state.setRootRollupProof(result); + state.resolve(); + }, + ); + } + + private checkAndEnqueueNextCheckpointMergeRollup(state: TopTreeProvingState, currentLocation: TreeNodeLocation) { + if (!state.isReadyForCheckpointMerge(currentLocation)) { + return; + } + const parentLocation = state.getParentLocation(currentLocation); + if (parentLocation.level === 0) { + this.checkAndEnqueueRootRollup(state); + } else { + this.enqueueCheckpointMergeRollup(state, parentLocation); + } + } + + private checkAndEnqueueRootRollup(state: TopTreeProvingState) { + if (!state.isReadyForRootRollup()) { + return; + } + this.enqueueRootRollup(state); + } + private async computeOutHashHints(checkpointData: CheckpointTopTreeData[]): Promise { const treeCalculator = await MerkleTreeCalculator.create(OUT_HASH_TREE_HEIGHT, undefined, (left, right) => Promise.resolve(shaMerkleHash(left, right)), diff --git a/yarn-project/prover-client/src/orchestrator/top-tree-proving-scheduler.ts b/yarn-project/prover-client/src/orchestrator/top-tree-proving-scheduler.ts deleted file mode 100644 index 0d65cd2b1b62..000000000000 --- a/yarn-project/prover-client/src/orchestrator/top-tree-proving-scheduler.ts +++ /dev/null @@ -1,154 +0,0 @@ -import type { NESTED_RECURSIVE_PROOF_LENGTH, NESTED_RECURSIVE_ROLLUP_HONK_PROOF_LENGTH } from '@aztec/constants'; -import type { EpochNumber } from '@aztec/foundation/branded-types'; -import type { LoggerBindings } from '@aztec/foundation/log'; -import type { TreeNodeLocation } from '@aztec/foundation/trees'; -import type { PublicInputsAndRecursiveProof, ServerCircuitProver } from '@aztec/stdlib/interfaces/server'; -import type { - CheckpointMergeRollupPrivateInputs, - CheckpointPaddingRollupPrivateInputs, - CheckpointRollupPublicInputs, - RootRollupPrivateInputs, - RootRollupPublicInputs, -} from '@aztec/stdlib/rollup'; - -import { ProvingScheduler, type ProvingStateLike } from './proving-scheduler.js'; - -type CheckpointRollupProof = PublicInputsAndRecursiveProof< - CheckpointRollupPublicInputs, - typeof NESTED_RECURSIVE_ROLLUP_HONK_PROOF_LENGTH ->; - -type RootRollupProof = PublicInputsAndRecursiveProof; - -/** - * State interface required by the top-tree proving drivers (checkpoint-merge → padding → - * root-rollup). Both `EpochProvingState` and `TopTreeProvingState` satisfy it structurally; - * the per-checkpoint state in `EpochProvingState` (block/tx proving, world-state forks) - * is owned outside this surface. - */ -export interface TopTreeStateLike extends ProvingStateLike { - readonly epochNumber: EpochNumber; - readonly totalNumCheckpoints: number; - - tryStartProvingCheckpointMerge(location: TreeNodeLocation): boolean; - setCheckpointMergeRollupProof(location: TreeNodeLocation, provingOutput: CheckpointRollupProof): void; - isReadyForCheckpointMerge(location: TreeNodeLocation): boolean; - getParentLocation(location: TreeNodeLocation): TreeNodeLocation; - getCheckpointMergeRollupInputs(location: TreeNodeLocation): CheckpointMergeRollupPrivateInputs; - - tryStartProvingPaddingCheckpoint(): boolean; - setCheckpointPaddingProof(provingOutput: CheckpointRollupProof): void; - getPaddingCheckpointInputs(): CheckpointPaddingRollupPrivateInputs; - - tryStartProvingRootRollup(): boolean; - setRootRollupProof(provingOutput: RootRollupProof): void; - isReadyForRootRollup(): boolean; - getRootRollupInputs(): RootRollupPrivateInputs; -} - -/** - * Shared scheduling for the top-tree section of epoch proving — checkpoint-merge, - * padding (single-checkpoint case), and root rollup. Both `ProvingOrchestrator` and - * `TopTreeOrchestrator` extend this; their per-checkpoint-root drivers diverge (one - * drains state-derived inputs once block-merge is done, the other builds inputs from - * caller-supplied checkpoint data), but the rest of the tree is identical. - * - * Subclasses provide a `wrapCircuitCall` hook for telemetry (the orchestrator wraps - * each call in a span; the top-tree leaves it as identity), and an - * `onRootRollupComplete` hook to invoke the right shape of `state.resolve()` — - * `EpochProvingState.resolve` takes a `ProvingResult`, `TopTreeProvingState.resolve` - * is no-arg. - */ -export abstract class TopTreeProvingScheduler extends ProvingScheduler { - constructor( - protected readonly prover: ServerCircuitProver, - enqueueConcurrency: number, - loggerName?: string, - bindings?: LoggerBindings, - ) { - super(enqueueConcurrency, loggerName, bindings); - } - - /** - * Wraps a circuit call for telemetry. Default is identity; the orchestrator overrides - * to wrap with `wrapCallbackInSpan`. - */ - protected wrapCircuitCall( - _circuitName: string, - fn: (signal: AbortSignal) => Promise, - ): (signal: AbortSignal) => Promise { - return fn; - } - - /** Called once the root rollup proof has been set; subclasses call `state.resolve(...)` with the right shape. */ - protected abstract onRootRollupComplete(state: TopTreeStateLike): void; - - protected enqueueCheckpointMergeRollup(state: TopTreeStateLike, location: TreeNodeLocation) { - if (!state.verifyState() || !state.tryStartProvingCheckpointMerge(location)) { - return; - } - const inputs = state.getCheckpointMergeRollupInputs(location); - this.deferredProving( - state, - this.wrapCircuitCall('rollup-checkpoint-merge', signal => - this.prover.getCheckpointMergeRollupProof(inputs, signal, state.epochNumber), - ), - result => { - state.setCheckpointMergeRollupProof(location, result); - this.checkAndEnqueueNextCheckpointMergeRollup(state, location); - }, - ); - } - - protected enqueueEpochPadding(state: TopTreeStateLike) { - if (!state.verifyState() || !state.tryStartProvingPaddingCheckpoint()) { - return; - } - const inputs = state.getPaddingCheckpointInputs(); - this.deferredProving( - state, - this.wrapCircuitCall('rollup-checkpoint-padding', signal => - this.prover.getCheckpointPaddingRollupProof(inputs, signal, state.epochNumber), - ), - result => { - state.setCheckpointPaddingProof(result); - this.checkAndEnqueueRootRollup(state); - }, - ); - } - - protected enqueueRootRollup(state: TopTreeStateLike) { - if (!state.verifyState() || !state.tryStartProvingRootRollup()) { - return; - } - const inputs = state.getRootRollupInputs(); - this.deferredProving( - state, - this.wrapCircuitCall('rollup-root', signal => this.prover.getRootRollupProof(inputs, signal, state.epochNumber)), - result => { - this.logger.verbose(`Completed root rollup for epoch ${state.epochNumber}`); - state.setRootRollupProof(result); - this.onRootRollupComplete(state); - }, - ); - } - - protected checkAndEnqueueNextCheckpointMergeRollup(state: TopTreeStateLike, currentLocation: TreeNodeLocation) { - if (!state.isReadyForCheckpointMerge(currentLocation)) { - return; - } - const parentLocation = state.getParentLocation(currentLocation); - if (parentLocation.level === 0) { - this.checkAndEnqueueRootRollup(state); - } else { - this.enqueueCheckpointMergeRollup(state, parentLocation); - } - } - - protected checkAndEnqueueRootRollup(state: TopTreeStateLike) { - if (!state.isReadyForRootRollup()) { - return; - } - this.enqueueRootRollup(state); - } -} diff --git a/yarn-project/prover-client/src/orchestrator/top-tree-proving-state.ts b/yarn-project/prover-client/src/orchestrator/top-tree-proving-state.ts index add5b990fb5c..eef365d86b92 100644 --- a/yarn-project/prover-client/src/orchestrator/top-tree-proving-state.ts +++ b/yarn-project/prover-client/src/orchestrator/top-tree-proving-state.ts @@ -41,7 +41,6 @@ export class TopTreeProvingState { private endBlobAccumulator: BatchedBlobAccumulator | undefined; private finalBatchedBlob: BatchedBlob | undefined; private lifecycle = TOP_TREE_LIFECYCLE.CREATED; - constructor( public readonly epochNumber: EpochNumber, public readonly totalNumCheckpoints: number, diff --git a/yarn-project/prover-client/src/prover-client/prover-client.ts b/yarn-project/prover-client/src/prover-client/prover-client.ts index 727ed3858363..f8260d704fbc 100644 --- a/yarn-project/prover-client/src/prover-client/prover-client.ts +++ b/yarn-project/prover-client/src/prover-client/prover-client.ts @@ -4,10 +4,10 @@ import { times } from '@aztec/foundation/collection'; import type { Fr } from '@aztec/foundation/curves/bn254'; import type { EthAddress } from '@aztec/foundation/eth-address'; import { type Logger, createLogger } from '@aztec/foundation/log'; +import { SerialQueue } from '@aztec/foundation/queue'; import { NativeACVMSimulator } from '@aztec/simulator/server'; import { type ActualProverConfig, - type EpochProver, type EpochProverManager, type ForkMerkleTreeOperations, type ProvingJobBroker, @@ -23,13 +23,11 @@ import { type TelemetryClient, getTelemetryClient } from '@aztec/telemetry-clien import type { ProverClientConfig } from '../config.js'; import { CheckpointSubTreeOrchestrator } from '../orchestrator/checkpoint-sub-tree-orchestrator.js'; -import { EpochProvingContext } from '../orchestrator/epoch-proving-context.js'; -import { ProvingOrchestrator } from '../orchestrator/orchestrator.js'; +import type { ChonkCache } from '../orchestrator/chonk-cache.js'; import { TopTreeOrchestrator } from '../orchestrator/top-tree-orchestrator.js'; import { BrokerCircuitProverFacade } from '../proving_broker/broker_prover_facade.js'; import { InlineProofStore, type ProofStore, createProofStore } from '../proving_broker/proof_store/index.js'; import { ProvingAgent } from '../proving_broker/proving_agent.js'; -import { ServerEpochProver } from './server-epoch-prover.js'; /** * The factory surface that `EpochProvingJob` (in `prover-node`) depends on. Implemented @@ -49,14 +47,13 @@ import { ServerEpochProver } from './server-epoch-prover.js'; export interface EpochProverFactory { getProverId(): EthAddress; /** - * Constructs a per-epoch shared context for the caching of e.g. chonk verifier results - */ - createEpochProvingContext(epochNumber: EpochNumber): EpochProvingContext; - /** - * Constructs and starts a `CheckpointSubTreeOrchestrator` for a single checkpoint. + * Constructs and starts a `CheckpointSubTreeOrchestrator` for a single checkpoint + * against the supplied shared `chonkCache`. The cache is owned by the prover-node + * and survives across epochs / sessions. */ createCheckpointSubTreeOrchestrator( - epochContext: EpochProvingContext, + chonkCache: ChonkCache, + epochNumber: EpochNumber, checkpointConstants: CheckpointConstantData, l1ToL2Messages: Fr[], totalNumBlocks: number, @@ -75,6 +72,13 @@ export class ProverClient implements EpochProverManager, EpochProverFactory { * `EpochProverFactory` for why a single shared facade is required. */ private facade: BrokerCircuitProverFacade | undefined; + /** + * Single deferred-proving-job queue shared across every orchestrator (sub-trees and + * top-trees, across every concurrent epoch session). Throttles the total rate of job + * submission to the broker once, rather than once per orchestrator. Started lazily + * alongside the facade and cancelled on `stop()`. + */ + private deferredJobQueue: SerialQueue | undefined; private constructor( private config: ProverClientConfig, @@ -112,40 +116,21 @@ export class ProverClient implements EpochProverManager, EpochProverFactory { return this.facade; } - /** - * Legacy single-class epoch prover. Each call constructs its own - * `BrokerCircuitProverFacade`; the new factory methods (`createCheckpointSubTreeOrchestrator`, - * `createTopTreeOrchestrator`, `createEpochProvingContext`) share a single facade - * owned by `ProverClient`. Both APIs coexist while the prover-node migrates onto - * the new pair. - */ - public createEpochProver(): EpochProver { - const bindings = this.log.getBindings(); - const facade = new BrokerCircuitProverFacade( - this.orchestratorClient, - this.proofStore, - this.failedProofStore, - undefined, - bindings, - ); - const orchestrator = new ProvingOrchestrator( - this.worldState, - facade, - this.config.proverId, - this.config.cancelJobsOnStop, - this.config.enqueueConcurrency, - this.telemetry, - bindings, - ); - return new ServerEpochProver(facade, orchestrator); - } - - public createEpochProvingContext(epochNumber: EpochNumber): EpochProvingContext { - return new EpochProvingContext(this.getFacade(), epochNumber, this.log.getBindings()); + /** Lazy-init the shared deferred-job queue, started with the configured enqueue concurrency. */ + private getDeferredJobQueue(): SerialQueue { + if (!this.running) { + throw new Error('ProverClient is not running; call start() before constructing orchestrators.'); + } + if (!this.deferredJobQueue) { + this.deferredJobQueue = new SerialQueue(); + this.deferredJobQueue.start(this.config.enqueueConcurrency); + } + return this.deferredJobQueue; } public createCheckpointSubTreeOrchestrator( - epochContext: EpochProvingContext, + chonkCache: ChonkCache, + epochNumber: EpochNumber, checkpointConstants: CheckpointConstantData, l1ToL2Messages: Fr[], totalNumBlocks: number, @@ -155,9 +140,10 @@ export class ProverClient implements EpochProverManager, EpochProverFactory { this.worldState, this.getFacade(), this.config.proverId, - epochContext, + chonkCache, + epochNumber, this.config.cancelJobsOnStop, - this.config.enqueueConcurrency, + this.getDeferredJobQueue(), checkpointConstants, l1ToL2Messages, totalNumBlocks, @@ -171,7 +157,7 @@ export class ProverClient implements EpochProverManager, EpochProverFactory { return new TopTreeOrchestrator( this.getFacade(), this.config.proverId, - this.config.enqueueConcurrency, + this.getDeferredJobQueue(), this.telemetry, this.log.getBindings(), ); @@ -216,12 +202,12 @@ export class ProverClient implements EpochProverManager, EpochProverFactory { } this.running = false; await this.stopAgents(); + if (this.deferredJobQueue) { + await this.deferredJobQueue.cancel(); + this.deferredJobQueue = undefined; + } if (this.facade) { - try { - await this.facade.stop(); - } catch (err) { - this.log.error('Error stopping shared broker facade', err); - } + await tryStop(this.facade, this.log); this.facade = undefined; } await tryStop(this.orchestratorClient); diff --git a/yarn-project/prover-client/src/prover-client/server-epoch-prover.ts b/yarn-project/prover-client/src/prover-client/server-epoch-prover.ts deleted file mode 100644 index dd1715757d6c..000000000000 --- a/yarn-project/prover-client/src/prover-client/server-epoch-prover.ts +++ /dev/null @@ -1,69 +0,0 @@ -import type { BatchedBlob, FinalBlobBatchingChallenges } from '@aztec/blob-lib/types'; -import { BlockNumber, EpochNumber } from '@aztec/foundation/branded-types'; -import type { Fr } from '@aztec/foundation/curves/bn254'; -import type { EthAddress } from '@aztec/stdlib/block'; -import type { EpochProver } from '@aztec/stdlib/interfaces/server'; -import type { Proof } from '@aztec/stdlib/proofs'; -import type { CheckpointConstantData, RootRollupPublicInputs } from '@aztec/stdlib/rollup'; -import type { BlockHeader, ProcessedTx, Tx } from '@aztec/stdlib/tx'; -import type { UInt64 } from '@aztec/stdlib/types'; - -import type { ProvingOrchestrator } from '../orchestrator/orchestrator.js'; -import type { BrokerCircuitProverFacade } from '../proving_broker/broker_prover_facade.js'; - -/** Encapsulates the proving orchestrator and the broker facade */ -export class ServerEpochProver implements EpochProver { - constructor( - private facade: BrokerCircuitProverFacade, - private orchestrator: ProvingOrchestrator, - ) {} - - startNewEpoch( - epochNumber: EpochNumber, - totalNumCheckpoints: number, - finalBlobBatchingChallenges: FinalBlobBatchingChallenges, - ): void { - this.orchestrator.startNewEpoch(epochNumber, totalNumCheckpoints, finalBlobBatchingChallenges); - this.facade.start(); - } - startNewCheckpoint( - checkpointIndex: number, - constants: CheckpointConstantData, - l1ToL2Messages: Fr[], - totalNumBlocks: number, - headerOfLastBlockInPreviousCheckpoint: BlockHeader, - ): Promise { - return this.orchestrator.startNewCheckpoint( - checkpointIndex, - constants, - l1ToL2Messages, - totalNumBlocks, - headerOfLastBlockInPreviousCheckpoint, - ); - } - startChonkVerifierCircuits(txs: Tx[]): Promise { - return this.orchestrator.startChonkVerifierCircuits(txs); - } - setBlockCompleted(blockNumber: BlockNumber, expectedBlockHeader?: BlockHeader): Promise { - return this.orchestrator.setBlockCompleted(blockNumber, expectedBlockHeader); - } - finalizeEpoch(): Promise<{ publicInputs: RootRollupPublicInputs; proof: Proof; batchedBlobInputs: BatchedBlob }> { - return this.orchestrator.finalizeEpoch(); - } - cancel(): void { - this.orchestrator.cancel(); - } - getProverId(): EthAddress { - return this.orchestrator.getProverId(); - } - async stop(): Promise { - await this.facade.stop(); - await this.orchestrator.stop(); - } - startNewBlock(blockNumber: BlockNumber, timestamp: UInt64, totalNumTxs: number): Promise { - return this.orchestrator.startNewBlock(blockNumber, timestamp, totalNumTxs); - } - addTxs(txs: ProcessedTx[]): Promise { - return this.orchestrator.addTxs(txs); - } -} diff --git a/yarn-project/prover-client/src/test/bb_prover_full_rollup.test.ts b/yarn-project/prover-client/src/test/bb_prover_full_rollup.test.ts index bc66a883cc0c..db00fb35bbfb 100644 --- a/yarn-project/prover-client/src/test/bb_prover_full_rollup.test.ts +++ b/yarn-project/prover-client/src/test/bb_prover_full_rollup.test.ts @@ -3,12 +3,16 @@ import { NUMBER_OF_L1_L2_MESSAGES_PER_ROLLUP, PAIRING_POINTS_SIZE } from '@aztec import { EpochNumber } from '@aztec/foundation/branded-types'; import { timesAsync } from '@aztec/foundation/collection'; import { parseBooleanEnv } from '@aztec/foundation/config'; +import { EthAddress } from '@aztec/foundation/eth-address'; import { type Logger, createLogger } from '@aztec/foundation/log'; import { getTestData, isGenerateTestDataEnabled } from '@aztec/foundation/testing'; import { writeTestData } from '@aztec/foundation/testing/files'; import { getTelemetryClient } from '@aztec/telemetry-client'; -import { TestContext } from '../mocks/test_context.js'; +import { TestContext, makeTestDeferredJobQueue } from '../mocks/test_context.js'; +import { CheckpointSubTreeOrchestrator } from '../orchestrator/checkpoint-sub-tree-orchestrator.js'; +import { ChonkCache } from '../orchestrator/chonk-cache.js'; +import { type CheckpointTopTreeData, TopTreeOrchestrator } from '../orchestrator/top-tree-orchestrator.js'; describe('prover/bb_prover/full-rollup', () => { const FAKE_PROOFS = parseBooleanEnv(process.env.FAKE_PROOFS); @@ -53,49 +57,80 @@ describe('prover/bb_prover/full-rollup', () => { ); const finalBlobChallenges = await context.getFinalBlobChallenges(); - context.orchestrator.startNewEpoch(EpochNumber(1), numCheckpoints, finalBlobChallenges); - - for (let checkpointIndex = 0; checkpointIndex < numCheckpoints; checkpointIndex++) { - const { constants, blocks, l1ToL2Messages, previousBlockHeader } = checkpoints[checkpointIndex]; - - log.info(`Starting new checkpoint #${checkpointIndex}`); - await context.orchestrator.startNewCheckpoint( - checkpointIndex, - constants, - l1ToL2Messages, - EpochNumber(1), - previousBlockHeader, - ); - - for (let i = 0; i < numBlockPerCheckpoint; i++) { - const { header, txs } = blocks[i]; - const { blockNumber, timestamp } = header.globalVariables; - - log.info(`Starting new block #${blockNumber}`); - await context.orchestrator.startNewBlock(blockNumber, timestamp, txs.length); - await context.orchestrator.addTxs(txs); - - log.info(`Setting block as completed`); - await context.orchestrator.setBlockCompleted(blockNumber, header); + const chonkCache = new ChonkCache(); + const subTrees: CheckpointSubTreeOrchestrator[] = []; + const topTreeData: CheckpointTopTreeData[] = []; + + try { + // Drive each checkpoint through its own sub-tree, mirroring the production + // CheckpointProver flow. The top tree starts proving as each sub-tree completes. + for (let checkpointIndex = 0; checkpointIndex < numCheckpoints; checkpointIndex++) { + const { constants, blocks, l1ToL2Messages, previousBlockHeader, checkpoint } = checkpoints[checkpointIndex]; + + log.info(`Starting new checkpoint #${checkpointIndex}`); + const subTree = await CheckpointSubTreeOrchestrator.start( + context.worldState, + context.prover, + EthAddress.ZERO, + chonkCache, + EpochNumber(1), + /* cancelJobsOnStop */ false, + makeTestDeferredJobQueue(), + constants, + l1ToL2Messages, + numBlockPerCheckpoint, + previousBlockHeader, + ); + subTrees.push(subTree); + + for (let i = 0; i < numBlockPerCheckpoint; i++) { + const { header, txs } = blocks[i]; + const { blockNumber, timestamp } = header.globalVariables; + + log.info(`Starting new block #${blockNumber}`); + await subTree.startNewBlock(blockNumber, timestamp, txs.length); + if (txs.length > 0) { + await subTree.addTxs(txs); + } + + log.info(`Setting block as completed`); + await subTree.setBlockCompleted(blockNumber, header); + } + + topTreeData.push({ + blockProofs: subTree.getSubTreeResult().then(r => r.blockProofOutputs), + l2ToL1MsgsPerBlock: blocks.map(b => b.txs.map(tx => tx.txEffect.l2ToL1Msgs)), + blobFields: checkpoint.toBlobFields(), + previousBlockHeader, + previousArchiveSiblingPath: subTree.getPreviousArchiveSiblingPath(), + }); } - } - log.info(`Awaiting proofs`); - const epochResult = await context.orchestrator.finalizeEpoch(); + log.info(`Awaiting top-tree proof`); + const topTree = new TopTreeOrchestrator(context.prover, EthAddress.ZERO, makeTestDeferredJobQueue()); + let epochResult; + try { + epochResult = await topTree.prove(EpochNumber(1), numCheckpoints, finalBlobChallenges, topTreeData); + } finally { + await topTree.stop(); + } - if (prover) { - // TODO(https://github.com/AztecProtocol/aztec-packages/issues/13188): Handle the pairing point object without these hacks. - epochResult.proof.numPublicInputs -= PAIRING_POINTS_SIZE; - await expect(prover.verifyProof('RootRollupArtifact', epochResult.proof)).resolves.not.toThrow(); - } + if (prover) { + // TODO(https://github.com/AztecProtocol/aztec-packages/issues/13188): Handle the pairing point object without these hacks. + epochResult.proof.numPublicInputs -= PAIRING_POINTS_SIZE; + await expect(prover.verifyProof('RootRollupArtifact', epochResult.proof)).resolves.not.toThrow(); + } - // Generate test data for the 1/1 blocks epoch scenario - if (numCheckpoints === 1 && numBlockPerCheckpoint === 1 && isGenerateTestDataEnabled()) { - const epochProof = getTestData('epochProofResult').at(-1); - writeTestData( - 'yarn-project/end-to-end/src/fixtures/dumps/epoch_proof_result.json', - JSON.stringify(epochProof!), - ); + // Generate test data for the 1/1 blocks epoch scenario. + if (numCheckpoints === 1 && numBlockPerCheckpoint === 1 && isGenerateTestDataEnabled()) { + const epochProof = getTestData('epochProofResult').at(-1); + writeTestData( + 'yarn-project/end-to-end/src/fixtures/dumps/epoch_proof_result.json', + JSON.stringify(epochProof!), + ); + } + } finally { + await Promise.all(subTrees.map(s => s.stop())); } }, FAKE_PROOFS ? undefined : 900_000, diff --git a/yarn-project/prover-node/README.md b/yarn-project/prover-node/README.md index e31824c879af..2e985f8cbab3 100644 --- a/yarn-project/prover-node/README.md +++ b/yarn-project/prover-node/README.md @@ -1 +1,507 @@ # Prover Node + +The prover node turns sequenced checkpoints into epoch proofs that get submitted to the L1 +rollup contract. It runs alongside an Aztec validator/full-node and consumes the +canonical chain view those nodes emit, proving epochs **optimistically** — sub-tree +work begins the moment a checkpoint lands on L1, not when the epoch closes. + +This document describes the internal architecture: the state held by the prover-node, +the events that drive it, and the data flow from a fresh `chain-checkpointed` event +through to a `submitEpochRootProof` on L1. + +## Contents + +1. [Architecture](#architecture) +2. [CheckpointProver lifecycle](#checkpointprover-lifecycle) +3. [EpochSession lifecycle](#epochsession-lifecycle) +4. [Event flow](#event-flow) +5. [Walkthroughs](#walkthroughs) +6. [Design rationale](#design-rationale) +7. [Configuration](#configuration) +8. [Failure handling and observability](#failure-handling-and-observability) + +## Architecture + +```mermaid +flowchart TB + L2BlockStream -->|chain-checkpointed| ProverNode + L2BlockStream -->|chain-pruned| ProverNode + L2BlockStream -->|chain-proven| ProverNode + L2BlockStream -->|any event| ProverNode + ProverNode --> CheckpointStore + ProverNode --> ChonkCache + ProverNode --> SessionManager + ProverNode --> ProofPublishingService + SessionManager --> EpochTicker[(periodic tick)] + SessionManager --> FullSessions[(fullSessions)] + SessionManager --> PartialSessions[(partialSessions)] + CheckpointStore --> SlotWatcher + FullSessions -.referenced checkpoints.-> CheckpointStore + PartialSessions -.referenced checkpoints.-> CheckpointStore + FullSessions --> TopTreeJob + PartialSessions --> TopTreeJob + TopTreeJob -->|PublishCandidate| ProofPublishingService + ProofPublishingService -->|fresh per publish| ProverNodePublisher + ProverNodePublisher --> L1[L1 Rollup] +``` + +The prover-node splits responsibility between four classes: + +- **`ProverNode`** — owns the long-lived collections, wires the L2BlockStream, and + translates each chain event into a single method call on the `SessionManager` or + `ProofPublishingService`. It also performs the per-event side effects that don't + belong on an `EpochSession` (registering new checkpoints with the store, sweeping + expired epochs out of the cache and the store, etc.) and runs the failure-upload + action when an `EpochSession` exits with `failed`. +- **`CheckpointStore`** — a registry of `CheckpointProver` instances keyed by + `(checkpointNumber, slot, archiveRoot)`. Each `CheckpointProver` runs its own sub-tree pipeline + (tx gather → block processing → block-rollup proofs), starting eagerly the moment a + checkpoint is registered. The store is the single source of canonical-vs-pruned + checkpoint content that `EpochSession`s query when assembling their subsets. +- **`SessionManager`** — owns every live `EpochSession`, the serial reconcile queue, + the periodic tick, and all `EpochSession` lifecycle decisions. `ProverNode` calls into it + via `onCheckpointAdded`, `onPrune`, and `startProof`. Every trigger it receives is + translated into a `reconcile(trigger)` call, a single idempotent function that walks + all `EpochSession`s, cancels any whose canonical content has shifted, re-creates them with + the new content, and opens fresh full `EpochSession`s for any epoch that has become provable. + Reconcile runs on a `SerialQueue` (from `@aztec/foundation/queue`), so two concurrent + triggers can never interleave on an `await` and race on the `EpochSession` maps. +- **`ProofPublishingService`** — central owner of L1 proof submission. `EpochSession`s hand + their top-tree proofs to the service as `PublishCandidate`s; the service serialises + one publish at a time against a freshly-created `ProverNodePublisher`, gates eligibility + on the proven block tip, picks the longest candidate per epoch as the winner + (others resolve `'superseded'`), and enforces a per-candidate `deadline`. It runs its + own `drain()` on a separate `SerialQueue`: submits, withdrawals, chain-proven advances, + and per-candidate deadline expiries all enqueue a drain pass, so the eligibility + re-check and the L1 publish never interleave with each other. + +## CheckpointProver lifecycle + +A `CheckpointProver` is content-addressed by `(checkpoint.number, slot, archiveRoot)`, +where `archiveRoot` is the checkpoint's own archive root (its post-state). Keying on the +post-state makes the identity precise: two checkpoints are "the same" iff they produce +the same archive — so a reorg branch, or a replacement built on the same predecessor but +with different content, yields a different archive root and a distinct `CheckpointProver`, while an +identical re-add collapses to the same `CheckpointProver` and reuses its in-flight work. + +```mermaid +stateDiagram-v2 + [*] --> Created + Created --> Proving: gather + execute + Proving --> Proven: sub-tree resolves blockProofs + Proving --> Cancelled: cancel() + Proven --> Reaped: reapExpired(epoch) + Cancelled --> [*] + + state "Pruned (side)" as Pruned + Proving --> Pruned: markPruned() + Pruned --> Proving: markCanonical() + Proven --> Pruned: markPruned() + Pruned --> Reaped: SlotWatcher (slot < syncedSlot) +``` + +The **`Pruned`** state is a side flag, not a place in the main lifecycle: sub-tree +proving keeps running underneath, so a brief reorg that prunes and immediately +re-adds the same checkpoint avoids any re-proving. The flag only gates *eligibility* +to be included in an `EpochSession` — `EpochSession`s ask the store for *canonical* (non-pruned) +checkpoints when assembling their subsets. + +### Reaping rules + +- **Pruned**: the `SlotWatcher` (a `RunningPromise` polling + `l2BlockSource.getSyncedL2SlotNumber`) reaps a pruned `CheckpointProver` when the chain's + synced slot has moved past the `CheckpointProver`'s slot. Once the chain is past that slot, + a re-add with the same content is impossible. +- **Canonical**: `CheckpointStore.reapExpired(expiredEpoch)` drops any canonical + `CheckpointProver` whose epoch is at or below the supplied expired epoch. Once an epoch's + proof-submission window has closed, its proof can no longer be accepted on L1, + so the `CheckpointProver` is no longer needed. +- **Cancelled**: removed immediately by whichever path called `cancel()` (store + shutdown, prune past-slot, `EpochSession` error). + +### Eager tx gathering + +A `CheckpointProver` starts its tx gather + sub-tree pipeline **in its constructor**. +The tx provider is injected as a dependency, +and the `CheckpointProver` pulls its own txs via `txProvider.getTxsForBlock(block)` for each +block in its checkpoint. + +This means the moment a checkpoint lands on L1, sub-tree proving is already in flight. +By the time the epoch closes (and the `EpochSession` is constructed), most or all of the +block-rollup proofs are already done — the `EpochSession` only has to drive the top tree. + +## EpochSession lifecycle + +An `EpochSession` is identified by a slot-based **spec**: + +```ts +interface SessionSpec { + kind: 'full' | 'partial'; + epochNumber: EpochNumber; + fromSlot: SlotNumber; + toSlot: SlotNumber; +} +``` + +The spec declares *what to prove* (a slot range). The concrete checkpoint set the +`EpochSession` holds is the *implementation* of the spec — frozen at construction time, +derived from the canonical content for that slot range. + +```mermaid +stateDiagram-v2 + [*] --> initialized + initialized --> awaiting_checkpoints: start() + awaiting_checkpoints --> completed: publish succeeds + awaiting_checkpoints --> superseded: longer same-epoch candidate wins + awaiting_checkpoints --> failed: L1 submission errored + awaiting_checkpoints --> cancelled: cancel() + initialized --> timed_out: deadline + awaiting_checkpoints --> timed_out: deadline (EpochSession or candidate) + completed --> [*] + superseded --> [*] + cancelled --> [*] + timed_out --> [*] + failed --> [*] +``` + +The `awaiting-checkpoints` state covers the window between `start()` and the L1 +submission: a `TopTreeJob` is running over the `EpochSession`'s frozen checkpoint set, +awaiting each checkpoint's sub-tree result (`CheckpointProver.whenBlockProofsReady`) +and assembling the epoch proof. + +The `EpochSession` does three sequential things: (1) run a `TopTreeJob` over the frozen +checkpoint subset, (2) hand the resulting proof to `ProofPublishingService` as a +`PublishCandidate`, (3) translate the service's outcome into a terminal state. +Predecessor gating, same-epoch dedup, deadline enforcement, and the L1 tx are all +the `ProofPublishingService`'s concern; the `EpochSession` is just the producer of one +candidate and the observer of its outcome. + +Outcome → state mapping: + +| `PublishOutcome` | `EpochSession` state | +|---|---| +| `published` | `completed` | +| `superseded` | `superseded` | +| `failed` | `failed` | +| `expired` | `timed-out` | +| `withdrawn` | `cancelled` | + +There is a single deadline — the proof submission window — that applies across both +proving and publishing. Before submission, the `EpochSession` arms its own timer against +it: if proving doesn't finish in time, the `EpochSession` enters `timed-out` via +`cancel('deadline')`. After submission, the publishing service enforces the same deadline +on the candidate. It's the same instant throughout; only which component enforces it +changes once the candidate has been handed off. + +### Full vs partial + +Every `EpochSession` — full or partial — has `fromSlot = firstSlotOfEpoch(N)`. The L1 rollup +contract requires every proof to extend from the previous epoch's proven tip, so +there's no value in starting later than the epoch boundary. The two kinds differ +only in `toSlot` and in how the publishing service treats their candidate: + +- **Full** `EpochSession`s are opened by reconcile when the epoch is complete on L1 *and* + every archiver-reported checkpoint is present in the store. Their `toSlot` is + the epoch's last slot. The publishing service never auto-supersedes a `full` + candidate on proven-tip subsumption — the L1 contract records a `(epoch, prover-id)` + submission for every full-epoch proof, so even after another prover-node has + landed first, this prover's submission is still worthwhile. +- **Partial** `EpochSession`s are constructed by an explicit `startProof(epochNumber)` API + call. Their `toSlot` is the last canonical slot present at request time, which may + be earlier than the epoch's last slot. Partial candidates are an early-finish + optimisation: if the proven chain has caught up to or past `endBlock` by the time + the publishing service picks the winner, the partial resolves `'superseded'` + without spending L1 gas. Dedup: if the partial's spec collapses to the full's spec + (canonical content already covers the whole epoch), `startProof` awaits the + existing full `EpochSession` instead of opening a duplicate. + +## ProofPublishingService + +The service is a single per-prover-node owner of L1 submission. `EpochSession`s call +`submit(candidate)` and await one of five outcomes: + +| Outcome | Meaning | +|---|---| +| `published` | L1 accepted the proof. | +| `superseded` | A longer same-epoch candidate won, or (for `partial` candidates) the proven tip has caught up to `endBlock`. | +| `failed` | L1 submission errored. | +| `expired` | The candidate's `deadline` elapsed before publishing started. | +| `withdrawn` | An `EpochSession` called `withdraw(uuid)` on a still-queued candidate. | + +Key invariants: + +- **One publish at a time** via a `SerialQueue` drain. +- **Fresh publisher per publish.** Each drain call constructs a new `ProverNodePublisher` + via the factory. There is no shared in-memory state across publishes. +- **Once an L1 publish starts, it runs to completion.** `withdraw` is a queue-only + operation: it removes a candidate that hasn't started publishing. An in-flight + candidate is left alone and its outcome reports whatever L1 returned. The + originating `EpochSession` has already moved to a terminal state via `cancel()` and + ignores the late outcome. +- **Drain reads the proven block number afresh** from `l2BlockSource` inside the + serial queue, so the eligibility + check is consistent with the publish that follows it on the same drain pass. +- **Per-candidate `deadline`** arms a `setTimeout` (against the injected `DateProvider`). + When it fires, a still-queued candidate resolves `'expired'`. An in-flight publish + is left alone (its outcome reports the natural L1 result). +- **Transient `publisherFactory.create()` failures are retried.** Instead of resolving + the candidate as `'failed'`, the service schedules another drain after a 1s backoff + and leaves the candidate in the queue. The candidate's `deadline` caps the total + retry window — persistent acquire failure resolves as `'expired'`. + +### Eligibility + +A candidate is eligible to publish when its **predecessor block is proven** +(`startBlock - 1 <= proven`). Among eligible candidates for the same epoch, the +one with the **highest `endBlock`** wins; the others resolve `'superseded'`. +Partial candidates whose `endBlock <= proven` are dropped before this check +(early-finish optimisation no longer helps); full candidates are never +auto-superseded on the proven tip. + +## Event flow + +### chain-checkpointed + +```mermaid +sequenceDiagram + participant L2 as L2BlockStream + participant PN as ProverNode + participant CS as CheckpointStore + participant CP as CheckpointProver + participant SM as SessionManager + + L2->>PN: chain-checkpointed{checkpoint} + PN->>PN: collectRegisterData (prev-header, l1ToL2 messages, sibling path) + PN->>CS: addOrUpdate(checkpoint, data) + alt content key new + CS->>CP: new CheckpointProver(args) + CP->>CP: eager gather + sub-tree start + else content key matches + CS->>CP: markCanonical() + end + PN->>SM: onCheckpointAdded(epoch) + SM->>SM: queue reconcile({kind:'checkpoint', epoch}) + SM->>SM: walk EpochSessions, recreate invalid + SM->>SM: open full EpochSession if epoch ready +``` + +### chain-pruned + +```mermaid +sequenceDiagram + participant L2 as L2BlockStream + participant PN as ProverNode + participant CS as CheckpointStore + participant SM as SessionManager + + L2->>PN: chain-pruned{checkpoint} + PN->>CS: markPrunedAfter(checkpoint.number) + CS->>CS: flip every CheckpointProver above threshold to pruned (sub-tree keeps running) + PN->>SM: onPrune(affectedEpochs) + SM->>SM: queue reconcile({kind:'prune', affectedEpochs}) + SM->>SM: walk EpochSessions, cancel-and-recreate those with shifted content +``` + +### chain-proven + +```mermaid +sequenceDiagram + participant L2 as L2BlockStream + participant PN as ProverNode + participant PS as ProofPublishingService + + L2->>PN: chain-proven{block} + PN->>PS: onChainProven(blockNumber) + PS->>PS: scheduleDrain (wake-up only, no state cached) + PS->>PS: drain reads proven afresh, re-checks eligibility +``` + +### Per-event expiry sweep + +```mermaid +sequenceDiagram + participant L2 as L2BlockStream + participant PN as ProverNode + participant CC as ChonkCache + participant CS as CheckpointStore + + L2->>PN: any event + PN->>L2: getSyncedL2SlotNumber() + PN->>PN: latestEpoch = getEpochAtSlot(latestSlot) + PN->>PN: newlyExpiredUpTo = latestEpoch - (proofSubmissionEpochs + 1) + loop for each newly-expired epoch + PN->>L2: getCheckpointsData({epoch}) + getBlocks(...) + PN->>CC: releaseForBlocks(blocks) + PN->>CS: reapExpired(epoch) + end +``` + +Expiry runs at the end of every `handleBlockStreamEvent` call (not on any specific +event type). An epoch `E` is expired once the chain reaches the start of epoch +`E + proofSubmissionEpochs + 1` — the deadline beyond which an L1 submission for +`E` would be rejected. A monotonic high-water mark (`lastExpiredEpoch`) makes the +sweep cheap: it advances per event and never revisits an epoch. It is seeded at +`start()` from the last fully-proven epoch (computed in `computeStartupState`), +so on a restart we never re-sweep epochs that already reached L1. + +### Periodic tick + +`SessionManager.start()` arms a `RunningPromise` that fires +`reconcile({ kind: 'tick' })` every `tickIntervalMs`. The tick picks up epochs that +became complete by time alone (no fresh checkpoint event) and advances to the +next unproven epoch once the previous one lands on L1. A monotonic high-water +mark (`lastTickEpoch`) prevents the tick from re-opening an epoch whose `EpochSession` +already terminated; the mark advances only after an `EpochSession` actually exists for +the epoch, so transient blockers (max-pending-jobs reached, archiver still +indexing) leave the mark in place and the next tick retries. + +## Walkthroughs + +### checkpoint-added → prune → checkpoint-added (reorg resilience) + +State: epoch N has checkpoints c1..c4 all canonical (slots s1..s4). `fullSessions[N]` +holds `EpochSession` **A** with spec `{kind:'full', N, fromSlot:s1, toSlot:s4}`, referencing +checkpoints `[c1, c2, c3, c4]`. + +1. **chain-pruned arrives, target c3.** Store flips c4 to pruned. Reconcile fires: + for `EpochSession` A, canonical content for `(s1, s4)` is now `[c1, c2, c3]` (c4 pruned). + The frozen set `[c1, c2, c3, c4]` no longer matches → `A.cancel('canonical content + changed')`. Epoch N still complete on L1 → reconcile constructs `EpochSession` **B** with + the same spec `{full, N, s1, s4}` but checkpoints `[c1, c2, c3]`. + +2. **`EpochSession` B starts top-tree proving over [c1, c2, c3].** + +3. **chain-checkpointed arrives, target c4_re (same content key as old c4).** The + store finds the existing `CheckpointProver` at `(c4.number, s4, c4.archive.root)` + and calls `markCanonical()`. The sub-tree work that never stopped is visible to + `EpochSession`s again. (A re-add with *different* content would have a different archive + root and so get a fresh `CheckpointProver` instead.) + +4. **Reconcile fires.** `EpochSession` B's canonical content for `(s1, s4)` is now `[c1, c2, + c3, c4]`, doesn't match its frozen `[c1, c2, c3]` → `B.cancel(...)`. Construct + `EpochSession` **C** with same spec but checkpoints `[c1, c2, c3, c4]`. + +5. **`EpochSession` C reuses the long-lived c1..c4 `CheckpointProver` instances.** Sub-tree + work may already be complete; only the top-tree is recomputed. The chonk cache + survived the reorg because no epoch in this range has expired yet. + + +### Partial request dedups against a running full `EpochSession` + +The operator calls `startProof(N)` while the full `EpochSession` for epoch N is running with +c1..c4. Current canonical slot range is `(s1, s4)`, so the partial's computed spec is +`{partial, N, s1, s4}` — its `fromSlot`/`toSlot` exactly match the running full `EpochSession`'s. `startProof` +detects this and awaits the existing full instead of opening a duplicate: no partial +`EpochSession` is created and no second `TopTreeJob` is built. The caller simply blocks on the +full session's result and the epoch is proven once. + +### True partial proof + +The operator calls `startProof(N)` when only c1, c2 are canonical (epoch incomplete). +`fromSlot` is the epoch's first slot; `toSlot` is `s2` (the last canonical slot). +Partial `EpochSession` created with spec `{partial, N, firstSlotOfEpoch(N), s2}` and +checkpoints `[c1, c2]`. + +When c3 later arrives in slot s3, the partial is **not** invalidated — c3's slot is +outside its range. If c2 is then pruned, the partial **is** invalidated (canonical +content for the same slot range is now just `[c1]`) and recreated with the same +spec but checkpoints `[c1]`. If c2 re-adds, the partial is invalidated again and +recreated with `[c1, c2]`. + +## Design rationale + +### Why slot-based specs (not checkpoint-based)? + +A spec like "prove checkpoints 7..10" is invalidated by any reorg that renumbers +those checkpoints. A spec like "prove slots 350..399" survives renumbering — the +slot range is determined by epoch math and L1 constants, not by which checkpoints +happen to be canonical at the moment. Reconciliation preserves the slot range +across cancel-and-recreate cycles. + +### Why does every `EpochSession` start at the epoch's first slot? + +The L1 rollup contract validates that every submitted proof extends from the previous +proven tip — the `fromCheckpoint` of any submission must be the checkpoint immediately +after the current L1 proven head. Starting a partial `EpochSession` at a later slot would +mean the partial's `fromCheckpoint` lies past the proven tip, which the contract +rejects. Fixing `fromSlot` to `firstSlotOfEpoch(N)` for both kinds means partials and +fulls always share the same starting point; they differ only in `toSlot` and in the +submission decision. + +### Why does a publishing service own L1 submission instead of the `EpochSession`? + +Concentrating L1 submission gives us three properties for free that were awkward +or impossible when each `EpochSession` called the publisher directly: + +1. **Atomic same-epoch dedup.** Multiple candidates for the same epoch (full + + partial, or partial-then-full as canonical content extends) can be in flight + at once; the service picks the winner under the serial drain so only one L1 + tx is ever sent for the longer candidate. +2. **One source of truth for the proven tip.** Reading the proven block number + inside the drain means the eligibility check and the publish that follows are + guaranteed to use the same value. `EpochSession`s can't race each other on stale + reads. +3. **Per-candidate deadline and retry.** The service owns expiry timers and the + `publisherFactory.create()` retry loop. `EpochSession`s don't need to know about + either — they just await the outcome. + +### Why is the chonk cache keyed by tx hash and released on finality? + +Chonk-verifier proofs are tx-scoped: they prove a transaction's chonk circuit is +valid, independently of which block or epoch the tx lands in. A tx that gets +reorged out of one block and re-mined into another should not need to be re-proved. +Keying by tx hash makes the cache survive any reorg up to finality; releasing on +finality means we don't grow the cache indefinitely while still keeping every +reorg-relevant proof. + +### Why does the slot watcher only reap pruned `CheckpointProver`s? + +Canonical `CheckpointProver`s can't be reaped on a slot heuristic — they're still part of the +proven-chain story. Pruned `CheckpointProver`s, on the other hand, are only kept around in +case the chain re-adds the same content; once the synced slot has moved past, that +re-add is impossible, and the `CheckpointProver` can go. Finality is the right signal for +canonical reaping, because finality is the only state that rules out future reorgs. + +## Configuration + +| Env var | Description | +|---|---| +| `PROVER_NODE_POLLING_INTERVAL_MS` | Polling interval for the L2BlockStream, the checkpoint-store slot watcher, and the SessionManager periodic tick. Default 1000 ms. | +| `PROVER_NODE_MAX_PENDING_JOBS` | Cap on the number of non-terminal `EpochSession`s (full + partial). When at limit, reconcile defers opening new full `EpochSession`s; explicit `startProof` calls throw. | +| `PROVER_NODE_EPOCH_PROVING_DELAY_MS` | Optional sleep at the start of each `EpochSession`, before the TopTreeJob is constructed. Used in tests to give late events time to land. | +| `TX_GATHERING_TIMEOUT_MS` | Per-block tx gather deadline used by each `CheckpointProver`. | +| `PROVER_NODE_FAILED_EPOCH_STORE` | If set, failed `EpochSession`s upload their proving data (every `CheckpointProver`'s txs + register-time data, regardless of sub-tree completion) to this file store. | +| `PROVER_NODE_DISABLE_PROOF_PUBLISH` | If true, the publishing service runs `analyzeEpochProofSubmission` (estimates L1 fees) instead of actually submitting. | + +## Failure handling and observability + +Loggers: + +- `prover-node` — `ProverNode` itself (event dispatch, lifecycle). +- `prover-node:session-manager` — reconcile decisions, `EpochSession` opens / drops, tick. +- `prover-node:epoch-session` — per-`EpochSession` lifecycle (`Created EpochSession`, + `Top-tree proof ready`, `Submitted proof for epoch N`, etc.). +- `prover-node:proof-publishing-service` — candidate submit / withdraw / expire, + drain, publish attempts, transient acquire retries. +- `prover-node:l1-tx-publisher` — the per-publish `ProverNodePublisher`'s L1 work. +- `prover-node:checkpoint-store` — content-key collisions, reap decisions. +- `prover-node:checkpoint-prover` — sub-tree pipeline (gather, block processing). +- `prover-client:chonk-cache` — chonk-verifier cache enqueue / release events. + +On `failed` exit, `SessionManager.runSession` invokes the `onSessionFailed` callback +the manager was constructed with. `ProverNode` wires this to `tryUploadSessionFailure`, +which calls `SessionManager.buildSessionProvingData(session)` to walk every `CheckpointProver` +referenced by the `EpochSession` and assemble an `EpochProvingJobData` snapshot — including +every `CheckpointProver`'s txs and register-time data even if its sub-tree never reached +`isCompleted()`. This snapshot is what `uploadEpochProofFailure` ships to the +configured file store along with a world-state + archiver backup, so the failure +can be reproduced offline via `rerunEpochProvingJob`. + +Metrics emitted by `EpochSession`s: + +- `aztec.prover_node.execution_duration` — wall-clock time from `EpochSession` start to terminal. +- `aztec.prover_node.job_duration` — same, in seconds. +- `aztec.prover_node.job_checkpoints` / `_blocks` / `_transactions` — sizes of the + proven range. +- `aztec.prover_node.block_processing_duration` / + `aztec.prover_node.checkpoint_processing_duration` — sub-tree breakdown. diff --git a/yarn-project/prover-node/src/actions/rerun-epoch-proving-job.ts b/yarn-project/prover-node/src/actions/rerun-epoch-proving-job.ts index 486b53c3026b..ef80162cd613 100644 --- a/yarn-project/prover-node/src/actions/rerun-epoch-proving-job.ts +++ b/yarn-project/prover-node/src/actions/rerun-epoch-proving-job.ts @@ -1,23 +1,33 @@ import { createArchiverStore, createContractDataSource } from '@aztec/archiver'; import type { L1ContractsConfig } from '@aztec/ethereum/config'; +import { BlockNumber } from '@aztec/foundation/branded-types'; import type { Logger } from '@aztec/foundation/log'; +import { DateProvider } from '@aztec/foundation/timer'; import { type ProverClientConfig, createProverClient } from '@aztec/prover-client'; import { ProverBrokerConfig, createAndStartProvingBroker } from '@aztec/prover-client/broker'; +import { getLastSiblingPath } from '@aztec/prover-client/helpers'; +import { ChonkCache } from '@aztec/prover-client/orchestrator'; import { PublicProcessorFactory } from '@aztec/simulator/server'; +import type { L2Block } from '@aztec/stdlib/block'; +import { getEpochAtSlot, getSlotRangeForEpoch } from '@aztec/stdlib/epoch-helpers'; +import type { ITxProvider } from '@aztec/stdlib/interfaces/server'; import type { DataStoreConfig } from '@aztec/stdlib/kv-store'; +import { MerkleTreeId } from '@aztec/stdlib/trees'; +import type { Tx, TxHash } from '@aztec/stdlib/tx'; import type { GenesisData } from '@aztec/stdlib/world-state'; import { getTelemetryClient } from '@aztec/telemetry-client'; import { createWorldState } from '@aztec/world-state'; import { readFileSync } from 'fs'; +import { CheckpointProver } from '../job/checkpoint-prover.js'; import { deserializeEpochProvingJobData } from '../job/epoch-proving-job-data.js'; -import { EpochProvingJob } from '../job/epoch-proving-job.js'; +import { EpochSession, type SessionSpec } from '../job/epoch-session.js'; import { ProverNodeJobMetrics } from '../metrics.js'; /** * Given a local folder where `downloadEpochProvingJob` was called, creates a new archiver and world state - * using the state snapshots, and creates a new epoch proving job to prove the downloaded proving job. + * using the state snapshots, and creates a new epoch proving session to prove the downloaded proving job. * Proving is done with a local proving broker and agents as specified by the config. */ export async function rerunEpochProvingJob( @@ -41,34 +51,99 @@ export async function rerunEpochProvingJob( log.getBindings(), ); - const publisher = { - submitEpochProof: () => Promise.resolve(true), - analyzeEpochProofSubmission: () => Promise.resolve(), + // Local rerun never publishes — stub the service so submit() always resolves 'published' + // and withdraw is a no-op. + const publishingService = { + submit: () => Promise.resolve('published' as const), + withdraw: () => {}, }; - const l2BlockSourceForReorgDetection = undefined; - const deadline = undefined; - - // This starts a local proving broker that does not get exposed as a service. This should be good enough for - // smallish epochs to be proven if we run on a large machine, but as epochs grow larger, we may want to switch - // this out for a live proving broker with multiple agents that we can connect to. const broker = await createAndStartProvingBroker(config, telemetry); const prover = await createProverClient(config, worldState, broker, telemetry); + const chonkCache = new ChonkCache(log.getBindings()); + + const txProvider = makeReplayingTxProvider(jobData.txs); + + log.info(`Rerunning epoch proving for epoch ${jobData.epochNumber}`); - const provingJob = new EpochProvingJob( - jobData, - worldState, - prover.createEpochProver(), - publicProcessorFactory, - publisher, - l2BlockSourceForReorgDetection, + const provers: CheckpointProver[] = []; + for (let i = 0; i < jobData.checkpoints.length; i++) { + const checkpoint = jobData.checkpoints[i]; + const previousBlockHeader = + i === 0 ? jobData.previousBlockHeader : jobData.checkpoints[i - 1].blocks.at(-1)!.header; + const l1ToL2Messages = jobData.l1ToL2Messages[checkpoint.number] ?? []; + const previousArchiveSiblingPath = await getLastSiblingPath( + MerkleTreeId.ARCHIVE, + worldState.getSnapshot(BlockNumber(checkpoint.blocks[0].number - 1)), + ); + const attestations = checkpoint.number === jobData.checkpoints.at(-1)!.number ? jobData.attestations : []; + provers.push( + new CheckpointProver( + { + checkpoint, + epochNumber: jobData.epochNumber, + attestations, + previousBlockHeader, + l1ToL2Messages, + previousArchiveSiblingPath, + }, + { + proverFactory: prover, + chonkCache, + publicProcessorFactory, + dbProvider: worldState, + txProvider, + dateProvider: new DateProvider(), + proverId: prover.getProverId(), + metrics, + txGatheringTimeoutMs: 120_000, + deadline: undefined, + log, + }, + ), + ); + } + + const l1Constants = { epochDuration: config.aztecEpochDuration }; + const [fromSlot, toSlot] = getSlotRangeForEpoch(jobData.epochNumber, l1Constants); + const spec: SessionSpec = { kind: 'full', epochNumber: jobData.epochNumber, fromSlot, toSlot }; + + const session = new EpochSession(spec, provers, { + proverFactory: prover, + proverId: prover.getProverId(), + publishingService, metrics, - deadline, - { skipEpochCheck: true }, - log.getBindings(), - ); + dateProvider: new DateProvider(), + deadline: undefined, + config: {}, + bindings: log.getBindings(), + }); + + const finalState = await session.start(); + log.info(`Completed proving for epoch ${jobData.epochNumber} with status ${finalState}`, { + derivedEpoch: getEpochAtSlot(provers[0].slotNumber, l1Constants), + }); + return finalState; +} - log.info(`Rerunning epoch proving job for epoch ${jobData.epochNumber}`); - await provingJob.run(); - log.info(`Completed job for epoch ${jobData.epochNumber} with status ${provingJob.getState()}`); - return provingJob.getState(); +/** Build a synthetic ITxProvider that returns the supplied txs map by lookup. */ +function makeReplayingTxProvider(txs: Map): ITxProvider { + const lookup = (hashes: TxHash[]) => { + const found: Tx[] = []; + const missing: TxHash[] = []; + for (const hash of hashes) { + const tx = txs.get(hash.toString()); + if (tx) { + found.push(tx); + } else { + missing.push(hash); + } + } + return { txs: found, missingTxs: missing }; + }; + return { + getAvailableTxs: hashes => Promise.resolve(lookup(hashes)), + hasTxs: hashes => Promise.resolve(hashes.map(h => txs.has(h.toString()))), + getTxsForBlockProposal: () => Promise.resolve({ txs: [], missingTxs: [] }), + getTxsForBlock: (block: L2Block) => Promise.resolve(lookup(block.body.txEffects.map(e => e.txHash))), + }; } diff --git a/yarn-project/prover-node/src/checkpoint-store.test.ts b/yarn-project/prover-node/src/checkpoint-store.test.ts new file mode 100644 index 000000000000..7c7e9bc841f1 --- /dev/null +++ b/yarn-project/prover-node/src/checkpoint-store.test.ts @@ -0,0 +1,272 @@ +import { ARCHIVE_HEIGHT } from '@aztec/constants'; +import { makeTuple } from '@aztec/foundation/array'; +import { CheckpointNumber, EpochNumber, SlotNumber } from '@aztec/foundation/branded-types'; +import { timesAsync } from '@aztec/foundation/collection'; +import { Fr } from '@aztec/foundation/curves/bn254'; +import type { L2BlockSource } from '@aztec/stdlib/block'; +import { Checkpoint } from '@aztec/stdlib/checkpoint'; +import { EmptyL1RollupConstants } from '@aztec/stdlib/epoch-helpers'; + +import { mock } from 'jest-mock-extended'; + +import { type CheckpointProverFactory, CheckpointStore } from './checkpoint-store.js'; +import type { CheckpointProver } from './job/checkpoint-prover.js'; + +describe('CheckpointStore', () => { + let store: TestCheckpointStore; + let blockSource: ReturnType>>; + /** Track stub provers we hand back from the factory. */ + const stubs: StubProver[] = []; + + // Single-slot epochs make every checkpoint live in its own epoch and slot range. + const l1Constants = { ...EmptyL1RollupConstants, epochDuration: 1 }; + + beforeEach(() => { + blockSource = mock>(); + blockSource.getL1Constants.mockResolvedValue(l1Constants); + stubs.length = 0; + store = new TestCheckpointStore( + blockSource, + // The deps are not exercised — the factory below ignores them. + {} as any, + { slotWatcherPollIntervalMs: 100 }, + undefined, + (args, _deps) => { + const stub = makeStubProver(args.checkpoint, args.epochNumber); + stubs.push(stub); + return stub as unknown as CheckpointProver; + }, + ); + }); + + afterEach(async () => { + await store.stop(); + }); + + it('addOrUpdate creates a new prover for a fresh content key', async () => { + const cp = await Checkpoint.random(CheckpointNumber(1), { numBlocks: 1 }); + + const prover = await store.addOrUpdate(cp, makeRegisterData()); + expect(prover.checkpoint).toBe(cp); + expect(stubs.length).toBe(1); + }); + + it('addOrUpdate is idempotent for the same content key (re-add after prune)', async () => { + const cp = await Checkpoint.random(CheckpointNumber(1), { numBlocks: 1 }); + + const first = await store.addOrUpdate(cp, makeRegisterData()); + expect(first.isPruned()).toBe(false); + store.markPrunedAfter(CheckpointNumber(0)); + expect(first.isPruned()).toBe(true); + + // Re-adding the identical checkpoint (same archive root) reuses the existing prover. + const second = await store.addOrUpdate(cp, makeRegisterData()); + expect(second).toBe(first); + expect(second.isPruned()).toBe(false); + expect(stubs.length).toBe(1); + }); + + it('addOrUpdate refuses a conflicting canonical checkpoint at the same slot', async () => { + // Two canonical checkpoints sharing a slot would be a parallel chain. The store rejects + // the second; the caller must prune the first (via the chain-pruned event) before the + // replacement built on the same predecessor after a reorg can be added. + const a = await Checkpoint.random(CheckpointNumber(1), { numBlocks: 1, slotNumber: SlotNumber(5) }); + const b = await Checkpoint.random(CheckpointNumber(1), { numBlocks: 1, slotNumber: SlotNumber(5) }); + expect(a.archive.root.equals(b.archive.root)).toBe(false); + + const proverA = await store.addOrUpdate(a, makeRegisterData()); + await expect(store.addOrUpdate(b, makeRegisterData())).rejects.toThrow( + /canonical checkpoint already occupies this slot/i, + ); + + // After the predecessor is pruned, the replacement is accepted and keys to a distinct + // prover (different archive root → different content id). + store.markPrunedAfter(CheckpointNumber(0)); + expect(proverA.isPruned()).toBe(true); + const proverB = await store.addOrUpdate(b, makeRegisterData()); + expect(proverB).not.toBe(proverA); + expect(proverB.isPruned()).toBe(false); + expect(stubs.length).toBe(2); + }); + + it('markPrunedAfter marks every prover above the threshold and returns them', async () => { + const cps = await timesAsync(4, i => Checkpoint.random(CheckpointNumber(i + 1), { numBlocks: 1 })); + for (const cp of cps) { + await store.addOrUpdate(cp, makeRegisterData()); + } + const affected = store.markPrunedAfter(CheckpointNumber(2)); + expect(affected.map(p => p.checkpoint.number)).toEqual([3, 4]); + expect(store.listCanonical().map(p => p.checkpoint.number)).toEqual([1, 2]); + }); + + it('reapExpired drops canonical provers whose epoch is ≤ expiredEpoch', async () => { + // With epochDuration=1 each checkpoint's slot is also its epoch number. + const cps = await Promise.all([ + Checkpoint.random(CheckpointNumber(1), { numBlocks: 1, slotNumber: SlotNumber(1) }), + Checkpoint.random(CheckpointNumber(2), { numBlocks: 1, slotNumber: SlotNumber(2) }), + Checkpoint.random(CheckpointNumber(3), { numBlocks: 1, slotNumber: SlotNumber(3) }), + ]); + for (const cp of cps) { + await store.addOrUpdate(cp, makeRegisterData()); + } + store.reapExpired(EpochNumber(2)); + const remainingNumbers = store.listAll().map(p => p.checkpoint.number); + expect(remainingNumbers).toEqual([3]); + }); + + it('reapExpired leaves pruned provers in place', async () => { + const cp = await Checkpoint.random(CheckpointNumber(1), { numBlocks: 1, slotNumber: SlotNumber(1) }); + await store.addOrUpdate(cp, makeRegisterData()); + store.markPrunedAfter(CheckpointNumber(0)); + store.reapExpired(EpochNumber(10)); + expect(store.listAll().map(p => p.checkpoint.number)).toEqual([1]); + }); + + // ---------------- slot watcher ---------------- + + it('slot watcher reaps pruned provers whose slot is strictly before the synced slot', async () => { + const cp1 = await Checkpoint.random(CheckpointNumber(1), { numBlocks: 1, slotNumber: SlotNumber(1) }); + const cp2 = await Checkpoint.random(CheckpointNumber(2), { numBlocks: 1, slotNumber: SlotNumber(2) }); + const cp3 = await Checkpoint.random(CheckpointNumber(3), { numBlocks: 1, slotNumber: SlotNumber(3) }); + for (const cp of [cp1, cp2, cp3]) { + await store.addOrUpdate(cp, makeRegisterData()); + } + // Prune everything above checkpoint 0 ⇒ all three flip to pruned. + store.markPrunedAfter(CheckpointNumber(0)); + blockSource.getSyncedL2SlotNumber.mockResolvedValue(SlotNumber(3)); + + await store.triggerSlotWatcherTick(); + + // Slots 1 and 2 are < 3 and get reaped; slot 3 is not strictly less, so it stays. + expect(store.listAll().map(p => p.checkpoint.number)).toEqual([3]); + // Reaped stubs were cancelled by the watcher. + expect(stubs.find(s => s.checkpoint.number === 1)!.cancelled).toBe(true); + expect(stubs.find(s => s.checkpoint.number === 2)!.cancelled).toBe(true); + expect(stubs.find(s => s.checkpoint.number === 3)!.cancelled).toBe(false); + }); + + it('slot watcher leaves canonical provers in place even when their slot is past the synced slot', async () => { + // Canonical provers must survive — only pruned provers are eligible for reaping. + const cp = await Checkpoint.random(CheckpointNumber(1), { numBlocks: 1, slotNumber: SlotNumber(1) }); + await store.addOrUpdate(cp, makeRegisterData()); + blockSource.getSyncedL2SlotNumber.mockResolvedValue(SlotNumber(10)); + + await store.triggerSlotWatcherTick(); + + expect(store.listAll().map(p => p.checkpoint.number)).toEqual([1]); + expect(stubs[0].cancelled).toBe(false); + }); + + it('slot watcher no-ops when getSyncedL2SlotNumber returns undefined', async () => { + const cp = await Checkpoint.random(CheckpointNumber(1), { numBlocks: 1, slotNumber: SlotNumber(1) }); + await store.addOrUpdate(cp, makeRegisterData()); + store.markPrunedAfter(CheckpointNumber(0)); + blockSource.getSyncedL2SlotNumber.mockResolvedValue(undefined); + + await store.triggerSlotWatcherTick(); + + // No synced slot yet ⇒ watcher doesn't know whether the chain has moved past, so it + // keeps the pruned prover around for a possible re-add. + expect(store.listAll().map(p => p.checkpoint.number)).toEqual([1]); + expect(stubs[0].cancelled).toBe(false); + }); + + it('slot watcher swallows getSyncedL2SlotNumber errors instead of crashing the tick', async () => { + const cp = await Checkpoint.random(CheckpointNumber(1), { numBlocks: 1, slotNumber: SlotNumber(1) }); + await store.addOrUpdate(cp, makeRegisterData()); + store.markPrunedAfter(CheckpointNumber(0)); + blockSource.getSyncedL2SlotNumber.mockRejectedValue(new Error('archiver unavailable')); + + await expect(store.triggerSlotWatcherTick()).resolves.toBeUndefined(); + expect(store.listAll().map(p => p.checkpoint.number)).toEqual([1]); + }); + + it('listCanonicalForEpoch returns only canonical provers in the epoch slot range', async () => { + // With epochDuration=1, each epoch's slot range is exactly [slot, slot]. + const cp1 = await Checkpoint.random(CheckpointNumber(1), { numBlocks: 1, slotNumber: SlotNumber(10) }); + const cp2 = await Checkpoint.random(CheckpointNumber(2), { numBlocks: 1, slotNumber: SlotNumber(11) }); + await store.addOrUpdate(cp1, makeRegisterData()); + await store.addOrUpdate(cp2, makeRegisterData()); + + const epoch10 = await store.listCanonicalForEpoch(EpochNumber(10)); + const epoch11 = await store.listCanonicalForEpoch(EpochNumber(11)); + expect(epoch10.map(p => p.checkpoint.number)).toEqual([1]); + expect(epoch11.map(p => p.checkpoint.number)).toEqual([2]); + }); +}); + +/** Minimal CheckpointProver-shaped stub for store-level tests. */ +type StubProver = { + id: string; + checkpoint: Checkpoint; + slotNumber: SlotNumber; + epochNumber: EpochNumber; + pruned: boolean; + cancelled: boolean; + isPruned(): boolean; + isCancelled(): boolean; + markPruned(): void; + markCanonical(): void; + cancel(opts?: { routine?: boolean }): void; + whenDone(): Promise; +}; + +function makeStubProver(checkpoint: Checkpoint, epochNumber: EpochNumber): StubProver { + const id = `${checkpoint.number}:${checkpoint.header.slotNumber}:${checkpoint.archive.root.toString()}`; + return { + id, + checkpoint, + slotNumber: checkpoint.header.slotNumber, + epochNumber, + pruned: false, + cancelled: false, + isPruned() { + return this.pruned; + }, + isCancelled() { + return this.cancelled; + }, + markPruned() { + this.pruned = true; + }, + markCanonical() { + this.pruned = false; + }, + cancel() { + this.cancelled = true; + }, + whenDone() { + return Promise.resolve(); + }, + }; +} + +function makeRegisterData() { + return { + attestations: [], + previousBlockHeader: {} as any, + l1ToL2Messages: [], + previousArchiveSiblingPath: makeTuple(ARCHIVE_HEIGHT, () => Fr.ZERO), + }; +} + +/** + * Subclass that exposes the protected `reapPrunedPastSlot` so tests can drive a single + * SlotWatcher tick directly — avoids spinning up the underlying `RunningPromise` and + * waiting on its polling interval. + */ +class TestCheckpointStore extends CheckpointStore { + constructor( + blockSource: ConstructorParameters[0], + proverDeps: ConstructorParameters[1], + options: ConstructorParameters[2], + bindings: ConstructorParameters[3], + factory: CheckpointProverFactory, + ) { + super(blockSource, proverDeps, options, bindings, factory); + } + + public triggerSlotWatcherTick(): Promise { + return this.reapPrunedPastSlot(); + } +} diff --git a/yarn-project/prover-node/src/checkpoint-store.ts b/yarn-project/prover-node/src/checkpoint-store.ts new file mode 100644 index 000000000000..79e80a1d7efa --- /dev/null +++ b/yarn-project/prover-node/src/checkpoint-store.ts @@ -0,0 +1,213 @@ +import type { CheckpointNumber, EpochNumber, SlotNumber } from '@aztec/foundation/branded-types'; +import { type Logger, type LoggerBindings, createLogger } from '@aztec/foundation/log'; +import { RunningPromise } from '@aztec/foundation/promise'; +import type { L2BlockSource } from '@aztec/stdlib/block'; +import type { Checkpoint } from '@aztec/stdlib/checkpoint'; +import { type L1RollupConstants, getEpochAtSlot, getSlotRangeForEpoch } from '@aztec/stdlib/epoch-helpers'; + +import { CheckpointProver, type CheckpointProverArgs, type CheckpointProverDeps } from './job/checkpoint-prover.js'; + +/** Register-time data needed to construct a `CheckpointProver` (everything except the checkpoint + epoch). */ +export type RegisterCheckpointData = Omit; + +/** Factory used by the store to construct new provers. Tests can inject a stub. */ +export type CheckpointProverFactory = (args: CheckpointProverArgs, deps: CheckpointProverDeps) => CheckpointProver; + +/** + * Prover-node-wide registry of `CheckpointProver` instances, content-addressed by + * `(checkpoint number, slot, checkpoint archive root)`. + * + * The store survives every epoch / session boundary. A prover lives from its first + * `addOrUpdate` call until either: + * - it has been pruned and the L2 chain has moved past its slot (no re-add possible), or + * - its epoch's proof-submission window has closed (`reapExpired`), so the proof could no + * longer be accepted on L1 even if produced. + * + * A re-add of a checkpoint that matches an existing prover's content key reuses the + * existing prover (and flips it back to canonical); the in-flight sub-tree work never + * stops, so a prune-then-re-add of the same content avoids re-proving entirely. + */ +export class CheckpointStore { + private readonly provers = new Map(); + private readonly slotWatcher: RunningPromise; + private readonly log: Logger; + + constructor( + private readonly l2BlockSource: Pick, + private readonly proverDeps: Omit, + private readonly options: { slotWatcherPollIntervalMs: number }, + bindings?: LoggerBindings, + private readonly proverFactoryFn: CheckpointProverFactory = (args, deps) => new CheckpointProver(args, deps), + ) { + this.log = createLogger('prover-node:checkpoint-store', bindings); + this.slotWatcher = new RunningPromise( + () => this.reapPrunedPastSlot(), + this.log, + this.options.slotWatcherPollIntervalMs, + ); + } + + public start(): Promise { + this.slotWatcher.start(); + return Promise.resolve(); + } + + public async stop(): Promise { + await this.slotWatcher.stop(); + // Cancel every live prover; await teardown. + const provers = Array.from(this.provers.values()); + this.provers.clear(); + for (const prover of provers) { + prover.cancel(); + } + await Promise.allSettled(provers.map(p => p.whenDone())); + } + + /** + * Registers a checkpoint with the store. If a prover already exists for the + * `(number, slot, archive root)` content key, it is reused and marked canonical; + * otherwise a new prover is constructed. + */ + public async addOrUpdate(checkpoint: Checkpoint, data: RegisterCheckpointData): Promise { + const l1Constants = await this.l2BlockSource.getL1Constants(); + const epochNumber = getEpochAtSlot(checkpoint.header.slotNumber, l1Constants); + const id = CheckpointProver.idFor(checkpoint); + + const existing = this.provers.get(id); + if (existing) { + existing.markCanonical(); + return existing; + } + + // At most one canonical checkpoint per slot. A different canonical checkpoint at the + // same slot means the caller forgot to prune the old chain before adding the replacement + // — surface it rather than silently creating a parallel canonical chain. + for (const prover of this.provers.values()) { + if (prover.slotNumber === checkpoint.header.slotNumber && !prover.isPruned()) { + throw new Error( + `Cannot add checkpoint ${checkpoint.number} (archive ${checkpoint.archive.root}) at slot ${checkpoint.header.slotNumber}: ` + + `a different canonical checkpoint already occupies this slot. Prune it first.`, + ); + } + } + + const prover = this.proverFactoryFn({ ...data, checkpoint, epochNumber }, { ...this.proverDeps, log: this.log }); + this.provers.set(id, prover); + return prover; + } + + /** + * Marks every canonical prover whose checkpoint number is strictly greater than + * `prunedNumber` as pruned. Sub-tree work keeps running so a re-add of the same + * content can pick it up. Returns the affected provers. + */ + public markPrunedAfter(prunedNumber: CheckpointNumber): CheckpointProver[] { + const affected: CheckpointProver[] = []; + for (const prover of this.provers.values()) { + if (prover.checkpoint.number > prunedNumber && !prover.isPruned()) { + prover.markPruned(); + affected.push(prover); + } + } + return affected; + } + + /** + * Drops canonical (non-pruned) provers whose epoch is at or below the supplied expired + * epoch. Once an epoch's proof-submission window has closed, its proof can no longer be + * accepted on L1, so the prover is no longer needed. + */ + public reapExpired(expiredEpoch: EpochNumber): void { + const reaped: { id: string; checkpointNumber: CheckpointNumber; epochNumber: EpochNumber }[] = []; + for (const [id, prover] of Array.from(this.provers.entries())) { + if (prover.isPruned()) { + continue; + } + if (prover.epochNumber <= expiredEpoch) { + reaped.push({ id, checkpointNumber: prover.checkpoint.number, epochNumber: prover.epochNumber }); + prover.cancel({ routine: true }); + void prover.whenDone(); + this.provers.delete(id); + } + } + if (reaped.length > 0) { + this.log.info(`Reaped ${reaped.length} expired CheckpointProver(s) for expiredEpoch ${expiredEpoch}`, { + expiredEpoch, + reapedCount: reaped.length, + reaped, + }); + } + } + + /** Returns the prover with the supplied id, or undefined. */ + public get(id: string): CheckpointProver | undefined { + return this.provers.get(id); + } + + /** Returns the prover for the supplied checkpoint (by its content-addressed id), or undefined. */ + public getByCheckpoint(checkpoint: Checkpoint): CheckpointProver | undefined { + return this.provers.get(CheckpointProver.idFor(checkpoint)); + } + + /** Every prover currently in the store (canonical and pruned), in insertion order. */ + public listAll(): CheckpointProver[] { + return Array.from(this.provers.values()); + } + + /** Canonical (non-pruned) provers in the store, sorted by checkpoint number. */ + public listCanonical(): CheckpointProver[] { + return Array.from(this.provers.values()) + .filter(p => !p.isPruned()) + .sort((a, b) => a.checkpoint.number - b.checkpoint.number); + } + + /** + * Canonical provers whose slot is in the supplied epoch's slot range, sorted by + * checkpoint number. + */ + public async listCanonicalForEpoch(epoch: EpochNumber): Promise { + const l1Constants = await this.l2BlockSource.getL1Constants(); + const [fromSlot, toSlot] = getSlotRangeForEpoch(epoch, l1Constants); + return this.listCanonicalInSlotRange(fromSlot, toSlot); + } + + /** Canonical provers whose slot falls within `[fromSlot, toSlot]`, sorted by checkpoint number. */ + public listCanonicalInSlotRange(fromSlot: SlotNumber, toSlot: SlotNumber): CheckpointProver[] { + return this.listCanonical().filter(p => p.slotNumber >= fromSlot && p.slotNumber <= toSlot); + } + + /** + * SlotWatcher tick: reap pruned provers whose slot has passed the chain's synced + * slot. Once the chain has moved past, no re-add can revive the prover and its + * content key is unique enough that an actual re-add would create a new entry. + * + * Protected so unit tests can drive a single tick without spinning up the + * `RunningPromise` and waiting on its interval. + */ + protected async reapPrunedPastSlot(): Promise { + let syncedSlot: SlotNumber | undefined; + try { + syncedSlot = await this.l2BlockSource.getSyncedL2SlotNumber(); + } catch (err) { + this.log.debug(`SlotWatcher could not read synced slot`, { error: `${err}` }); + return; + } + if (syncedSlot === undefined) { + return; + } + for (const [id, prover] of Array.from(this.provers.entries())) { + if (prover.isPruned() && prover.slotNumber < syncedSlot) { + this.log.info(`Reaping pruned CheckpointProver ${id}: slot ${prover.slotNumber} < synced ${syncedSlot}`, { + checkpointNumber: prover.checkpoint.number, + slotNumber: prover.slotNumber, + }); + prover.cancel(); + void prover.whenDone(); + this.provers.delete(id); + } + } + } +} + +/** Sub-set of `L1RollupConstants` actually consumed by the store's slot helpers. */ +export type CheckpointStoreL1Constants = Pick; diff --git a/yarn-project/prover-node/src/config.ts b/yarn-project/prover-node/src/config.ts index 61a39d5d90e2..304ce7a9c1a5 100644 --- a/yarn-project/prover-node/src/config.ts +++ b/yarn-project/prover-node/src/config.ts @@ -68,7 +68,8 @@ export const specificProverNodeConfigMappings: ConfigMappingsType { + let checkpoint: Checkpoint; + let deps: CheckpointProverDeps; + let txProvider: ReturnType>; + let proverFactory: ReturnType>; + let publicProcessorFactory: ReturnType>; + let dbProvider: ReturnType>>; + let chonkCache: ReturnType>; + let log: Logger; + + beforeEach(async () => { + checkpoint = await Checkpoint.random(CheckpointNumber(1), { numBlocks: 2 }); + + txProvider = mock(); + proverFactory = mock(); + publicProcessorFactory = mock(); + dbProvider = mock>(); + chonkCache = mock(); + log = createLogger('test:checkpoint-prover'); + + // Default: gather rejects fast so the eager pipeline unwinds without hanging. The + // production prover doesn't propagate abort into the txProvider call (only a deadline) + // — a never-resolving mock would leave runPromise pending forever even after cancel, + // which would in turn hang whenDone(). Per-test overrides reconfigure the mock when a + // specific failure mode is under test. + txProvider.getTxsForBlock.mockRejectedValue(new Error('test default: gather not configured')); + + deps = { + proverFactory, + chonkCache, + publicProcessorFactory, + dbProvider, + txProvider, + dateProvider: new DateProvider(), + proverId: EthAddress.ZERO, + metrics: new ProverNodeJobMetrics( + { createHistogram: noopMetric, createGauge: noopMetric, createCounter: noopMetric } as any, + { startActiveSpan: (_n: string, fn: any) => fn({ end: () => {} }) } as any, + ), + txGatheringTimeoutMs: 30_000, + deadline: undefined, + log, + }; + }); + + // ---------------- identity ---------------- + + describe('idFor', () => { + it('formats id as `${checkpointNumber}:${slot}:${archiveRoot}`', () => { + const id = CheckpointProver.idFor(checkpoint); + expect(id).toBe(`${checkpoint.number}:${checkpoint.header.slotNumber}:${checkpoint.archive.root.toString()}`); + }); + + it('two checkpoints with the same content key produce the same id', async () => { + // Same archive root + slot + number ⇒ same id, even if other fields differ. + const a = checkpoint; + const b = await Checkpoint.random(a.number, { + numBlocks: 1, + slotNumber: a.header.slotNumber, + archive: a.archive, + }); + expect(CheckpointProver.idFor(a)).toBe(CheckpointProver.idFor(b)); + }); + }); + + // ---------------- construction ---------------- + + describe('construction', () => { + it('initializes readonly fields from args', async () => { + const prover = makeProver(); + expect(prover.id).toBe(CheckpointProver.idFor(checkpoint)); + expect(prover.checkpoint).toBe(checkpoint); + expect(prover.epochNumber).toEqual(EpochNumber(5)); + expect(prover.slotNumber).toEqual(checkpoint.header.slotNumber); + expect(prover.attestations).toEqual([]); + expect(prover.l1ToL2Messages).toEqual([]); + expect(prover.isCancelled()).toBe(false); + expect(prover.isCompleted()).toBe(false); + expect(prover.isPruned()).toBe(false); + await cleanup(prover); + }); + + it('eagerly starts tx gathering on construction', async () => { + const prover = makeProver(); + // The constructor kicks off gatherTxs which calls getTxsForBlock for every block. + expect(txProvider.getTxsForBlock).toHaveBeenCalledTimes(checkpoint.blocks.length); + await cleanup(prover); + }); + }); + + // ---------------- prune/canonical flag ---------------- + + describe('markPruned / markCanonical', () => { + it('markPruned flips isPruned() and is idempotent', async () => { + const prover = makeProver(); + expect(prover.isPruned()).toBe(false); + prover.markPruned(); + expect(prover.isPruned()).toBe(true); + prover.markPruned(); + expect(prover.isPruned()).toBe(true); + await cleanup(prover); + }); + + it('markCanonical restores isPruned() to false and is idempotent on a non-pruned prover', async () => { + const prover = makeProver(); + prover.markPruned(); + prover.markCanonical(); + expect(prover.isPruned()).toBe(false); + // No-op when already canonical. + prover.markCanonical(); + expect(prover.isPruned()).toBe(false); + await cleanup(prover); + }); + }); + + // ---------------- cancellation ---------------- + + describe('cancel', () => { + it('flips isCancelled() and fires the abort signal', async () => { + const prover = makeProver(); + expect(prover.isCancelled()).toBe(false); + expect(prover.getAbortSignal().aborted).toBe(false); + + prover.cancel(); + expect(prover.isCancelled()).toBe(true); + expect(prover.getAbortSignal().aborted).toBe(true); + await prover.whenDone(); + }); + + it('is idempotent', async () => { + const prover = makeProver(); + prover.cancel(); + // Second call is a no-op — no throws, no extra side effects. + prover.cancel(); + expect(prover.isCancelled()).toBe(true); + await prover.whenDone(); + }); + + it('rejects whenBlockProofsReady()', async () => { + const prover = makeProver(); + const blockProofs = prover.whenBlockProofsReady(); + prover.cancel(); + await expect(blockProofs).rejects.toThrow(/cancelled/); + await prover.whenDone(); + }); + + it('whenDone resolves after cancel unwinds even when gather is still in flight', async () => { + // Hold gather pending until after cancel fires — gatherAndExecute's `cancelled` + // guard must swallow the resulting rejection so runPromise still resolves cleanly. + const gate = promiseWithResolvers<{ txs: Tx[]; missingTxs: never[] }>(); + txProvider.getTxsForBlock.mockReset(); + txProvider.getTxsForBlock.mockReturnValue(gate.promise); + + const prover = makeProver(); + prover.cancel(); + gate.reject(new Error('gather aborted by test')); + await expect(prover.whenDone()).resolves.toBeUndefined(); + }); + + it('routine cancel still aborts and rejects block proofs (only log level differs)', async () => { + const prover = makeProver(); + const blockProofs = prover.whenBlockProofsReady(); + prover.cancel({ routine: true }); + expect(prover.isCancelled()).toBe(true); + expect(prover.getAbortSignal().aborted).toBe(true); + await expect(blockProofs).rejects.toThrow(/cancelled/); + await prover.whenDone(); + }); + }); + + // ---------------- gather failure ---------------- + + describe('gather failures', () => { + it('rejects whenBlockProofsReady when txProvider returns missing txs', async () => { + const missingHash = checkpoint.blocks[0].body.txEffects[0]?.txHash; + // Without a real missing hash the per-block payload would be empty and the prover + // would happily proceed; only checkpoints with txs can exercise this branch. + if (!missingHash) { + return; + } + txProvider.getTxsForBlock.mockReset(); + txProvider.getTxsForBlock.mockResolvedValue({ txs: [], missingTxs: [missingHash] }); + + const prover = makeProver(); + await expect(prover.whenBlockProofsReady()).rejects.toThrow(/Txs not found/); + await prover.whenDone(); + }); + + it('does not surface an error when cancel races ahead of gather', async () => { + // Hold gather pending until after cancel — the cancelled guard in gatherAndExecute + // swallows the abort-induced rejection silently; no unhandled rejection should + // escape whenDone(). + const gate = promiseWithResolvers<{ txs: Tx[]; missingTxs: never[] }>(); + txProvider.getTxsForBlock.mockReset(); + txProvider.getTxsForBlock.mockReturnValue(gate.promise); + + const prover = makeProver(); + const blockProofs = prover.whenBlockProofsReady(); + prover.cancel(); + gate.reject(new Error('gather aborted by test')); + await expect(blockProofs).rejects.toThrow(/cancelled/); + await expect(prover.whenDone()).resolves.toBeUndefined(); + }); + + it('lets a second whenBlockProofsReady caller observe the same rejection', async () => { + // Two callers awaiting the same promise both see the rejection — neither leaks an + // unhandled rejection (the constructor pre-attaches a noop catch handler). + const prover = makeProver(); + const a = prover.whenBlockProofsReady(); + const b = prover.whenBlockProofsReady(); + prover.cancel(); + await Promise.all([expect(a).rejects.toThrow(/cancelled/), expect(b).rejects.toThrow(/cancelled/)]); + await prover.whenDone(); + }); + + it('cancel after a gather-failure rejection is still idempotent', async () => { + const missingHash = checkpoint.blocks[0].body.txEffects[0]?.txHash; + if (!missingHash) { + return; + } + const failure = promiseWithResolvers<{ txs: Tx[]; missingTxs: (typeof missingHash)[] }>(); + txProvider.getTxsForBlock.mockReset(); + txProvider.getTxsForBlock.mockReturnValue(failure.promise); + + const prover = makeProver(); + failure.resolve({ txs: [], missingTxs: [missingHash] }); + await expect(prover.whenBlockProofsReady()).rejects.toThrow(/Txs not found/); + // Subsequent cancel is a no-op; no throws. + prover.cancel(); + expect(prover.isCancelled()).toBe(true); + await prover.whenDone(); + }); + }); + + // ---------------- helpers ---------------- + + function makeProver(overrides: Partial = {}): CheckpointProver { + const args: CheckpointProverArgs = { + checkpoint, + epochNumber: EpochNumber(5), + attestations: [], + previousBlockHeader: {} as BlockHeader, + l1ToL2Messages: [], + previousArchiveSiblingPath: makeTuple(ARCHIVE_HEIGHT, () => Fr.ZERO), + ...overrides, + }; + return new CheckpointProver(args, deps); + } + + async function cleanup(prover: CheckpointProver): Promise { + if (!prover.isCancelled()) { + prover.cancel({ routine: true }); + } + await prover.whenDone(); + } +}); + +/** Minimal Histogram/Gauge/Counter stub: only the methods ProverNodeJobMetrics records into. */ +function noopMetric() { + return { record: () => {}, add: () => {} }; +} diff --git a/yarn-project/prover-node/src/job/checkpoint-prover.ts b/yarn-project/prover-node/src/job/checkpoint-prover.ts new file mode 100644 index 000000000000..8c1d7859f69f --- /dev/null +++ b/yarn-project/prover-node/src/job/checkpoint-prover.ts @@ -0,0 +1,465 @@ +import { type ARCHIVE_HEIGHT, NUMBER_OF_L1_L2_MESSAGES_PER_ROLLUP } from '@aztec/constants'; +import { BlockNumber, type EpochNumber, type SlotNumber } from '@aztec/foundation/branded-types'; +import { padArrayEnd } from '@aztec/foundation/collection'; +import { Fr } from '@aztec/foundation/curves/bn254'; +import type { EthAddress } from '@aztec/foundation/eth-address'; +import type { Logger } from '@aztec/foundation/log'; +import { type PromiseWithResolvers, promiseWithResolvers } from '@aztec/foundation/promise'; +import type { Tuple } from '@aztec/foundation/serialize'; +import { type DateProvider, Timer } from '@aztec/foundation/timer'; +import { getVKTreeRoot } from '@aztec/noir-protocol-circuits-types/vk-tree'; +import { protocolContractsHash } from '@aztec/protocol-contracts'; +import type { EpochProverFactory } from '@aztec/prover-client'; +import type { CheckpointSubTreeOrchestrator, ChonkCache, SubTreeResult } from '@aztec/prover-client/orchestrator'; +import type { PublicProcessor, PublicProcessorFactory } from '@aztec/simulator/server'; +import { PublicSimulatorConfig } from '@aztec/stdlib/avm'; +import type { CommitteeAttestation, L2Block } from '@aztec/stdlib/block'; +import type { Checkpoint } from '@aztec/stdlib/checkpoint'; +import type { ForkMerkleTreeOperations, ITxProvider } from '@aztec/stdlib/interfaces/server'; +import { CheckpointConstantData } from '@aztec/stdlib/rollup'; +import { MerkleTreeId } from '@aztec/stdlib/trees'; +import type { BlockHeader, ProcessedTx, Tx } from '@aztec/stdlib/tx'; + +import type { ProverNodeJobMetrics } from '../metrics.js'; + +/** Dependencies a `CheckpointProver` needs at construction. */ +export type CheckpointProverDeps = { + proverFactory: EpochProverFactory; + /** Shared chonk-verifier cache. Survives across all sessions / epochs. */ + chonkCache: ChonkCache; + publicProcessorFactory: PublicProcessorFactory; + dbProvider: Pick; + txProvider: ITxProvider; + /** Clock the prover-node operates against — e2e fixtures inject a cheat-controlled one. */ + dateProvider: DateProvider; + proverId: EthAddress; + metrics: ProverNodeJobMetrics; + /** Tx gathering deadline. */ + txGatheringTimeoutMs: number; + /** Public processor deadline. */ + deadline: Date | undefined; + log: Logger; +}; + +/** Inputs that fully describe a checkpoint at register time. */ +export type CheckpointProverArgs = { + checkpoint: Checkpoint; + /** Epoch the checkpoint belongs to (derivable from slot + L1 constants; cached at register time). */ + epochNumber: EpochNumber; + attestations: CommitteeAttestation[]; + previousBlockHeader: BlockHeader; + l1ToL2Messages: Fr[]; + previousArchiveSiblingPath: Tuple; +}; + +/** + * Self-contained per-checkpoint prover, content-addressed by + * `(checkpoint number, slot number, checkpoint archive root)`. + * + * The store creates a CheckpointProver once per content-key. Keying on the checkpoint's + * own archive root (its post-state) means two checkpoints are "the same" iff they + * produce the same archive — so a reorg branch, or a replacement built on the same + * predecessor but with different content, keys to a distinct prover; an identical + * re-add keys to the same one and reuses its in-flight sub-tree work. + * + * The prover eagerly starts its own tx gather and sub-tree work in the constructor, so + * callers only need to call `whenBlockProofsReady()` to obtain the resulting block-rollup + * proofs. + * + * The prover survives prune/re-add cycles via `markPruned()` / `markCanonical()` — + * sub-tree proving keeps running underneath, so a checkpoint that is re-added after + * a brief reorg can be re-consumed with no re-proving. + * + * `cancel()` is idempotent. It aborts the gather + sub-tree, rejects the block-proof + * promise, and exposes a `whenDone()` that resolves once teardown has unwound. + */ +export class CheckpointProver { + readonly id: string; + readonly checkpoint: Checkpoint; + readonly epochNumber: EpochNumber; + readonly slotNumber: SlotNumber; + readonly attestations: CommitteeAttestation[]; + readonly previousBlockHeader: BlockHeader; + readonly l1ToL2Messages: Fr[]; + readonly previousArchiveSiblingPath: Tuple; + + /** Per-prover tx map — populated by the internal gather. Empty until then. */ + readonly txs: Map = new Map(); + + /** Resolved by the sub-tree on success, rejected on cancel/failure. */ + private readonly blockProofs: PromiseWithResolvers = promiseWithResolvers(); + + private cancelled = false; + private subTree?: CheckpointSubTreeOrchestrator; + private completed = false; + /** Pruned in the canonical chain but not yet reaped — sub-tree continues running. */ + private pruned = false; + private readonly abortController = new AbortController(); + + /** Tracks the eager gather+execute task so `cancel()` and `whenDone()` can await its unwind. */ + private readonly runPromise: Promise; + /** Tracks the cancel-driven teardown so `whenDone()` can await it. */ + private cancelPromise?: Promise; + + constructor( + args: CheckpointProverArgs, + private readonly deps: CheckpointProverDeps, + ) { + this.checkpoint = args.checkpoint; + this.epochNumber = args.epochNumber; + this.slotNumber = args.checkpoint.header.slotNumber; + this.attestations = args.attestations; + this.previousBlockHeader = args.previousBlockHeader; + this.l1ToL2Messages = args.l1ToL2Messages; + this.previousArchiveSiblingPath = args.previousArchiveSiblingPath; + this.id = CheckpointProver.idFor(args.checkpoint); + // Mark blockProofs as observed so a cancel that lands before any consumer awaits + // does not surface as an unhandled rejection. + this.blockProofs.promise.catch(() => {}); + deps.log.info(`Created CheckpointProver ${this.id}`, { + checkpointNumber: this.checkpoint.number, + epochNumber: this.epochNumber, + slotNumber: this.slotNumber, + blockCount: this.checkpoint.blocks.length, + l1ToL2MessageCount: this.l1ToL2Messages.length, + archiveRoot: this.checkpoint.archive.root.toString(), + }); + // Kick off the eager gather + sub-tree pipeline. + this.runPromise = this.gatherAndExecute(); + } + + /** + * Stable content-addressed identifier: `${checkpoint number}:${slot}:${archive root}`. + * The archive root is the checkpoint's post-state, so it distinguishes any two + * checkpoints that differ in history or content while collapsing identical re-adds. + */ + public static idFor(checkpoint: Checkpoint): string { + return `${checkpoint.number}:${checkpoint.header.slotNumber}:${checkpoint.archive.root.toString()}`; + } + + public isCancelled(): boolean { + return this.cancelled; + } + + /** True once block-level proving has been fully *enqueued* (sub-tree completion may still be pending). */ + public isCompleted(): boolean { + return this.completed; + } + + public isPruned(): boolean { + return this.pruned; + } + + /** + * Mark this prover as no longer present in the canonical chain. Sub-tree proving keeps + * running so the work survives if the checkpoint is re-added. Idempotent. + */ + public markPruned(): void { + if (this.pruned) { + return; + } + this.pruned = true; + this.deps.log.info(`Marking CheckpointProver ${this.id} as pruned`, { + checkpointNumber: this.checkpoint.number, + slotNumber: this.slotNumber, + }); + } + + /** Mark this prover as part of the canonical chain again after a re-add. Idempotent. */ + public markCanonical(): void { + if (!this.pruned) { + return; + } + this.pruned = false; + this.deps.log.info(`Marking CheckpointProver ${this.id} as canonical`, { + checkpointNumber: this.checkpoint.number, + slotNumber: this.slotNumber, + }); + } + + /** AbortSignal that fires on cancel — for callers that want to wire their own tasks. */ + public getAbortSignal(): AbortSignal { + return this.abortController.signal; + } + + /** Promise that resolves with the block-rollup proofs for this checkpoint (or rejects on cancel/failure). */ + public whenBlockProofsReady(): Promise { + return this.blockProofs.promise; + } + + /** Resolves when all in-flight work for this prover has fully unwound. */ + public async whenDone(): Promise { + await this.runPromise.catch(() => {}); + if (this.cancelPromise) { + await this.cancelPromise; + } + } + + private async gatherAndExecute(): Promise { + try { + const txs = await this.gatherTxs(); + if (this.cancelled) { + return; + } + await this.executeCheckpoint(txs); + } catch (err) { + if (this.cancelled) { + this.deps.log.debug(`CheckpointProver ${this.id} cancelled during gather/execute`, { + checkpointNumber: this.checkpoint.number, + }); + return; + } + this.deps.log.error(`Error in CheckpointProver ${this.id}`, err, { + checkpointNumber: this.checkpoint.number, + }); + this.blockProofs.reject(err instanceof Error ? err : new Error(String(err))); + } + } + + private async gatherTxs(): Promise> { + const deadline = new Date(this.deps.dateProvider.now() + this.deps.txGatheringTimeoutMs); + const txsByBlock = await Promise.all( + this.checkpoint.blocks.map(block => this.deps.txProvider.getTxsForBlock(block, { deadline })), + ); + const txs = txsByBlock.map(({ txs }) => txs).flat(); + const missingTxs = txsByBlock.map(({ missingTxs }) => missingTxs).flat(); + + if (missingTxs.length > 0) { + throw new Error( + `Txs not found for checkpoint ${this.checkpoint.number}: ${missingTxs.map(hash => hash.toString()).join(', ')}`, + ); + } + return new Map(txs.map(tx => [tx.getTxHash().toString(), tx])); + } + + private async executeCheckpoint(txs: Map): Promise { + const signal = this.abortController.signal; + const checkpointTimer = new Timer(); + let subTreeStarted = false; + + try { + for (const [hash, tx] of txs) { + this.txs.set(hash, tx); + } + + const { chainId, version } = this.checkpoint.blocks[0].header.globalVariables; + const checkpointConstants = CheckpointConstantData.from({ + chainId, + version, + vkTreeRoot: getVKTreeRoot(), + protocolContractsHash: protocolContractsHash, + proverId: this.deps.proverId.toField(), + slotNumber: this.checkpoint.header.slotNumber, + coinbase: this.checkpoint.header.coinbase, + feeRecipient: this.checkpoint.header.feeRecipient, + gasFees: this.checkpoint.header.gasFees, + }); + + this.deps.log.info(`Starting processing checkpoint ${this.checkpoint.number}`, { + checkpointNumber: this.checkpoint.number, + checkpointHash: this.checkpoint.hash().toString(), + blockCount: this.checkpoint.blocks.length, + }); + + this.subTree = await this.deps.proverFactory.createCheckpointSubTreeOrchestrator( + this.deps.chonkCache, + this.epochNumber, + checkpointConstants, + this.l1ToL2Messages, + this.checkpoint.blocks.length, + this.previousBlockHeader, + ); + subTreeStarted = true; + // Bridge the sub-tree's result onto blockProofs. + void this.subTree.getSubTreeResult().then( + result => { + this.deps.log.info(`Sub-tree block proofs ready for checkpoint ${this.checkpoint.number}`, { + checkpointNumber: this.checkpoint.number, + blockProofCount: result.blockProofOutputs.length, + }); + this.blockProofs.resolve(result.blockProofOutputs); + }, + err => this.blockProofs.reject(err), + ); + if (signal.aborted) { + return; + } + + const allTxs = this.checkpoint.blocks.flatMap(block => + block.body.txEffects.map(txEffect => txs.get(txEffect.txHash.toString())!), + ); + const publicTxs = allTxs.filter(tx => tx?.data.forPublic); + if (publicTxs.length > 0) { + await this.subTree.startChonkVerifierCircuits(publicTxs); + if (signal.aborted) { + return; + } + } + + for (let blockIndex = 0; blockIndex < this.checkpoint.blocks.length; blockIndex++) { + const blockTimer = new Timer(); + const block = this.checkpoint.blocks[blockIndex]; + const globalVariables = block.header.globalVariables; + const blockTxs = this.getTxsForBlock(block, txs); + + await this.subTree.startNewBlock(block.number, globalVariables.timestamp, blockTxs.length); + if (signal.aborted) { + return; + } + + const db = await this.createFork( + BlockNumber(block.number - 1), + blockIndex === 0 ? this.l1ToL2Messages : undefined, + ); + try { + if (signal.aborted) { + return; + } + const config = PublicSimulatorConfig.from({ + proverId: this.deps.proverId.toField(), + skipFeeEnforcement: false, + collectDebugLogs: false, + collectHints: true, + collectPublicInputs: true, + collectStatistics: false, + }); + const publicProcessor = this.deps.publicProcessorFactory.create(db, globalVariables, config); + const processed = await this.processTxs(publicProcessor, blockTxs); + if (signal.aborted) { + return; + } + await this.subTree.addTxs(processed); + } finally { + await db.close(); + } + if (signal.aborted) { + return; + } + + await this.subTree.setBlockCompleted(block.number, block.header); + this.deps.metrics.recordBlockProcessing(blockTimer.ms()); + if (signal.aborted) { + return; + } + } + + this.completed = true; + this.deps.metrics.recordCheckpointProcessing(checkpointTimer.ms()); + this.deps.log.info( + `Finished enqueueing block-level proving for checkpoint ${this.checkpoint.number} in ${checkpointTimer.ms()}ms`, + { + checkpointNumber: this.checkpoint.number, + blockCount: this.checkpoint.blocks.length, + durationMs: checkpointTimer.ms(), + }, + ); + } finally { + if (!this.completed) { + if (subTreeStarted) { + await this.teardownSubTree(); + } + this.blockProofs.reject(new Error(`Checkpoint ${this.id} did not complete block processing`)); + } + } + } + + /** + * Mark cancelled. Idempotent. Aborts in-flight work, rejects the block-proof promise, + * and kicks off a background teardown of the sub-tree. The teardown promise is exposed + * via `whenDone()`. + * + * `routine` distinguishes a post-finalize teardown (sub-tree already proven, fires + * once at prover exit) from a real abort (reorg, prune, deadline). Behaviour is + * identical either way; the flag only adjusts log verbosity. + */ + public cancel({ routine = false }: { routine?: boolean } = {}): void { + if (this.cancelled) { + return; + } + this.cancelled = true; + // A teardown of a completed prover is routine regardless of the caller's flag — + // we logged the work as done already, so don't relabel it as a mid-flight cancel. + if (routine || this.completed) { + this.deps.log.verbose(`Tearing down CheckpointProver ${this.id}`, { + checkpointNumber: this.checkpoint.number, + wasCompleted: this.completed, + }); + } else { + this.deps.log.info(`Cancelling in-flight CheckpointProver ${this.id}`, { + checkpointNumber: this.checkpoint.number, + wasCompleted: this.completed, + }); + } + this.abortController.abort(); + this.blockProofs.reject(new Error(`Checkpoint ${this.id} cancelled`)); + this.cancelPromise = this.runCancel().catch(() => {}); + } + + private async runCancel(): Promise { + if (this.subTree) { + try { + this.subTree.cancel(); + } catch (err) { + this.deps.log.error('Error cancelling sub-tree', err); + } + } + await this.runPromise.catch(() => {}); + if (this.subTree) { + await this.teardownSubTree(); + } + } + + private async teardownSubTree(): Promise { + const { subTree } = this; + this.subTree = undefined; + if (subTree) { + this.deps.log.debug(`Tearing down sub-tree for checkpoint ${this.checkpoint.number}`, { + checkpointNumber: this.checkpoint.number, + }); + try { + await subTree.stop(); + } catch (err) { + this.deps.log.error('Error stopping sub-tree', err); + } + } + } + + private getTxsForBlock(block: L2Block, txs: Map): Tx[] { + return block.body.txEffects.map(txEffect => txs.get(txEffect.txHash.toString())!); + } + + private async processTxs(publicProcessor: PublicProcessor, txs: Tx[]): Promise { + const [processedTxs, failedTxs] = await publicProcessor.process(txs, { deadline: this.deps.deadline }); + + if (failedTxs.length) { + const failedTxHashes = await Promise.all(failedTxs.map(({ tx }) => tx.getTxHash())); + throw new Error( + `Txs failed processing: ${failedTxs + .map(({ error }, index) => `${failedTxHashes[index]} (${error})`) + .join(', ')}`, + ); + } + + if (processedTxs.length !== txs.length) { + throw new Error(`Failed to process all txs: processed ${processedTxs.length} out of ${txs.length}`); + } + + return processedTxs; + } + + private async createFork(blockNumber: BlockNumber, l1ToL2Messages: Fr[] | undefined) { + const db = await this.deps.dbProvider.fork(blockNumber); + + if (l1ToL2Messages !== undefined) { + const l1ToL2MessagesPadded = padArrayEnd( + l1ToL2Messages, + Fr.ZERO, + NUMBER_OF_L1_L2_MESSAGES_PER_ROLLUP, + 'Too many L1 to L2 messages', + ); + await db.appendLeaves(MerkleTreeId.L1_TO_L2_MESSAGE_TREE, l1ToL2MessagesPadded); + } + + return db; + } +} diff --git a/yarn-project/prover-node/src/job/epoch-proving-job.test.ts b/yarn-project/prover-node/src/job/epoch-proving-job.test.ts deleted file mode 100644 index cc6c7b6f7aa0..000000000000 --- a/yarn-project/prover-node/src/job/epoch-proving-job.test.ts +++ /dev/null @@ -1,393 +0,0 @@ -import { BatchedBlob } from '@aztec/blob-lib/types'; -import { BlockNumber, CheckpointNumber, EpochNumber } from '@aztec/foundation/branded-types'; -import { fromEntries, times, timesParallel } from '@aztec/foundation/collection'; -import { Fr } from '@aztec/foundation/curves/bn254'; -import { EthAddress } from '@aztec/foundation/eth-address'; -import { toArray } from '@aztec/foundation/iterable'; -import { sleep } from '@aztec/foundation/sleep'; -import type { PublicProcessor, PublicProcessorFactory } from '@aztec/simulator/server'; -import { PublicSimulatorConfig } from '@aztec/stdlib/avm'; -import { CommitteeAttestation, type L2BlockSource } from '@aztec/stdlib/block'; -import { Checkpoint, PublishedCheckpoint } from '@aztec/stdlib/checkpoint'; -import type { L1RollupConstants } from '@aztec/stdlib/epoch-helpers'; -import type { EpochProver, MerkleTreeWriteOperations, WorldStateSynchronizer } from '@aztec/stdlib/interfaces/server'; -import { Proof } from '@aztec/stdlib/proofs'; -import { RootRollupPublicInputs } from '@aztec/stdlib/rollup'; -import { MerkleTreeId } from '@aztec/stdlib/trees'; -import type { ProcessedTx, Tx } from '@aztec/stdlib/tx'; -import { BlockHeader } from '@aztec/stdlib/tx'; -import { getTelemetryClient } from '@aztec/telemetry-client'; - -import { type MockProxy, mock } from 'jest-mock-extended'; - -import { ProverNodeJobMetrics } from '../metrics.js'; -import type { ProverNodePublisher } from '../prover-node-publisher.js'; -import type { EpochProvingJobData } from './epoch-proving-job-data.js'; -import { EpochProvingJob } from './epoch-proving-job.js'; - -describe('epoch-proving-job', () => { - const mockFork = () => { - const fork = mock(); - fork[Symbol.asyncDispose].mockImplementation(() => fork.close()); - return fork; - }; - - // Dependencies - let prover: MockProxy; - let publisher: MockProxy; - let l2BlockSource: MockProxy; - let worldState: MockProxy; - let publicProcessorFactory: MockProxy; - let metrics: ProverNodeJobMetrics; - - // Created by a dependency - let db: MockProxy; - let publicProcessor: MockProxy; - - // Objects - let publicInputs: RootRollupPublicInputs; - let proof: Proof; - let batchedBlobInputs: BatchedBlob; - let checkpoints: Checkpoint[]; - let txs: Tx[]; - let initialHeader: BlockHeader; - let epochNumber: number; - let attestations: CommitteeAttestation[]; - - // Constants - const NUM_CHECKPOINTS = 3; - const BLOCKS_PER_CHECKPOINT = 2; - const TXS_PER_BLOCK = 2; - const NUM_BLOCKS = NUM_CHECKPOINTS * BLOCKS_PER_CHECKPOINT; - const proverId = EthAddress.random(); - - // Subject factory - const createJob = (opts: { deadline?: Date; parallelBlockLimit?: number; skipSubmitProof?: boolean } = {}) => { - const txsMap = new Map(txs.map(tx => [tx.getTxHash().toString(), tx])); - - const data: EpochProvingJobData = { - checkpoints, - txs: txsMap, - epochNumber: EpochNumber(epochNumber), - l1ToL2Messages: fromEntries(checkpoints.map(c => [c.number, []])), - previousBlockHeader: initialHeader, - attestations, - }; - return new EpochProvingJob( - data, - worldState, - prover, - publicProcessorFactory, - publisher, - l2BlockSource, - metrics, - opts.deadline, - { parallelBlockLimit: opts.parallelBlockLimit ?? 32, skipSubmitProof: opts.skipSubmitProof }, - ); - }; - - beforeEach(async () => { - prover = mock(); - publisher = mock(); - l2BlockSource = mock(); - worldState = mock(); - publicProcessorFactory = mock(); - db = mockFork(); - publicProcessor = mock(); - metrics = new ProverNodeJobMetrics( - getTelemetryClient().getMeter('EpochProvingJob'), - getTelemetryClient().getTracer('EpochProvingJob'), - ); - - publicInputs = RootRollupPublicInputs.random(); - proof = Proof.empty(); - batchedBlobInputs = new BatchedBlob( - publicInputs.blobPublicInputs.blobCommitmentsHash, - publicInputs.blobPublicInputs.z, - publicInputs.blobPublicInputs.y, - publicInputs.blobPublicInputs.c, - publicInputs.blobPublicInputs.c.negate(), - ); - epochNumber = 1; - initialHeader = BlockHeader.empty(); - checkpoints = await timesParallel(NUM_CHECKPOINTS, i => - Checkpoint.random(CheckpointNumber(i + 1), { - numBlocks: BLOCKS_PER_CHECKPOINT, - startBlockNumber: i * BLOCKS_PER_CHECKPOINT + 1, - txsPerBlock: TXS_PER_BLOCK, - }), - ); - attestations = times(3, CommitteeAttestation.random); - - const txHashes = checkpoints.map(c => c.blocks.map(b => b.body.txEffects.map(tx => tx.txHash))).flat(2); - txs = txHashes.map(txHash => ({ txHash, getTxHash: () => txHash }) as Tx); - - l2BlockSource.getBlockData.mockResolvedValue({ header: initialHeader } as any); - l2BlockSource.getL1Constants.mockResolvedValue({ ethereumSlotDuration: 0.1 } as L1RollupConstants); - l2BlockSource.getBlocksData.mockResolvedValue( - checkpoints.map(c => c.blocks.map(b => ({ header: b.header }) as any)).flat(), - ); - l2BlockSource.getCheckpoints.mockResolvedValue([ - { checkpoint: checkpoints.at(-1)!, attestations } as PublishedCheckpoint, - ]); - publicProcessorFactory.create.mockReturnValue(publicProcessor); - db.getInitialHeader.mockReturnValue(initialHeader); - worldState.fork.mockResolvedValue(db); - prover.getProverId.mockReturnValue(proverId); - prover.startNewBlock.mockImplementation(() => sleep(200)); - prover.finalizeEpoch.mockResolvedValue({ publicInputs, proof, batchedBlobInputs }); - publisher.submitEpochProof.mockResolvedValue(true); - publicProcessor.process.mockImplementation(async txs => { - const txsArray = await toArray(txs); - const processedTxs = await Promise.all(txsArray.map(tx => mock({ hash: tx.getTxHash() }))); - return [processedTxs, [], txsArray, [], []]; - }); - }); - - it('works', async () => { - const job = createJob(); - await job.run(); - - expect(job.getState()).toEqual('completed'); - expect(db.close).toHaveBeenCalledTimes(NUM_BLOCKS); - expect(publicProcessor.process).toHaveBeenCalledTimes(NUM_BLOCKS); - expect(publicProcessorFactory.create).toHaveBeenCalledTimes(NUM_BLOCKS); - expect(publicProcessorFactory.create.mock.calls.map(call => /* config */ call[2])).toEqual( - new Array(NUM_BLOCKS).fill( - PublicSimulatorConfig.from({ - proverId: proverId.toField(), - collectHints: true, - collectPublicInputs: true, - }), - ), - ); - expect(publisher.submitEpochProof).toHaveBeenCalledWith( - expect.objectContaining({ epochNumber, proof, publicInputs, attestations: attestations.map(a => a.toViem()) }), - ); - }); - - it('sorts txs based on block body', async () => { - txs.reverse(); - - const job = createJob(); - await job.run(); - - expect(job.getState()).toEqual('completed'); - expect(publicProcessor.process).toHaveBeenCalledTimes(NUM_BLOCKS); - - const firstBlockProcessedTxs = publicProcessor.process.mock.calls[0][0] as Tx[]; - expect(firstBlockProcessedTxs.map(tx => tx.txHash.toString())).toEqual( - checkpoints[0].blocks[0].body.txEffects.map(tx => tx.txHash.toString()), - ); - }); - - it('fails if fails to process txs for a block', async () => { - publicProcessor.process.mockImplementation(async txs => { - const txsArray = await toArray(txs); - const errors = txsArray.map(tx => ({ error: new Error('Failed to process tx'), tx })); - return [[], errors, [], [], []]; - }); - - const job = createJob(); - await job.run(); - - expect(job.getState()).toEqual('failed'); - expect(publisher.submitEpochProof).not.toHaveBeenCalled(); - }); - - it('fails if does not process all txs for a block', async () => { - publicProcessor.process.mockImplementation(_txs => Promise.resolve([[], [], [], [], []])); - - const job = createJob(); - await job.run(); - - expect(job.getState()).toEqual('failed'); - expect(publisher.submitEpochProof).not.toHaveBeenCalled(); - }); - - it('waits for in-flight checkpoint processing to settle after a block processing failure', async () => { - const forkDbs = times(NUM_BLOCKS, () => mockFork()); - let nextFork = 0; - worldState.fork.mockImplementation(() => Promise.resolve(forkDbs[nextFork++])); - prover.startNewBlock.mockResolvedValue(undefined); - - let processCalls = 0; - let resolveSecondProcessStarted!: () => void; - const secondProcessStarted = new Promise(resolve => { - resolveSecondProcessStarted = resolve; - }); - let releaseSecondProcess!: () => void; - const secondProcessMayFinish = new Promise(resolve => { - releaseSecondProcess = resolve; - }); - - publicProcessorFactory.create.mockImplementation(() => { - const processor = mock(); - processor.process.mockImplementation(async txs => { - const txsArray = await toArray(txs); - processCalls++; - - if (processCalls === 1) { - await secondProcessStarted; - throw new Error('Failed to process tx'); - } - - if (processCalls === 2) { - resolveSecondProcessStarted(); - await secondProcessMayFinish; - } - - const processedTxs = await Promise.all(txsArray.map(tx => mock({ hash: tx.getTxHash() }))); - return [processedTxs, [], txsArray, [], []]; - }); - return processor; - }); - - const job = createJob({ parallelBlockLimit: 2 }); - const runPromise = job.run(); - - await secondProcessStarted; - const runResolvedBeforeSecondProcessFinished = await Promise.race([ - runPromise.then(() => true), - sleep(50).then(() => false), - ]); - - releaseSecondProcess(); - await runPromise; - - expect(runResolvedBeforeSecondProcessFinished).toBe(false); - expect(job.getState()).toEqual('failed'); - expect(forkDbs[1].close).toHaveBeenCalled(); - expect(publisher.submitEpochProof).not.toHaveBeenCalled(); - }); - - it('times out if deadline is hit', async () => { - prover.startNewBlock.mockImplementation(() => sleep(200)); - const deadline = new Date(Date.now() + 100); - const job = createJob({ deadline }); - await job.run(); - - expect(job.getState()).toEqual('timed-out'); - expect(publisher.submitEpochProof).not.toHaveBeenCalled(); - }); - - it('halts if stopped externally', async () => { - const job = createJob(); - void job.run(); - await sleep(100); - await job.stop(); - - expect(job.getState()).toEqual('stopped'); - expect(publisher.submitEpochProof).not.toHaveBeenCalled(); - }); - - it('aborts public processing when stopped externally', async () => { - prover.startNewBlock.mockResolvedValue(undefined); - - let processStarted!: () => void; - const processStartedPromise = new Promise(resolve => { - processStarted = resolve; - }); - let abortSignal: AbortSignal | undefined; - - publicProcessor.process.mockImplementation(async (txs, opts) => { - const signal = opts?.signal; - if (!signal) { - throw new Error('Expected public processor abort signal'); - } - abortSignal = signal; - processStarted(); - await new Promise(resolve => signal.addEventListener('abort', () => resolve(), { once: true })); - - const txsArray = await toArray(txs); - const processedTxs = await Promise.all(txsArray.map(tx => mock({ hash: tx.getTxHash() }))); - return [processedTxs, [], txsArray, [], []]; - }); - - const job = createJob({ parallelBlockLimit: 1 }); - const runPromise = job.run(); - - await processStartedPromise; - await job.stop(); - await runPromise; - - expect(abortSignal?.aborted).toBe(true); - expect(job.getState()).toEqual('stopped'); - expect(prover.addTxs).not.toHaveBeenCalled(); - expect(publisher.submitEpochProof).not.toHaveBeenCalled(); - }); - - it('halts if a new block for the epoch is found', async () => { - const newHeaders = times(NUM_BLOCKS + 1, i => BlockHeader.random({ blockNumber: BlockNumber(i + 1) })); - l2BlockSource.getBlocksData.mockResolvedValue(newHeaders.map(h => ({ header: h }) as any)); - - const job = createJob(); - await job.run(); - - expect(job.getState()).toEqual('reorg'); - expect(publisher.submitEpochProof).not.toHaveBeenCalled(); - expect(prover.cancel).toHaveBeenCalled(); - }); - - it('analyzes estimated fees and does not publish when skipSubmitProof is enabled', async () => { - publisher.analyzeEpochProofSubmission.mockResolvedValue(undefined); - - const job = createJob({ skipSubmitProof: true }); - await job.run(); - - expect(job.getState()).toEqual('completed'); - expect(prover.finalizeEpoch).toHaveBeenCalled(); - expect(publisher.submitEpochProof).not.toHaveBeenCalled(); - expect(publisher.analyzeEpochProofSubmission).toHaveBeenCalledWith( - expect.objectContaining({ epochNumber, proof, publicInputs, attestations: attestations.map(a => a.toViem()) }), - ); - }); - - it('completes successfully even if fee analysis fails when skipSubmitProof is enabled', async () => { - publisher.analyzeEpochProofSubmission.mockRejectedValue(new Error('fee analysis failed')); - - const job = createJob({ skipSubmitProof: true }); - await job.run(); - - expect(job.getState()).toEqual('completed'); - expect(publisher.submitEpochProof).not.toHaveBeenCalled(); - expect(publisher.analyzeEpochProofSubmission).toHaveBeenCalled(); - }); - - it('inserts L1 to L2 messages into the message tree only for the first block of each checkpoint', async () => { - const l1ToL2Messages: Record = fromEntries( - checkpoints.map(c => [c.number, [Fr.random(), Fr.random()]]), - ); - - const txsMap = new Map(txs.map(tx => [tx.getTxHash().toString(), tx])); - const data: EpochProvingJobData = { - checkpoints, - txs: txsMap, - epochNumber: EpochNumber(epochNumber), - l1ToL2Messages, - previousBlockHeader: initialHeader, - attestations, - }; - - const job = new EpochProvingJob( - data, - worldState, - prover, - publicProcessorFactory, - publisher, - l2BlockSource, - metrics, - undefined, - { parallelBlockLimit: 32 }, - ); - - await job.run(); - - expect(job.getState()).toEqual('completed'); - - // appendLeaves should be called once per checkpoint (for the first block only), not once per block - const appendLeavesCalls = db.appendLeaves.mock.calls.filter(call => call[0] === MerkleTreeId.L1_TO_L2_MESSAGE_TREE); - expect(appendLeavesCalls).toHaveLength(NUM_CHECKPOINTS); - expect(appendLeavesCalls).not.toHaveLength(NUM_BLOCKS); - }); -}); diff --git a/yarn-project/prover-node/src/job/epoch-proving-job.ts b/yarn-project/prover-node/src/job/epoch-proving-job.ts deleted file mode 100644 index 88b8fad06dda..000000000000 --- a/yarn-project/prover-node/src/job/epoch-proving-job.ts +++ /dev/null @@ -1,531 +0,0 @@ -import { asyncPool } from '@aztec/foundation/async-pool'; -import { BlockNumber, EpochNumber } from '@aztec/foundation/branded-types'; -import { Fr } from '@aztec/foundation/curves/bn254'; -import { type Logger, type LoggerBindings, createLogger } from '@aztec/foundation/log'; -import { RunningPromise, promiseWithResolvers } from '@aztec/foundation/promise'; -import { Timer } from '@aztec/foundation/timer'; -import { AVM_MAX_CONCURRENT_SIMULATIONS } from '@aztec/native'; -import { getVKTreeRoot } from '@aztec/noir-protocol-circuits-types/vk-tree'; -import { protocolContractsHash } from '@aztec/protocol-contracts'; -import { buildFinalBlobChallenges } from '@aztec/prover-client/helpers'; -import type { PublicProcessor, PublicProcessorFactory } from '@aztec/simulator/server'; -import { PublicSimulatorConfig } from '@aztec/stdlib/avm'; -import type { L2Block, L2BlockSource } from '@aztec/stdlib/block'; -import type { Checkpoint } from '@aztec/stdlib/checkpoint'; -import { - type EpochProver, - type EpochProvingJobState, - EpochProvingJobTerminalState, - type ForkMerkleTreeOperations, -} from '@aztec/stdlib/interfaces/server'; -import { appendL1ToL2MessagesToTree } from '@aztec/stdlib/messaging'; -import { CheckpointConstantData } from '@aztec/stdlib/rollup'; -import type { ProcessedTx, Tx } from '@aztec/stdlib/tx'; -import { Attributes, type Traceable, type Tracer, trackSpan } from '@aztec/telemetry-client'; - -import * as crypto from 'node:crypto'; - -import type { ProverNodeJobMetrics } from '../metrics.js'; -import type { ProverNodePublisher } from '../prover-node-publisher.js'; -import { type EpochProvingJobData, validateEpochProvingJobData } from './epoch-proving-job-data.js'; - -export type EpochProvingJobOptions = { - parallelBlockLimit?: number; - skipEpochCheck?: boolean; - skipSubmitProof?: boolean; -}; - -/** - * Job that grabs a range of blocks from the unfinalized chain from L1, gets their txs given their hashes, - * re-executes their public calls, generates a rollup proof, and submits it to L1. This job will update the - * world state as part of public call execution via the public processor. - */ -export class EpochProvingJob implements Traceable { - private state: EpochProvingJobState = 'initialized'; - private log: Logger; - private uuid: string; - - private runPromise: Promise | undefined; - private abortController = new AbortController(); - private epochCheckPromise: RunningPromise | undefined; - private deadlineTimeoutHandler: NodeJS.Timeout | undefined; - - public readonly tracer: Tracer; - - constructor( - private data: EpochProvingJobData, - private dbProvider: Pick, - private prover: EpochProver, - private publicProcessorFactory: PublicProcessorFactory, - private publisher: Pick, - private l2BlockSource: L2BlockSource | undefined, - private metrics: ProverNodeJobMetrics, - private deadline: Date | undefined, - private config: EpochProvingJobOptions, - bindings?: LoggerBindings, - ) { - validateEpochProvingJobData(data); - this.uuid = crypto.randomUUID(); - this.log = createLogger('prover-node:epoch-proving-job', { - ...bindings, - instanceId: `epoch-${data.epochNumber}`, - }); - this.tracer = metrics.tracer; - } - - public getId(): string { - return this.uuid; - } - - public getState(): EpochProvingJobState { - return this.state; - } - - public getEpochNumber(): EpochNumber { - return this.data.epochNumber; - } - - public getDeadline(): Date | undefined { - return this.deadline; - } - - public getProvingData(): EpochProvingJobData { - return this.data; - } - - private get epochNumber() { - return this.data.epochNumber; - } - - private get checkpoints() { - return this.data.checkpoints; - } - - private get txs() { - return this.data.txs; - } - - private get attestations() { - return this.data.attestations; - } - - /** - * Proves the given epoch and submits the proof to L1. - */ - @trackSpan('EpochProvingJob.run', function () { - return { [Attributes.EPOCH_NUMBER]: this.data.epochNumber }; - }) - public async run() { - this.scheduleDeadlineStop(); - if (!this.config.skipEpochCheck) { - await this.scheduleEpochCheck(); - } - - const attestations = this.attestations.map(attestation => attestation.toViem()); - const epochNumber = this.epochNumber; - const epochSizeCheckpoints = this.checkpoints.length; - const epochSizeBlocks = this.checkpoints.reduce((accum, checkpoint) => accum + checkpoint.blocks.length, 0); - const epochSizeTxs = this.checkpoints.reduce( - (accum, checkpoint) => - accum + checkpoint.blocks.reduce((accumC, block) => accumC + block.body.txEffects.length, 0), - 0, - ); - const fromCheckpoint = this.checkpoints[0].number; - const toCheckpoint = this.checkpoints.at(-1)!.number; - const fromBlock = this.checkpoints[0].blocks[0].number; - const toBlock = this.checkpoints.at(-1)!.blocks.at(-1)!.number; - this.log.info(`Starting epoch ${epochNumber} proving job with checkpoints ${fromCheckpoint} to ${toCheckpoint}`, { - fromBlock, - toBlock, - epochSizeTxs, - epochNumber, - uuid: this.uuid, - }); - - this.progressState('processing'); - const timer = new Timer(); - const { promise, resolve } = promiseWithResolvers(); - this.runPromise = promise; - - try { - const blobTimer = new Timer(); - const blobFieldsPerCheckpoint = this.checkpoints.map(checkpoint => checkpoint.toBlobFields()); - const finalBlobBatchingChallenges = await buildFinalBlobChallenges(blobFieldsPerCheckpoint); - this.metrics.recordBlobProcessing(blobTimer.ms()); - - this.prover.startNewEpoch(epochNumber, epochSizeCheckpoints, finalBlobBatchingChallenges); - const chonkTimer = new Timer(); - await this.prover.startChonkVerifierCircuits(Array.from(this.txs.values())); - this.metrics.recordChonkVerifier(chonkTimer.ms()); - - // Everything in the epoch should have the same chainId and version. - const { chainId, version } = this.checkpoints[0].blocks[0].header.globalVariables; - - const previousBlockHeaders = this.gatherPreviousBlockHeaders(); - - const allCheckpointsTimer = new Timer(); - - const parallelism = this.config.parallelBlockLimit - ? this.config.parallelBlockLimit - : AVM_MAX_CONCURRENT_SIMULATIONS > 0 - ? AVM_MAX_CONCURRENT_SIMULATIONS - : this.checkpoints.length; - - await this.processCheckpoints(parallelism, async checkpoint => { - this.checkState(); - const checkpointTimer = new Timer(); - - const checkpointIndex = checkpoint.number - fromCheckpoint; - const checkpointConstants = CheckpointConstantData.from({ - chainId, - version, - vkTreeRoot: getVKTreeRoot(), - protocolContractsHash: protocolContractsHash, - proverId: this.prover.getProverId().toField(), - slotNumber: checkpoint.header.slotNumber, - coinbase: checkpoint.header.coinbase, - feeRecipient: checkpoint.header.feeRecipient, - gasFees: checkpoint.header.gasFees, - }); - const previousHeader = previousBlockHeaders[checkpointIndex]; - const l1ToL2Messages = this.getL1ToL2Messages(checkpoint); - - this.log.debug(`Starting processing checkpoint ${checkpoint.number}`, { - number: checkpoint.number, - checkpointHash: checkpoint.hash().toString(), - headerHash: checkpoint.header.hash().toString(), - numL1ToL2Messages: l1ToL2Messages.length, - previousBlockNumber: previousHeader.globalVariables.blockNumber, - uuid: this.uuid, - }); - - await this.prover.startNewCheckpoint( - checkpointIndex, - checkpointConstants, - l1ToL2Messages, - checkpoint.blocks.length, - previousHeader, - ); - - for (let blockIndex = 0; blockIndex < checkpoint.blocks.length; blockIndex++) { - const blockTimer = new Timer(); - const block = checkpoint.blocks[blockIndex]; - const globalVariables = block.header.globalVariables; - const txs = this.getTxs(block); - - this.log.verbose(`Starting processing block ${block.number}`, { - number: block.number, - blockHash: (await block.hash()).toString(), - lastArchive: block.header.lastArchive.root, - noteHashTreeRoot: block.header.state.partial.noteHashTree.root, - nullifierTreeRoot: block.header.state.partial.nullifierTree.root, - publicDataTreeRoot: block.header.state.partial.publicDataTree.root, - ...globalVariables, - numTxs: txs.length, - }); - - // Start block proving - await this.prover.startNewBlock(block.number, globalVariables.timestamp, txs.length); - - // Process public fns. L1 to L2 messages are only inserted for the first block of a checkpoint, - // as the fork for subsequent blocks already includes them from the previous block's synced state. - { - await using db = await this.createFork( - BlockNumber(block.number - 1), - blockIndex === 0 ? l1ToL2Messages : undefined, - ); - this.checkState(); - const config = PublicSimulatorConfig.from({ - proverId: this.prover.getProverId().toField(), - skipFeeEnforcement: false, - collectDebugLogs: false, - collectHints: true, - collectPublicInputs: true, - collectStatistics: false, - }); - const publicProcessor = this.publicProcessorFactory.create(db, globalVariables, config); - const processed = await this.processTxs(publicProcessor, txs); - this.checkState(); - await this.prover.addTxs(processed); - } - this.checkState(); - this.log.verbose(`Processed all ${txs.length} txs for block ${block.number}`, { - blockNumber: block.number, - blockHash: (await block.hash()).toString(), - uuid: this.uuid, - }); - - // Mark block as completed to pad it - const expectedBlockHeader = block.header; - await this.prover.setBlockCompleted(block.number, expectedBlockHeader); - this.metrics.recordBlockProcessing(blockTimer.ms()); - } - this.metrics.recordCheckpointProcessing(checkpointTimer.ms()); - }); - this.metrics.recordAllCheckpointsProcessing(allCheckpointsTimer.ms()); - - const executionTime = timer.ms(); - - this.progressState('awaiting-prover'); - const { publicInputs, proof, batchedBlobInputs } = await this.prover.finalizeEpoch(); - this.log.info(`Finalized proof for epoch ${epochNumber}`, { epochNumber, uuid: this.uuid, duration: timer.ms() }); - - this.progressState('publishing-proof'); - - if (this.config.skipSubmitProof) { - this.log.info( - `Proof publishing is disabled. Analyzing estimated L1 fees for epoch ${epochNumber} (checkpoints ${fromCheckpoint} to ${toCheckpoint})`, - ); - try { - await this.publisher.analyzeEpochProofSubmission({ - fromCheckpoint, - toCheckpoint, - epochNumber, - publicInputs, - proof, - batchedBlobInputs, - attestations, - }); - } catch (err) { - this.log.warn(`Failed to analyze estimated L1 fees for epoch ${epochNumber}`, err); - } - this.state = 'completed'; - this.metrics.recordProvingJob(executionTime, timer.ms(), epochSizeCheckpoints, epochSizeBlocks, epochSizeTxs); - return; - } - - const success = await this.publisher.submitEpochProof({ - fromCheckpoint, - toCheckpoint, - epochNumber, - publicInputs, - proof, - batchedBlobInputs, - attestations, - }); - if (!success) { - throw new Error('Failed to submit epoch proof to L1'); - } - - this.log.info(`Submitted proof for epoch ${epochNumber} (checkpoints ${fromCheckpoint} to ${toCheckpoint})`, { - epochNumber, - uuid: this.uuid, - }); - this.state = 'completed'; - this.metrics.recordProvingJob(executionTime, timer.ms(), epochSizeCheckpoints, epochSizeBlocks, epochSizeTxs); - } catch (err: any) { - if (err && err.name === 'HaltExecutionError') { - this.log.warn(`Halted execution of epoch ${epochNumber} prover job`, { - uuid: this.uuid, - epochNumber, - details: err.message, - }); - return; - } - this.log.error(`Error running epoch ${epochNumber} prover job`, err, { uuid: this.uuid, epochNumber }); - if (this.state === 'processing' || this.state === 'awaiting-prover' || this.state === 'publishing-proof') { - this.state = 'failed'; - } - } finally { - clearTimeout(this.deadlineTimeoutHandler); - await this.epochCheckPromise?.stop(); - await this.prover.stop(); - resolve(); - } - } - - /** - * Create a new db fork for tx processing, optionally inserting L1 to L2 messages. - * L1 to L2 messages should only be inserted for the first block in a checkpoint, - * as subsequent blocks' synced state already includes them. - * REFACTOR: The prover already spawns a db fork of its own for each block, so we may be able to do away with just one fork. - */ - private async createFork(blockNumber: BlockNumber, l1ToL2Messages: Fr[] | undefined) { - this.log.verbose(`Creating fork at ${blockNumber}`, { blockNumber }); - // temporary stack to control fork lifetime - await using cleanup = new AsyncDisposableStack(); - const db = cleanup.use(await this.dbProvider.fork(blockNumber)); - - if (l1ToL2Messages !== undefined) { - this.log.verbose(`Inserting ${l1ToL2Messages.length} L1 to L2 messages in fork`, { - blockNumber, - l1ToL2Messages: l1ToL2Messages.map(m => m.toString()), - }); - await appendL1ToL2MessagesToTree(db, l1ToL2Messages); - } - - // everything run succesfully so we can release this stack and give control of the fork's lifetime to the caller - cleanup.move(); - return db; - } - - private async processCheckpoints( - parallelism: number, - processCheckpoint: (checkpoint: Checkpoint) => Promise, - ): Promise { - let hasError = false; - let firstError: unknown; - - await asyncPool(Math.max(parallelism, 1), this.checkpoints, async checkpoint => { - if (hasError || this.abortController.signal.aborted) { - return; - } - - try { - this.checkState(); - await processCheckpoint(checkpoint); - } catch (err) { - if (!hasError) { - hasError = true; - firstError = err; - this.failProcessing(); - } - } - }); - - if (hasError) { - throw firstError; - } - - if (this.abortController.signal.aborted) { - this.checkState(); - } - } - - private progressState(state: EpochProvingJobState) { - this.checkState(); - this.state = state; - } - - private checkState() { - if (this.state === 'timed-out' || this.state === 'stopped' || this.state === 'failed' || this.state === 'reorg') { - throw new HaltExecutionError(this.state); - } - } - - public async stop(state: EpochProvingJobTerminalState = 'stopped') { - this.state = state; - this.interruptProcessing(); - if (this.runPromise) { - await this.runPromise; - } - } - - private failProcessing() { - if (!EpochProvingJobTerminalState.includes(this.state)) { - this.state = 'failed'; - } - this.interruptProcessing(); - } - - private interruptProcessing() { - this.abortController.abort(); - this.prover.cancel(); - } - - private scheduleDeadlineStop() { - const deadline = this.deadline; - if (deadline) { - const timeout = deadline.getTime() - Date.now(); - if (timeout <= 0) { - throw new Error('Cannot start job with deadline in the past'); - } - - this.deadlineTimeoutHandler = setTimeout(() => { - if (EpochProvingJobTerminalState.includes(this.state)) { - return; - } - this.log.warn('Stopping job due to deadline hit', { uuid: this.uuid, epochNumber: this.epochNumber }); - this.stop('timed-out').catch(err => { - this.log.error('Error stopping job', err, { uuid: this.uuid, epochNumber: this.epochNumber }); - }); - }, timeout); - } - } - - /** - * Kicks off a running promise that queries the archiver for the set of L2 blocks of the current epoch. - * If those change, stops the proving job with a `rerun` state, so the node re-enqueues it. - */ - private async scheduleEpochCheck() { - const l2BlockSource = this.l2BlockSource; - if (!l2BlockSource) { - this.log.warn(`No L2 block source available, skipping epoch check`); - return; - } - - const intervalMs = Math.ceil((await l2BlockSource.getL1Constants()).ethereumSlotDuration / 2) * 1000; - this.epochCheckPromise = new RunningPromise( - async () => { - const blockHeaders = ( - await l2BlockSource.getBlocksData({ epoch: this.epochNumber, onlyCheckpointed: true }) - ).map(d => d.header); - const blockHashes = await Promise.all(blockHeaders.map(header => header.hash())); - const thisBlocks = this.checkpoints.flatMap(checkpoint => checkpoint.blocks); - const thisBlockHashes = await Promise.all(thisBlocks.map(block => block.hash())); - if ( - blockHeaders.length !== thisBlocks.length || - !blockHashes.every((block, i) => block.equals(thisBlockHashes[i])) - ) { - this.log.warn('Epoch blocks changed underfoot', { - uuid: this.uuid, - epochNumber: this.epochNumber, - oldBlockHashes: thisBlockHashes, - newBlockHashes: blockHashes, - }); - void this.stop('reorg'); - } - }, - this.log, - intervalMs, - ).start(); - this.log.verbose(`Scheduled epoch check for epoch ${this.epochNumber} every ${intervalMs}ms`); - } - - /* Returns the last block header in the previous checkpoint for all checkpoints in the epoch */ - private gatherPreviousBlockHeaders() { - const lastBlocks = this.checkpoints.map(checkpoint => checkpoint.blocks.at(-1)!); - return [this.data.previousBlockHeader, ...lastBlocks.map(block => block.header).slice(0, -1)]; - } - - private getTxs(block: L2Block): Tx[] { - return block.body.txEffects.map(txEffect => this.txs.get(txEffect.txHash.toString())!); - } - - private getL1ToL2Messages(checkpoint: Checkpoint) { - return this.data.l1ToL2Messages[checkpoint.number]; - } - - private async processTxs(publicProcessor: PublicProcessor, txs: Tx[]): Promise { - const { deadline } = this; - const [processedTxs, failedTxs] = await publicProcessor.process(txs, { - deadline, - signal: this.abortController.signal, - }); - this.checkState(); - - if (failedTxs.length) { - const failedTxHashes = await Promise.all(failedTxs.map(({ tx }) => tx.getTxHash())); - throw new Error( - `Txs failed processing: ${failedTxs - .map(({ error }, index) => `${failedTxHashes[index]} (${error})`) - .join(', ')}`, - ); - } - - if (processedTxs.length !== txs.length) { - throw new Error(`Failed to process all txs: processed ${processedTxs.length} out of ${txs.length}`); - } - - return processedTxs; - } -} - -class HaltExecutionError extends Error { - constructor(public readonly state: EpochProvingJobState) { - super(`Halted execution due to state ${state}`); - this.name = 'HaltExecutionError'; - } -} - -export { type EpochProvingJobState }; diff --git a/yarn-project/prover-node/src/job/epoch-session.test.ts b/yarn-project/prover-node/src/job/epoch-session.test.ts new file mode 100644 index 000000000000..fea0ebde5345 --- /dev/null +++ b/yarn-project/prover-node/src/job/epoch-session.test.ts @@ -0,0 +1,430 @@ +import { BatchedBlob } from '@aztec/blob-lib/types'; +import { ARCHIVE_HEIGHT } from '@aztec/constants'; +import { makeTuple } from '@aztec/foundation/array'; +import { BlockNumber, CheckpointNumber, EpochNumber } from '@aztec/foundation/branded-types'; +import { Fr } from '@aztec/foundation/curves/bn254'; +import { EthAddress } from '@aztec/foundation/eth-address'; +import { promiseWithResolvers } from '@aztec/foundation/promise'; +import { DateProvider } from '@aztec/foundation/timer'; +import type { EpochProverFactory } from '@aztec/prover-client'; +import type { TopTreeOrchestrator } from '@aztec/prover-client/orchestrator'; +import { Checkpoint } from '@aztec/stdlib/checkpoint'; +import { Proof } from '@aztec/stdlib/proofs'; +import { RootRollupPublicInputs } from '@aztec/stdlib/rollup'; +import { BlockHeader } from '@aztec/stdlib/tx'; +import { getTelemetryClient } from '@aztec/telemetry-client'; + +import { mock } from 'jest-mock-extended'; + +import { ProverNodeJobMetrics } from '../metrics.js'; +import type { ProofPublishingService, PublishCandidate, PublishOutcome } from '../proof-publishing-service.js'; +import { CheckpointProver } from './checkpoint-prover.js'; +import { EpochSession, type EpochSessionDeps, type EpochSessionHooks, type SessionSpec } from './epoch-session.js'; +import type { TopTreeProof } from './top-tree-job.js'; + +describe('EpochSession', () => { + let proverFactory: ReturnType>; + let publishingService: ReturnType>>; + let topTree: ReturnType>; + let metrics: ProverNodeJobMetrics; + let dateProvider: DateProvider; + let cp: Checkpoint; + let stubProver: CheckpointProver; + let synthProof: TopTreeProof; + /** Resolves on the next createTopTreeOrchestrator call — used to await TopTreeJob construction. */ + let topTreeConstructed: ReturnType>; + + beforeEach(async () => { + cp = await Checkpoint.random(CheckpointNumber(1), { numBlocks: 2 }); + stubProver = makeStubProver(cp); + + topTree = mock(); + proverFactory = mock(); + topTreeConstructed = promiseWithResolvers(); + // Signal as soon as the TopTreeJob constructor reaches createTopTreeOrchestrator — by + // the time the awaiter resumes, EpochSession.run has already assigned `this.topTreeJob` + // (the assignment is the next sync statement after `new TopTreeJob(...)`). + proverFactory.createTopTreeOrchestrator.mockImplementation(() => { + topTreeConstructed.resolve(); + return topTree; + }); + publishingService = mock>(); + + const telemetry = getTelemetryClient(); + metrics = new ProverNodeJobMetrics(telemetry.getMeter('test'), telemetry.getTracer('test')); + dateProvider = new DateProvider(); + + synthProof = { + publicInputs: RootRollupPublicInputs.random(), + proof: Proof.empty(), + batchedBlobInputs: makeBlob(), + }; + }); + + // ---------------- construction ---------------- + + describe('construction', () => { + it('throws on an empty prover set', () => { + expect(() => new EpochSession(makeSpec(), [], makeDeps())).toThrow(/empty checkpoint set/); + }); + + it('initial state is "initialized" and not terminal', () => { + const session = makeSession(); + expect(session.getState()).toBe('initialized'); + expect(session.isTerminal()).toBe(false); + }); + + it('accessors return the spec values', () => { + const session = makeSession(); + expect(session.getEpochNumber()).toEqual(EpochNumber(5)); + expect(session.getKind()).toBe('full'); + expect(session.getId()).toMatch(/^[\da-f-]+$/i); + expect(session.getSpec()).toEqual(makeSpec()); + expect(session.getCheckpoints()).toEqual([stubProver]); + }); + + it('getStartBlockNumber / getEndBlockNumber span the contained checkpoint blocks', () => { + const session = makeSession(); + const blocks = cp.blocks; + expect(session.getStartBlockNumber()).toEqual(BlockNumber(blocks[0].number)); + expect(session.getEndBlockNumber()).toEqual(BlockNumber(blocks[blocks.length - 1].number)); + }); + + it('defensively copies the prover array so external mutation cannot corrupt it', () => { + const provers: CheckpointProver[] = [stubProver]; + const session = new EpochSession(makeSpec(), provers, makeDeps()); + provers.length = 0; + expect(session.getCheckpoints().length).toBe(1); + }); + }); + + // ---------------- happy path ---------------- + + describe('start (happy path)', () => { + it('runs to "completed" when the publishing service reports "published"', async () => { + publishingService.submit.mockResolvedValue('published'); + const session = makeSession(); + const state = await session.start(); + expect(state).toBe('completed'); + expect(session.getState()).toBe('completed'); + expect(session.isTerminal()).toBe(true); + await expect(session.whenDone()).resolves.toBe('completed'); + }); + + it('submits a candidate whose id, kind, range, and checkpoint bounds come from the session', async () => { + publishingService.submit.mockResolvedValue('published'); + const session = makeSession(); + await session.start(); + expect(publishingService.submit).toHaveBeenCalledTimes(1); + const candidate = publishingService.submit.mock.calls[0]![0] as PublishCandidate; + expect(candidate.id).toBe(session.getId()); + expect(candidate.kind).toBe('full'); + expect(candidate.epoch).toEqual(EpochNumber(5)); + expect(candidate.startBlock).toEqual(session.getStartBlockNumber()); + expect(candidate.endBlock).toEqual(session.getEndBlockNumber()); + expect(candidate.fromCheckpoint).toEqual(CheckpointNumber(1)); + expect(candidate.toCheckpoint).toEqual(CheckpointNumber(1)); + }); + }); + + // ---------------- outcome mapping ---------------- + + describe('publishing outcome → terminal state', () => { + it.each<[PublishOutcome, string]>([ + ['published', 'completed'], + ['superseded', 'superseded'], + ['expired', 'timed-out'], + ])('maps "%s" → "%s"', async (outcome, expected) => { + publishingService.submit.mockResolvedValue(outcome); + const session = makeSession(); + const state = await session.start(); + expect(state).toBe(expected); + }); + + it('"failed" outcome propagates as a thrown error → state "failed"', async () => { + publishingService.submit.mockResolvedValue('failed'); + const session = makeSession(); + const state = await session.start(); + expect(state).toBe('failed'); + }); + + it('"withdrawn" outcome with no prior cancel falls back to "cancelled"', async () => { + publishingService.submit.mockResolvedValue('withdrawn'); + const session = makeSession(); + const state = await session.start(); + expect(state).toBe('cancelled'); + }); + }); + + // ---------------- cancellation ---------------- + + describe('cancel', () => { + it('flips state to "cancelled" and calls publishingService.withdraw with the session id', async () => { + const session = makeSession(); + await session.cancel(); + expect(session.getState()).toBe('cancelled'); + expect(session.isTerminal()).toBe(true); + expect(publishingService.withdraw).toHaveBeenCalledWith(session.getId()); + }); + + it('is idempotent — repeated calls do not re-withdraw or change state', async () => { + const session = makeSession(); + await session.cancel(); + await session.cancel(); + await session.cancel(); + expect(session.getState()).toBe('cancelled'); + expect(publishingService.withdraw).toHaveBeenCalledTimes(1); + }); + + it('cancel during top-tree prove unwinds cleanly and end state stays "cancelled"', async () => { + // Hold prove indefinitely; cancel must stop the session out from under it. The gate + // is left pending past assertion: it has a single handler (TopTreeJob.run's await), + // so it never surfaces as an unhandled rejection. Resolving/rejecting it after the + // cancel risks leaking an unhandled rejection into the next test. + const proveGate = promiseWithResolvers(); + const session = makeSession({ hooks: { topTreeProveOverride: () => proveGate.promise } }); + const startResult = session.start(); + // Explicit trigger: wait until the TopTreeJob has been constructed (and thus assigned + // to `this.topTreeJob`) before issuing the cancel. + await topTreeConstructed.promise; + await session.cancel(); + await expect(startResult).resolves.toBe('cancelled'); + // cancel() drops the top-tree job and cleanup awaits its teardown. + expect(topTree.stop).toHaveBeenCalled(); + }); + + it('cancel after start has settled leaves the existing terminal state in place', async () => { + publishingService.submit.mockResolvedValue('published'); + const session = makeSession(); + await session.start(); + expect(session.getState()).toBe('completed'); + await session.cancel(); + expect(session.getState()).toBe('completed'); + // withdraw is NOT called once the session has already terminated. + expect(publishingService.withdraw).not.toHaveBeenCalled(); + }); + + it('a withdraw error from the publishing service does not stop cancel from finishing', async () => { + publishingService.withdraw.mockImplementation(() => { + throw new Error('publishing service crashed'); + }); + const session = makeSession(); + await expect(session.cancel()).resolves.toBeUndefined(); + expect(session.getState()).toBe('cancelled'); + }); + }); + + // ---------------- deadline ---------------- + + describe('deadline', () => { + it('fires while prove is in flight → state transitions to "timed-out"', async () => { + const proveGate = promiseWithResolvers(); + // Deadline far in the future — the test drives handleDeadline manually via + // triggerDeadline(), so the real setTimeout never fires within the test window. + const deadline = new Date(dateProvider.now() + 60_000); + const session = makeSession({ + deadline, + hooks: { topTreeProveOverride: () => proveGate.promise }, + }); + const startResult = session.start(); + await topTreeConstructed.promise; + // Drive the deadline path directly — handleDeadline returns a promise that resolves + // only after the 'cancelled' → 'timed-out' state flip has landed. + await session.triggerDeadline(); + expect(session.getState()).toBe('timed-out'); + expect(publishingService.withdraw).toHaveBeenCalledWith(session.getId()); + // start()'s return value may still be 'cancelled' due to the race between the + // resolveCompletion call inside cancel and the state flip after it — the canonical + // observable for the deadline outcome is getState() above. + await startResult; + }); + + it('does not fire when the session completes before its deadline', async () => { + publishingService.submit.mockResolvedValue('published'); + const deadline = new Date(dateProvider.now() + 60_000); // far enough out + const session = makeSession({ deadline }); + const state = await session.start(); + expect(state).toBe('completed'); + // withdraw is never called on the happy path. + expect(publishingService.withdraw).not.toHaveBeenCalled(); + }); + }); + + // ---------------- checkpoint failure ---------------- + + describe('checkpoint that fails to prove', () => { + it('drives the session to "failed" when a checkpoint\'s blockProofs reject', async () => { + // Build a prover whose block-rollup proofs are guaranteed to reject — this mirrors + // the production path where CheckpointProver.executeCheckpoint catches an internal + // error and rejects its blockProofs promise. + const failingProver = makeStubProver(cp, { blockProofsError: new Error('block 7 proving failed') }); + const session = new EpochSession( + makeSpec(), + [failingProver], + makeDeps({ + // Override mirrors what the real topTree.prove(...) does: awaits each prover's + // blockProofs and propagates the rejection up. + hooks: { topTreeProveOverride: () => failingProver.whenBlockProofsReady().then(() => synthProof) }, + }), + ); + const state = await session.start(); + expect(state).toBe('failed'); + expect(session.isTerminal()).toBe(true); + // Failure happens before submission; the publishing service must never see the candidate. + expect(publishingService.submit).not.toHaveBeenCalled(); + }); + + it('whenDone resolves to "failed" so callers observing the lifecycle agree with the return value', async () => { + const failingProver = makeStubProver(cp, { blockProofsError: new Error('boom') }); + const session = new EpochSession( + makeSpec(), + [failingProver], + makeDeps({ + hooks: { topTreeProveOverride: () => failingProver.whenBlockProofsReady().then(() => synthProof) }, + }), + ); + const startResult = session.start(); + await expect(session.whenDone()).resolves.toBe('failed'); + await expect(startResult).resolves.toBe('failed'); + }); + + it('a prove that rejects for any reason ends the session in "failed" without submitting', async () => { + // Belt-and-braces: any prove rejection (top-tree internal error, blob computation, + // etc.) follows the same path. The session swallows the error and reports 'failed'. + const session = makeSession({ + hooks: { topTreeProveOverride: () => Promise.reject(new Error('top-tree internal failure')) }, + }); + const state = await session.start(); + expect(state).toBe('failed'); + expect(publishingService.submit).not.toHaveBeenCalled(); + }); + }); + + // ---------------- hooks ---------------- + + describe('hooks', () => { + it('beforeTopTreeProve fires before the prove override and afterTopTreeProve after it', async () => { + const calls: string[] = []; + publishingService.submit.mockResolvedValue('published'); + const session = makeSession({ + hooks: { + beforeTopTreeProve: () => { + calls.push('before'); + }, + topTreeProveOverride: () => { + calls.push('prove'); + return Promise.resolve(synthProof); + }, + afterTopTreeProve: () => { + calls.push('after'); + }, + }, + }); + await session.start(); + expect(calls).toEqual(['before', 'prove', 'after']); + }); + }); + + // ---------------- helpers ---------------- + + /** Default session spec used by every test that doesn't override it. */ + function makeSpec(): SessionSpec { + return { + kind: 'full', + epochNumber: EpochNumber(5), + fromSlot: cp.header.slotNumber, + toSlot: cp.header.slotNumber, + }; + } + + /** + * Default deps. Tests inject a `topTreeProveOverride` whenever they don't want the real + * (and missing) orchestrator to be called — by default the hook returns `synthProof` + * immediately so submit can be exercised. + */ + function makeDeps(opts: { hooks?: EpochSessionHooks; deadline?: Date } = {}): EpochSessionDeps { + const hooks: EpochSessionHooks = { + topTreeProveOverride: () => Promise.resolve(synthProof), + ...opts.hooks, + }; + return { + proverFactory, + proverId: EthAddress.ZERO, + publishingService, + metrics, + dateProvider, + deadline: opts.deadline, + config: {}, + hooks, + }; + } + + function makeSession(opts: { hooks?: EpochSessionHooks; deadline?: Date } = {}): TestEpochSession { + return new TestEpochSession(makeSpec(), [stubProver], makeDeps(opts)); + } +}); + +/** + * Subclass that exposes the protected `handleDeadline` so tests can drive the deadline + * path directly without waiting on the real `setTimeout` to fire. Awaiting the returned + * promise blocks until cancellation has propagated AND the 'cancelled' → 'timed-out' + * state flip has landed. + */ +class TestEpochSession extends EpochSession { + public triggerDeadline(): Promise { + return this.handleDeadline(); + } +} + +/** + * Minimal CheckpointProver-shaped stub: provides everything the TopTreeJob and EpochSession + * read off a prover, without standing up the actual eager gather/sub-tree pipeline. + * + * Pass `blockProofsError` to simulate a checkpoint that fails to prove — its + * `whenBlockProofsReady()` will reject with the supplied error, mirroring the production + * path where CheckpointProver.executeCheckpoint catches an internal failure and rejects + * its blockProofs promise. + */ +function makeStubProver(checkpoint: Checkpoint, opts: { blockProofsError?: Error } = {}): CheckpointProver { + const id = CheckpointProver.idFor(checkpoint); + // By default whenBlockProofsReady never resolves in these tests; the prove override + // bypasses any path that would actually await it. + const blockProofs: Promise = opts.blockProofsError + ? Promise.reject(opts.blockProofsError) + : new Promise(() => {}); + // Suppress unhandled-rejection noise — tests that need the rejection observe it + // explicitly via the proveOverride hook. + blockProofs.catch(() => {}); + return { + id, + checkpoint, + epochNumber: EpochNumber(5), + slotNumber: checkpoint.header.slotNumber, + attestations: [], + previousBlockHeader: BlockHeader.empty(), + l1ToL2Messages: [], + previousArchiveSiblingPath: makeTuple(ARCHIVE_HEIGHT, () => Fr.ZERO), + txs: new Map(), + whenBlockProofsReady: () => blockProofs, + isCancelled: () => false, + isCompleted: () => false, + isPruned: () => false, + markPruned: () => {}, + markCanonical: () => {}, + cancel: () => {}, + whenDone: () => Promise.resolve(), + getAbortSignal: () => new AbortController().signal, + } as unknown as CheckpointProver; +} + +/** Builds a syntactically valid BatchedBlob — values are random but the shape is real. */ +function makeBlob(): BatchedBlob { + const pi = RootRollupPublicInputs.random(); + return new BatchedBlob( + pi.blobPublicInputs.blobCommitmentsHash, + pi.blobPublicInputs.z, + pi.blobPublicInputs.y, + pi.blobPublicInputs.c, + pi.blobPublicInputs.c.negate(), + ); +} diff --git a/yarn-project/prover-node/src/job/epoch-session.ts b/yarn-project/prover-node/src/job/epoch-session.ts new file mode 100644 index 000000000000..933582e92fbb --- /dev/null +++ b/yarn-project/prover-node/src/job/epoch-session.ts @@ -0,0 +1,424 @@ +import { BlockNumber, type CheckpointNumber, type EpochNumber, type SlotNumber } from '@aztec/foundation/branded-types'; +import type { EthAddress } from '@aztec/foundation/eth-address'; +import { type Logger, type LoggerBindings, createLogger } from '@aztec/foundation/log'; +import { sleep } from '@aztec/foundation/sleep'; +import { type DateProvider, Timer } from '@aztec/foundation/timer'; +import type { EpochProverFactory } from '@aztec/prover-client'; +import { TopTreeCancelledError } from '@aztec/prover-client/orchestrator'; +import { type EpochProvingJobState, EpochProvingJobTerminalState } from '@aztec/stdlib/interfaces/server'; +import { Attributes, type Traceable, type Tracer, trackSpan } from '@aztec/telemetry-client'; + +import * as crypto from 'node:crypto'; + +import type { ProverNodeJobMetrics } from '../metrics.js'; +import type { ProofPublishingService } from '../proof-publishing-service.js'; +import { CheckpointProver } from './checkpoint-prover.js'; +import { TopTreeJob, type TopTreeJobHooks, type TopTreeProof } from './top-tree-job.js'; + +export type { EpochProvingJobState }; + +/** Full vs partial — the only behavioural difference is at the L1 submission step. */ +export type SessionKind = 'full' | 'partial'; + +/** + * Identifies what a session proves: a contiguous slot range within an epoch. The + * concrete prover set the session holds is the *implementation* of the spec — frozen + * at construction time, derived from the canonical content for `[fromSlot, toSlot]`. + * + * Reconciliation in `ProverNode` is uniform across kinds: whenever the canonical + * content for the slot range changes, the session is cancelled and replaced with a + * fresh session that **preserves the slot range** but adopts the new checkpoints. + * + * Kind affects only the publishing decision (see `EpochSession`). + */ +export interface SessionSpec { + kind: SessionKind; + epochNumber: EpochNumber; + fromSlot: SlotNumber; + toSlot: SlotNumber; +} + +/** Stable string key for use in maps. */ +export function specKey(spec: SessionSpec): string { + return `${spec.kind}:${spec.epochNumber}:${spec.fromSlot}-${spec.toSlot}`; +} + +/** Hooks tests use to interpose around the top-tree prove without monkey-patching. */ +export type EpochSessionHooks = { + beforeTopTreeProve?: () => Promise | void; + afterTopTreeProve?: () => Promise | void; + topTreeProveOverride?: (defaultProve: () => Promise) => Promise; +}; + +export type EpochSessionOptions = { + /** + * If set, the session sleeps this many ms after `start()` (before the TopTreeJob is + * constructed). Lets late-arriving events (e.g. a prune) be processed before + * top-tree proving begins. + */ + finalizationDelayMs?: number; +}; + +/** Dependencies an `EpochSession` needs at construction. */ +export type EpochSessionDeps = { + proverFactory: EpochProverFactory; + proverId: EthAddress; + publishingService: Pick; + metrics: ProverNodeJobMetrics; + dateProvider: DateProvider; + /** Optional proving deadline. The session enters `timed-out` if exceeded. */ + deadline: Date | undefined; + config: EpochSessionOptions; + bindings?: LoggerBindings; + hooks?: EpochSessionHooks; +}; + +/** + * One attempt at proving and publishing a contiguous slot range. The `SessionSpec` and + * the prover set are both frozen at construction time; the session does not adapt to + * reorgs or extensions of canonical content. Instead, `SessionManager` owns the + * reconciliation loop and replaces invalidated sessions wholesale (cancel + construct + * a fresh session with the new prover set). + * + * Each session does three things in sequence: + * + * 1. Run a `TopTreeJob` over its frozen prover subset to produce the epoch proof. + * 2. Hand the proof to the shared `ProofPublishingService` as a `PublishCandidate`. + * 3. Translate the service's outcome into a terminal session state. + * + * Everything to do with submission — predecessor gating, same-epoch dedup, deadline + * enforcement, and the L1 transaction itself — is the publishing service's concern. + * The session is just the producer of one candidate and the observer of its outcome. + * + * Lifecycle (happy path): + * + * initialized → awaiting-checkpoints → completed + * + * Terminal states map the publishing outcome: `published` → `completed`, `superseded` → + * `superseded`, `failed` → `failed`, `expired` → `timed-out`, `withdrawn` → `cancelled`. + * Additionally, the session-level deadline fires `cancel('deadline')` and transitions + * to `timed-out` for the pre-submit window (top-tree proving) — the publishing service + * handles the post-submit window via the candidate's `deadline`. + * + * `cancel()` is idempotent. It marks the session terminal, calls + * `publishingService.withdraw(uuid)` to drop any queued candidate (an in-flight publish + * runs to natural completion; the session has already settled), and tears down the + * top-tree job if proving is still in progress. + */ +export class EpochSession implements Traceable { + public readonly tracer: Tracer; + private readonly uuid: string; + private readonly log: Logger; + private state: EpochProvingJobState = 'initialized'; + private deadlineTimeoutHandler: NodeJS.Timeout | undefined; + + private topTreeJob: TopTreeJob | undefined; + /** Cancelled top-tree jobs whose teardown is still in flight. Awaited at session stop. */ + private readonly pendingTopTreeCleanups: TopTreeJob[] = []; + + private readonly completionPromise: Promise; + private resolveCompletion!: (state: EpochProvingJobState) => void; + + /** Stable reference; never mutated after construction. */ + private readonly checkpoints: readonly CheckpointProver[]; + + constructor( + private readonly spec: SessionSpec, + checkpoints: readonly CheckpointProver[], + private readonly deps: EpochSessionDeps, + ) { + if (checkpoints.length === 0) { + throw new Error(`Cannot construct EpochSession for ${specKey(spec)}: empty checkpoint set`); + } + this.checkpoints = [...checkpoints]; + this.uuid = crypto.randomUUID(); + this.log = createLogger('prover-node:epoch-session', { + ...deps.bindings, + instanceId: `session-${spec.kind}-${spec.epochNumber}-${spec.fromSlot}-${spec.toSlot}`, + }); + this.tracer = deps.metrics.tracer; + this.completionPromise = new Promise(resolve => { + this.resolveCompletion = resolve; + }); + this.scheduleDeadlineStop(); + this.log.info(`Created EpochSession ${this.uuid}`, { + uuid: this.uuid, + ...spec, + checkpointCount: this.checkpoints.length, + checkpointIds: this.checkpoints.map(c => c.id), + }); + } + + public getId(): string { + return this.uuid; + } + + public getSpec(): SessionSpec { + return this.spec; + } + + public getState(): EpochProvingJobState { + return this.state; + } + + public getEpochNumber(): EpochNumber { + return this.spec.epochNumber; + } + + public getKind(): SessionKind { + return this.spec.kind; + } + + public getDeadline(): Date | undefined { + return this.deps.deadline; + } + + public getCheckpoints(): readonly CheckpointProver[] { + return this.checkpoints; + } + + /** Resolves when the session reaches a terminal state. */ + public whenDone(): Promise { + return this.completionPromise; + } + + /** True if the session is in a terminal state. */ + public isTerminal(): boolean { + return EpochProvingJobTerminalState.includes(this.state); + } + + /** First block this session proves. */ + public getStartBlockNumber(): BlockNumber { + return BlockNumber(this.checkpoints[0].checkpoint.blocks[0].number); + } + + /** Last block this session proves. */ + public getEndBlockNumber(): BlockNumber { + const lastCheckpoint = this.checkpoints[this.checkpoints.length - 1]; + return BlockNumber(lastCheckpoint.checkpoint.blocks[lastCheckpoint.checkpoint.blocks.length - 1].number); + } + + /** + * Kicks off proving + submission. Fires and forgets — callers should await `whenDone()`. + * Returns a promise that resolves to the final state for callers that want to wait inline. + */ + @trackSpan('EpochSession.start', function () { + return { [Attributes.EPOCH_NUMBER]: this.spec.epochNumber }; + }) + public async start(): Promise { + try { + await this.run(); + } catch (err) { + this.log.error(`Error in EpochSession ${this.uuid}`, err, { + uuid: this.uuid, + ...this.spec, + }); + if (!this.isTerminal()) { + this.state = 'failed'; + } + } finally { + clearTimeout(this.deadlineTimeoutHandler); + await this.teardownTopTreeIfNeeded(); + this.resolveCompletion(this.state); + } + return this.state; + } + + /** + * Cancels the session. Idempotent. Withdraws any submitted candidate from the + * publishing service so the in-flight publisher (if any) is interrupted. + */ + public async cancel(reason = 'cancelled'): Promise { + if (this.isTerminal()) { + return; + } + this.log.info(`Cancelling EpochSession ${this.uuid}: ${reason}`, { + uuid: this.uuid, + ...this.spec, + previousState: this.state, + reason, + }); + this.state = 'cancelled'; + try { + this.deps.publishingService.withdraw(this.uuid); + } catch (err) { + this.log.error(`Error withdrawing candidate from publishing service`, err); + } + if (this.topTreeJob && !this.topTreeJob.isCancelled()) { + const job = this.topTreeJob; + this.topTreeJob = undefined; + job.cancel(); + this.pendingTopTreeCleanups.push(job); + } + await this.teardownTopTreeIfNeeded(); + this.resolveCompletion(this.state); + } + + private async run(): Promise { + const timer = new Timer(); + + if (this.deps.config.finalizationDelayMs && this.deps.config.finalizationDelayMs > 0) { + this.log.warn(`Waiting ${this.deps.config.finalizationDelayMs}ms before starting top-tree proving`, { + uuid: this.uuid, + ...this.spec, + }); + await sleep(this.deps.config.finalizationDelayMs); + if (this.isTerminal()) { + return; + } + } + + // Stage 1 — top-tree proving. + const topTreeJob = new TopTreeJob(this.spec.epochNumber, this.checkpoints, { + proverFactory: this.deps.proverFactory, + metrics: this.deps.metrics, + log: this.log, + hooks: this.toTopTreeHooks(), + }); + this.topTreeJob = topTreeJob; + const { fromCheckpoint, toCheckpoint, count } = topTreeJob.getRange(); + + this.state = 'awaiting-checkpoints'; + let proof: TopTreeProof; + try { + proof = await topTreeJob.start(); + } catch (err) { + if (err instanceof TopTreeCancelledError) { + // Session cancel kicked off the underlying teardown; nothing more to do here. + this.log.info(`Top-tree cancelled for EpochSession ${this.uuid}`, { uuid: this.uuid, ...this.spec }); + return; + } + throw err; + } + this.topTreeJob = undefined; + this.log.info(`Top-tree proof ready for EpochSession ${this.uuid}`, { + uuid: this.uuid, + ...this.spec, + fromCheckpoint, + toCheckpoint, + durationMs: timer.ms(), + }); + + // Stage 2 — hand the proof to the publishing service and wait for its verdict. + await this.submitProof(proof, fromCheckpoint, toCheckpoint, count, timer); + } + + private async submitProof( + proof: TopTreeProof, + fromCheckpoint: CheckpointNumber, + toCheckpoint: CheckpointNumber, + checkpointCount: number, + timer: Timer, + ): Promise { + // Attestations come from the highest-numbered registered checkpoint — that's the one + // whose attestations the L1 contract checks for the proven range. + const lastCheckpoint = this.checkpoints[this.checkpoints.length - 1]; + const attestations = lastCheckpoint.attestations.map(a => a.toViem()); + const epochSizeBlocks = this.checkpoints.reduce((acc, c) => acc + c.checkpoint.blocks.length, 0); + const epochSizeTxs = this.checkpoints.reduce( + (acc, c) => acc + c.checkpoint.blocks.reduce((bAcc, block) => bAcc + block.body.txEffects.length, 0), + 0, + ); + + const outcome = await this.deps.publishingService.submit({ + id: this.uuid, + epoch: this.spec.epochNumber, + kind: this.spec.kind, + startBlock: this.getStartBlockNumber(), + endBlock: this.getEndBlockNumber(), + deadline: this.deps.deadline, + fromCheckpoint, + toCheckpoint, + publicInputs: proof.publicInputs, + proof: proof.proof, + batchedBlobInputs: proof.batchedBlobInputs, + attestations, + }); + + if (this.isTerminal()) { + // cancel() already set the terminal state — don't clobber it. + return; + } + + switch (outcome) { + case 'published': + this.log.info( + `Submitted proof for epoch ${this.spec.epochNumber} (checkpoints ${fromCheckpoint}..${toCheckpoint})`, + { uuid: this.uuid, ...this.spec }, + ); + this.state = 'completed'; + this.deps.metrics.recordProvingJob(timer.ms(), timer.ms(), checkpointCount, epochSizeBlocks, epochSizeTxs); + return; + case 'superseded': + this.log.info(`EpochSession ${this.uuid} superseded by a longer candidate`, { + uuid: this.uuid, + ...this.spec, + }); + this.state = 'superseded'; + return; + case 'withdrawn': + // cancel() ran but the terminal-state check above missed it. Defensive: treat as cancelled. + this.state = 'cancelled'; + return; + case 'expired': + this.log.warn(`EpochSession ${this.uuid} expired before publishing`, { uuid: this.uuid, ...this.spec }); + this.state = 'timed-out'; + return; + case 'failed': + throw new Error('Failed to submit epoch proof to L1'); + } + } + + private async teardownTopTreeIfNeeded(): Promise { + if (this.topTreeJob) { + const job = this.topTreeJob; + this.topTreeJob = undefined; + job.cancel(); + this.pendingTopTreeCleanups.push(job); + } + if (this.pendingTopTreeCleanups.length > 0) { + await Promise.allSettled(this.pendingTopTreeCleanups.map(j => j.whenDone())); + this.pendingTopTreeCleanups.length = 0; + } + } + + private scheduleDeadlineStop(): void { + const deadline = this.deps.deadline; + if (!deadline) { + return; + } + const timeout = Math.max(deadline.getTime() - this.deps.dateProvider.now(), 0); + this.deadlineTimeoutHandler = setTimeout(() => { + void this.handleDeadline(); + }, timeout); + } + + /** + * Returns a promise that resolves once cancellation has propagated and the state has + * been flipped from 'cancelled' to 'timed-out'. Protected so unit tests can drive the + * deadline path without waiting on the real `setTimeout` to fire. + */ + protected async handleDeadline(): Promise { + if (this.isTerminal()) { + return; + } + this.log.warn(`EpochSession ${this.uuid} hit deadline`, { uuid: this.uuid, ...this.spec }); + await this.cancel('deadline'); + // After cancel, override state if it was the canonical timeout case so observers see 'timed-out'. + if (this.state === 'cancelled') { + this.state = 'timed-out'; + } + } + + private toTopTreeHooks(): TopTreeJobHooks | undefined { + const hooks = this.deps.hooks; + if (!hooks?.beforeTopTreeProve && !hooks?.afterTopTreeProve && !hooks?.topTreeProveOverride) { + return undefined; + } + return { + beforeProve: hooks.beforeTopTreeProve, + afterProve: hooks.afterTopTreeProve, + proveOverride: hooks.topTreeProveOverride, + }; + } +} diff --git a/yarn-project/prover-node/src/job/top-tree-job.ts b/yarn-project/prover-node/src/job/top-tree-job.ts new file mode 100644 index 000000000000..d7ddc67463f9 --- /dev/null +++ b/yarn-project/prover-node/src/job/top-tree-job.ts @@ -0,0 +1,227 @@ +import type { BatchedBlob } from '@aztec/blob-lib/types'; +import type { CheckpointNumber, EpochNumber } from '@aztec/foundation/branded-types'; +import type { Logger } from '@aztec/foundation/log'; +import { type PromiseWithResolvers, promiseWithResolvers } from '@aztec/foundation/promise'; +import { Timer } from '@aztec/foundation/timer'; +import type { EpochProverFactory } from '@aztec/prover-client'; +import { buildFinalBlobChallenges } from '@aztec/prover-client/helpers'; +import { + type CheckpointTopTreeData, + TopTreeCancelledError, + type TopTreeOrchestrator, +} from '@aztec/prover-client/orchestrator'; +import type { Proof } from '@aztec/stdlib/proofs'; +import type { RootRollupPublicInputs } from '@aztec/stdlib/rollup'; + +import type { ProverNodeJobMetrics } from '../metrics.js'; +import type { CheckpointProver } from './checkpoint-prover.js'; + +/** Result of a successful top-tree run. */ +export type TopTreeProof = { + publicInputs: RootRollupPublicInputs; + proof: Proof; + batchedBlobInputs: BatchedBlob; +}; + +/** + * Hooks for tests to interpose around the underlying `topTree.prove(...)` call without + * monkey-patching the orchestrator. + */ +export type TopTreeJobHooks = { + /** Called immediately before the top tree's `prove` runs. */ + beforeProve?: () => Promise | void; + /** Called after `prove` returns successfully (not on failure / cancellation). */ + afterProve?: () => Promise | void; + /** + * If set, called instead of running the underlying prove. Receives a thunk that + * runs the real call. Lets tests substitute a synthetic proof or delay/throw without + * re-implementing the rest of the finalize flow. + */ + proveOverride?: (defaultProve: () => Promise) => Promise; +}; + +/** + * Self-contained top-tree job. Constructed from a snapshot of `CheckpointProver`s; runs + * `topTree.prove(...)` against their pending `blockProofs` promises and exposes the + * final epoch proof via `result`. + * + */ +export class TopTreeJob { + /** Resolves with the final proof on success; rejects on cancellation or any prove error. */ + readonly result: PromiseWithResolvers = promiseWithResolvers(); + + /** Snapshot of checkpoint jobs the top tree is built from, in checkpoint-number order. */ + readonly snapshot: readonly CheckpointProver[]; + + private readonly topTree: TopTreeOrchestrator; + private readonly fromCheckpoint: CheckpointNumber; + private readonly toCheckpoint: CheckpointNumber; + private cancelled = false; + /** Tracks the cancel-driven background teardown so `whenDone()` can await it. */ + private cancelPromise?: Promise; + private readonly executionTimer = new Timer(); + + constructor( + private readonly epochNumber: EpochNumber, + snapshot: readonly CheckpointProver[], + private readonly deps: { + proverFactory: EpochProverFactory; + metrics: ProverNodeJobMetrics; + log: Logger; + hooks?: TopTreeJobHooks; + }, + ) { + if (snapshot.length === 0) { + throw new Error(`Cannot construct TopTreeJob for epoch ${epochNumber}: empty snapshot`); + } + for (let i = 1; i < snapshot.length; i++) { + const prev = snapshot[i - 1].checkpoint.number; + const curr = snapshot[i].checkpoint.number; + if (curr !== prev + 1) { + throw new Error( + `Cannot construct TopTreeJob for epoch ${epochNumber}: checkpoint numbers must be contiguous, got gap between ${prev} and ${curr}`, + ); + } + } + this.snapshot = snapshot; + this.fromCheckpoint = snapshot[0].checkpoint.number; + this.toCheckpoint = snapshot[snapshot.length - 1].checkpoint.number; + this.topTree = deps.proverFactory.createTopTreeOrchestrator(); + deps.log.info( + `Created TopTreeJob for epoch ${epochNumber} covering checkpoints ${this.fromCheckpoint}..${this.toCheckpoint}`, + { + epochNumber, + fromCheckpoint: this.fromCheckpoint, + toCheckpoint: this.toCheckpoint, + checkpointCount: snapshot.length, + }, + ); + // Mark the result's rejection branch as observed so a cancellation before any + // consumer awaits does not surface as unhandled. + this.result.promise.catch(() => {}); + } + + /** Range covered by this attempt — useful for logging and L1 submission. */ + public getRange(): { fromCheckpoint: CheckpointNumber; toCheckpoint: CheckpointNumber; count: number } { + return { fromCheckpoint: this.fromCheckpoint, toCheckpoint: this.toCheckpoint, count: this.snapshot.length }; + } + + public isCancelled(): boolean { + return this.cancelled; + } + + /** Wall-time since construction — used by the owning job for metrics. */ + public elapsedMs(): number { + return this.executionTimer.ms(); + } + + /** Kicks off the prove. Returns the result promise (also available as `result.promise`). */ + public start(): Promise { + void this.run(); + return this.result.promise; + } + + /** + * Cancels the in-flight prove. Idempotent. Rejects the result promise with + * `TopTreeCancelledError`, then kicks off the underlying orchestrator's teardown + * in the background so callers don't block on it. The teardown promise is exposed + * via `whenDone()` — the parent collects the cancelled job and awaits all + * pending top-tree teardowns at the end of the epoch. + */ + public cancel(): void { + if (this.cancelled) { + return; + } + this.cancelled = true; + this.deps.log.info( + `Cancelling TopTreeJob for epoch ${this.epochNumber} (checkpoints ${this.fromCheckpoint}..${this.toCheckpoint})`, + { + epochNumber: this.epochNumber, + fromCheckpoint: this.fromCheckpoint, + toCheckpoint: this.toCheckpoint, + elapsedMs: this.executionTimer.ms(), + }, + ); + this.result.reject(new TopTreeCancelledError()); + // Fire and forget: parent awaits the cancel-driven teardown via whenDone(); the + // chained .catch swallows rejections so the unawaited promise doesn't surface + // as an unhandled rejection. + this.cancelPromise = this.runCancel().catch(() => {}); + } + + /** Resolves once the cancel-driven teardown of the underlying orchestrator has unwound. */ + public async whenDone(): Promise { + if (this.cancelPromise) { + await this.cancelPromise; + } + } + + private async runCancel(): Promise { + try { + this.topTree.cancel({ abortJobs: true }); + } catch (err) { + this.deps.log.error('Error cancelling top tree', err); + } + try { + await this.topTree.stop(); + } catch (err) { + this.deps.log.error('Error stopping top tree', err); + } + } + + private async run() { + try { + const blobTimer = new Timer(); + const blobFieldsPerCheckpoint = this.snapshot.map(j => j.checkpoint.toBlobFields()); + const finalBlobBatchingChallenges = await buildFinalBlobChallenges(blobFieldsPerCheckpoint); + this.deps.metrics.recordBlobProcessing(blobTimer.ms()); + this.deps.log.verbose( + `Built final blob batching challenges for epoch ${this.epochNumber} in ${blobTimer.ms()}ms`, + { + epochNumber: this.epochNumber, + checkpointCount: this.snapshot.length, + durationMs: blobTimer.ms(), + }, + ); + + const checkpointData: CheckpointTopTreeData[] = this.snapshot.map(j => ({ + blockProofs: j.whenBlockProofsReady(), + l2ToL1MsgsPerBlock: j.checkpoint.blocks.map(b => b.body.txEffects.map(tx => tx.l2ToL1Msgs)), + blobFields: j.checkpoint.toBlobFields(), + previousBlockHeader: j.previousBlockHeader, + previousArchiveSiblingPath: j.previousArchiveSiblingPath, + })); + + const defaultProve = (): Promise => + this.topTree.prove(this.epochNumber, this.snapshot.length, finalBlobBatchingChallenges, checkpointData); + + await this.deps.hooks?.beforeProve?.(); + const proveTimer = new Timer(); + this.deps.log.info( + `Starting top-tree prove for epoch ${this.epochNumber} (checkpoints ${this.fromCheckpoint}..${this.toCheckpoint})`, + { + epochNumber: this.epochNumber, + fromCheckpoint: this.fromCheckpoint, + toCheckpoint: this.toCheckpoint, + checkpointCount: this.snapshot.length, + }, + ); + const proof = await (this.deps.hooks?.proveOverride + ? this.deps.hooks.proveOverride(defaultProve) + : defaultProve()); + await this.deps.hooks?.afterProve?.(); + this.deps.log.info(`Top-tree prove succeeded for epoch ${this.epochNumber} in ${proveTimer.ms()}ms`, { + epochNumber: this.epochNumber, + fromCheckpoint: this.fromCheckpoint, + toCheckpoint: this.toCheckpoint, + durationMs: proveTimer.ms(), + totalElapsedMs: this.executionTimer.ms(), + }); + + this.result.resolve(proof); + } catch (err) { + // Cancel paths surface as TopTreeCancelledError; everything else propagates as-is. + this.result.reject(err); + } + } +} diff --git a/yarn-project/prover-node/src/metrics.ts b/yarn-project/prover-node/src/metrics.ts index d4a1df6f156e..8fa7bff0b700 100644 --- a/yarn-project/prover-node/src/metrics.ts +++ b/yarn-project/prover-node/src/metrics.ts @@ -18,6 +18,9 @@ import { import { formatEther, formatUnits } from 'viem'; +import type { CheckpointStore } from './checkpoint-store.js'; +import type { SessionManager } from './session-manager.js'; + export class ProverNodeJobMetrics { proverEpochExecutionDuration: Histogram; provingJobDuration: Histogram; @@ -26,10 +29,15 @@ export class ProverNodeJobMetrics { provingJobTransactions: Gauge; private blobProcessingDuration: Gauge; - private chonkVerifierDuration: Gauge; private blockProcessingDuration: Histogram; private checkpointProcessingDuration: Histogram; - private allCheckpointsProcessingDuration: Gauge; + + /** Observable gauges for live state. Registered via `observeState(...)` once the + * CheckpointStore and SessionManager are available. */ + private activeCheckpoints: ObservableGauge | undefined; + private activeEpochSessions: ObservableGauge | undefined; + private stateObserver: ((observer: BatchObservableResult) => void) | undefined; + private stateObservedMetrics: ObservableGauge[] = []; constructor( private meter: Meter, @@ -43,12 +51,8 @@ export class ProverNodeJobMetrics { this.provingJobTransactions = this.meter.createGauge(Metrics.PROVER_NODE_JOB_TRANSACTIONS); this.blobProcessingDuration = this.meter.createGauge(Metrics.PROVER_NODE_BLOB_PROCESSING_LAST_DURATION); - this.chonkVerifierDuration = this.meter.createGauge(Metrics.PROVER_NODE_CHONK_VERIFIER_LAST_DURATION); this.blockProcessingDuration = this.meter.createHistogram(Metrics.PROVER_NODE_BLOCK_PROCESSING_DURATION); this.checkpointProcessingDuration = this.meter.createHistogram(Metrics.PROVER_NODE_CHECKPOINT_PROCESSING_DURATION); - this.allCheckpointsProcessingDuration = this.meter.createGauge( - Metrics.PROVER_NODE_ALL_CHECKPOINTS_PROCESSING_LAST_DURATION, - ); } public recordProvingJob( @@ -69,10 +73,6 @@ export class ProverNodeJobMetrics { this.blobProcessingDuration.record(Math.ceil(durationMs)); } - public recordChonkVerifier(durationMs: number) { - this.chonkVerifierDuration.record(Math.ceil(durationMs)); - } - public recordBlockProcessing(durationMs: number) { this.blockProcessingDuration.record(Math.ceil(durationMs)); } @@ -81,8 +81,47 @@ export class ProverNodeJobMetrics { this.checkpointProcessingDuration.record(Math.ceil(durationMs)); } - public recordAllCheckpointsProcessing(durationMs: number) { - this.allCheckpointsProcessingDuration.record(Math.ceil(durationMs)); + /** + * Registers observable gauges for the prover-node's live state: how many canonical + * checkpoint provers are in the store, and how many epoch sessions are live (broken + * down by kind). Idempotent — repeated calls re-arm with the latest references. + * + * Call this once the `SessionManager` has been constructed (i.e. inside `ProverNode.start()`). + */ + public observeState(checkpointStore: CheckpointStore, sessionManager: SessionManager): void { + this.stopObservingState(); + this.activeCheckpoints = this.meter.createObservableGauge(Metrics.PROVER_NODE_ACTIVE_CHECKPOINTS); + this.activeEpochSessions = this.meter.createObservableGauge(Metrics.PROVER_NODE_ACTIVE_EPOCH_SESSIONS); + this.stateObserver = (observer: BatchObservableResult) => { + observer.observe(this.activeCheckpoints!, checkpointStore.listCanonical().length); + let full = 0; + let partial = 0; + for (const session of sessionManager.allSessions()) { + if (session.isTerminal()) { + continue; + } + if (session.getKind() === 'full') { + full++; + } else { + partial++; + } + } + observer.observe(this.activeEpochSessions!, full, { [Attributes.EPOCH_SESSION_KIND]: 'full' }); + observer.observe(this.activeEpochSessions!, partial, { [Attributes.EPOCH_SESSION_KIND]: 'partial' }); + }; + this.stateObservedMetrics = [this.activeCheckpoints, this.activeEpochSessions]; + this.meter.addBatchObservableCallback(this.stateObserver, this.stateObservedMetrics); + } + + /** Tears down the observable callback registered by `observeState`. Idempotent. */ + public stopObservingState(): void { + if (this.stateObserver) { + this.meter.removeBatchObservableCallback(this.stateObserver, this.stateObservedMetrics); + this.stateObserver = undefined; + this.stateObservedMetrics = []; + this.activeCheckpoints = undefined; + this.activeEpochSessions = undefined; + } } } diff --git a/yarn-project/prover-node/src/proof-publishing-service.test.ts b/yarn-project/prover-node/src/proof-publishing-service.test.ts new file mode 100644 index 000000000000..03ee9518b1fd --- /dev/null +++ b/yarn-project/prover-node/src/proof-publishing-service.test.ts @@ -0,0 +1,431 @@ +import { BatchedBlob } from '@aztec/blob-lib/types'; +import { BlockNumber, CheckpointNumber, EpochNumber } from '@aztec/foundation/branded-types'; +import { promiseWithResolvers } from '@aztec/foundation/promise'; +import { DateProvider } from '@aztec/foundation/timer'; +import type { L2BlockSource } from '@aztec/stdlib/block'; +import { Proof } from '@aztec/stdlib/proofs'; +import { RootRollupPublicInputs } from '@aztec/stdlib/rollup'; + +import { type MockProxy, mock } from 'jest-mock-extended'; + +import { + ProofPublishingService, + type ProofPublishingServiceDeps, + type PublishCandidate, + type PublisherFactoryLike, + type PublisherLike, +} from './proof-publishing-service.js'; + +describe('ProofPublishingService', () => { + let publisherFactory: MockProxy; + let publishers: MockProxy[]; + let l2BlockSource: MockProxy>; + let dateProvider: DateProvider; + let service: TestProofPublishingService; + + beforeEach(() => { + publishers = []; + publisherFactory = mock(); + publisherFactory.create.mockImplementation(() => { + const next = newPublisher(); + publishers.push(next); + return Promise.resolve(next as unknown as Awaited>); + }); + l2BlockSource = mock>(); + l2BlockSource.getBlockNumber.mockResolvedValue(BlockNumber.ZERO); + dateProvider = new DateProvider(); + }); + + afterEach(async () => { + await service.stop(); + }); + + function startService(overrides?: Partial): void { + service = new TestProofPublishingService({ + publisherFactory, + l2BlockSource, + dateProvider, + config: { skipSubmitProof: false, ...overrides }, + }); + } + + /** + * Builds a gated publisher whose `submitEpochProof` resolves only when the test + * releases `gate`, and exposes a `submitCalled` trigger that fires the moment the + * publish enters the publisher (after `inFlight = { id }` has been set in + * `publishWinner`). Lets tests deterministically wait for "drain is in-flight on the + * publisher" without sleeping. + */ + function installGatedPublisher(): { + gate: ReturnType>; + submitCalled: ReturnType>; + } { + const gate = promiseWithResolvers(); + const submitCalled = promiseWithResolvers(); + publisherFactory.create.mockImplementationOnce(() => { + const p = newPublisher(); + p.submitEpochProof.mockImplementation(() => { + submitCalled.resolve(); + return gate.promise; + }); + publishers.push(p); + return Promise.resolve(p as unknown as Awaited>); + }); + return { gate, submitCalled }; + } + + function newPublisher(): MockProxy { + const p = mock(); + p.submitEpochProof.mockResolvedValue(true); + p.analyzeEpochProofSubmission.mockResolvedValue(undefined); + return p; + } + + /** Build a candidate with sensible defaults — caller overrides only what matters per test. */ + function makeCandidate(overrides: Partial = {}): PublishCandidate { + const startBlock = overrides.startBlock ?? BlockNumber(1); + const endBlock = overrides.endBlock ?? BlockNumber(8); + return { + id: overrides.id ?? `cand-${Math.random().toString(36).slice(2, 9)}`, + epoch: overrides.epoch ?? EpochNumber(1), + kind: overrides.kind ?? 'full', + startBlock, + endBlock, + deadline: overrides.deadline, + fromCheckpoint: overrides.fromCheckpoint ?? CheckpointNumber(1), + toCheckpoint: overrides.toCheckpoint ?? CheckpointNumber(1), + publicInputs: overrides.publicInputs ?? RootRollupPublicInputs.random(), + proof: overrides.proof ?? Proof.empty(), + batchedBlobInputs: overrides.batchedBlobInputs ?? makeBlob(), + attestations: overrides.attestations ?? [], + }; + } + + // ---------------- happy path ---------------- + + it('publishes a single eligible candidate', async () => { + startService(); + l2BlockSource.getBlockNumber.mockResolvedValue(BlockNumber(0)); // predecessor proven (startBlock=1, so 0 is enough). + const candidate = makeCandidate({ startBlock: BlockNumber(1), endBlock: BlockNumber(8) }); + + const outcome = await service.submit(candidate); + + expect(outcome).toEqual('published'); + expect(publishers).toHaveLength(1); + expect(publishers[0].submitEpochProof).toHaveBeenCalledTimes(1); + }); + + it('waits for predecessor before publishing', async () => { + startService(); + l2BlockSource.getBlockNumber.mockResolvedValue(BlockNumber(0)); // predecessor not proven (need >= 4). + const candidate = makeCandidate({ startBlock: BlockNumber(5), endBlock: BlockNumber(8) }); + + const outcomePromise = service.submit(candidate); + await service.drainSyncPoint(); // drain runs, picks no winner, returns + expect(publishers).toHaveLength(0); + + l2BlockSource.getBlockNumber.mockResolvedValue(BlockNumber(4)); // predecessor now proven. + service.onChainProven(BlockNumber(4)); + + expect(await outcomePromise).toEqual('published'); + expect(publishers).toHaveLength(1); + }); + + // ---------------- dedup / supersession ---------------- + + it('supersedes a partial candidate fully covered by the proven tip', async () => { + startService(); + l2BlockSource.getBlockNumber.mockResolvedValue(BlockNumber(8)); + const candidate = makeCandidate({ + kind: 'partial', + startBlock: BlockNumber(1), + endBlock: BlockNumber(8), + }); + + const outcome = await service.submit(candidate); + + expect(outcome).toEqual('superseded'); + expect(publishers).toHaveLength(0); + }); + + it('still publishes a full candidate when the proven tip already covers its range', async () => { + // Multi-prover-node case: every prover-node submits its own full epoch proof; the L1 + // rollup records each (prover-id, epoch) tuple. The publishing service must not + // suppress a redundant full proof just because some other prover-node landed first. + startService(); + l2BlockSource.getBlockNumber.mockResolvedValue(BlockNumber(8)); + const candidate = makeCandidate({ kind: 'full', startBlock: BlockNumber(1), endBlock: BlockNumber(8) }); + + const outcome = await service.submit(candidate); + + expect(outcome).toEqual('published'); + expect(publishers).toHaveLength(1); + expect(publishers[0].submitEpochProof).toHaveBeenCalledTimes(1); + }); + + it('publishes the longest candidate when several are eligible for the same epoch', async () => { + startService(); + l2BlockSource.getBlockNumber.mockResolvedValue(BlockNumber(0)); + const shortCandidate = makeCandidate({ + id: 'short', + epoch: EpochNumber(1), + startBlock: BlockNumber(1), + endBlock: BlockNumber(3), + }); + const longCandidate = makeCandidate({ + id: 'long', + epoch: EpochNumber(1), + startBlock: BlockNumber(1), + endBlock: BlockNumber(8), + toCheckpoint: CheckpointNumber(2), + }); + + // Submit both before drain runs so they're considered together. + const shortOutcome = service.submit(shortCandidate); + const longOutcome = service.submit(longCandidate); + + expect(await shortOutcome).toEqual('superseded'); + expect(await longOutcome).toEqual('published'); + expect(publishers).toHaveLength(1); + expect(publishers[0].submitEpochProof).toHaveBeenCalledWith( + expect.objectContaining({ toCheckpoint: CheckpointNumber(2) }), + ); + }); + + // ---------------- withdraw ---------------- + + it('withdraws a queued candidate without calling the publisher', async () => { + startService(); + l2BlockSource.getBlockNumber.mockResolvedValue(BlockNumber(0)); + const candidate = makeCandidate({ startBlock: BlockNumber(5), endBlock: BlockNumber(8) }); // not eligible. + + const outcomePromise = service.submit(candidate); + await service.drainSyncPoint(); // drain runs, parks the candidate (not eligible) + service.withdraw(candidate.id); + + expect(await outcomePromise).toEqual('withdrawn'); + expect(publishers).toHaveLength(0); + }); + + it('lets an in-flight publish run to completion when withdraw is called on it', async () => { + // Once a publish starts, withdraw is a no-op for the in-flight candidate — the L1 + // submission runs naturally and the outcome reports whatever the publisher returned. + // The originating session is expected to have moved to a terminal state via cancel() + // and ignore the late outcome. + startService(); + l2BlockSource.getBlockNumber.mockResolvedValue(BlockNumber(0)); + + const { gate, submitCalled } = installGatedPublisher(); + const candidate = makeCandidate(); + const outcomePromise = service.submit(candidate); + + await submitCalled.promise; // inFlight = { id } now set; drain is awaiting submitEpochProof + expect(publishers).toHaveLength(1); + + // Withdraw mid-publish: service does not touch the publisher, the publish keeps running. + service.withdraw(candidate.id); + + // Release the publish — outcome reports the publisher's natural return value. + gate.resolve(true); + expect(await outcomePromise).toEqual('published'); + }); + + // ---------------- expiry ---------------- + + it('resolves as expired when the deadline elapses before publishing', async () => { + startService(); + // Predecessor not proven — candidate sits in the queue. + l2BlockSource.getBlockNumber.mockResolvedValue(BlockNumber(0)); + const candidate = makeCandidate({ + startBlock: BlockNumber(5), + endBlock: BlockNumber(8), + deadline: new Date(Date.now() + 20), + }); + + expect(await service.submit(candidate)).toEqual('expired'); + expect(publishers).toHaveLength(0); + }); + + it('expires a candidate whose deadline is already in the past at submit time', async () => { + startService(); + l2BlockSource.getBlockNumber.mockResolvedValue(BlockNumber(0)); + const candidate = makeCandidate({ + startBlock: BlockNumber(5), + endBlock: BlockNumber(8), + deadline: new Date(Date.now() - 1000), + }); + + expect(await service.submit(candidate)).toEqual('expired'); + expect(publishers).toHaveLength(0); + }); + + it('lets an in-flight publish complete past its deadline', async () => { + // Once a publish starts, the deadline timer becomes a no-op. The publish runs to + // completion and the outcome reports the publisher's result. + startService(); + l2BlockSource.getBlockNumber.mockResolvedValue(BlockNumber(0)); + + const { gate, submitCalled } = installGatedPublisher(); + // Deadline far enough out that the real timer never fires within the test — we drive + // expiry manually via triggerExpiry below. + const candidate = makeCandidate({ deadline: new Date(Date.now() + 60_000) }); + const outcomePromise = service.submit(candidate); + + await submitCalled.promise; // inFlight is set; publish is awaiting the gate + expect(publishers).toHaveLength(1); + + // Drive the deadline path manually. inFlight matches the candidate id, so handleExpiry + // is a no-op — the publish keeps running. + service.triggerExpiry(candidate.id); + gate.resolve(true); + expect(await outcomePromise).toEqual('published'); + }); + + // ---------------- failure surfaces ---------------- + + it('resolves as failed when submitEpochProof returns false', async () => { + startService(); + publisherFactory.create.mockImplementationOnce(() => { + const p = newPublisher(); + p.submitEpochProof.mockResolvedValue(false); + publishers.push(p); + return Promise.resolve(p as unknown as Awaited>); + }); + + const outcome = await service.submit(makeCandidate()); + expect(outcome).toEqual('failed'); + }); + + it('resolves as failed when submitEpochProof throws', async () => { + startService(); + publisherFactory.create.mockImplementationOnce(() => { + const p = newPublisher(); + p.submitEpochProof.mockRejectedValue(new Error('boom')); + publishers.push(p); + return Promise.resolve(p as unknown as Awaited>); + }); + + const outcome = await service.submit(makeCandidate()); + expect(outcome).toEqual('failed'); + }); + + it('retries the publish when publisherFactory.create transiently fails', async () => { + // Pool exhaustion is transient — we must not fail the proof, just back off and try + // again on a later drain. Once create() succeeds, the candidate publishes normally. + startService(); + let createCalls = 0; + publisherFactory.create.mockImplementation(() => { + createCalls++; + if (createCalls < 3) { + return Promise.reject(new Error('pool exhausted')); + } + const p = newPublisher(); + publishers.push(p); + return Promise.resolve(p as unknown as Awaited>); + }); + + const outcome = service.submit(makeCandidate()); + + // First drain attempts create() and fails; publishWinner schedules a setTimeout + // retry. We bypass the timer by driving the next drain via onChainProven, which + // shares the same scheduleDrain mechanism. This loses direct coverage of the + // 1000ms retry delay but exercises the retry *behaviour* deterministically. + await service.drainSyncPoint(); + expect(publishers).toHaveLength(0); + expect(createCalls).toBe(1); + + service.onChainProven(BlockNumber(0)); // wake the drain again + await service.drainSyncPoint(); + expect(publishers).toHaveLength(0); + expect(createCalls).toBe(2); + + service.onChainProven(BlockNumber(0)); + expect(await outcome).toEqual('published'); + expect(publishers).toHaveLength(1); + expect(createCalls).toBe(3); + }); + + it('expires a candidate that keeps hitting publisher acquire failures past its deadline', async () => { + // Persistent acquire failure + a short deadline: the expiry timer wins. + startService(); + publisherFactory.create.mockRejectedValue(new Error('pool exhausted')); + + const candidate = makeCandidate({ deadline: new Date(Date.now() + 50) }); + expect(await service.submit(candidate)).toEqual('expired'); + expect(publishers).toHaveLength(0); + }); + + // ---------------- skipSubmitProof ---------------- + + it('routes to analyzeEpochProofSubmission when skipSubmitProof is true', async () => { + startService({ skipSubmitProof: true }); + const outcome = await service.submit(makeCandidate()); + + expect(outcome).toEqual('published'); + expect(publishers).toHaveLength(1); + expect(publishers[0].analyzeEpochProofSubmission).toHaveBeenCalledTimes(1); + expect(publishers[0].submitEpochProof).not.toHaveBeenCalled(); + }); + + // ---------------- serialisation ---------------- + + it('drains one publish at a time — no concurrent publishes', async () => { + startService(); + l2BlockSource.getBlockNumber.mockResolvedValue(BlockNumber(0)); + + // Hand back publishers whose submitEpochProof completes only when we say. Each gated + // publisher exposes a submitCalled trigger that fires once the publish actually starts + // (inFlight = { id } has been set in publishWinner). + const first = installGatedPublisher(); + const second = installGatedPublisher(); + + // Both candidates need their predecessor proven from the start so the drain doesn't park + // the second behind the first. The unit test ignores real block-number sequencing — + // what matters is that two eligible candidates publish serially, not in parallel. + const a = service.submit( + makeCandidate({ epoch: EpochNumber(1), startBlock: BlockNumber(1), endBlock: BlockNumber(2) }), + ); + const b = service.submit( + makeCandidate({ epoch: EpochNumber(2), startBlock: BlockNumber(1), endBlock: BlockNumber(4) }), + ); + + // Only the first publish should be in flight. + await first.submitCalled.promise; + expect(publishers).toHaveLength(1); + + // Release first; the second drain pass now runs and starts the second publish. + first.gate.resolve(true); + await a; + await second.submitCalled.promise; + expect(publishers).toHaveLength(2); + + second.gate.resolve(true); + expect(await b).toEqual('published'); + }); + + function makeBlob(): BatchedBlob { + const pi = RootRollupPublicInputs.random(); + return new BatchedBlob( + pi.blobPublicInputs.blobCommitmentsHash, + pi.blobPublicInputs.z, + pi.blobPublicInputs.y, + pi.blobPublicInputs.c, + pi.blobPublicInputs.c.negate(), + ); + } +}); + +/** + * Subclass that exposes the protected `drainQueue.syncPoint()` and `handleExpiry` for + * test triggers. Lets tests wait for the drain to settle and drive deadline expiry + * without relying on real setTimeouts. + */ +class TestProofPublishingService extends ProofPublishingService { + public drainSyncPoint(): Promise { + return this.drainQueue.syncPoint(); + } + + public triggerExpiry(candidateId: string): void { + this.handleExpiry(candidateId); + } +} diff --git a/yarn-project/prover-node/src/proof-publishing-service.ts b/yarn-project/prover-node/src/proof-publishing-service.ts new file mode 100644 index 000000000000..2cfebb2e3381 --- /dev/null +++ b/yarn-project/prover-node/src/proof-publishing-service.ts @@ -0,0 +1,424 @@ +import type { BatchedBlob } from '@aztec/blob-lib'; +import type { ViemCommitteeAttestation } from '@aztec/ethereum/contracts'; +import { BlockNumber, type CheckpointNumber, type EpochNumber } from '@aztec/foundation/branded-types'; +import { type Logger, type LoggerBindings, createLogger } from '@aztec/foundation/log'; +import { promiseWithResolvers } from '@aztec/foundation/promise'; +import { SerialQueue } from '@aztec/foundation/queue'; +import type { DateProvider } from '@aztec/foundation/timer'; +import type { L2BlockSource } from '@aztec/stdlib/block'; +import type { Proof } from '@aztec/stdlib/proofs'; +import type { RootRollupPublicInputs } from '@aztec/stdlib/rollup'; + +import type { ProverNodePublisher } from './prover-node-publisher.js'; +import type { ProverPublisherFactory } from './prover-publisher-factory.js'; + +/** A single proof candidate offered to the publishing service by an `EpochSession`. */ +export type PublishCandidate = { + /** Stable id; matches the originating session so `withdraw` can target this entry. */ + id: string; + epoch: EpochNumber; + /** + * Full vs partial. A `partial` candidate is an early-finish optimisation: if the chain's + * proven tip catches up to or past its `endBlock` before it publishes, it's superseded — + * publishing would be wasted L1 gas. A `full` candidate covers the entire epoch and is + * useful to publish even after some other prover-node has already submitted (the rollup + * contract records the submission per prover-id), so it is never auto-superseded by the + * proven tip alone. + */ + kind: 'full' | 'partial'; + /** First L2 block in the candidate's range. */ + startBlock: BlockNumber; + /** Last L2 block in the candidate's range. */ + endBlock: BlockNumber; + /** + * Wall-clock time after which the candidate is no longer worth publishing — typically + * the L1 proof-submission window deadline. If the candidate is still queued at this + * time it resolves as `'expired'`. If it's already in flight, the publish runs to + * completion (the L1 tx may still mine; the deadline only governs whether the service + * will start a publish). `undefined` disables the per-candidate timer. + */ + deadline: Date | undefined; + /** Everything `ProverNodePublisher.submitEpochProof` needs. */ + fromCheckpoint: CheckpointNumber; + toCheckpoint: CheckpointNumber; + publicInputs: RootRollupPublicInputs; + proof: Proof; + batchedBlobInputs: BatchedBlob; + attestations: ViemCommitteeAttestation[]; +}; + +/** Terminal outcome for a candidate. The promise from `submit()` resolves with one of these. */ +export type PublishOutcome = 'published' | 'superseded' | 'failed' | 'withdrawn' | 'expired'; + +/** Subset of `ProverPublisherFactory` the service uses — single async `create()` call. */ +export type PublisherFactoryLike = Pick; + +/** Subset of `ProverNodePublisher` the service drives — one publish per fresh publisher. */ +export type PublisherLike = Pick; + +/** Config for the publishing service. */ +export type ProofPublishingServiceConfig = { + /** When true, submitting a candidate runs `analyzeEpochProofSubmission` instead of publishing. */ + skipSubmitProof: boolean; +}; + +export type ProofPublishingServiceDeps = { + publisherFactory: PublisherFactoryLike; + l2BlockSource: Pick; + dateProvider: DateProvider; + config: ProofPublishingServiceConfig; + bindings?: LoggerBindings; +}; + +/** Per-epoch bucket: live candidates, their pending-outcome resolvers, and expiry timers. */ +type EpochBucket = { + candidates: Map; + resolvers: Map void>; + expiryTimers: Map; +}; + +/** + * Backoff after a transient `publisherFactory.create()` failure. The candidate stays + * in the queue and the drain is re-scheduled after this delay; if the failure persists + * the candidate's own `deadline` timer caps the total wait. + */ +const PUBLISHER_ACQUIRE_RETRY_DELAY_MS = 1_000; + +/** + * Central owner of L1 proof submission. Sessions offer their proofs here as + * `PublishCandidate`s; the service serialises one publish at a time, picks the + * longest candidate per epoch as the winner, and resolves the rest as + * `'superseded'` without spending L1 gas. + * + * Construction-time invariants: + * - Every publish runs against a freshly-created `ProverNodePublisher` from the factory. + * - Only one publish is ever in flight (`SerialQueue` drain) — no defensive locks. + * - Once an L1 publish starts, it runs to completion. `withdraw` is a queue-only + * operation: it removes a candidate that has not yet started publishing. An in-flight + * candidate is left alone and its outcome (`'published'` / `'failed'`) is reported as + * usual — the originating session has already moved to a terminal state via `cancel()` + * and ignores the late outcome. + * + * Eligibility for publication is decided against the proven block number read inside + * the drain (so the value is consistent with the publish that runs on the same drain + * pass): a candidate is eligible when its predecessor block is proven and (for partial + * candidates) the candidate's range extends past the proven tip. `onChainProven` is a + * wake-up signal; it does not pass state into the drain. + */ +export class ProofPublishingService { + private readonly log: Logger; + private readonly epochs: Map = new Map(); + /** + * One drain task at a time. Submits, withdrawals, chain-proven advances, and prunes + * all schedule a `drain` here, so the eligibility re-check and the L1 publish never + * interleave. + * + * Protected so unit tests can `await drainQueue.syncPoint()` to wait for pending + * drain work to settle deterministically (no sleeps). + */ + protected readonly drainQueue = new SerialQueue(); + /** Tracks the candidate currently being published. Set while drain is awaiting the L1 publish. */ + private inFlight: { id: string } | undefined; + private stopped = false; + + constructor(private readonly deps: ProofPublishingServiceDeps) { + this.log = createLogger('prover-node:proof-publishing-service', deps.bindings); + this.drainQueue.start(); + } + + /** + * Offers a proof candidate to the service. The returned promise resolves once the + * service settles the candidate's fate: `'published'` if it wins and L1 accepts it, + * `'superseded'` if a longer candidate for the same epoch wins, `'failed'` if the + * L1 submission errored, `'withdrawn'` if the originating session cancelled, + * `'expired'` if the candidate's `deadline` elapsed before publishing started. + */ + public submit(candidate: PublishCandidate): Promise { + if (this.stopped) { + return Promise.resolve('withdrawn'); + } + const { promise, resolve } = promiseWithResolvers(); + let bucket = this.epochs.get(candidate.epoch); + if (!bucket) { + bucket = { candidates: new Map(), resolvers: new Map(), expiryTimers: new Map() }; + this.epochs.set(candidate.epoch, bucket); + } + bucket.candidates.set(candidate.id, candidate); + bucket.resolvers.set(candidate.id, resolve); + this.scheduleExpiry(bucket, candidate); + this.log.info(`Candidate proof ${candidate.id} submitted for publishing`, { + candidateId: candidate.id, + epoch: candidate.epoch, + startBlock: candidate.startBlock, + endBlock: candidate.endBlock, + deadline: candidate.deadline?.toISOString(), + }); + this.scheduleDrain(); + return promise; + } + + /** + * Pulls a queued candidate from the bucket and resolves its promise as `'withdrawn'`. + * If the candidate is already being published, the publish runs to completion and the + * outcome reports whatever L1 returned — callers that cancelled mid-publish must rely + * on their own terminal-state check to ignore the late outcome. No-op if the candidate + * is unknown. + */ + public withdraw(candidateId: string): void { + if (this.inFlight?.id === candidateId) { + this.log.debug(`Withdraw for in-flight candidate ${candidateId} ignored; publish will run to completion`, { + candidateId, + }); + return; + } + for (const bucket of this.epochs.values()) { + if (bucket.candidates.has(candidateId)) { + this.log.info(`Candidate ${candidateId} withdrawn`, { candidateId }); + this.resolveCandidate(bucket, candidateId, 'withdrawn'); + this.scheduleDrain(); + return; + } + } + } + + /** + * Signals that the L1 proven tip has advanced and the queue should be re-evaluated. + * The drain reads the proven block number from `l2BlockSource` itself rather than + * relying on the value passed here — that way the eligibility check uses a value read + * inside the serial drain, not one captured by a concurrent caller of `onChainProven`. + */ + public onChainProven(_provenBlock: BlockNumber): void { + this.scheduleDrain(); + } + + /** + * Stops accepting new submissions, waits for any in-flight publish to settle, and + * resolves remaining queued candidates as `'withdrawn'`. + */ + public async stop(): Promise { + this.stopped = true; + await this.drainQueue.end(); + // Anything still parked in a bucket never ran through drain — resolve it as withdrawn so + // callers awaiting `submit()` aren't left hanging. + for (const bucket of Array.from(this.epochs.values())) { + for (const id of Array.from(bucket.candidates.keys())) { + this.resolveCandidate(bucket, id, 'withdrawn'); + } + } + this.epochs.clear(); + } + + // ---------------- drain ---------------- + + private scheduleDrain(): void { + if (this.stopped) { + return; + } + void this.drainQueue + .put(() => this.drain()) + .catch(err => { + this.log.error(`Drain task threw`, err); + }); + } + + private async drain(): Promise { + if (this.stopped) { + return; + } + // Read the proven block number afresh inside the serial drain so the eligibility + // check is consistent with the publish that follows it on the same drain pass. + const proven = await this.readProvenBlockNumber(); + + // Process epochs in ascending order: the proven tip advances monotonically, so the lower + // epoch is the natural next eligible candidate. + const orderedEpochs = Array.from(this.epochs.keys()).sort((a, b) => Number(a) - Number(b)); + for (const epoch of orderedEpochs) { + const bucket = this.epochs.get(epoch)!; + const eligible = this.pickEpochWinner(bucket, proven); + if (!eligible) { + continue; + } + await this.publishWinner(epoch, eligible.winner, bucket); + } + + // Drop empty buckets + for (const [key, bucket] of Array.from(this.epochs.entries())) { + if (bucket.candidates.size === 0) { + this.epochs.delete(key); + } + } + } + + /** + * Picks the winning candidate for a given epoch. Partial candidates whose `endBlock` is + * already proven on-chain resolve `'superseded'`. + * Full candidates are never auto-superseded by the proven tip — multiple prover-nodes + * legitimately submit redundant full epoch proofs (one per prover-id) and L1 records each. + * Among the remaining candidates with their predecessor proven, the one with the highest + * `endBlock` wins; the others resolve `'superseded'`. + */ + private pickEpochWinner(bucket: EpochBucket, proven: BlockNumber): { winner: PublishCandidate } | undefined { + const now = this.deps.dateProvider.now(); + // Resolve any candidate whose deadline has already passed. + for (const candidate of Array.from(bucket.candidates.values())) { + if (candidate.deadline && candidate.deadline.getTime() <= now) { + this.resolveCandidate(bucket, candidate.id, 'expired'); + } + } + // Drop partial candidates the proven chain has already caught up to. + for (const candidate of Array.from(bucket.candidates.values())) { + if (candidate.kind === 'partial' && candidate.endBlock <= proven) { + this.resolveCandidate(bucket, candidate.id, 'superseded'); + } + } + + const remaining = Array.from(bucket.candidates.values()).filter(c => c.startBlock - 1 <= proven); + if (remaining.length === 0) { + return undefined; + } + const winner = remaining.reduce((best, c) => (c.endBlock > best.endBlock ? c : best)); + // Every other same-epoch candidate is superseded by the winner. + for (const candidate of remaining) { + if (candidate.id !== winner.id) { + this.resolveCandidate(bucket, candidate.id, 'superseded'); + } + } + return { winner }; + } + + private async publishWinner(epoch: EpochNumber, winner: PublishCandidate, bucket: EpochBucket): Promise { + let publisher: PublisherLike; + try { + publisher = await this.deps.publisherFactory.create(); + } catch (err) { + // Treat this as transient: the publisher pool may be temporarily exhausted + // (every signer busy, funding tx in flight, etc.). Leave the candidate queued and + // schedule another drain after a short backoff. If the failure persists past the + // candidate's deadline the expiry timer will resolve it as `'expired'`. + this.log.warn(`Failed to acquire publisher for candidate ${winner.id}; retrying`, { + candidateId: winner.id, + epoch: winner.epoch, + retryDelayMs: PUBLISHER_ACQUIRE_RETRY_DELAY_MS, + err, + }); + setTimeout(() => this.scheduleDrain(), PUBLISHER_ACQUIRE_RETRY_DELAY_MS); + return; + } + + this.inFlight = { id: winner.id }; + this.log.info(`Publishing candidate ${winner.id}`, { + candidateId: winner.id, + epoch: winner.epoch, + startBlock: winner.startBlock, + endBlock: winner.endBlock, + fromCheckpoint: winner.fromCheckpoint, + toCheckpoint: winner.toCheckpoint, + }); + + const outcome = await this.runPublish(winner, publisher); + this.inFlight = undefined; + this.resolveCandidate(bucket, winner.id, outcome); + + if (bucket.candidates.size === 0) { + this.epochs.delete(epoch); + } + } + + private async runPublish(candidate: PublishCandidate, publisher: PublisherLike): Promise { + const submitArgs = { + epochNumber: candidate.epoch, + fromCheckpoint: candidate.fromCheckpoint, + toCheckpoint: candidate.toCheckpoint, + publicInputs: candidate.publicInputs, + proof: candidate.proof, + batchedBlobInputs: candidate.batchedBlobInputs, + attestations: candidate.attestations, + // Stop the L1 tx retrying past the candidate's submission-window deadline. + deadline: candidate.deadline, + }; + + if (this.deps.config.skipSubmitProof) { + try { + await publisher.analyzeEpochProofSubmission(submitArgs); + return 'published'; + } catch (err) { + this.log.warn(`Failed to analyze estimated L1 fees for candidate ${candidate.id}`, { + err, + candidateId: candidate.id, + epoch: candidate.epoch, + }); + // Analyze-mode failures are recorded but the session shouldn't enter `failed` — + // the operator opted out of submission. Match the previous EpochSession behaviour. + return 'published'; + } + } + + try { + const success = await publisher.submitEpochProof(submitArgs); + return success ? 'published' : 'failed'; + } catch (err) { + this.log.error(`Error publishing candidate ${candidate.id}`, err, { + candidateId: candidate.id, + epoch: candidate.epoch, + }); + return 'failed'; + } + } + + private resolveCandidate(bucket: EpochBucket, id: string, outcome: PublishOutcome): void { + const resolve = bucket.resolvers.get(id); + const timer = bucket.expiryTimers.get(id); + if (timer) { + clearTimeout(timer); + bucket.expiryTimers.delete(id); + } + bucket.candidates.delete(id); + bucket.resolvers.delete(id); + if (resolve) { + this.log.info(`Candidate ${id} resolved as ${outcome}`, { candidateId: id, outcome }); + resolve(outcome); + } + } + + /** + * Arms a per-candidate expiry timer if the candidate carries a deadline. When the timer + * fires, the candidate resolves as `'expired'` — unless it is already in flight, in + * which case the publish runs to completion (the timer becomes a no-op). The timer is + * cleared by `resolveCandidate` whenever the candidate settles for any other reason. + */ + private scheduleExpiry(bucket: EpochBucket, candidate: PublishCandidate): void { + if (!candidate.deadline) { + return; + } + const delay = Math.max(candidate.deadline.getTime() - this.deps.dateProvider.now(), 0); + const timer = setTimeout(() => this.handleExpiry(candidate.id), delay); + bucket.expiryTimers.set(candidate.id, timer); + } + + /** + * Protected so unit tests can drive the deadline path without waiting on the real + * `setTimeout` to fire. Production code calls this only via the per-candidate timer + * armed in `scheduleExpiry`. + */ + protected handleExpiry(candidateId: string): void { + if (this.inFlight?.id === candidateId) { + this.log.debug(`Expiry for in-flight candidate ${candidateId} ignored; publish will run to completion`, { + candidateId, + }); + return; + } + for (const bucket of this.epochs.values()) { + if (bucket.candidates.has(candidateId)) { + this.log.info(`Candidate ${candidateId} expired before publishing`, { candidateId }); + this.resolveCandidate(bucket, candidateId, 'expired'); + this.scheduleDrain(); + return; + } + } + } + + private async readProvenBlockNumber(): Promise { + const proven = await this.deps.l2BlockSource.getBlockNumber({ tag: 'proven' }); + return BlockNumber(proven ?? 0); + } +} diff --git a/yarn-project/prover-node/src/prover-node-publisher.test.ts b/yarn-project/prover-node/src/prover-node-publisher.test.ts index 31d035b37d9e..f7a22a777829 100644 --- a/yarn-project/prover-node/src/prover-node-publisher.test.ts +++ b/yarn-project/prover-node/src/prover-node-publisher.test.ts @@ -188,82 +188,6 @@ describe('prover-node-publisher', () => { }, ); - it('waits until the proven checkpoint reaches the checkpoint before the proof start', async () => { - const checkpoints = Array.from({ length: 100 }, () => RootRollupPublicInputs.random()); - const fromCheckpoint = CheckpointNumber(33); - const toCheckpoint = CheckpointNumber(64); - - rollup.getTips - .mockResolvedValueOnce({ - pending: CheckpointNumber(65), - proven: CheckpointNumber(31), - }) - .mockResolvedValueOnce({ - pending: CheckpointNumber(65), - proven: CheckpointNumber(32), - }) - .mockResolvedValue({ - pending: CheckpointNumber(65), - proven: CheckpointNumber(32), - }); - rollup.getRollupConstants.mockResolvedValue({ - l1StartBlock: 0n, - l1GenesisTime: BigInt(Math.floor(Date.now() / 1000)), - slotDuration: 1, - epochDuration: 1, - proofSubmissionEpochs: 100, - targetCommitteeSize: 48, - rollupManaLimit: Number.MAX_SAFE_INTEGER, - }); - - rollup.getCheckpoint.mockImplementation((checkpointNumber: CheckpointNumber) => - Promise.resolve({ - archive: checkpoints[checkpointNumber - 1].endArchiveRoot, - attestationsHash: Buffer32.ZERO, - payloadDigest: Buffer32.ZERO, - headerHash: Buffer32.ZERO, - blobCommitmentsHash: Buffer32.ZERO, - outHash: '0x', - slotNumber: SlotNumber(0), - feeHeader: { - excessMana: 0n, - manaUsed: 0n, - ethPerFeeAsset: 0n, - congestionCost: 0n, - proverCost: 0n, - }, - }), - ); - - const ourPublicInputs = RootRollupPublicInputs.random(); - ourPublicInputs.previousArchiveRoot = checkpoints[fromCheckpoint - 2].endArchiveRoot; - ourPublicInputs.endArchiveRoot = checkpoints[toCheckpoint - 1].endArchiveRoot; - - const ourBatchedBlob = new BatchedBlob( - ourPublicInputs.blobPublicInputs.blobCommitmentsHash, - ourPublicInputs.blobPublicInputs.z, - ourPublicInputs.blobPublicInputs.y, - ourPublicInputs.blobPublicInputs.c, - ourPublicInputs.blobPublicInputs.c.negate(), - ); - - rollup.getEpochProofPublicInputs.mockResolvedValue(ourPublicInputs.toFields()); - - await publisher.submitEpochProof({ - epochNumber: EpochNumber(2), - fromCheckpoint, - toCheckpoint, - publicInputs: ourPublicInputs, - proof: Proof.empty(), - batchedBlobInputs: ourBatchedBlob, - attestations: [], - }); - - expect(rollup.getRollupConstants).toHaveBeenCalled(); - expect(rollup.getTips).toHaveBeenCalledTimes(3); - expect(l1Utils.sendAndMonitorTransaction).toHaveBeenCalled(); - }); - it('analyzeEpochProofSubmission validates, estimates, and does not send tx', async () => { const fromCheckpoint = 33; const toCheckpoint = 64; diff --git a/yarn-project/prover-node/src/prover-node-publisher.ts b/yarn-project/prover-node/src/prover-node-publisher.ts index 5c7bfeab2460..67d9ff1d3ace 100644 --- a/yarn-project/prover-node/src/prover-node-publisher.ts +++ b/yarn-project/prover-node/src/prover-node-publisher.ts @@ -8,13 +8,11 @@ import { areArraysEqual } from '@aztec/foundation/collection'; import { Fr } from '@aztec/foundation/curves/bn254'; import { EthAddress } from '@aztec/foundation/eth-address'; import { type Logger, type LoggerBindings, createLogger } from '@aztec/foundation/log'; -import { retryUntil } from '@aztec/foundation/retry'; import type { Tuple } from '@aztec/foundation/serialize'; import { Timer } from '@aztec/foundation/timer'; import { RollupAbi } from '@aztec/l1-artifacts'; import type { PublisherConfig, TxSenderConfig } from '@aztec/sequencer-client'; import { CommitteeAttestation, CommitteeAttestationsAndSigners } from '@aztec/stdlib/block'; -import { getProofSubmissionDeadlineTimestamp } from '@aztec/stdlib/epoch-helpers'; import type { Proof } from '@aztec/stdlib/proofs'; import type { FeeRecipient, RootRollupPublicInputs } from '@aztec/stdlib/rollup'; import type { L1PublishProofStats } from '@aztec/stdlib/stats'; @@ -38,7 +36,6 @@ export type L1SubmitEpochProofArgs = { }; export class ProverNodePublisher { - private interrupted = false; private metrics: ProverNodePublisherMetrics; protected log: Logger; @@ -69,23 +66,6 @@ export class ProverNodePublisher { return this.rollupContract; } - /** - * Calling `interrupt` will cause any in progress call to `publishRollup` to return `false` asap. - * Be warned, the call may return false even if the tx subsequently gets successfully mined. - * In practice this shouldn't matter, as we'll only ever be calling `interrupt` when we know it's going to fail. - * A call to `restart` is required before you can continue publishing. - */ - public interrupt() { - this.interrupted = true; - this.l1TxUtils.interrupt(); - } - - /** Restarts the publisher after calling `interrupt`. */ - public restart() { - this.interrupted = false; - this.l1TxUtils.restart(); - } - public getSenderAddress() { return this.l1TxUtils.getSenderAddress(); } @@ -98,107 +78,53 @@ export class ProverNodePublisher { proof: Proof; batchedBlobInputs: BatchedBlob; attestations: ViemCommitteeAttestation[]; + /** Wall-clock deadline (proof-submission window end) past which the L1 tx should stop retrying. */ + deadline?: Date; }): Promise { const { epochNumber, fromCheckpoint, toCheckpoint } = args; const ctx = { epochNumber, fromCheckpoint, toCheckpoint }; - if (!this.interrupted) { - if (!(await this.waitUntilStartBuildsOnProven(args))) { - this.log.verbose('Checkpoint data syncing interrupted', ctx); - return false; - } - - const timer = new Timer(); - // Validate epoch proof range and hashes are correct before submitting - await this.validateEpochProofSubmission(args); - - const txReceipt = await this.sendSubmitEpochProofTx(args); - if (!txReceipt) { - this.log.error(`Failed to mine submitEpochProof tx`, undefined, ctx); - return false; - } - - try { - this.metrics.recordSenderBalance( - await this.l1TxUtils.getSenderBalance(), - this.l1TxUtils.getSenderAddress().toString(), - ); - } catch (err) { - this.log.warn(`Failed to record the ETH balance of the prover node: ${err}`); - } - - // Tx was mined successfully - if (txReceipt.status === 'success') { - const tx = await this.l1TxUtils.getTransactionStats(txReceipt.transactionHash); - const stats: L1PublishProofStats = { - gasPrice: txReceipt.effectiveGasPrice, - gasUsed: txReceipt.gasUsed, - transactionHash: txReceipt.transactionHash, - calldataGas: tx!.calldataGas, - calldataSize: tx!.calldataSize, - sender: tx!.sender, - blobDataGas: 0n, - blobGasUsed: 0n, - eventName: 'proof-published-to-l1', - }; - this.log.info(`Published epoch proof to L1 rollup contract`, { ...stats, ...ctx }); - this.metrics.recordSubmitProof(timer.ms(), stats); - return true; - } + const timer = new Timer(); + // Validate epoch proof range and hashes are correct before submitting + await this.validateEpochProofSubmission(args); - this.metrics.recordFailedTx(); - this.log.error(`Rollup submitEpochProof tx reverted ${txReceipt.transactionHash}`, undefined, ctx); + const txReceipt = await this.sendSubmitEpochProofTx(args); + if (!txReceipt) { + this.log.error(`Failed to mine submitEpochProof tx`, undefined, ctx); + return false; } - this.log.verbose('Checkpoint data syncing interrupted', ctx); - return false; - } + try { + this.metrics.recordSenderBalance( + await this.l1TxUtils.getSenderBalance(), + this.l1TxUtils.getSenderAddress().toString(), + ); + } catch (err) { + this.log.warn(`Failed to record the ETH balance of the prover node: ${err}`); + } - private async waitUntilStartBuildsOnProven(args: { epochNumber: EpochNumber; fromCheckpoint: CheckpointNumber }) { - const { epochNumber, fromCheckpoint } = args; - const provenCheckpoint = await this.getProvenCheckpoint(); - if (this.isStartBuildingOnProven(fromCheckpoint, provenCheckpoint)) { + // Tx was mined successfully + if (txReceipt.status === 'success') { + const tx = await this.l1TxUtils.getTransactionStats(txReceipt.transactionHash); + const stats: L1PublishProofStats = { + gasPrice: txReceipt.effectiveGasPrice, + gasUsed: txReceipt.gasUsed, + transactionHash: txReceipt.transactionHash, + calldataGas: tx!.calldataGas, + calldataSize: tx!.calldataSize, + sender: tx!.sender, + blobDataGas: 0n, + blobGasUsed: 0n, + eventName: 'proof-published-to-l1', + }; + this.log.info(`Published epoch proof to L1 rollup contract`, { ...stats, ...ctx }); + this.metrics.recordSubmitProof(timer.ms(), stats); return true; } - const timeout = await this.getSecondsUntilProofSubmissionWindowEnd(epochNumber); - this.log.info(`Waiting for proven checkpoint to reach proof start`, { - epochNumber, - fromCheckpoint, - provenCheckpoint, - timeout, - }); - - await retryUntil( - async () => { - if (this.interrupted) { - return true; - } - - const proven = await this.getProvenCheckpoint(); - this.log.verbose(`Proven checkpoint is at ${proven} (waiting for ${fromCheckpoint - 1})`, { epochNumber }); - return this.isStartBuildingOnProven(fromCheckpoint, proven) ? true : undefined; - }, - `proven checkpoint to reach ${fromCheckpoint - 1}`, - timeout, - 4, - ); - - return !this.interrupted; - } - - private async getProvenCheckpoint() { - return (await this.rollupContract.getTips()).proven; - } - - private isStartBuildingOnProven(fromCheckpoint: CheckpointNumber, provenCheckpoint: CheckpointNumber) { - return fromCheckpoint - 1 <= provenCheckpoint; - } - - private async getSecondsUntilProofSubmissionWindowEnd(epochNumber: EpochNumber) { - const deadline = getProofSubmissionDeadlineTimestamp(epochNumber, await this.rollupContract.getRollupConstants()); - const now = BigInt(Math.floor(Date.now() / 1000)); - return Math.max(Number(deadline - now), 0.001); + this.metrics.recordFailedTx(); + this.log.error(`Rollup submitEpochProof tx reverted ${txReceipt.transactionHash}`, undefined, ctx); + return false; } private async validateEpochProofSubmission(args: { @@ -339,6 +265,7 @@ export class ProverNodePublisher { private async sendSubmitEpochProofTx(args: { fromCheckpoint: CheckpointNumber; toCheckpoint: CheckpointNumber; + deadline?: Date; publicInputs: RootRollupPublicInputs; proof: Proof; batchedBlobInputs: BatchedBlob; @@ -357,7 +284,10 @@ export class ProverNodePublisher { args: txArgs, }); try { - const { receipt } = await this.l1TxUtils.sendAndMonitorTransaction({ to: this.rollupContract.address, data }); + const { receipt } = await this.l1TxUtils.sendAndMonitorTransaction( + { to: this.rollupContract.address, data }, + { txTimeoutAt: args.deadline }, + ); if (receipt.status !== 'success') { const errorMsg = await this.l1TxUtils.tryGetErrorFromRevertedTx( data, diff --git a/yarn-project/prover-node/src/prover-node.test.ts b/yarn-project/prover-node/src/prover-node.test.ts index dbed4994fac7..d3abd72b5cee 100644 --- a/yarn-project/prover-node/src/prover-node.test.ts +++ b/yarn-project/prover-node/src/prover-node.test.ts @@ -1,344 +1,646 @@ -import { RollupContract } from '@aztec/ethereum/contracts'; -import { BlockNumber, CheckpointNumber, EpochNumber } from '@aztec/foundation/branded-types'; -import { timesParallel } from '@aztec/foundation/collection'; +import type { RollupContract } from '@aztec/ethereum/contracts'; +import { BlockNumber, CheckpointNumber, EpochNumber, SlotNumber } from '@aztec/foundation/branded-types'; +import { Fr } from '@aztec/foundation/curves/bn254'; import { EthAddress } from '@aztec/foundation/eth-address'; -import { promiseWithResolvers } from '@aztec/foundation/promise'; -import { retryUntil } from '@aztec/foundation/retry'; -import { sleep } from '@aztec/foundation/sleep'; -import type { P2PClient, TxProvider } from '@aztec/p2p'; -import type { PublicProcessorFactory } from '@aztec/simulator/server'; -import { - CommitteeAttestation, - GENESIS_BLOCK_HEADER_HASH, - GENESIS_CHECKPOINT_HEADER_HASH, - type L2BlockSource, -} from '@aztec/stdlib/block'; -import { Checkpoint, type CheckpointData, type PublishedCheckpoint } from '@aztec/stdlib/checkpoint'; +import type { EpochProverFactory } from '@aztec/prover-client'; +import { L2Block, type L2BlockSource, type L2BlockStreamEvent, type L2Tips } from '@aztec/stdlib/block'; +import type { Checkpoint, PublishedCheckpoint } from '@aztec/stdlib/checkpoint'; import type { ContractDataSource } from '@aztec/stdlib/contract'; import { EmptyL1RollupConstants } from '@aztec/stdlib/epoch-helpers'; -import { - type EpochProverManager, - type EpochProvingJobState, - type MerkleTreeWriteOperations, - WorldStateRunningState, - type WorldStateSynchronizer, -} from '@aztec/stdlib/interfaces/server'; +import type { EpochProverManager, ITxProvider, WorldStateSynchronizer } from '@aztec/stdlib/interfaces/server'; import type { L1ToL2MessageSource } from '@aztec/stdlib/messaging'; -import { BlockHeader, type Tx, TxHash } from '@aztec/stdlib/tx'; import { L1Metrics } from '@aztec/telemetry-client'; -import { type MockProxy, mock } from 'jest-mock-extended'; +import { jest } from '@jest/globals'; +import { mock } from 'jest-mock-extended'; -import type { SpecificProverNodeConfig } from './config.js'; -import type { EpochProvingJobData } from './job/epoch-proving-job-data.js'; -import type { EpochProvingJob } from './job/epoch-proving-job.js'; -import { EpochMonitor } from './monitors/epoch-monitor.js'; +import type { ProofPublishingService } from './proof-publishing-service.js'; import type { ProverNodePublisher } from './prover-node-publisher.js'; import { ProverNode } from './prover-node.js'; -import { ProverPublisherFactory } from './prover-publisher-factory.js'; - -describe('prover-node', () => { - // Prover node dependencies - let prover: MockProxy; - let publisher: MockProxy; - let l2BlockSource: MockProxy; - let l1ToL2MessageSource: MockProxy; - let contractDataSource: MockProxy; - let worldState: MockProxy; - let p2p: MockProxy; - let txProvider: MockProxy; - let epochMonitor: MockProxy; - let config: SpecificProverNodeConfig; - let rollupContract: MockProxy; - let publisherFactory: MockProxy; - let l1Metrics: MockProxy; - - // L1 genesis time - let l1GenesisTime: number; - - // Subject under test +import type { ProverPublisherFactory } from './prover-publisher-factory.js'; +import { SessionManager } from './session-manager.js'; + +describe('ProverNode', () => { let proverNode: TestProverNode; - // Checkpoints returned by the archiver - let checkpoints: Checkpoint[]; - let publishedCheckpoints: PublishedCheckpoint[]; - let checkpointData: CheckpointData[]; - let lastPublishedCheckpoint: PublishedCheckpoint; - let previousBlockHeader: BlockHeader; + let prover: ReturnType>; + let publisherFactory: ReturnType>; + let publisher: ReturnType>; + let l2BlockSource: ReturnType>; + let l1ToL2MessageSource: ReturnType>; + let contractDataSource: ReturnType>; + let worldState: ReturnType>; + let txProvider: ReturnType>; + let rollupContract: ReturnType>; + let l1Metrics: ReturnType>; + let sessionManager: ReturnType>; + let publishingService: ReturnType>; + + // epochDuration=1 ⇒ slot N lives in epoch N. proofSubmissionEpochs=1 ⇒ deadline for + // epoch E is the start of epoch E+2, so epoch E expires once latestEpoch >= E+2. + const l1Constants = { ...EmptyL1RollupConstants, epochDuration: 1, proofSubmissionEpochs: 1 }; - // Address of the publisher - let address: EthAddress; + beforeEach(() => { + prover = mock(); + publisherFactory = mock(); + publisher = mock(); + l2BlockSource = mock(); + l1ToL2MessageSource = mock(); + contractDataSource = mock(); + worldState = mock(); + txProvider = mock(); + rollupContract = mock(); + l1Metrics = mock(); + sessionManager = mock(); + publishingService = mock(); - // List of all jobs ever created by the test prover node and their dependencies - let jobs: { job: MockProxy; epochNumber: EpochNumber }[]; + prover.getProverId.mockReturnValue(EthAddress.ZERO); + l2BlockSource.getGenesisBlockHash.mockReturnValue('0x00' as any); + l2BlockSource.getL1Constants.mockResolvedValue(l1Constants); + l2BlockSource.getL2Tips.mockResolvedValue({} as L2Tips); + publisherFactory.create.mockResolvedValue(publisher); - const createProverNode = () => - new TestProverNode( + proverNode = new TestProverNode( prover, publisherFactory, l2BlockSource, l1ToL2MessageSource, contractDataSource, worldState, - p2p, - epochMonitor, + { getTxProvider: () => txProvider }, rollupContract, l1Metrics, - config, + {}, ); + // Inject the session manager and publishing service without going through start() — + // start() wires the publisher + block stream + ticker, none of which these unit tests + // exercise. + proverNode.setSessionManager(sessionManager); + proverNode.setPublishingService(publishingService); + }); - beforeEach(async () => { - prover = mock({ - getProverId: () => EthAddress.random(), - }); - publisher = mock(); - l2BlockSource = mock(); - l1ToL2MessageSource = mock(); - contractDataSource = mock(); - worldState = mock(); - epochMonitor = mock(); - txProvider = mock(); + // ---------------- event dispatch ---------------- - rollupContract = mock(); - publisherFactory = mock(); - publisherFactory.create.mockResolvedValue(publisher); + it('dispatches chain-checkpointed to handleCheckpointEvent', async () => { + setupNotFullyProven(); + const checkpoint = makeCheckpoint(1, 1, 1); + const event: L2BlockStreamEvent = { + type: 'chain-checkpointed', + checkpoint: makePublishedCheckpoint(checkpoint), + block: { number: BlockNumber(1), hash: '0x01' }, + }; - l1Metrics = mock(); + await proverNode.handleBlockStreamEvent(event); - p2p = mock(); - p2p.getTxProvider.mockReturnValue(txProvider); - - config = { - proverNodeMaxPendingJobs: 3, - proverNodePollingIntervalMs: 10, - proverNodeMaxParallelBlocksPerEpoch: 32, - txGatheringIntervalMs: 100, - txGatheringBatchSize: 10, - txGatheringMaxParallelRequestsPerNode: 5, - proverNodeFailedEpochStore: undefined, - txGatheringTimeoutMs: 1000, - proverNodeEpochProvingDelayMs: undefined, - proverNodeDisableProofPublish: false, - }; + expect(proverNode.getCheckpointStore().listAll().length).toBe(1); + expect(sessionManager.onCheckpointAdded).toHaveBeenCalledWith(EpochNumber(1)); + }); - // World state returns a new mock db every time it is asked to fork - worldState.fork.mockImplementation(() => Promise.resolve(mock())); - worldState.status.mockResolvedValue({ - state: WorldStateRunningState.RUNNING, - syncSummary: { - latestBlockNumber: BlockNumber(1), - latestBlockHash: '', - finalizedBlockNumber: BlockNumber.ZERO, - oldestHistoricBlockNumber: BlockNumber.ZERO, - treesAreSynched: true, - }, + it('dispatches chain-pruned through markPrunedAfter and notifies the session manager only when affected', async () => { + // No registered checkpoints — nothing to prune. + await proverNode.handleBlockStreamEvent({ + type: 'chain-pruned', + checkpoint: { number: CheckpointNumber(0), hash: '0x00' }, + block: { number: BlockNumber(0), hash: '0x00' }, + }); + expect(sessionManager.onPrune).not.toHaveBeenCalled(); + + // Register a checkpoint, then prune. + setupNotFullyProven(); + await proverNode.handleBlockStreamEvent({ + type: 'chain-checkpointed', + checkpoint: makePublishedCheckpoint(makeCheckpoint(2, 2, 2)), + block: { number: BlockNumber(2), hash: '0x02' }, }); - // Publisher returns its sender address - address = EthAddress.random(); - publisher.getSenderAddress.mockReturnValue(address); + await proverNode.handleBlockStreamEvent({ + type: 'chain-pruned', + checkpoint: { number: CheckpointNumber(1), hash: '0x01' }, + block: { number: BlockNumber(1), hash: '0x01' }, + }); + expect(sessionManager.onPrune).toHaveBeenCalledWith([EpochNumber(2)]); + }); - // We create 3 fake checkpoints with 1 block and 1 tx effect each - const startBlockNumber = 20; - checkpoints = await timesParallel( - 3, - async i => - await Checkpoint.random(CheckpointNumber(i + 1), { numBlocks: 1, startBlockNumber: startBlockNumber + i }), - ); - previousBlockHeader = BlockHeader.random({ blockNumber: BlockNumber(startBlockNumber - 1) }); - lastPublishedCheckpoint = { - checkpoint: checkpoints.at(-1)!, - attestations: [CommitteeAttestation.random()], - } as PublishedCheckpoint; - - publishedCheckpoints = [ - ...checkpoints.slice(0, -1).map(cp => ({ checkpoint: cp, attestations: [] }) as unknown as PublishedCheckpoint), - lastPublishedCheckpoint, - ]; - - l1GenesisTime = Math.floor(Date.now() / 1000) - 3600; - checkpointData = checkpoints.map(checkpoint => ({ checkpointNumber: checkpoint.number }) as CheckpointData); - - l2BlockSource.getL1Constants.mockResolvedValue({ ...EmptyL1RollupConstants, l1GenesisTime: BigInt(l1GenesisTime) }); - l2BlockSource.getCheckpoints.mockResolvedValue(publishedCheckpoints); - l2BlockSource.getCheckpointsData.mockResolvedValue(checkpointData); - const latestBlockNumber = BlockNumber.fromCheckpointNumber(checkpoints.at(-1)!.number); - const latestHash = checkpoints.at(-1)!.hash().toString(); - const genesisTipId = { - block: { number: BlockNumber.ZERO, hash: GENESIS_BLOCK_HEADER_HASH.toString() }, - checkpoint: { number: CheckpointNumber.ZERO, hash: GENESIS_CHECKPOINT_HEADER_HASH.toString() }, - }; - l2BlockSource.getL2Tips.mockResolvedValue({ - proposed: { number: latestBlockNumber, hash: latestHash }, - checkpointed: { - block: { number: latestBlockNumber, hash: latestHash }, - checkpoint: { number: checkpoints.at(-1)!.number, hash: latestHash }, - }, - proposedCheckpoint: { - block: { number: latestBlockNumber, hash: latestHash }, - checkpoint: { number: checkpoints.at(-1)!.number, hash: latestHash }, - }, - proven: genesisTipId, - finalized: genesisTipId, + it('dispatches chain-proven to publishingService.onChainProven', async () => { + await proverNode.handleBlockStreamEvent({ + type: 'chain-proven', + block: { number: BlockNumber(7), hash: '0x07' }, }); - l2BlockSource.getBlockData.mockImplementation(query => - Promise.resolve( - 'number' in query && query.number === checkpoints[0].blocks[0].number - 1 - ? ({ header: previousBlockHeader } as any) - : undefined, - ), - ); + expect(publishingService.onChainProven).toHaveBeenCalledWith(BlockNumber(7)); + }); - // L1 to L2 message source returns no messages - l1ToL2MessageSource.getL1ToL2Messages.mockResolvedValue([]); + it('expires elapsed epochs on every block-stream event: releases chonk cache, reaps store', async () => { + // Latest synced L2 slot = 4 ⇒ latestEpoch = 4 ⇒ epochs 0..2 are past their submission + // window (deadline = E+2 with proofSubmissionEpochs=1). + l2BlockSource.getSyncedL2SlotNumber.mockResolvedValue(SlotNumber(4)); - // Tx provider plays along and returns a tx whenever requested - txProvider.getTxsForBlock.mockImplementation(block => - Promise.resolve({ txs: block.body.txEffects.map(tx => makeTx(tx.txHash)), missingTxs: [] }), - ); + const expiredBlock = await L2Block.random(BlockNumber(1), { txsPerBlock: 1 }); + l2BlockSource.getCheckpointsData.mockResolvedValue([{ startBlock: BlockNumber(1), blockCount: 1 } as any]); + l2BlockSource.getBlocks.mockResolvedValue([expiredBlock]); - jobs = []; - }); + const txHash = expiredBlock.body.txEffects[0].txHash.toString(); + const cache = proverNode.getChonkCache(); + void cache.getOrCompute(txHash, () => Promise.resolve({} as any)); + expect(cache.get(txHash)).toBeDefined(); - const makeTx = (txHash: TxHash): Tx => ({ getTxHash: () => txHash, txHash }) as Tx; + const reapSpy = jest.spyOn(proverNode.getCheckpointStore(), 'reapExpired'); + + // Any block-stream event is enough to trigger the expiry sweep. + await proverNode.handleBlockStreamEvent({ + type: 'chain-finalized', + block: { number: BlockNumber(1), hash: '0x01' }, + }); - afterEach(async () => { - await proverNode.stop(); + expect(cache.get(txHash)).toBeUndefined(); + // Three expired epochs ⇒ reapExpired called once per epoch. + expect(reapSpy.mock.calls.map(([e]) => Number(e))).toEqual([0, 1, 2]); }); - beforeEach(() => { - proverNode = createProverNode(); + it('checkExpiry advances the high-water mark — does not re-reap already-expired epochs', async () => { + l2BlockSource.getSyncedL2SlotNumber.mockResolvedValue(SlotNumber(4)); + l2BlockSource.getCheckpointsData.mockResolvedValue([]); + const reapSpy = jest.spyOn(proverNode.getCheckpointStore(), 'reapExpired'); + + await proverNode.handleBlockStreamEvent({ + type: 'chain-finalized', + block: { number: BlockNumber(1), hash: '0x01' }, + }); + expect(reapSpy.mock.calls.length).toBe(3); + reapSpy.mockClear(); + + // Same latest slot ⇒ nothing new should expire. + await proverNode.handleBlockStreamEvent({ + type: 'chain-finalized', + block: { number: BlockNumber(1), hash: '0x01' }, + }); + expect(reapSpy).not.toHaveBeenCalled(); }); - it('starts a proof on a finished epoch', async () => { - await proverNode.handleEpochReadyToProve(EpochNumber.fromBigInt(10n)); - expect(jobs[0].epochNumber).toEqual(EpochNumber.fromBigInt(10n)); - expect(jobs[0].job.getDeadline()).toEqual(new Date((l1GenesisTime + 10 + 2) * 1000)); - expect(proverNode.totalJobCount).toEqual(1); + it('checkExpiry no-ops when archiver has no synced slot yet', async () => { + l2BlockSource.getSyncedL2SlotNumber.mockResolvedValue(undefined); + const reapSpy = jest.spyOn(proverNode.getCheckpointStore(), 'reapExpired'); + + await proverNode.handleBlockStreamEvent({ + type: 'chain-finalized', + block: { number: BlockNumber(1), hash: '0x01' }, + }); + expect(reapSpy).not.toHaveBeenCalled(); }); - it('requests a publisher for each epoch', async () => { - await proverNode.handleEpochReadyToProve(EpochNumber.fromBigInt(10n)); - expect(publisherFactory.create).toHaveBeenCalledTimes(1); + it('propagates a checkpoint registration failure and leaves the tips store unadvanced (A-1041)', async () => { + setupNotFullyProven(); + // Registration fails: worldState.syncImmediate (inside collectRegisterData) rejects. The + // failure propagates rather than being swallowed, so the checkpoint is never registered and + // the tips stay put for the L2BlockStream to retry. + worldState.syncImmediate.mockRejectedValue(new Error('boom')); + + const event: L2BlockStreamEvent = { + type: 'chain-checkpointed', + checkpoint: makePublishedCheckpoint(makeCheckpoint(1, 1, 1)), + block: { number: BlockNumber(1), hash: '0x01' }, + }; + + await expect(proverNode.handleBlockStreamEvent(event)).rejects.toThrow('boom'); + + // Tips left unadvanced; nothing was registered and the session manager wasn't notified. + expect(await proverNode.getTipsStore().getL2BlockHash(1)).toBeUndefined(); + expect(proverNode.getCheckpointStore().listAll()).toHaveLength(0); + expect(sessionManager.onCheckpointAdded).not.toHaveBeenCalled(); }); - it('does not start a proof if there are no checkpoints in the epoch', async () => { - l2BlockSource.getCheckpoints.mockResolvedValue([]); - l2BlockSource.getCheckpointsData.mockResolvedValue([]); - await proverNode.handleEpochReadyToProve(EpochNumber.fromBigInt(10n)); - expect(proverNode.totalJobCount).toEqual(0); + it('leaves the tips store unadvanced when a handler propagates an error (A-1041)', async () => { + setupNotFullyProven(); + // Registration succeeds, but the expiry sweep throws — a failure that propagates before the + // tips-store update, so the error surfaces to the L2BlockStream and the tips stay put. + l2BlockSource.getSyncedL2SlotNumber.mockRejectedValue(new Error('archiver down')); + + const event: L2BlockStreamEvent = { + type: 'chain-checkpointed', + checkpoint: makePublishedCheckpoint(makeCheckpoint(1, 1, 1)), + block: { number: BlockNumber(1), hash: '0x01' }, + }; + + await expect(proverNode.handleBlockStreamEvent(event)).rejects.toThrow('archiver down'); + + // Tips left unadvanced so the L2BlockStream re-emits this event on its next poll. + expect(await proverNode.getTipsStore().getL2BlockHash(1)).toBeUndefined(); }); - it('gathers txs via the p2p client tx provider', async () => { - await proverNode.handleEpochReadyToProve(EpochNumber.fromBigInt(10n)); - // The prover node must route tx gathering through the shared p2p client's tx provider - expect(p2p.getTxProvider).toHaveBeenCalled(); - // One call per block across all checkpoints in the epoch - const totalBlocks = checkpoints.flatMap(c => c.blocks).length; - expect(txProvider.getTxsForBlock).toHaveBeenCalledTimes(totalBlocks); + // ---------------- handleCheckpointEvent gating ---------------- + + it('skips registration when the epoch is already fully proven on L1', async () => { + // Proven block sits at the last block of epoch 1 (epochDuration=1, slot=1). Block 2 must be + // absent so isProvenBlockLastOfItsEpoch falls through to isEpochComplete and reports the + // proven tip as the epoch's last block. + l2BlockSource.getBlockNumber.mockResolvedValue(BlockNumber(1)); + l2BlockSource.getBlockData.mockImplementation((query: any) => + Promise.resolve(Number(query.number) === 1 ? ({ header: { getSlot: () => SlotNumber(1) } } as any) : undefined), + ); + l2BlockSource.isEpochComplete.mockResolvedValue(true); + + await proverNode.handleBlockStreamEvent({ + type: 'chain-checkpointed', + checkpoint: makePublishedCheckpoint(makeCheckpoint(1, 1, 1)), + block: { number: BlockNumber(1), hash: '0x01' }, + }); + + expect(proverNode.getCheckpointStore().listAll().length).toBe(0); + expect(sessionManager.onCheckpointAdded).not.toHaveBeenCalled(); }); - it('does not start a proof if there is a tx missing from coordinator', async () => { - txProvider.getTxsForBlock.mockResolvedValue({ missingTxs: [TxHash.random()], txs: [] }); - await proverNode.handleEpochReadyToProve(EpochNumber.fromBigInt(10n)); - expect(proverNode.totalJobCount).toEqual(0); + it('content-addresses the prover by the checkpoint archive root', async () => { + setupNotFullyProven(); + const archiveRoot = Fr.random(); + + await proverNode.handleBlockStreamEvent({ + type: 'chain-checkpointed', + checkpoint: makePublishedCheckpoint(makeCheckpoint(1, 1, 2, archiveRoot)), + block: { number: BlockNumber(2), hash: '0x02' }, + }); + + const prover = proverNode.getCheckpointStore().listAll()[0]; + expect(prover.id).toContain(archiveRoot.toString()); }); - it('does not prove the same epoch twice', async () => { - const firstJob = promiseWithResolvers(); - proverNode.nextJobRun = () => firstJob.promise; - proverNode.nextJobState = 'processing'; - await proverNode.handleEpochReadyToProve(EpochNumber.fromBigInt(10n)); - await proverNode.handleEpochReadyToProve(EpochNumber.fromBigInt(10n)); + // ---------------- forwarders ---------------- - firstJob.resolve(); - expect(proverNode.totalJobCount).toEqual(1); + it('startProof forwards to the session manager and returns the job id', async () => { + sessionManager.startProof.mockResolvedValue('job-5'); + await expect(proverNode.startProof(EpochNumber(5))).resolves.toBe('job-5'); + expect(sessionManager.startProof).toHaveBeenCalledWith(EpochNumber(5)); }); - it('does not start duplicate proofs from concurrent RPC calls', async () => { - await Promise.all([ - proverNode.startProof(EpochNumber.fromBigInt(10n)), - proverNode.startProof(EpochNumber.fromBigInt(10n)), + it('getJobs forwards to the session manager', async () => { + sessionManager.getJobs.mockReturnValue([ + { uuid: 'a', status: 'awaiting-checkpoints', epochNumber: EpochNumber(3) }, ]); + const jobs = await proverNode.getJobs(); + expect(jobs).toEqual([{ uuid: 'a', status: 'awaiting-checkpoints', epochNumber: EpochNumber(3) }]); + }); - expect(proverNode.totalJobCount).toEqual(1); + it('startProof throws when the session manager has not been constructed yet', async () => { + proverNode.clearSessionManager(); + await expect(proverNode.startProof(EpochNumber(5))).rejects.toThrow(/not started/); }); - it('does not start duplicate proofs from concurrent monitor and RPC calls', async () => { - await Promise.all([ - proverNode.handleEpochReadyToProve(EpochNumber.fromBigInt(10n)), - proverNode.startProof(EpochNumber.fromBigInt(10n)), - ]); + it('getJobs returns an empty array when the session manager has not been constructed', async () => { + proverNode.clearSessionManager(); + await expect(proverNode.getJobs()).resolves.toEqual([]); + }); + + // ---------------- handleBlockStreamEvent: blocks-added is a no-op + still triggers expiry ---------------- - expect(proverNode.totalJobCount).toEqual(1); + it("'blocks-added' invokes no event handler but still runs the expiry sweep", async () => { + // latestSlot=4 ⇒ epochs 0..2 expire. + l2BlockSource.getSyncedL2SlotNumber.mockResolvedValue(SlotNumber(4)); + l2BlockSource.getCheckpointsData.mockResolvedValue([]); + const reapSpy = jest.spyOn(proverNode.getCheckpointStore(), 'reapExpired'); + + // Use a real (random) L2Block so the tips-store handler doesn't choke on an empty array. + const block = await L2Block.random(BlockNumber(1)); + await proverNode.handleBlockStreamEvent({ type: 'blocks-added', blocks: [block] }); + + // No checkpoint, prune, or proven handler should have fired. + expect(sessionManager.onCheckpointAdded).not.toHaveBeenCalled(); + expect(sessionManager.onPrune).not.toHaveBeenCalled(); + expect(publishingService.onChainProven).not.toHaveBeenCalled(); + // But the expiry sweep ran. + expect(reapSpy.mock.calls.map(([e]) => Number(e))).toEqual([0, 1, 2]); }); - it('starts a full proof when an active job only covers a partial epoch', async () => { - const partialJob = promiseWithResolvers(); - l2BlockSource.getCheckpoints - .mockResolvedValueOnce(publishedCheckpoints.slice(0, 2)) - .mockResolvedValue(publishedCheckpoints); - proverNode.nextJobRun = () => partialJob.promise; - proverNode.nextJobState = 'processing'; - - await proverNode.handleEpochReadyToProve(EpochNumber.fromBigInt(10n)); - const handledWhilePartialJobActive = await proverNode.handleEpochReadyToProve(EpochNumber.fromBigInt(10n)); - - expect(handledWhilePartialJobActive).toBe(true); - expect(proverNode.totalJobCount).toEqual(2); - expect(jobs[0].job.getProvingData().checkpoints.map(checkpoint => checkpoint.number)).toEqual( - checkpoints.slice(0, 2).map(checkpoint => checkpoint.number), - ); - expect(jobs[1].job.getProvingData().checkpoints.map(checkpoint => checkpoint.number)).toEqual( - checkpoints.map(checkpoint => checkpoint.number), - ); + // ---------------- checkEpochExpiry: latestEpoch < offset is a no-op ---------------- + + it('checkEpochExpiry no-ops when latestEpoch is below the submission-window offset', async () => { + // proofSubmissionEpochs=1 ⇒ offset=2. latestSlot=1 ⇒ latestEpoch=1 < 2. + l2BlockSource.getSyncedL2SlotNumber.mockResolvedValue(SlotNumber(1)); + const reapSpy = jest.spyOn(proverNode.getCheckpointStore(), 'reapExpired'); + + await proverNode.handleBlockStreamEvent({ + type: 'chain-finalized', + block: { number: BlockNumber(1), hash: '0x01' }, + }); + + expect(reapSpy).not.toHaveBeenCalled(); + // High-water mark stays untouched. + expect(proverNode.getLastExpiredEpoch()).toBeUndefined(); + }); + + // ---------------- expireEpoch swallows getCheckpointsData errors ---------------- - partialJob.resolve(); + it('expireEpoch still reaps the store when getCheckpointsData throws', async () => { + // Three epochs would expire (latestSlot=4 ⇒ epochs 0..2). getCheckpointsData throws for + // every call, but reapExpired must still be invoked for each epoch and the high-water + // mark must still advance. + l2BlockSource.getSyncedL2SlotNumber.mockResolvedValue(SlotNumber(4)); + l2BlockSource.getCheckpointsData.mockRejectedValue(new Error('archiver unavailable')); + const reapSpy = jest.spyOn(proverNode.getCheckpointStore(), 'reapExpired'); + + await proverNode.handleBlockStreamEvent({ + type: 'chain-finalized', + block: { number: BlockNumber(1), hash: '0x01' }, + }); + + expect(reapSpy.mock.calls.map(([e]) => Number(e))).toEqual([0, 1, 2]); + expect(proverNode.getLastExpiredEpoch()).toEqual(EpochNumber(2)); }); - it('restarts a proof on a reorg', async () => { - proverNode.nextJobState = 'reorg'; - await proverNode.handleEpochReadyToProve(EpochNumber.fromBigInt(10n)); - await retryUntil(() => proverNode.totalJobCount === 2, 'job retried', 5); - expect(proverNode.totalJobCount).toEqual(2); + // ---------------- handlePruneEvent dedupes affected epochs ---------------- + + it('handlePruneEvent dedupes affected epochs when multiple provers share one epoch', async () => { + // Suite default is epochDuration=1 (one slot per epoch ⇒ at most one prover per epoch). + // To exercise dedup we need an epoch that holds multiple slots — override l1Constants + // for this test to epochDuration=2 so slots 6 and 7 both live in epoch 3. + const l1ConstantsTwo = { ...EmptyL1RollupConstants, epochDuration: 2, proofSubmissionEpochs: 1 }; + l2BlockSource.getL1Constants.mockResolvedValue(l1ConstantsTwo); + setupRegistrationSuccess(); + + // Register two checkpoints at slots 6 and 7 (both in epoch 3). + await proverNode.handleBlockStreamEvent({ + type: 'chain-checkpointed', + checkpoint: makePublishedCheckpoint(makeCheckpoint(1, 6, 6)), + block: { number: BlockNumber(6), hash: '0x06' }, + }); + await proverNode.handleBlockStreamEvent({ + type: 'chain-checkpointed', + checkpoint: makePublishedCheckpoint(makeCheckpoint(2, 7, 7)), + block: { number: BlockNumber(7), hash: '0x07' }, + }); + expect(proverNode.getCheckpointStore().listAll().length).toBe(2); + + // Pruning above checkpoint 0 marks both as pruned — onPrune must receive [EpochNumber(3)], + // not [3, 3]. + sessionManager.onPrune.mockClear(); + await proverNode.handleBlockStreamEvent({ + type: 'chain-pruned', + checkpoint: { number: CheckpointNumber(0), hash: '0x00' }, + block: { number: BlockNumber(0), hash: '0x00' }, + }); + expect(sessionManager.onPrune).toHaveBeenCalledTimes(1); + expect(sessionManager.onPrune).toHaveBeenCalledWith([EpochNumber(3)]); }); - it('does not restart a proof on an error', async () => { - proverNode.nextJobState = 'failed'; - await proverNode.handleEpochReadyToProve(EpochNumber.fromBigInt(10n)); - await sleep(1000); - expect(proverNode.totalJobCount).toEqual(1); + // ---------------- isEpochFullyProven branches ---------------- + + describe('isEpochFullyProven', () => { + it('returns false when no block is proven yet', async () => { + l2BlockSource.getBlockNumber.mockResolvedValue(undefined); + await expect(proverNode.callIsEpochFullyProven(EpochNumber(3), l1Constants)).resolves.toBe(false); + }); + + it('returns false when the proven block has no header in the archiver', async () => { + l2BlockSource.getBlockNumber.mockResolvedValue(BlockNumber(5)); + l2BlockSource.getBlockData.mockResolvedValue(undefined); + await expect(proverNode.callIsEpochFullyProven(EpochNumber(3), l1Constants)).resolves.toBe(false); + }); + + it("returns true for any epoch strictly below the proven tip's epoch", async () => { + // Proven block at slot 5 ⇒ provenEpoch = 5 (epochDuration=1). + l2BlockSource.getBlockNumber.mockResolvedValue(BlockNumber(5)); + l2BlockSource.getBlockData.mockResolvedValue({ header: { getSlot: () => SlotNumber(5) } } as any); + await expect(proverNode.callIsEpochFullyProven(EpochNumber(3), l1Constants)).resolves.toBe(true); + }); + + it("returns false for any epoch strictly above the proven tip's epoch", async () => { + l2BlockSource.getBlockNumber.mockResolvedValue(BlockNumber(5)); + l2BlockSource.getBlockData.mockResolvedValue({ header: { getSlot: () => SlotNumber(5) } } as any); + await expect(proverNode.callIsEpochFullyProven(EpochNumber(7), l1Constants)).resolves.toBe(false); + }); + + it('returns true on the equality case when the proven block is the last of its epoch (next block is in a later epoch)', async () => { + // provenEpoch=2, next block in epoch 3 ⇒ last of epoch. + const l1ConstantsTwo = { ...EmptyL1RollupConstants, epochDuration: 2 }; + l2BlockSource.getBlockNumber.mockResolvedValue(BlockNumber(5)); + l2BlockSource.getBlockData.mockImplementation((q: any) => { + if (q.number === 5) { + return Promise.resolve({ header: { getSlot: () => SlotNumber(5) } } as any); // epoch 2 + } + if (q.number === 6) { + return Promise.resolve({ header: { getSlot: () => SlotNumber(6) } } as any); // epoch 3 + } + return Promise.resolve(undefined); + }); + await expect(proverNode.callIsEpochFullyProven(EpochNumber(2), l1ConstantsTwo)).resolves.toBe(true); + }); + + it('returns false on the equality case when the proven block is mid-epoch (next block is in the same epoch)', async () => { + const l1ConstantsTwo = { ...EmptyL1RollupConstants, epochDuration: 2 }; + l2BlockSource.getBlockNumber.mockResolvedValue(BlockNumber(5)); + l2BlockSource.getBlockData.mockImplementation((q: any) => { + if (q.number === 5) { + return Promise.resolve({ header: { getSlot: () => SlotNumber(4) } } as any); // epoch 2 + } + if (q.number === 6) { + return Promise.resolve({ header: { getSlot: () => SlotNumber(5) } } as any); // also epoch 2 + } + return Promise.resolve(undefined); + }); + await expect(proverNode.callIsEpochFullyProven(EpochNumber(2), l1ConstantsTwo)).resolves.toBe(false); + }); + + it('falls back to isEpochComplete when there is no next-block header', async () => { + // No next-block header ⇒ isProvenBlockLastOfItsEpoch defers to isEpochComplete. + l2BlockSource.getBlockNumber.mockResolvedValue(BlockNumber(5)); + l2BlockSource.getBlockData.mockImplementation((q: any) => { + if (q.number === 5) { + return Promise.resolve({ header: { getSlot: () => SlotNumber(5) } } as any); + } + return Promise.resolve(undefined); // no next block yet + }); + l2BlockSource.isEpochComplete.mockResolvedValueOnce(true); + await expect(proverNode.callIsEpochFullyProven(EpochNumber(5), l1Constants)).resolves.toBe(true); + + l2BlockSource.isEpochComplete.mockResolvedValueOnce(false); + await expect(proverNode.callIsEpochFullyProven(EpochNumber(5), l1Constants)).resolves.toBe(false); + }); }); - class TestProverNode extends ProverNode { - public totalJobCount = 0; - public nextJobState: EpochProvingJobState = 'completed'; - public nextJobRun: () => Promise = () => Promise.resolve(); - - protected override doCreateEpochProvingJob( - data: EpochProvingJobData, - deadline: Date | undefined, - _publicProcessorFactory: PublicProcessorFactory, - ): EpochProvingJob { - const state = this.nextJobState; - this.nextJobState = 'completed'; - const run = this.nextJobRun; - this.nextJobRun = () => Promise.resolve(); - const job = mock({ - run, - getState: () => state, - getEpochNumber: () => data.epochNumber, - getDeadline: () => deadline, - getProvingData: () => data, + // ---------------- computeStartupState branches ---------------- + + describe('computeStartupState', () => { + it('returns starting block 1 and no fully-proven epoch when nothing is proven', async () => { + l2BlockSource.getBlockNumber.mockResolvedValue(undefined); + await expect(proverNode.callComputeStartupState()).resolves.toEqual({ + startingBlock: BlockNumber(1), + lastFullyProvenEpoch: undefined, + }); + }); + + it('returns provenBlock+1 and no fully-proven epoch when the proven block has no archiver header', async () => { + l2BlockSource.getBlockNumber.mockResolvedValue(BlockNumber(5)); + l2BlockSource.getBlockData.mockResolvedValue(undefined); + await expect(proverNode.callComputeStartupState()).resolves.toEqual({ + startingBlock: BlockNumber(6), + lastFullyProvenEpoch: undefined, + }); + }); + + it('returns provenBlock+1 and provenEpoch when the proven block is the last of its epoch', async () => { + // epochDuration=1: slot 5 ⇒ epoch 5; next slot 6 ⇒ epoch 6 > 5 ⇒ last of epoch. + l2BlockSource.getBlockNumber.mockResolvedValue(BlockNumber(5)); + l2BlockSource.getBlockData.mockImplementation((q: any) => { + if (q.number === 5) { + return Promise.resolve({ header: { getSlot: () => SlotNumber(5) } } as any); + } + if (q.number === 6) { + return Promise.resolve({ header: { getSlot: () => SlotNumber(6) } } as any); + } + return Promise.resolve(undefined); + }); + await expect(proverNode.callComputeStartupState()).resolves.toEqual({ + startingBlock: BlockNumber(6), + lastFullyProvenEpoch: EpochNumber(5), + }); + }); + + it('returns provenBlock+1 and provenEpoch via the isEpochComplete fallback when there is no next-block header', async () => { + l2BlockSource.getBlockNumber.mockResolvedValue(BlockNumber(5)); + l2BlockSource.getBlockData.mockImplementation((q: any) => { + if (q.number === 5) { + return Promise.resolve({ header: { getSlot: () => SlotNumber(5) } } as any); + } + return Promise.resolve(undefined); + }); + l2BlockSource.isEpochComplete.mockResolvedValue(true); + await expect(proverNode.callComputeStartupState()).resolves.toEqual({ + startingBlock: BlockNumber(6), + lastFullyProvenEpoch: EpochNumber(5), + }); + }); + + it("returns the partially-proven epoch's first block and provenEpoch-1 when proven is mid-epoch", async () => { + // epochDuration=2: slot 5 ⇒ epoch 2; next slot 5 ⇒ same epoch ⇒ mid-epoch. + const l1ConstantsTwo = { ...EmptyL1RollupConstants, epochDuration: 2, proofSubmissionEpochs: 1 }; + l2BlockSource.getL1Constants.mockResolvedValue(l1ConstantsTwo); + l2BlockSource.getBlockNumber.mockResolvedValue(BlockNumber(5)); + l2BlockSource.getBlockData.mockImplementation((q: any) => { + if (q.number === 5) { + return Promise.resolve({ header: { getSlot: () => SlotNumber(4) } } as any); // epoch 2 + } + if (q.number === 6) { + return Promise.resolve({ header: { getSlot: () => SlotNumber(5) } } as any); // epoch 2 + } + return Promise.resolve(undefined); + }); + l2BlockSource.getCheckpointsData.mockResolvedValue([{ startBlock: BlockNumber(3) } as any]); + + await expect(proverNode.callComputeStartupState()).resolves.toEqual({ + startingBlock: BlockNumber(3), + lastFullyProvenEpoch: EpochNumber(1), + }); + }); + + it('returns lastFullyProvenEpoch=undefined when proven is mid-epoch within epoch 0', async () => { + // The provenEpoch=0 edge case: there is no "previous" epoch to claim as fully proven. + const l1ConstantsTwo = { ...EmptyL1RollupConstants, epochDuration: 2, proofSubmissionEpochs: 1 }; + l2BlockSource.getL1Constants.mockResolvedValue(l1ConstantsTwo); + l2BlockSource.getBlockNumber.mockResolvedValue(BlockNumber(1)); + l2BlockSource.getBlockData.mockImplementation((q: any) => { + if (q.number === 1) { + return Promise.resolve({ header: { getSlot: () => SlotNumber(0) } } as any); // epoch 0 + } + if (q.number === 2) { + return Promise.resolve({ header: { getSlot: () => SlotNumber(1) } } as any); // still epoch 0 + } + return Promise.resolve(undefined); }); - job.getId.mockReturnValue(jobs.length.toString()); - jobs.push({ epochNumber: data.epochNumber, job }); - this.totalJobCount++; - return job; - } - - public override triggerMonitors() { - return super.triggerMonitors(); - } + l2BlockSource.getCheckpointsData.mockResolvedValue([{ startBlock: BlockNumber(1) } as any]); + + await expect(proverNode.callComputeStartupState()).resolves.toEqual({ + startingBlock: BlockNumber(1), + lastFullyProvenEpoch: undefined, + }); + }); + }); + + // ---------------- helpers ---------------- + + /** Bypass `isEpochFullyProven` so checkpoint events register normally. */ + function setupNotFullyProven() { + l2BlockSource.getBlockNumber.mockResolvedValue(undefined); + setupRegistrationSuccess(); + // getBlockData returns a header that lets isEpochFullyProven bail out as "not proven" + // and supplies a lastArchive.root for collectRegisterData. + l2BlockSource.getBlockData.mockResolvedValue({ + header: { lastArchive: { root: Fr.ZERO } }, + } as any); + } + + /** + * Sets up everything `collectRegisterData` needs (world-state sync, L1→L2 message source, + * archive sibling-path snapshot). Tests that want to override `getBlockNumber` / + * `getBlockData` to drive `isEpochFullyProven` should call this and then set the + * proven-tip mocks themselves. + */ + function setupRegistrationSuccess() { + worldState.syncImmediate.mockResolvedValue(undefined as any); + l1ToL2MessageSource.getL1ToL2Messages.mockResolvedValue([]); + l2BlockSource.getBlockData.mockResolvedValue({ + header: { lastArchive: { root: Fr.ZERO } }, + } as any); + worldState.getSnapshot.mockReturnValue({ + getTreeInfo: () => Promise.resolve({ size: 1n }), + getSiblingPath: () => Promise.resolve({ toFields: () => [] }), + } as any); + } + + function makeCheckpoint( + checkpointNumber: number, + slot: number, + blockNumber: number, + archiveRoot: Fr = Fr.random(), + ): Checkpoint { + return { + number: CheckpointNumber(checkpointNumber), + header: { slotNumber: SlotNumber(slot) }, + archive: { root: archiveRoot }, + blocks: [{ number: blockNumber, header: { hash: () => Promise.resolve('0x01') } }], + } as unknown as Checkpoint; + } + + function makePublishedCheckpoint(checkpoint: Checkpoint): PublishedCheckpoint { + return { checkpoint, attestations: [] } as unknown as PublishedCheckpoint; } }); + +/** ProverNode subclass that exposes hooks for injecting a mocked SessionManager + reads. */ +class TestProverNode extends ProverNode { + public setSessionManager(sm: SessionManager): void { + this.sessionManager = sm; + } + + public clearSessionManager(): void { + this.sessionManager = undefined; + } + + public setPublishingService(svc: ProofPublishingService): void { + this.publishingService = svc; + } + + public getTipsStore() { + // tipsStore is private; reach in for the A-1041 assertion. + + return (this as any).tipsStore; + } + + // ---------------- direct access for unit tests ---------------- + + public callComputeStartupState() { + return this.computeStartupState(); + } + + public callIsEpochFullyProven(epoch: EpochNumber, l1Constants: { epochDuration: number }) { + return this.isEpochFullyProven(epoch, l1Constants as any); + } + + public callIsProvenBlockLastOfItsEpoch( + provenBlock: BlockNumber, + provenEpoch: EpochNumber, + l1Constants: { epochDuration: number }, + ) { + return this.isProvenBlockLastOfItsEpoch(provenBlock, provenEpoch, l1Constants as any); + } + + public getLastExpiredEpoch(): EpochNumber | undefined { + return this.lastExpiredEpoch; + } +} diff --git a/yarn-project/prover-node/src/prover-node.ts b/yarn-project/prover-node/src/prover-node.ts index 89c26eef75bd..22c28e8c5bd4 100644 --- a/yarn-project/prover-node/src/prover-node.ts +++ b/yarn-project/prover-node/src/prover-node.ts @@ -2,19 +2,29 @@ import type { Archiver } from '@aztec/archiver'; import type { RollupContract } from '@aztec/ethereum/contracts'; import type { Delayer } from '@aztec/ethereum/l1-tx-utils'; import { BlockNumber, CheckpointNumber, EpochNumber } from '@aztec/foundation/branded-types'; -import { assertRequired, compact, pick, sum } from '@aztec/foundation/collection'; -import type { Fr } from '@aztec/foundation/curves/bn254'; +import { assertRequired, compact, pick } from '@aztec/foundation/collection'; import { memoize } from '@aztec/foundation/decorators'; import { createLogger } from '@aztec/foundation/log'; -import { DateProvider } from '@aztec/foundation/timer'; +import { DateProvider, executeTimeout } from '@aztec/foundation/timer'; +import type { EpochProverFactory } from '@aztec/prover-client'; +import { getLastSiblingPath } from '@aztec/prover-client/helpers'; +import { ChonkCache } from '@aztec/prover-client/orchestrator'; import { PublicProcessorFactory } from '@aztec/simulator/server'; -import type { L2BlockSource } from '@aztec/stdlib/block'; -import type { Checkpoint } from '@aztec/stdlib/checkpoint'; +import { + type L2BlockSource, + L2BlockStream, + type L2BlockStreamEvent, + type L2BlockStreamEventHandler, + L2TipsMemoryStore, +} from '@aztec/stdlib/block'; +import type { Checkpoint, PublishedCheckpoint } from '@aztec/stdlib/checkpoint'; import type { ChainConfig } from '@aztec/stdlib/config'; import type { ContractDataSource } from '@aztec/stdlib/contract'; -import { getProofSubmissionDeadlineTimestamp } from '@aztec/stdlib/epoch-helpers'; +import { type L1RollupConstants, getEpochAtSlot, getProofSubmissionDeadlineEpoch } from '@aztec/stdlib/epoch-helpers'; import { type EpochProverManager, + type EpochProvingJobState, + EpochProvingJobTerminalState, type ITxProvider, type ProverNodeApi, type Service, @@ -24,56 +34,79 @@ import { } from '@aztec/stdlib/interfaces/server'; import type { DataStoreConfig } from '@aztec/stdlib/kv-store'; import type { L1ToL2MessageSource } from '@aztec/stdlib/messaging'; -import type { Tx } from '@aztec/stdlib/tx'; +import { MerkleTreeId } from '@aztec/stdlib/trees'; import { - Attributes, L1Metrics, type TelemetryClient, type Traceable, type Tracer, getTelemetryClient, - trackSpan, } from '@aztec/telemetry-client'; import { uploadEpochProofFailure } from './actions/upload-epoch-proof-failure.js'; +import { CheckpointStore, type RegisterCheckpointData } from './checkpoint-store.js'; import type { SpecificProverNodeConfig } from './config.js'; -import type { EpochProvingJobData } from './job/epoch-proving-job-data.js'; -import { EpochProvingJob, type EpochProvingJobOptions, type EpochProvingJobState } from './job/epoch-proving-job.js'; +import type { EpochSession, EpochSessionHooks } from './job/epoch-session.js'; import { ProverNodeJobMetrics, ProverNodeRewardsMetrics } from './metrics.js'; -import type { EpochMonitor, EpochMonitorHandler } from './monitors/epoch-monitor.js'; -import type { ProverNodePublisher } from './prover-node-publisher.js'; +import { ProofPublishingService } from './proof-publishing-service.js'; import type { ProverPublisherFactory } from './prover-publisher-factory.js'; +import { SessionManager } from './session-manager.js'; type ProverNodeOptions = SpecificProverNodeConfig & Partial; type DataStoreOptions = Pick & Pick; /** - * An Aztec Prover Node is a standalone process that monitors the unfinalized chain on L1 for unproven epochs, - * fetches their txs from the p2p network or external nodes, re-executes their public functions, creates a rollup - * proof for the epoch, and submits it to L1. + * Grace period for the proof-publishing service to settle during shutdown. The service waits for + * any in-flight L1 proof-submission tx to finish; that tx can take a long time to mine, so we cap + * the wait rather than letting `stop()` hang indefinitely. */ -export class ProverNode implements EpochMonitorHandler, ProverNodeApi, Traceable { +const PUBLISHING_SERVICE_STOP_TIMEOUT_MS = 30_000; + +/** + * An Aztec Prover Node is a standalone process that monitors the chain for new checkpoints, + * starts proving them optimistically as they arrive, and submits epoch proofs to L1 once + * complete. + * + * The class is intentionally thin: it owns the long-lived collections (`CheckpointStore`, + * `ChonkCache`, `SessionManager`), the L2BlockStream, and a periodic ticker that nudges the + * manager to pick up newly-complete epochs. Every session lifecycle decision is delegated to + * the `SessionManager`. Each chain event is translated here into a single method call on it. + */ +export class ProverNode implements L2BlockStreamEventHandler, ProverNodeApi, Traceable { private log = createLogger('prover-node'); - private jobs: Map = new Map(); - private config: ProverNodeOptions; - private jobMetrics: ProverNodeJobMetrics; - private rewardsMetrics: ProverNodeRewardsMetrics; - private startingProofEpochs: Set = new Set(); + protected readonly checkpointStore: CheckpointStore; + protected readonly chonkCache: ChonkCache; + protected sessionManager: SessionManager | undefined; + + private readonly config: ProverNodeOptions; + private readonly jobMetrics: ProverNodeJobMetrics; + private readonly rewardsMetrics: ProverNodeRewardsMetrics; + + /** In-memory store for the L2BlockStream's local data provider. */ + private tipsStore: L2TipsMemoryStore; + /** Block stream for checkpoint and reorg detection. */ + private blockStream: L2BlockStream | undefined; + /** + * Highest epoch whose proof-submission window has passed. Monotonic high-water mark. + * Seeded from the last fully-proven epoch at start(); advanced on every block-stream + * event by comparing the archiver's latest synced L2 slot against each epoch's + * submission deadline. Protected so tests can verify the start() seeding. + */ + protected lastExpiredEpoch: EpochNumber | undefined; public readonly tracer: Tracer; - protected publisher: ProverNodePublisher | undefined; + protected publishingService: ProofPublishingService | undefined; constructor( - protected readonly prover: EpochProverManager, + protected readonly prover: EpochProverManager & EpochProverFactory, protected readonly publisherFactory: ProverPublisherFactory, protected readonly l2BlockSource: L2BlockSource & Partial, protected readonly l1ToL2MessageSource: L1ToL2MessageSource, protected readonly contractDataSource: ContractDataSource, protected readonly worldState: WorldStateSynchronizer, protected readonly p2pClient: { getTxProvider(): ITxProvider } & Partial, - protected readonly epochsMonitor: EpochMonitor, protected readonly rollupContract: RollupContract, protected readonly l1Metrics: L1Metrics, config: Partial = {}, @@ -100,8 +133,33 @@ export class ProverNode implements EpochMonitorHandler, ProverNodeApi, Traceable this.tracer = telemetryClient.getTracer('ProverNode'); this.jobMetrics = new ProverNodeJobMetrics(meter, telemetryClient.getTracer('EpochProvingJob')); - this.rewardsMetrics = new ProverNodeRewardsMetrics(meter, this.prover.getProverId(), rollupContract); + + this.tipsStore = new L2TipsMemoryStore(this.l2BlockSource.getGenesisBlockHash()); + + this.chonkCache = new ChonkCache(this.log.getBindings()); + this.checkpointStore = new CheckpointStore( + this.l2BlockSource, + { + proverFactory: this.prover, + chonkCache: this.chonkCache, + publicProcessorFactory: new PublicProcessorFactory( + this.contractDataSource, + this.dateProvider, + this.telemetryClient, + this.log.getBindings(), + ), + dbProvider: this.worldState, + txProvider: this.p2pClient.getTxProvider(), + dateProvider: this.dateProvider, + proverId: this.prover.getProverId(), + metrics: this.jobMetrics, + txGatheringTimeoutMs: this.config.txGatheringTimeoutMs, + deadline: undefined, + }, + { slotWatcherPollIntervalMs: this.config.proverNodePollingIntervalMs }, + this.log.getBindings(), + ); } public getProverId() { @@ -112,60 +170,32 @@ export class ProverNode implements EpochMonitorHandler, ProverNodeApi, Traceable return this.p2pClient; } - /** Returns the shared tx delayer for prover L1 txs, if enabled. Test-only. */ + /** Test-only: the shared L1 tx delayer, if enabled. */ public getDelayer(): Delayer | undefined { return this.delayer; } - /** - * Handles an epoch being completed by starting a proof for it if there are no active jobs for it. - * @param epochNumber - The epoch number that was just completed. - * @returns false if there is an error, true otherwise - */ - async handleEpochReadyToProve(epochNumber: EpochNumber): Promise { - try { - this.log.debug(`Running jobs as ${epochNumber} is ready to prove`, { - jobs: Array.from(this.jobs.values()).map(job => `${job.getEpochNumber()}:${job.getId()}`), - }); - return await this.startProofIfNeeded(epochNumber); - } catch (err) { - if (err instanceof EmptyEpochError) { - this.log.info(`Not starting proof for ${epochNumber} since no blocks were found`); - } else { - this.log.error(`Error handling epoch completed`, err); - } - return false; - } + /** Observability summary for the ProverNodeApi. */ + public getJobs(): Promise<{ uuid: string; status: EpochProvingJobState; epochNumber: EpochNumber }[]> { + return Promise.resolve(this.sessionManager?.getJobs() ?? []); } - /** - * Starts the prover node so it periodically checks for unproven epochs in the unfinalized chain from L1 and - * starts proving jobs for them. - */ - async start() { - this.epochsMonitor.start(this); - await this.publisherFactory.start(); - this.publisher = await this.publisherFactory.create(); - await this.rewardsMetrics.start(); - this.l1Metrics.start(); - this.log.info(`Started Prover Node with prover id ${this.prover.getProverId().toString()}`, this.config); + /** Tests inspect this when validating reconcile behaviour. */ + public getCheckpointStore(): CheckpointStore { + return this.checkpointStore; } - /** - * Stops the prover node and all its dependencies. - * Resources not owned by this node (shared with the parent aztec-node) are skipped. - */ - async stop() { - this.log.info('Stopping ProverNode'); - await this.epochsMonitor.stop(); - this.publisher?.interrupt(); - await Promise.all(Array.from(this.jobs.values()).map(job => job.stop())); - await this.prover.stop(); - await tryStop(this.publisherFactory); - this.rewardsMetrics.stop(); - this.l1Metrics.stop(); - await this.telemetryClient.stop(); - this.log.info('Stopped ProverNode'); + /** Tests inspect this to verify chonk-cache release semantics. */ + public getChonkCache(): ChonkCache { + return this.chonkCache; + } + + /** Tests inspect this when looking up live sessions. */ + public getSessionManager(): SessionManager { + if (!this.sessionManager) { + throw new Error('SessionManager not yet constructed — start() must be called first.'); + } + return this.sessionManager; } /** Returns world state status. */ @@ -179,264 +209,391 @@ export class ProverNode implements EpochMonitorHandler, ProverNodeApi, Traceable return this.l2BlockSource.getL2Tips(); } - /** - * Starts a proving process and returns immediately. - */ - public async startProof(epochNumber: EpochNumber) { - await this.startProofIfNeeded(epochNumber, { skipEpochCheck: true }); + /** Returns the underlying prover instance. */ + public getProver() { + return this.prover; } - private async startProofIfNeeded(epochNumber: EpochNumber, opts: EpochProvingJobOptions = {}): Promise { - if (this.startingProofEpochs.has(epochNumber)) { - this.log.warn(`Not starting proof for ${epochNumber} since a proof is already being started for the epoch`, { - epochNumber, - }); - return false; + // ---------------- L2BlockStream handler ---------------- + + public async handleBlockStreamEvent(event: L2BlockStreamEvent): Promise { + switch (event.type) { + case 'chain-checkpointed': + await this.handleCheckpointEvent(event.checkpoint); + break; + case 'chain-pruned': + await this.handlePruneEvent(event.checkpoint); + break; + case 'chain-proven': + this.publishingService?.onChainProven(BlockNumber(event.block.number)); + break; + case 'chain-finalized': + case 'blocks-added': + break; } + // Expiry is driven by the archiver's latest synced L2 slot + await this.checkEpochExpiry(); + // Advance the local tips store only after the proving-side handling has succeeded. Any + // failure above propagates to the L2BlockStream (which logs and stops this poll pass) and + // skips this update, so the event is re-emitted on the next poll rather than skipped (A-1041). + await this.tipsStore.handleBlockStreamEvent(event); + } - this.startingProofEpochs.add(epochNumber); + /** Register a new checkpoint with the store and notify the session manager. */ + private async handleCheckpointEvent(publishedCheckpoint: PublishedCheckpoint) { + const checkpoint = publishedCheckpoint.checkpoint; + const slotNumber = checkpoint.header.slotNumber; + const l1Constants = await this.getL1Constants(); + const epochNumber = getEpochAtSlot(slotNumber, l1Constants); - try { - const activeJobs = await this.activeJobsCoverEpoch(epochNumber); - if (activeJobs.length > 0) { - this.log.warn(`Not starting proof for ${epochNumber} since an active job already covers the epoch`, { - epochNumber, - activeJobs, - }); - return true; - } + if (await this.isEpochFullyProven(epochNumber, l1Constants)) { + this.log.debug(`Skipping checkpoint ${checkpoint.number} for already-proven epoch ${epochNumber}`); + return; + } - await this.startProofInternal(epochNumber, opts); - return true; - } finally { - this.startingProofEpochs.delete(epochNumber); + if (await this.isEpochPastProofSubmissionWindow(epochNumber, l1Constants)) { + this.log.debug( + `Skipping checkpoint ${checkpoint.number} for epoch ${epochNumber} past its proof-submission window`, + ); + return; } - } - private async startProofInternal(epochNumber: EpochNumber, opts: EpochProvingJobOptions = {}) { - const job = await this.createProvingJob(epochNumber, opts); - void this.runJob(job); + this.log.info(`New checkpoint ${checkpoint.number} for epoch ${epochNumber}`, { + checkpointNumber: checkpoint.number, + epochNumber, + slotNumber, + }); + + const registerData = await this.collectRegisterData(checkpoint, publishedCheckpoint.attestations); + await this.checkpointStore.addOrUpdate(checkpoint, registerData); + await this.sessionManager?.onCheckpointAdded(epochNumber); } - private async runJob(job: EpochProvingJob) { - const epochNumber = job.getEpochNumber(); - const ctx = { id: job.getId(), epochNumber, state: undefined as EpochProvingJobState | undefined }; + /** + * Gathers register-time data for a checkpoint: previous block header, L1-to-L2 messages, + * and the archive sibling path. + */ + private async collectRegisterData( + checkpoint: Checkpoint, + attestations: PublishedCheckpoint['attestations'], + ): Promise { + const previousBlockNumber = BlockNumber(checkpoint.blocks[0].number - 1); + const previousBlockHeader = await this.gatherPreviousBlockHeader(previousBlockNumber); + const l1ToL2Messages = await this.l1ToL2MessageSource.getL1ToL2Messages(checkpoint.number); + const lastBlock = checkpoint.blocks.at(-1)!; + const lastBlockHash = await lastBlock.header.hash(); + await this.worldState.syncImmediate(lastBlock.number, lastBlockHash); + const previousArchiveSiblingPath = await getLastSiblingPath( + MerkleTreeId.ARCHIVE, + this.worldState.getSnapshot(previousBlockNumber), + ); + return { + attestations, + previousBlockHeader, + l1ToL2Messages, + previousArchiveSiblingPath, + }; + } - try { - await job.run(); - const state = job.getState(); - ctx.state = state; - - if (state === 'reorg') { - this.log.warn(`Running new job for epoch ${epochNumber} due to reorg`, ctx); - await this.createProvingJob(epochNumber); - } else if (state === 'failed') { - this.log.error(`Job for ${epochNumber} exited with state ${state}`, ctx); - await this.tryUploadEpochFailure(job); - } else { - this.log.verbose(`Job for ${epochNumber} exited with state ${state}`, ctx); - } - } catch (err) { - this.log.error(`Error proving epoch ${epochNumber}`, err, ctx); - } finally { - this.jobs.delete(job.getId()); + /** Mark every prover above the prune threshold as pruned and notify the session manager. */ + private async handlePruneEvent(prunedCheckpoint: { number: CheckpointNumber; hash: string }) { + this.log.warn(`Chain pruned to checkpoint ${prunedCheckpoint.number}`, { prunedCheckpoint }); + const affected = this.checkpointStore.markPrunedAfter(prunedCheckpoint.number); + if (affected.length === 0) { + return; } + const l1Constants = await this.getL1Constants(); + const affectedEpochs = Array.from( + new Set(affected.map(p => Number(getEpochAtSlot(p.slotNumber, l1Constants)))), + ).map(n => EpochNumber(n)); + // The session manager cancels every affected session, which in turn calls + // publishingService.withdraw(uuid) for each candidate; no separate notification to the + // publishing service is needed. + await this.sessionManager?.onPrune(affectedEpochs); } - protected async tryUploadEpochFailure(job: EpochProvingJob) { - if (this.config.proverNodeFailedEpochStore) { - return await uploadEpochProofFailure( - this.config.proverNodeFailedEpochStore, - job.getId(), - job.getProvingData(), - this.l2BlockSource as Archiver, - this.worldState, - assertRequired(pick(this.config, 'l1ChainId', 'rollupVersion', 'dataDirectory')), - this.log, - ); + /** + * Returns true once the chain has advanced past the given epoch's proof-submission window. + * Used to ignore checkpoints whose epoch can no longer be proven in time — chiefly while the + * archiver replays old blocks after a restart. Compares the archiver's latest synced L2 slot + * against the epoch's submission-deadline epoch; conservatively returns false if the slot can't + * be read yet. + */ + private async isEpochPastProofSubmissionWindow( + epochNumber: EpochNumber, + l1Constants: L1RollupConstants, + ): Promise { + const latestSlot = await this.l2BlockSource.getSyncedL2SlotNumber(); + if (latestSlot === undefined) { + return false; } + const latestEpoch = getEpochAtSlot(latestSlot, l1Constants); + return latestEpoch >= getProofSubmissionDeadlineEpoch(epochNumber, l1Constants); } /** - * Returns the prover instance. + * Compares the archiver's latest synced L2 slot against `lastExpiredEpoch` and, for each + * newly-expired epoch, releases the chonk-cache entries for its blocks and reaps any + * CheckpointProvers in the store. An epoch E is expired once the chain reaches the start + * of epoch `E + proofSubmissionEpochs + 1`. Silently no-ops if nothing has expired since + * the last check or the archiver's slot can't be read. */ - public getProver() { - return this.prover; + private async checkEpochExpiry(): Promise { + const latestSlot = await this.l2BlockSource.getSyncedL2SlotNumber(); + if (latestSlot === undefined) { + return; + } + const l1Constants = await this.getL1Constants(); + const latestEpoch = getEpochAtSlot(latestSlot, l1Constants); + const offset = l1Constants.proofSubmissionEpochs + 1; + if (latestEpoch < offset) { + return; + } + const newlyExpiredUpTo = EpochNumber(latestEpoch - offset); + const from = this.lastExpiredEpoch === undefined ? EpochNumber(0) : EpochNumber(this.lastExpiredEpoch + 1); + if (newlyExpiredUpTo < from) { + return; + } + for (let e = from; e <= newlyExpiredUpTo; e = EpochNumber(e + 1)) { + await this.expireEpoch(e); + } + this.lastExpiredEpoch = newlyExpiredUpTo; } /** - * Returns an array of jobs being processed. + * Releases chonk-cache entries for every block in the supplied epoch (best-effort) and + * reaps every CheckpointProver in the store whose epoch number matches. */ - public getJobs(): Promise<{ uuid: string; status: EpochProvingJobState; epochNumber: EpochNumber }[]> { - return Promise.resolve(this.getJobsInternal()); + private async expireEpoch(epoch: EpochNumber): Promise { + try { + const blocks = await this.l2BlockSource.getBlocks({ epoch, onlyCheckpointed: true }); + if (blocks.length > 0) { + this.chonkCache.releaseForBlocks(blocks); + } + } catch (err) { + this.log.warn(`Could not release chonk-cache entries for expired epoch ${epoch}`, err); + } + this.checkpointStore.reapExpired(epoch); } - private getJobsInternal(): { uuid: string; status: EpochProvingJobState; epochNumber: EpochNumber }[] { - return Array.from(this.jobs.entries()).map(([uuid, job]) => ({ - uuid, - status: job.getState(), - epochNumber: job.getEpochNumber(), - })); - } + // ---------------- public API ---------------- - private async activeJobsCoverEpoch(epochNumber: EpochNumber): Promise { - const checkpoints = await this.l2BlockSource.getCheckpointsData({ epoch: epochNumber }); - if (checkpoints.length === 0) { - return []; + /** + * Schedules proving for the given epoch and returns the job id without waiting for completion. + */ + public async startProof(epochNumber: EpochNumber): Promise { + if (!this.sessionManager) { + throw new Error('ProverNode not started'); } + return await this.sessionManager.startProof(epochNumber); + } - const firstCheckpoint = checkpoints.at(0)!.checkpointNumber; - const latestCheckpoint = checkpoints.at(-1)!.checkpointNumber; + // ---------------- Service lifecycle ---------------- - const jobs: string[] = []; - for (const job of this.jobs.values()) { - if (job.getEpochNumber() !== epochNumber) { - continue; - } - - const jobCheckpoints = job.getProvingData().checkpoints; - const checkpointOverlap = - jobCheckpoints.at(0)!.number <= firstCheckpoint && jobCheckpoints.at(-1)!.number >= latestCheckpoint; + async start() { + await this.checkpointStore.start(); - if (checkpointOverlap && !['failed', 'stopped', 'timed-out'].includes(job.getState())) { - jobs.push(job.getId()); - } - } + await this.publisherFactory.start(); + this.publishingService = new ProofPublishingService({ + publisherFactory: this.publisherFactory, + l2BlockSource: this.l2BlockSource, + dateProvider: this.dateProvider, + config: { skipSubmitProof: !!this.config.proverNodeDisableProofPublish }, + bindings: this.log.getBindings(), + }); + this.sessionManager = this.createSessionManager(this.publishingService); + // SessionManager owns its own periodic tick; start it here so it begins picking up + // epochs that become complete by time (no fresh checkpoint event) and advances once + // the previous epoch is proven on L1. + this.sessionManager.start(); + // Now that the store + manager exist, arm the live-state observable gauges. + this.jobMetrics.observeState(this.checkpointStore, this.sessionManager); + + const { startingBlock, lastFullyProvenEpoch } = await this.computeStartupState(); + this.lastExpiredEpoch = lastFullyProvenEpoch; + this.blockStream = new L2BlockStream(this.l2BlockSource, this.tipsStore, this, this.log, { + pollIntervalMS: this.config.proverNodePollingIntervalMs, + startingBlock, + }); + this.blockStream.start(); - return jobs; + await this.rewardsMetrics.start(); + this.l1Metrics.start(); + this.log.info(`Started Prover Node with prover id ${this.prover.getProverId().toString()}`, this.config); } - private checkMaximumPendingJobs() { - const { proverNodeMaxPendingJobs: maxPendingJobs } = this.config; - if (maxPendingJobs > 0 && this.jobs.size >= maxPendingJobs) { - throw new Error(`Maximum pending proving jobs ${maxPendingJobs} reached. Cannot create new job.`); + async stop() { + this.log.info('Stopping ProverNode'); + this.jobMetrics.stopObservingState(); + await this.blockStream?.stop(); + if (this.sessionManager) { + await this.sessionManager.stop(); + } + if (this.publishingService) { + // Bound the wait: the publishing service blocks until any in-flight L1 proof-submission tx + // settles, which can outlast a reasonable shutdown window. On timeout we log and move on — + // the tx may still mine, but shutdown must not hang on it. + const publishingService = this.publishingService; + await executeTimeout( + () => publishingService.stop(), + PUBLISHING_SERVICE_STOP_TIMEOUT_MS, + 'prover-node publishing-service stop', + ).catch(err => this.log.warn(`Timed out stopping proof publishing service`, err)); } + await this.checkpointStore.stop(); + this.chonkCache.stop(); + await this.prover.stop(); + await tryStop(this.publisherFactory); + this.rewardsMetrics.stop(); + this.l1Metrics.stop(); + await this.telemetryClient.stop(); + this.log.info('Stopped ProverNode'); } - @trackSpan('ProverNode.createProvingJob', epochNumber => ({ [Attributes.EPOCH_NUMBER]: epochNumber })) - private async createProvingJob(epochNumber: EpochNumber, opts: { skipEpochCheck?: boolean } = {}) { - this.checkMaximumPendingJobs(); - - this.publisher = await this.publisherFactory.create(); - - // Gather all data for this epoch - const epochData = await this.gatherEpochData(epochNumber); - const fromCheckpoint = epochData.checkpoints[0].number; - const toCheckpoint = epochData.checkpoints.at(-1)!.number; - const fromBlock = epochData.checkpoints[0].blocks[0].number; - const lastBlock = epochData.checkpoints.at(-1)!.blocks.at(-1)!; - const toBlock = lastBlock.number; - this.log.verbose( - `Creating proving job for epoch ${epochNumber} for checkpoint range ${fromCheckpoint} to ${toCheckpoint} and block range ${fromBlock} to ${toBlock}`, - ); + /** + * Constructs the session manager. Extracted so subclasses (test harness) can swap + * the implementation. Wired to `tryUploadSessionFailure` so failed sessions get + * their proving data uploaded. + */ + protected createSessionManager(publishingService: ProofPublishingService): SessionManager { + return new SessionManager({ + checkpointStore: this.checkpointStore, + l2BlockSource: this.l2BlockSource, + proverFactory: this.prover, + proverId: this.prover.getProverId(), + publishingService, + metrics: this.jobMetrics, + dateProvider: this.dateProvider, + config: { + maxPendingJobs: this.config.proverNodeMaxPendingJobs, + tickIntervalMs: this.config.proverNodePollingIntervalMs, + finalizationDelayMs: this.config.proverNodeEpochProvingDelayMs, + }, + onSessionFailed: async session => { + await this.tryUploadSessionFailure(session); + }, + bindings: this.log.getBindings(), + }); + } - // Fast forward world state to right before the target block and get a fork - const lastBlockHash = await lastBlock.header.hash(); - await this.worldState.syncImmediate(toBlock, lastBlockHash); + /** + * Installs session hooks for the e2e harness to interpose around top-tree proving + * (gate, override, or observe it) without monkey-patching the orchestrator factory. + * Applies to every session constructed after this call. + */ + public setSessionHooks(hooks: EpochSessionHooks): void { + if (!this.sessionManager) { + throw new Error('ProverNode not started; call start() before setting session hooks.'); + } + this.sessionManager.setSessionHooks(hooks); + } - // Create a processor factory - const publicProcessorFactory = new PublicProcessorFactory( - this.contractDataSource, - this.dateProvider, - this.telemetryClient, - this.log.getBindings(), + /** Uploads failure snapshots when sessions exit with `failed`. Exposed as a method so tests can spy on it. */ + public async tryUploadSessionFailure(session: EpochSession): Promise { + if (!this.config.proverNodeFailedEpochStore) { + return undefined; + } + const data = SessionManager.buildSessionProvingData(session); + return await uploadEpochProofFailure( + this.config.proverNodeFailedEpochStore, + session.getId(), + data, + this.l2BlockSource as Archiver, + this.worldState, + assertRequired(pick(this.config, 'l1ChainId', 'rollupVersion', 'dataDirectory')), + this.log, ); - - // Set deadline for this job to run. It will abort if it takes too long. - const deadlineTs = getProofSubmissionDeadlineTimestamp(epochNumber, await this.getL1Constants()); - const deadline = new Date(Number(deadlineTs) * 1000); - const job = this.doCreateEpochProvingJob(epochData, deadline, publicProcessorFactory, this.publisher, opts); - this.jobs.set(job.getId(), job); - return job; } + // ---------------- helpers ---------------- + @memoize - private getL1Constants() { + private getL1Constants(): Promise { return this.l2BlockSource.getL1Constants(); } - @trackSpan('ProverNode.gatherEpochData', epochNumber => ({ [Attributes.EPOCH_NUMBER]: epochNumber })) - private async gatherEpochData(epochNumber: EpochNumber): Promise { - const publishedCheckpoints = await this.l2BlockSource.getCheckpoints({ epoch: epochNumber }); - if (publishedCheckpoints.length === 0) { - throw new EmptyEpochError(epochNumber); + /** + * Returns true if every block in the given epoch is proven on L1. An epoch is only + * fully proven when its *last* block is proven. Protected for direct unit-test access. + */ + protected async isEpochFullyProven( + epochNumber: EpochNumber, + l1Constants: Pick, + ): Promise { + const provenBlockNumber = await this.l2BlockSource.getBlockNumber({ tag: 'proven' }); + if (!provenBlockNumber || provenBlockNumber <= 0) { + return false; } - const checkpoints = publishedCheckpoints.map(p => p.checkpoint); - const attestations = publishedCheckpoints.at(-1)?.attestations ?? []; - const txArray = await this.gatherTxs(epochNumber, checkpoints); - const txs = new Map(txArray.map(tx => [tx.getTxHash().toString(), tx])); - const l1ToL2Messages = await this.gatherMessages(epochNumber, checkpoints); - const [firstBlock] = checkpoints[0].blocks; - const previousBlockHeader = await this.gatherPreviousBlockHeader(epochNumber, firstBlock.number - 1); - - return { checkpoints, txs, l1ToL2Messages, epochNumber, previousBlockHeader, attestations }; + const provenHeader = (await this.l2BlockSource.getBlockData({ number: BlockNumber(provenBlockNumber) }))?.header; + if (!provenHeader) { + return false; + } + const provenEpoch = getEpochAtSlot(provenHeader.getSlot(), l1Constants); + if (epochNumber < provenEpoch) { + return true; + } + if (epochNumber > provenEpoch) { + return false; + } + return this.isProvenBlockLastOfItsEpoch(BlockNumber(provenBlockNumber), provenEpoch, l1Constants); } - private async gatherTxs(epochNumber: EpochNumber, checkpoints: Checkpoint[]) { - const deadline = new Date(this.dateProvider.now() + this.config.txGatheringTimeoutMs); - const txProvider = this.p2pClient.getTxProvider(); - const blocks = checkpoints.flatMap(checkpoint => checkpoint.blocks); - const txsByBlock = await Promise.all(blocks.map(block => txProvider.getTxsForBlock(block, { deadline }))); - const txs = txsByBlock.map(({ txs }) => txs).flat(); - const missingTxs = txsByBlock.map(({ missingTxs }) => missingTxs).flat(); - - if (missingTxs.length === 0) { - this.log.verbose(`Gathered all ${txs.length} txs for epoch ${epochNumber}`, { epochNumber }); - return txs; + /** Protected for direct unit-test access. */ + protected async isProvenBlockLastOfItsEpoch( + provenBlockNumber: BlockNumber, + provenEpoch: EpochNumber, + l1Constants: Pick, + ): Promise { + const nextHeader = (await this.l2BlockSource.getBlockData({ number: BlockNumber(provenBlockNumber + 1) }))?.header; + if (nextHeader) { + return getEpochAtSlot(nextHeader.getSlot(), l1Constants) > provenEpoch; } - - throw new Error(`Txs not found for epoch ${epochNumber}: ${missingTxs.map(hash => hash.toString()).join(', ')}`); + return this.l2BlockSource.isEpochComplete(provenEpoch); } - private async gatherMessages(epochNumber: EpochNumber, checkpoints: Checkpoint[]) { - const messages = await Promise.all(checkpoints.map(c => this.l1ToL2MessageSource.getL1ToL2Messages(c.number))); - const messageCount = sum(messages.map(m => m.length)); - this.log.verbose(`Gathered all ${messageCount} messages for epoch ${epochNumber}`, { epochNumber }); - const messagesByCheckpoint: Record = {}; - for (let i = 0; i < checkpoints.length; i++) { - messagesByCheckpoint[checkpoints[i].number] = messages[i]; + /** + * Resolves the L2BlockStream's starting block and the last fully-proven epoch in one + * pass. The starting block is the first block of the next unproven epoch (or the start + * of the partially-proven epoch if the proven tip falls mid-epoch). The fully-proven + * epoch is `provenEpoch` when the proven tip is the last block of its epoch, otherwise + * `provenEpoch - 1`, or `undefined` if no block is proven yet. + */ + protected async computeStartupState(): Promise<{ + startingBlock: BlockNumber; + lastFullyProvenEpoch: EpochNumber | undefined; + }> { + const provenBlockNumber = await this.l2BlockSource.getBlockNumber({ tag: 'proven' }); + if (!provenBlockNumber || provenBlockNumber <= 0) { + return { startingBlock: BlockNumber(1), lastFullyProvenEpoch: undefined }; + } + const l1Constants = await this.getL1Constants(); + const provenHeader = (await this.l2BlockSource.getBlockData({ number: BlockNumber(provenBlockNumber) }))?.header; + if (!provenHeader) { + return { startingBlock: BlockNumber(provenBlockNumber + 1), lastFullyProvenEpoch: undefined }; } - return messagesByCheckpoint; + const provenEpoch = getEpochAtSlot(provenHeader.getSlot(), l1Constants); + if (await this.isProvenBlockLastOfItsEpoch(BlockNumber(provenBlockNumber), provenEpoch, l1Constants)) { + return { startingBlock: BlockNumber(provenBlockNumber + 1), lastFullyProvenEpoch: provenEpoch }; + } + const epochCheckpoints = await this.l2BlockSource.getCheckpointsData({ epoch: provenEpoch }); + const firstBlockOfEpoch = + epochCheckpoints.length > 0 ? epochCheckpoints[0].startBlock : BlockNumber(provenBlockNumber); + this.log.info( + `Starting L2BlockStream at block ${firstBlockOfEpoch} (start of partially-proven epoch ${provenEpoch})`, + { provenBlockNumber, provenEpoch, firstBlockOfEpoch }, + ); + const lastFullyProvenEpoch = provenEpoch > 0 ? EpochNumber(provenEpoch - 1) : undefined; + return { startingBlock: firstBlockOfEpoch, lastFullyProvenEpoch }; } - private async gatherPreviousBlockHeader(epochNumber: EpochNumber, previousBlockNumber: number) { + private async gatherPreviousBlockHeader(previousBlockNumber: number) { const data = await this.l2BlockSource.getBlockData({ number: BlockNumber(previousBlockNumber) }); if (!data?.header) { - throw new Error(`Previous block header ${previousBlockNumber} not found for proving epoch ${epochNumber}`); + throw new Error(`Previous block header ${previousBlockNumber} not found`); } - - this.log.verbose(`Gathered previous block header ${data.header.getBlockNumber()} for epoch ${epochNumber}`); return data.header; } - /** Extracted for testing purposes. */ - protected doCreateEpochProvingJob( - data: EpochProvingJobData, - deadline: Date | undefined, - publicProcessorFactory: PublicProcessorFactory, - publisher: ProverNodePublisher, - opts: { skipEpochCheck?: boolean } = {}, - ) { - const { proverNodeMaxParallelBlocksPerEpoch: parallelBlockLimit, proverNodeDisableProofPublish } = this.config; - return new EpochProvingJob( - data, - this.worldState, - this.prover.createEpochProver(), - publicProcessorFactory, - publisher, - this.l2BlockSource, - this.jobMetrics, - deadline, - { parallelBlockLimit, skipSubmitProof: proverNodeDisableProofPublish, ...opts }, - this.log.getBindings(), - ); - } - - /** Extracted for testing purposes. */ - protected async triggerMonitors() { - await this.epochsMonitor.work(); - } - private validateConfig() { if ( this.config.proverNodeFailedEpochStore && @@ -453,9 +610,5 @@ export class ProverNode implements EpochMonitorHandler, ProverNodeApi, Traceable } } -class EmptyEpochError extends Error { - constructor(epochNumber: EpochNumber) { - super(`No blocks found for epoch ${epochNumber}`); - this.name = 'EmptyEpochError'; - } -} +// Re-export so handlers can compare states externally. +export { EpochProvingJobTerminalState }; diff --git a/yarn-project/prover-node/src/session-manager.test.ts b/yarn-project/prover-node/src/session-manager.test.ts new file mode 100644 index 000000000000..c907cf8b3151 --- /dev/null +++ b/yarn-project/prover-node/src/session-manager.test.ts @@ -0,0 +1,851 @@ +import { BlockNumber, EpochNumber, SlotNumber } from '@aztec/foundation/branded-types'; +import { EthAddress } from '@aztec/foundation/eth-address'; +import { promiseWithResolvers } from '@aztec/foundation/promise'; +import { DateProvider } from '@aztec/foundation/timer'; +import type { L2BlockSource } from '@aztec/stdlib/block'; +import { EmptyL1RollupConstants } from '@aztec/stdlib/epoch-helpers'; +import type { EpochProvingJobState } from '@aztec/stdlib/interfaces/server'; + +import { mock } from 'jest-mock-extended'; + +import type { CheckpointStore } from './checkpoint-store.js'; +import { CheckpointProver } from './job/checkpoint-prover.js'; +import { EpochSession, type SessionSpec } from './job/epoch-session.js'; +import { ProverNodeJobMetrics } from './metrics.js'; +import type { ProofPublishingService } from './proof-publishing-service.js'; +import { SessionManager, type SessionManagerDeps } from './session-manager.js'; + +describe('SessionManager', () => { + // Two-slot epochs let a single epoch hold canonical checkpoints at distinct slots. + // Epoch N covers slots [N*2, N*2+1]: epoch 3 → [6, 7], epoch 4 → [8, 9], epoch 7 → [14, 15]. + const l1Constants = { ...EmptyL1RollupConstants, epochDuration: 2 }; + + let store: ReturnType>; + let l2BlockSource: ReturnType< + typeof mock< + Pick + > + >; + let publishingService: ReturnType>; + let metrics: ProverNodeJobMetrics; + + /** Mirror of fullSessions/partialSessions whose entries are stubs we control. */ + let stubs: StubSession[]; + /** Resolves whenever the manager constructs a stub session. */ + let onConstruct: ((stub: StubSession) => void) | undefined; + + let manager: TestSessionManager; + + beforeEach(() => { + store = mock(); + l2BlockSource = + mock< + Pick + >(); + publishingService = mock(); + metrics = new ProverNodeJobMetrics( + // Minimal Meter stub: every meter.create* returns an object with a no-op record. + { createHistogram: noopMetric, createGauge: noopMetric, createCounter: noopMetric } as any, + { startActiveSpan: (_n: string, fn: any) => fn({ end: () => {} }) } as any, + ); + l2BlockSource.getL1Constants.mockResolvedValue(l1Constants); + l2BlockSource.isEpochComplete.mockResolvedValue(false); + l2BlockSource.getCheckpoints.mockResolvedValue([]); + store.listCanonicalInSlotRange.mockReturnValue([]); + store.listCanonicalForEpoch.mockResolvedValue([]); + + stubs = []; + onConstruct = undefined; + + manager = new TestSessionManager( + { + checkpointStore: store, + l2BlockSource, + proverFactory: {} as any, + proverId: EthAddress.ZERO, + publishingService, + metrics, + dateProvider: new DateProvider(), + config: { maxPendingJobs: 0, tickIntervalMs: 60_000, finalizationDelayMs: undefined }, + }, + (spec, provers) => { + const stub = makeStubSession(spec, provers); + stubs.push(stub); + onConstruct?.(stub); + return stub as unknown as EpochSession; + }, + ); + }); + + afterEach(async () => { + // Resolve any stub session that's still waiting so manager.stop() can drain. + for (const stub of stubs) { + stub.terminate('cancelled'); + } + await manager.stop(); + }); + + // ---------------- read views ---------------- + + it('getJobs returns empty when no sessions exist', () => { + expect(manager.getJobs()).toEqual([]); + }); + + it('getJobs reports every live session', async () => { + await openCanonicalFullSession(EpochNumber(5), [proverWithSlot(10)]); + const jobs = manager.getJobs(); + expect(jobs.length).toBe(1); + expect(jobs[0].epochNumber).toEqual(EpochNumber(5)); + expect(jobs[0].status).toBe('awaiting-checkpoints'); + }); + + // ---------------- opening full sessions ---------------- + + it('does not open a full session when the epoch is incomplete on L1', async () => { + const epoch = EpochNumber(3); + l2BlockSource.isEpochComplete.mockResolvedValue(false); + await manager.onCheckpointAdded(epoch); + expect(stubs.length).toBe(0); + expect(manager.getFullSession(epoch)).toBeUndefined(); + }); + + it('does not open a full session when archiver checkpoints are not all in the store', async () => { + const epoch = EpochNumber(3); + l2BlockSource.isEpochComplete.mockResolvedValue(true); + l2BlockSource.getCheckpoints.mockResolvedValue([archiverCp(1, 6), archiverCp(2, 7)]); + // Store only has checkpoint 1. + store.listCanonicalInSlotRange.mockReturnValue([proverForCheckpoint(1, 6)]); + await manager.onCheckpointAdded(epoch); + expect(stubs.length).toBe(0); + expect(manager.getFullSession(epoch)).toBeUndefined(); + }); + + it('opens a full session when epoch complete + store fully covered', async () => { + const epoch = EpochNumber(3); + // Two canonical checkpoints at distinct slots within epoch 3's range [6, 7]. + const provers = [proverForCheckpoint(1, 6), proverForCheckpoint(2, 7)]; + l2BlockSource.isEpochComplete.mockResolvedValue(true); + l2BlockSource.getCheckpoints.mockResolvedValue([archiverCp(1, 6), archiverCp(2, 7)]); + store.listCanonicalInSlotRange.mockReturnValue(provers); + + await manager.onCheckpointAdded(epoch); + + expect(stubs.length).toBe(1); + expect(stubs[0].spec).toEqual({ kind: 'full', epochNumber: epoch, fromSlot: SlotNumber(6), toSlot: SlotNumber(7) }); + expect(stubs[0].provers).toEqual(provers); + expect(manager.getFullSession(epoch)).toBe(stubs[0] as unknown as EpochSession); + }); + + it('does not open a duplicate full session if one already exists', async () => { + const epoch = EpochNumber(3); + await openCanonicalFullSession(epoch, [proverForCheckpoint(1, 6)]); + expect(stubs.length).toBe(1); + await manager.onCheckpointAdded(epoch); + expect(stubs.length).toBe(1); + }); + + it('respects maxPendingJobs when opening full sessions', async () => { + manager = new TestSessionManager( + { + checkpointStore: store, + l2BlockSource, + proverFactory: {} as any, + proverId: EthAddress.ZERO, + publishingService, + metrics, + dateProvider: new DateProvider(), + config: { maxPendingJobs: 1, tickIntervalMs: 60_000, finalizationDelayMs: undefined }, + }, + (spec, provers) => { + const stub = makeStubSession(spec, provers); + stubs.push(stub); + return stub as unknown as EpochSession; + }, + ); + + // Persistent mock implementations keyed by slot so reconcile's invariant checks + // see consistent content for each session across multiple events. Epoch 3 → slot 6, + // epoch 4 → slot 8 (each epoch's first slot under epochDuration=2). + const epoch3Provers = [proverForCheckpoint(1, 6)]; + const epoch4Provers = [proverForCheckpoint(2, 8)]; + l2BlockSource.isEpochComplete.mockResolvedValue(true); + l2BlockSource.getCheckpoints.mockImplementation(({ epoch }: { epoch: EpochNumber }) => + Promise.resolve( + (Number(epoch) === 3 ? epoch3Provers : Number(epoch) === 4 ? epoch4Provers : []).map( + p => ({ checkpoint: p.checkpoint }) as any, + ), + ), + ); + store.listCanonicalInSlotRange.mockImplementation((fromSlot: SlotNumber) => { + if (Number(fromSlot) === 6) { + return epoch3Provers; + } + if (Number(fromSlot) === 8) { + return epoch4Provers; + } + return []; + }); + + await manager.onCheckpointAdded(EpochNumber(3)); + expect(stubs.length).toBe(1); + // At the cap — second epoch is skipped. + await manager.onCheckpointAdded(EpochNumber(4)); + expect(stubs.length).toBe(1); + expect(manager.getFullSession(EpochNumber(4))).toBeUndefined(); + }); + + // ---------------- onTick ---------------- + + it('onTick opens a full session for the next unproven epoch', async () => { + // Proven tip at block 2; block 3 (first unproven) sits at slot 6, which is in epoch 3 + // under epochDuration=2. + mockNextUnprovenSlot(2, 6); + const provers = [proverForCheckpoint(1, 6)]; + l2BlockSource.isEpochComplete.mockResolvedValue(true); + l2BlockSource.getCheckpoints.mockResolvedValue([archiverCp(1, 6)]); + store.listCanonicalInSlotRange.mockReturnValue(provers); + + await manager.onTick(); + expect(manager.getFullSession(EpochNumber(3))).toBeDefined(); + }); + + it('onTick does nothing when the next checkpoint to prove is not yet in the store', async () => { + mockNextUnprovenSlot(2, undefined); + l2BlockSource.isEpochComplete.mockResolvedValue(true); + await manager.onTick(); + expect(stubs.length).toBe(0); + }); + + it('onTick does not open a session when the next epoch is incomplete', async () => { + mockNextUnprovenSlot(2, 6); + l2BlockSource.isEpochComplete.mockResolvedValue(false); + await manager.onTick(); + expect(stubs.length).toBe(0); + expect(manager.getFullSession(EpochNumber(3))).toBeUndefined(); + }); + + it('onTick does not re-open a session that already exists', async () => { + mockNextUnprovenSlot(2, 6); + const provers = [proverForCheckpoint(1, 6)]; + l2BlockSource.isEpochComplete.mockResolvedValue(true); + l2BlockSource.getCheckpoints.mockResolvedValue([archiverCp(1, 6)]); + store.listCanonicalInSlotRange.mockReturnValue(provers); + + await manager.onTick(); + expect(stubs.length).toBe(1); + // A second tick with the same proven height must not open a duplicate. + await manager.onTick(); + expect(stubs.length).toBe(1); + }); + + it('onTick does not retry an epoch whose session already terminated', async () => { + // The tick attempts each epoch at most once; a failed proving attempt must not be + // resubmitted by a later tick (only a new checkpoint event reopens it). Without the + // high-water mark the reaped session would be reopened, resubmitting the proof. + mockNextUnprovenSlot(2, 6); + const provers = [proverForCheckpoint(1, 6)]; + l2BlockSource.isEpochComplete.mockResolvedValue(true); + l2BlockSource.getCheckpoints.mockResolvedValue([archiverCp(1, 6)]); + store.listCanonicalInSlotRange.mockReturnValue(provers); + + await manager.onTick(); + expect(stubs.length).toBe(1); + + // Session fails. Proven height has not advanced, so the next tick reaps the failed + // session via recreateInvalidSessions (always called in reconcile) but the + // lastTickEpoch high-water mark prevents resubmission. + stubs[0].terminate('failed'); + await manager.onTick(); + expect(manager.getFullSession(EpochNumber(3))).toBeUndefined(); + expect(stubs.length).toBe(1); + }); + + it('onTick keeps retrying the same epoch while a transient blocker prevents opening', async () => { + // The archiver is still indexing — getCheckpoints returns a checkpoint we don't yet + // have in the store. openFullSessionIfReady should bail without creating a session, + // and the next tick must try again rather than skip the epoch. + mockNextUnprovenSlot(2, 6); + l2BlockSource.isEpochComplete.mockResolvedValue(true); + l2BlockSource.getCheckpoints.mockResolvedValue([archiverCp(1, 6)]); + store.listCanonicalInSlotRange.mockReturnValue([]); // store hasn't indexed it yet + + await manager.onTick(); + expect(stubs.length).toBe(0); // no session created + await manager.onTick(); + expect(stubs.length).toBe(0); // still no session — the tick keeps trying + + // Archiver catches up; the next tick succeeds. + store.listCanonicalInSlotRange.mockReturnValue([proverForCheckpoint(1, 6)]); + await manager.onTick(); + expect(stubs.length).toBe(1); + expect(manager.getFullSession(EpochNumber(3))).toBe(stubs[0] as unknown as EpochSession); + }); + + // ---------------- session invalidation on prune ---------------- + + it('cancels and recreates a session whose canonical content changed', async () => { + const epoch = EpochNumber(3); + // Two checkpoints at distinct slots within epoch 3's range [6, 7]. + const initial = [proverForCheckpoint(1, 6), proverForCheckpoint(2, 7)]; + await openCanonicalFullSession(epoch, initial); + const original = stubs[0]; + + // Now the store reports only the first prover. + store.listCanonicalInSlotRange.mockReturnValue([initial[0]]); + await manager.onPrune([epoch]); + + expect(original.cancelled).toBe(true); + expect(original.state).toBe('cancelled'); + expect(original.isTerminal()).toBe(true); + expect(original.cancelReasons).toEqual(['canonical content changed']); + expect(stubs.length).toBe(2); + const recreated = stubs[1]; + expect(recreated.provers).toEqual([initial[0]]); + expect(recreated.spec).toEqual(original.spec); // same slot range, fresh prover set + expect(recreated.state).toBe('awaiting-checkpoints'); + expect(recreated.isTerminal()).toBe(false); + expect(recreated.uuid).not.toBe(original.uuid); + expect(manager.getFullSession(epoch)).toBe(recreated as unknown as EpochSession); + }); + + it('drops a session and does not recreate when canonical content goes empty', async () => { + const epoch = EpochNumber(3); + await openCanonicalFullSession(epoch, [proverForCheckpoint(1, 6)]); + const original = stubs[0]; + + store.listCanonicalInSlotRange.mockReturnValue([]); + await manager.onPrune([epoch]); + + expect(original.cancelled).toBe(true); + expect(original.state).toBe('cancelled'); + expect(original.cancelReasons).toEqual(['canonical content changed']); + expect(manager.getFullSession(epoch)).toBeUndefined(); + expect(stubs.length).toBe(1); + }); + + it('reopens an epoch session after all its checkpoints are pruned and then re-added', async () => { + // The race flagged in review: a reorg removes every checkpoint of an epoch, then new + // ones arrive. Hitting the empty state first is benign — the session is dropped with + // no error, and the re-add opens a fresh session for the same epoch. + const epoch = EpochNumber(3); + const prover = proverForCheckpoint(1, 6); + await openCanonicalFullSession(epoch, [prover]); + const original = stubs[0]; + + // Reorg removes every checkpoint of the epoch → session dropped, not recreated. + store.listCanonicalInSlotRange.mockReturnValue([]); + await manager.onPrune([epoch]); + expect(original.cancelled).toBe(true); + expect(original.state).toBe('cancelled'); + expect(manager.getFullSession(epoch)).toBeUndefined(); + + // A new checkpoint for the same epoch arrives → a fresh session opens. + await openCanonicalFullSession(epoch, [prover]); + const recreated = manager.getFullSession(epoch) as unknown as StubSession | undefined; + expect(recreated).toBeDefined(); + expect(recreated).not.toBe(original); + expect(recreated!.uuid).not.toBe(original.uuid); + expect(recreated!.getCheckpoints()).toEqual([prover]); + expect(recreated!.state).toBe('awaiting-checkpoints'); + expect(recreated!.isTerminal()).toBe(false); + expect(stubs.length).toBe(2); + }); + + it('drops terminal sessions on the next reconcile', async () => { + const epoch = EpochNumber(3); + await openCanonicalFullSession(epoch, [proverForCheckpoint(1, 6)]); + const original = stubs[0]; + original.terminate('completed'); + // Trigger a reconcile. + await manager.onTick(); + expect(manager.getFullSession(epoch)).toBeUndefined(); + expect(stubs.length).toBe(1); // no replacement constructed + // Terminal-drop path is quiet: the manager does NOT call cancel on an already-terminal + // session, because the cancel is redundant. + expect(original.cancelReasons).toEqual([]); + expect(original.cancelled).toBe(false); // cancel() was never invoked; state is 'completed' + expect(original.state).toBe('completed'); + }); + + // ---------------- partial-session cleanup ---------------- + + it('cancels and recreates a partial session whose canonical content changed', async () => { + const epoch = EpochNumber(7); + const initial = [proverForCheckpoint(1, 14)]; + store.listCanonicalForEpoch.mockResolvedValue(initial); + store.listCanonicalInSlotRange.mockReturnValue(initial); + + const stubPromise = awaitNextStub(); + const startPromise = manager.startProof(epoch); + const original = await stubPromise; + expect(original.spec.kind).toBe('partial'); + expect(original.provers).toEqual(initial); + + // The store now reports a different prover at the same slot. + const swapped = [proverForCheckpoint(2, 14)]; + store.listCanonicalInSlotRange.mockReturnValue(swapped); + + const recreatePromise = awaitNextStub(); + await manager.onTick(); + const recreated = await recreatePromise; + + expect(original.cancelled).toBe(true); + expect(original.state).toBe('cancelled'); + expect(original.cancelReasons).toEqual(['canonical content changed']); + expect(recreated.spec).toEqual(original.spec); + expect(recreated.provers).toEqual(swapped); + expect(recreated.state).toBe('awaiting-checkpoints'); + expect(recreated.uuid).not.toBe(original.uuid); + expect(manager.getPartialSession(original.spec)).toBe(recreated as unknown as EpochSession); + expect(stubs).toHaveLength(2); + + // startProof resolves with the scheduled job id as soon as the session is constructed. + await startPromise; + }); + + it('drops a partial session and does not recreate when canonical content goes empty', async () => { + const epoch = EpochNumber(7); + const initial = [proverForCheckpoint(1, 14)]; + store.listCanonicalForEpoch.mockResolvedValue(initial); + store.listCanonicalInSlotRange.mockReturnValue(initial); + + const stubPromise = awaitNextStub(); + const startPromise = manager.startProof(epoch); + const original = await stubPromise; + + store.listCanonicalInSlotRange.mockReturnValue([]); + await manager.onTick(); + + expect(original.cancelled).toBe(true); + expect(original.state).toBe('cancelled'); + expect(original.cancelReasons).toEqual(['canonical content changed']); + expect(manager.getPartialSession(original.spec)).toBeUndefined(); + expect(stubs).toHaveLength(1); // no replacement constructed + await startPromise; + }); + + it('drops terminal partial sessions on the next reconcile', async () => { + const epoch = EpochNumber(7); + const canonical = [proverForCheckpoint(1, 14)]; + store.listCanonicalForEpoch.mockResolvedValue(canonical); + store.listCanonicalInSlotRange.mockReturnValue(canonical); + + const stubPromise = awaitNextStub(); + const startPromise = manager.startProof(epoch); + const partial = await stubPromise; + + partial.terminate('completed'); + await startPromise; + expect(manager.getPartialSession(partial.spec)).toBe(partial as unknown as EpochSession); // still in map + + // Any subsequent reconcile drops the terminal entry without cancelling it. + await manager.onTick(); + expect(manager.getPartialSession(partial.spec)).toBeUndefined(); + expect(partial.cancelReasons).toEqual([]); + expect(partial.state).toBe('completed'); + }); + + // ---------------- startProof ignores terminal sessions ---------------- + + it('startProof ignores a terminal full session and constructs a fresh partial', async () => { + // Existing full session that already terminated (e.g. it previously failed). startProof + // must NOT dedupe against it — it should construct a fresh partial instead. + const epoch = EpochNumber(7); + const canonical = [proverForCheckpoint(1, 14)]; + await openCanonicalFullSession(epoch, canonical); + const terminalFull = stubs[0]; + terminalFull.terminate('failed'); + expect(terminalFull.isTerminal()).toBe(true); + + store.listCanonicalForEpoch.mockResolvedValue(canonical); + store.listCanonicalInSlotRange.mockReturnValue(canonical); + + const stubPromise = awaitNextStub(); + const startPromise = manager.startProof(epoch); + const partial = await stubPromise; + + expect(partial.spec.kind).toBe('partial'); + expect(partial.spec.epochNumber).toEqual(epoch); + expect(partial).not.toBe(terminalFull); + expect(manager.getPartialSession(partial.spec)).toBe(partial as unknown as EpochSession); + + partial.terminate('completed'); + await startPromise; + }); + + it('startProof ignores a terminal partial session and constructs a fresh one', async () => { + const epoch = EpochNumber(7); + const canonical = [proverForCheckpoint(1, 14)]; + store.listCanonicalForEpoch.mockResolvedValue(canonical); + store.listCanonicalInSlotRange.mockReturnValue(canonical); + + // Open a partial, settle it terminally, then call startProof again. + const firstPromise = awaitNextStub(); + const firstStart = manager.startProof(epoch); + const firstPartial = await firstPromise; + firstPartial.terminate('failed'); + await firstStart; + expect(firstPartial.isTerminal()).toBe(true); + + // Second startProof must construct a fresh partial. + const secondPromise = awaitNextStub(); + const secondStart = manager.startProof(epoch); + const secondPartial = await secondPromise; + + expect(secondPartial).not.toBe(firstPartial); + expect(secondPartial.uuid).not.toBe(firstPartial.uuid); + expect(secondPartial.spec).toEqual(firstPartial.spec); + expect(secondPartial.state).toBe('awaiting-checkpoints'); + expect(stubs).toHaveLength(2); + + secondPartial.terminate('completed'); + await secondStart; + }); + + // ---------------- startProof ---------------- + + it('startProof opens a partial session with fromSlot = firstSlotOfEpoch', async () => { + const epoch = EpochNumber(7); + // Epoch 7 covers slots [14, 15]. Single canonical prover at slot 14. + const canonical = [proverForCheckpoint(1, 14)]; + store.listCanonicalForEpoch.mockResolvedValue(canonical); + store.listCanonicalInSlotRange.mockReturnValue(canonical); + + // Arm the construction trigger before calling startProof — no need to sleep waiting + // for reconcile to land. + const stubPromise = awaitNextStub(); + const done = manager.startProof(epoch); + const partial = await stubPromise; + + expect(stubs.length).toBe(1); + expect(partial.spec).toEqual({ + kind: 'partial', + epochNumber: epoch, + fromSlot: SlotNumber(14), + toSlot: SlotNumber(14), + }); + expect(partial.provers).toEqual(canonical); + expect(partial.state).toBe('awaiting-checkpoints'); + expect(partial.isTerminal()).toBe(false); + expect(manager.getPartialSession(partial.spec)).toBe(partial as unknown as EpochSession); + + // startProof returns the job id without awaiting completion; await the resolved id. + await done; + partial.terminate('completed'); + }); + + it('startProof throws when the epoch has no canonical content', async () => { + store.listCanonicalForEpoch.mockResolvedValue([]); + await expect(manager.startProof(EpochNumber(7))).rejects.toThrow(/No blocks found/); + }); + + it('startProof refuses to re-prove an epoch the proven chain already encompasses', async () => { + const epoch = EpochNumber(7); + // proverForCheckpoint builds a checkpoint whose single block number equals the checkpoint + // number (1 here). A proven tip at or beyond that block means the epoch is already proven. + store.listCanonicalForEpoch.mockResolvedValue([proverForCheckpoint(1, 14)]); + l2BlockSource.getBlockNumber.mockResolvedValue(BlockNumber(1)); + + await expect(manager.startProof(epoch)).rejects.toThrow(/already proven/i); + expect(stubs).toHaveLength(0); + }); + + it('startProof dedupes against an existing full session with the same range', async () => { + const epoch = EpochNumber(7); + // Checkpoint at the epoch's last slot (15) so the partial range startProof derives ([14,15]) + // matches the full session's range — otherwise the dedup guard wouldn't fire. + const provers = [proverForCheckpoint(1, 15)]; + await openCanonicalFullSession(epoch, provers); + expect(stubs.length).toBe(1); + const fullSession = stubs[0]; + + store.listCanonicalForEpoch.mockResolvedValue(provers); + const doneId = await manager.startProof(epoch); + fullSession.terminate('completed'); + + // No new session opened; startProof returned the existing full session's id. + expect(doneId).toBe(fullSession.uuid); + expect(stubs.length).toBe(1); + }); + + it('startProof dedupes against an existing partial session with the same spec', async () => { + const epoch = EpochNumber(7); + const canonical = [proverForCheckpoint(1, 14)]; + store.listCanonicalForEpoch.mockResolvedValue(canonical); + store.listCanonicalInSlotRange.mockReturnValue(canonical); + + const firstId = await manager.startProof(epoch); + expect(stubs).toHaveLength(1); + const partial = stubs[0]; + expect(firstId).toBe(partial.uuid); + + // A second startProof for the same spec returns the existing partial's id without + // constructing a new session or cancelling the existing one. + const secondId = await manager.startProof(epoch); + expect(secondId).toBe(partial.uuid); + expect(stubs).toHaveLength(1); // no second stub ever constructed + expect(partial.cancelReasons).toEqual([]); // dedup path never cancels the existing partial + + partial.terminate('completed'); + }); + + // ---------------- stop ---------------- + + it('stop cancels every live session', async () => { + // Persistent mocks: every reconcile that runs sees consistent content for each epoch, + // so opening epoch 4 doesn't trigger a spurious 'canonical content changed' recreate + // of the epoch 3 session. + const epoch3Provers = [proverForCheckpoint(1, 6)]; + const epoch4Provers = [proverForCheckpoint(2, 8)]; + l2BlockSource.isEpochComplete.mockResolvedValue(true); + l2BlockSource.getCheckpoints.mockImplementation(({ epoch }: { epoch: EpochNumber }) => + Promise.resolve( + (Number(epoch) === 3 ? epoch3Provers : Number(epoch) === 4 ? epoch4Provers : []).map( + p => ({ checkpoint: p.checkpoint }) as any, + ), + ), + ); + store.listCanonicalInSlotRange.mockImplementation((fromSlot: SlotNumber) => { + if (Number(fromSlot) === 6) { + return epoch3Provers; + } + if (Number(fromSlot) === 8) { + return epoch4Provers; + } + return []; + }); + + await manager.onCheckpointAdded(EpochNumber(3)); + await manager.onCheckpointAdded(EpochNumber(4)); + expect(stubs).toHaveLength(2); + + await manager.stop(); + + expect(stubs.every(s => s.cancelled)).toBe(true); + expect(stubs.every(s => s.state === 'cancelled')).toBe(true); + expect(stubs.every(s => s.isTerminal())).toBe(true); + // stop() passes 'prover-node stopping' as the cancel reason — verify every session + // saw it, so a future caller can grep logs for that string. + expect(stubs.map(s => s.cancelReasons)).toEqual([['prover-node stopping'], ['prover-node stopping']]); + }); + + it('stop awaits sessions whose cancel is in flight', async () => { + await openCanonicalFullSession(EpochNumber(3), [proverForCheckpoint(1, 6)]); + const session = stubs[0]; + + // Hold cancel until the test releases the gate. stop() must wait on the in-flight + // cancel rather than returning early. + const cancelGate = promiseWithResolvers(); + session.cancelBlocker = cancelGate.promise; + + const stopPromise = manager.stop(); + // Trigger: cancelStarted fires the moment SessionManager.stop invokes session.cancel. + await session.cancelStarted.promise; + // Confirm stop is still pending and the cancel hasn't reached completion. + expect(session.cancelled).toBe(false); + expect(session.state).toBe('awaiting-checkpoints'); + + // Release the gate; stop now returns. + cancelGate.resolve(); + await stopPromise; + expect(session.cancelled).toBe(true); + expect(session.state).toBe('cancelled'); + expect(session.cancelReasons).toEqual(['prover-node stopping']); + }); + + it('rejects further reconcile scheduling once stop has drained the queue', async () => { + await manager.stop(); + + // After stop, reconcileQueue.cancel() has fired — any new enqueue attempt rejects. + // We assert via onCheckpointAdded, the most common entry point for new reconciles. + await expect(manager.onCheckpointAdded(EpochNumber(3))).rejects.toThrow(/enqueue/i); + expect(stubs).toHaveLength(0); + expect(manager.getFullSession(EpochNumber(3))).toBeUndefined(); + }); + + // ---------------- helpers ---------------- + + async function openCanonicalFullSession(epoch: EpochNumber, provers: CheckpointProver[]): Promise { + l2BlockSource.isEpochComplete.mockResolvedValueOnce(true); + l2BlockSource.getCheckpoints.mockResolvedValueOnce(provers.map(p => ({ checkpoint: p.checkpoint }) as any)); + store.listCanonicalInSlotRange.mockReturnValueOnce(provers); + await manager.onCheckpointAdded(epoch); + } + + /** + * Arms a single-shot trigger that fires the moment the manager constructs the next stub + * session. Returns a promise that resolves with that stub. Use this instead of sleeping + * after an action that schedules a reconcile — the manager itself signals "session ready" + * via the factory call. + */ + function awaitNextStub(): Promise { + const { promise, resolve } = promiseWithResolvers(); + onConstruct = stub => { + onConstruct = undefined; + resolve(stub); + }; + return promise; + } + + /** + * Mocks the chain tip that `nextUnprovenEpoch` reads: proven height = `provenBlock`, with the + * first unproven block (`provenBlock + 1`) sitting at `firstUnprovenSlot` — or not yet mined + * when `undefined`. With epochDuration=2, slot N lives in epoch ⌊N/2⌋: e.g. slot 6 → epoch 3. + */ + function mockNextUnprovenSlot(provenBlock: number, firstUnprovenSlot: number | undefined) { + l2BlockSource.getBlockNumber.mockResolvedValue(BlockNumber(provenBlock)); + l2BlockSource.getBlockData.mockImplementation((query: any) => { + if (!('number' in query) || Number(query.number) !== provenBlock + 1 || firstUnprovenSlot === undefined) { + return Promise.resolve(undefined); + } + return Promise.resolve({ header: { getSlot: () => SlotNumber(firstUnprovenSlot) } } as any); + }); + } +}); + +/** + * Subclass that swaps `doConstructSession` for an injected factory so tests can hand + * back stub sessions whose lifecycle they control. + */ +class TestSessionManager extends SessionManager { + constructor( + deps: Omit, + private readonly factory: (spec: SessionSpec, provers: readonly CheckpointProver[]) => EpochSession, + ) { + super(deps); + } + + protected override constructSession(spec: SessionSpec, provers: readonly CheckpointProver[]): EpochSession { + return this.factory(spec, provers); + } +} + +/** Minimal EpochSession-shaped stub for SessionManager-level tests. */ +type StubSession = { + spec: SessionSpec; + provers: readonly CheckpointProver[]; + uuid: string; + state: EpochProvingJobState; + cancelled: boolean; + /** Reasons captured for every cancel(reason) call. Lets assertions verify "why" the cancel fired. */ + cancelReasons: string[]; + /** Optional gate held by tests that want to drive a cancel mid-flight. */ + cancelBlocker?: Promise; + /** Resolves the first time cancel() is invoked — tests use it to know when stop's cancel call lands. */ + cancelStarted: ReturnType>; + donePromise: Promise; + resolveDone: (s: EpochProvingJobState) => void; + terminate(state: EpochProvingJobState): void; + // EpochSession interface methods used by SessionManager: + getSpec(): SessionSpec; + getId(): string; + getState(): EpochProvingJobState; + getEpochNumber(): EpochNumber; + getCheckpoints(): readonly CheckpointProver[]; + isTerminal(): boolean; + cancel(reason?: string): Promise; + start(): Promise; + whenDone(): Promise; +}; + +let stubCounter = 0; + +function makeStubSession(spec: SessionSpec, provers: readonly CheckpointProver[]): StubSession { + const { promise, resolve } = promiseWithResolvers(); + const stub: StubSession = { + spec, + provers, + uuid: `stub-${stubCounter++}`, + state: 'awaiting-checkpoints', + cancelled: false, + cancelReasons: [], + cancelStarted: promiseWithResolvers(), + donePromise: promise, + resolveDone: resolve, + terminate(state) { + this.state = state; + this.resolveDone(state); + }, + getSpec() { + return this.spec; + }, + getId() { + return this.uuid; + }, + getState() { + return this.state; + }, + getEpochNumber() { + return this.spec.epochNumber; + }, + getCheckpoints() { + return this.provers; + }, + isTerminal() { + const terminal: EpochProvingJobState[] = [ + 'completed', + 'superseded', + 'failed', + 'stopped', + 'cancelled', + 'timed-out', + ]; + return terminal.includes(this.state); + }, + async cancel(reason?: string) { + this.cancelReasons.push(reason ?? 'cancelled'); + this.cancelStarted.resolve(); + if (this.cancelBlocker) { + await this.cancelBlocker; + } + this.cancelled = true; + this.terminate('cancelled'); + }, + start() { + return this.donePromise; + }, + whenDone() { + return this.donePromise; + }, + }; + return stub; +} + +/** + * Minimal checkpoint content carrying just enough for `CheckpointProver.idFor` (number, slot, + * archive root). The archive root is derived from (number, slot) so identical (number, slot) pairs + * produce identical content-addressed ids — letting archiver-side and store-side stubs match. + */ +function makeCheckpointContent(number: number, slot: number) { + return { + number, + header: { slotNumber: SlotNumber(slot) }, + archive: { root: { toString: () => `root-${number}-${slot}` } }, + blocks: [{ number }], + } as any; +} + +function proverForCheckpoint(number: number, slot: number): CheckpointProver { + const checkpoint = makeCheckpointContent(number, slot); + return { + id: CheckpointProver.idFor(checkpoint), + checkpoint, + slotNumber: SlotNumber(slot), + isPruned: () => false, + isCancelled: () => false, + } as unknown as CheckpointProver; +} + +/** Archiver-side PublishedCheckpoint stub whose content matches `proverForCheckpoint(number, slot)`. */ +function archiverCp(number: number, slot: number) { + return { checkpoint: makeCheckpointContent(number, slot) } as any; +} + +function proverWithSlot(slot: number): CheckpointProver { + return proverForCheckpoint(1, slot); +} + +/** Minimal Histogram/Gauge/Counter stub: only the methods ProverNodeJobMetrics records into. */ +function noopMetric() { + return { record: () => {}, add: () => {} }; +} diff --git a/yarn-project/prover-node/src/session-manager.ts b/yarn-project/prover-node/src/session-manager.ts new file mode 100644 index 000000000000..3430815c205f --- /dev/null +++ b/yarn-project/prover-node/src/session-manager.ts @@ -0,0 +1,552 @@ +import { BlockNumber, type EpochNumber } from '@aztec/foundation/branded-types'; +import { Fr } from '@aztec/foundation/curves/bn254'; +import type { EthAddress } from '@aztec/foundation/eth-address'; +import { type Logger, type LoggerBindings, createLogger } from '@aztec/foundation/log'; +import { SerialQueue } from '@aztec/foundation/queue'; +import { RunningPromise } from '@aztec/foundation/running-promise'; +import type { DateProvider } from '@aztec/foundation/timer'; +import type { EpochProverFactory } from '@aztec/prover-client'; +import type { L2BlockSource } from '@aztec/stdlib/block'; +import type { PublishedCheckpoint } from '@aztec/stdlib/checkpoint'; +import { + type L1RollupConstants, + getEpochAtSlot, + getProofSubmissionDeadlineTimestamp, + getSlotRangeForEpoch, +} from '@aztec/stdlib/epoch-helpers'; +import type { EpochProvingJobState } from '@aztec/stdlib/interfaces/server'; + +import type { CheckpointStore } from './checkpoint-store.js'; +import { CheckpointProver } from './job/checkpoint-prover.js'; +import type { EpochProvingJobData } from './job/epoch-proving-job-data.js'; +import { + EpochSession, + type EpochSessionDeps, + type EpochSessionHooks, + type EpochSessionOptions, + type SessionSpec, + specKey, +} from './job/epoch-session.js'; +import type { ProverNodeJobMetrics } from './metrics.js'; +import type { ProofPublishingService } from './proof-publishing-service.js'; + +/** Trigger payload for `reconcile`. */ +export type ReconcileTrigger = + | { kind: 'checkpoint'; epoch: EpochNumber } + | { kind: 'prune'; affectedEpochs: EpochNumber[] } + | { kind: 'tick' } + | { kind: 'start-proof'; spec: SessionSpec }; + +/** Config bag for session lifecycle decisions. */ +export type SessionManagerConfig = { + /** Cap on the number of non-terminal sessions (full + partial). 0 disables. */ + maxPendingJobs: number; + /** Interval at which the internal periodic tick fires `reconcile({ kind: 'tick' })`. */ + tickIntervalMs: number; + /** Forwarded to every session: delay before top-tree proving, letting late reorgs settle. */ + finalizationDelayMs: number | undefined; +}; + +export type SessionManagerDeps = { + checkpointStore: CheckpointStore; + l2BlockSource: Pick< + L2BlockSource, + 'isEpochComplete' | 'getCheckpoints' | 'getL1Constants' | 'getBlockNumber' | 'getBlockData' + >; + proverFactory: EpochProverFactory; + proverId: EthAddress; + publishingService: ProofPublishingService; + metrics: ProverNodeJobMetrics; + dateProvider: DateProvider; + config: SessionManagerConfig; + /** + * Optional callback fired when a session terminates with `failed`. The session manager + * doesn't own the failure-upload action; it just notifies the owner. + */ + onSessionFailed?: (session: EpochSession) => Promise; + bindings?: LoggerBindings; +}; + +/** + * Owns the lifecycle of every `EpochSession`. Each L2BlockStream event and periodic tick + * arrives via a dedicated entry point (`onCheckpointAdded`, `onPrune`, `onTick`, etc.) which + * schedules a `reconcile(trigger)` on a serial queue. Reconcile walks both session + * maps, cancels any session whose canonical content has shifted, re-creates it with + * the same spec but new content, and opens fresh full sessions for any epoch implicated + * by the trigger. + */ +export class SessionManager { + private readonly log: Logger; + private readonly fullSessions: Map = new Map(); + private readonly partialSessions: Map = new Map(); + /** + * Serialises every reconcile call. The trigger sources (L2BlockStream events, the + * periodic tick, JSON-RPC `startProof`) run independently, so without this queue two + * reconciles could interleave on the `await session.cancel(...)` step and orphan a + * freshly-constructed session. + */ + private readonly reconcileQueue = new SerialQueue(); + /** Cached L1 constants, populated on first read. */ + private cachedL1Constants: L1RollupConstants | undefined; + /** + * Highest epoch for which the periodic tick has successfully created a full session. + * Monotonic high-water mark: once the tick observes a session for epoch X, it stops + * trying to open one — even if that session subsequently fails (only a new checkpoint + * event reopens it). Crucially, the mark only advances when a session actually exists + * post-open, so transient blockers (atMaxSessionLimit, archiver still indexing) leave + * the mark in place and the next tick retries. + */ + private lastTickEpoch: EpochNumber | undefined; + /** Test-only hooks applied to every session this manager constructs. */ + private sessionHooks: EpochSessionHooks | undefined; + /** Periodic tick that nudges reconcile to pick up newly-complete epochs. Started by `start()`. */ + private epochTicker: RunningPromise | undefined; + + constructor(private readonly deps: SessionManagerDeps) { + this.log = createLogger('prover-node:session-manager', deps.bindings); + this.reconcileQueue.start(); + } + + /** + * Starts the periodic tick. Separated from the constructor so tests can drive `onTick()` + * manually without the background ticker interleaving. Idempotent. + */ + public start(): void { + if (this.epochTicker) { + return; + } + this.epochTicker = new RunningPromise(() => this.onTick(), this.log, this.deps.config.tickIntervalMs); + this.epochTicker.start(); + } + + /** + * Installs hooks applied to every session constructed from now on. Used by the e2e + * harness to interpose around top-tree proving (gate it, override it, observe it) + * without monkey-patching the orchestrator factory. + */ + public setSessionHooks(hooks: EpochSessionHooks): void { + this.sessionHooks = hooks; + } + + // ---------------- read-only views ---------------- + + /** Every live (non-terminal) session. */ + public allSessions(): EpochSession[] { + return [...this.fullSessions.values(), ...this.partialSessions.values()]; + } + + /** Returns the full session for `epoch`, if any. */ + public getFullSession(epoch: EpochNumber): EpochSession | undefined { + return this.fullSessions.get(epoch); + } + + /** Returns the partial session for `spec`, if any. */ + public getPartialSession(spec: SessionSpec): EpochSession | undefined { + return this.partialSessions.get(specKey(spec)); + } + + /** Observability summary used by the prover-node API. */ + public getJobs(): { uuid: string; status: EpochProvingJobState; epochNumber: EpochNumber }[] { + return this.allSessions().map(s => ({ + uuid: s.getId(), + status: s.getState(), + epochNumber: s.getEpochNumber(), + })); + } + + // ---------------- event entry points ---------------- + + /** Called by ProverNode after a chain-checkpointed event has been added to the store. */ + public onCheckpointAdded(epoch: EpochNumber): Promise { + return this.scheduleReconcile({ kind: 'checkpoint', epoch }); + } + + /** Called by ProverNode after a chain-pruned event has flipped store provers to pruned. */ + public onPrune(affectedEpochs: EpochNumber[]): Promise { + return this.scheduleReconcile({ kind: 'prune', affectedEpochs }); + } + + /** + * Called periodically by ProverNode's ticker. Picks up epochs that have become complete + * by time without a fresh checkpoint event (e.g. the epoch's last slots are empty), and + * advances to the next epoch once the previous one is proven on L1. + */ + public onTick(): Promise { + return this.scheduleReconcile({ kind: 'tick' }); + } + + // ---------------- public API ---------------- + + /** + * Schedules a proof attempt for the supplied epoch and returns the job id without waiting for + * the proof to complete — proving can far outlast an HTTP request, so callers poll `getJobs()` + * for the outcome. Every session — full or partial — begins at the epoch's first slot; the + * partial's spec stops at the last canonical slot, while the full's stops at the epoch's last + * slot. Dedupes against any existing session covering the same range, returning its id. + */ + public async startProof(epoch: EpochNumber): Promise { + const canonical = await this.deps.checkpointStore.listCanonicalForEpoch(epoch); + if (canonical.length === 0) { + throw new EmptyEpochError(epoch); + } + // Don't re-prove an epoch the L1 proven chain already encompasses — it was already proven + // (possibly by another prover node), so a fresh proof would be wasted work. + if (await this.isProvenChainEncompassing(canonical)) { + throw new EpochAlreadyProvenError(epoch); + } + const l1Constants = await this.getL1Constants(); + const [fromSlot] = getSlotRangeForEpoch(epoch, l1Constants); + const toSlot = canonical[canonical.length - 1].slotNumber; + const spec: SessionSpec = { kind: 'partial', epochNumber: epoch, fromSlot, toSlot }; + + // Reuse a session already covering this exact range rather than scheduling a duplicate. + const existingFull = this.getFullSession(epoch); + if ( + existingFull && + !existingFull.isTerminal() && + existingFull.getSpec().fromSlot === fromSlot && + existingFull.getSpec().toSlot === toSlot + ) { + return existingFull.getId(); + } + const existingPartial = this.getPartialSession(spec); + if (existingPartial && !existingPartial.isTerminal()) { + return existingPartial.getId(); + } + + await this.scheduleReconcile({ kind: 'start-proof', spec }); + const created = this.getPartialSession(spec); + if (!created) { + throw new Error(`Failed to schedule partial proof for epoch ${epoch}`); + } + return created.getId(); + } + + /** Stops the tick, drains the reconcile queue, and cancels every live session. */ + public async stop(): Promise { + await this.epochTicker?.stop(); + await this.reconcileQueue.cancel(); + const sessions = this.allSessions(); + await Promise.allSettled(sessions.map(s => s.cancel('prover-node stopping'))); + } + + // ---------------- reconcile ---------------- + + private scheduleReconcile(trigger: ReconcileTrigger): Promise { + return this.reconcileQueue.put(() => this.reconcile(trigger)); + } + + private async reconcile(trigger: ReconcileTrigger): Promise { + this.log.debug(`Reconciling`, { trigger }); + + this.recreateInvalidSessions(); + + const implicatedEpochs = await this.epochsForTrigger(trigger); + for (const epoch of implicatedEpochs) { + await this.openFullSessionIfReady(epoch); + } + + // Advance the tick high-water mark only once a session actually exists for the epoch. + // `openFullSessionIfReady` can early-return without creating one (atMaxSessionLimit, + // archiver still indexing, etc.); in those cases we want the next tick to try again + // rather than skip the epoch forever. + if (trigger.kind === 'tick' && implicatedEpochs.length === 1) { + const epoch = implicatedEpochs[0]; + if (this.fullSessions.has(epoch)) { + this.lastTickEpoch = epoch; + } + } + + if (trigger.kind === 'start-proof') { + this.openPartialSession(trigger.spec); + } + } + + private recreateInvalidSessions(): void { + for (const [key, session] of Array.from(this.fullSessions.entries())) { + if (session.isTerminal()) { + this.fullSessions.delete(key); + continue; + } + const canonical = this.canonicalCheckpointsForSpec(session.getSpec()); + if (!this.checkpointsMatch(session.getCheckpoints(), canonical)) { + this.fireAndForgetCancel(session, 'canonical content changed'); + this.fullSessions.delete(key); + if (canonical.length > 0) { + const newSession = this.constructSession(session.getSpec(), canonical); + this.fullSessions.set(key, newSession); + void this.runSession(newSession); + } + } + } + for (const [key, session] of Array.from(this.partialSessions.entries())) { + if (session.isTerminal()) { + this.partialSessions.delete(key); + continue; + } + const canonical = this.canonicalCheckpointsForSpec(session.getSpec()); + if (!this.checkpointsMatch(session.getCheckpoints(), canonical)) { + this.fireAndForgetCancel(session, 'canonical content changed'); + this.partialSessions.delete(key); + if (canonical.length > 0) { + const newSession = this.constructSession(session.getSpec(), canonical); + this.partialSessions.set(key, newSession); + void this.runSession(newSession); + } + } + } + } + + private async openFullSessionIfReady(epoch: EpochNumber): Promise { + if (this.fullSessions.has(epoch)) { + return; + } + if (this.atMaxSessionLimit()) { + this.log.debug(`Skipping full-session open for epoch ${epoch}: max pending jobs reached`); + return; + } + if (!(await this.deps.l2BlockSource.isEpochComplete(epoch))) { + return; + } + const l1Constants = await this.getL1Constants(); + const archiverCps = await this.deps.l2BlockSource.getCheckpoints({ epoch }); + if (archiverCps.length === 0) { + return; + } + const [fromSlot, toSlot] = getSlotRangeForEpoch(epoch, l1Constants); + const canonical = this.deps.checkpointStore.listCanonicalInSlotRange(fromSlot, toSlot); + if (!this.archiverFullyCovered(archiverCps, canonical)) { + this.log.debug(`Skipping full-session open for epoch ${epoch}: archiver checkpoints not all in store`, { + archiverCount: archiverCps.length, + storeCount: canonical.length, + }); + return; + } + const spec: SessionSpec = { kind: 'full', epochNumber: epoch, fromSlot, toSlot }; + const session = this.constructSession(spec, canonical); + this.fullSessions.set(epoch, session); + void this.runSession(session); + } + + private openPartialSession(spec: SessionSpec): void { + const canonical = this.deps.checkpointStore.listCanonicalInSlotRange(spec.fromSlot, spec.toSlot); + if (canonical.length === 0) { + return; + } + // Reuse a live partial session for this epoch whose checkpoint set already matches the + // canonical content — e.g. a repeated `startProof` with no new checkpoints mined since the + // last one. Reconstructing would re-prove identical content and burn a pending-job slot. + const existing = Array.from(this.partialSessions.values()).find( + s => + s.getSpec().epochNumber === spec.epochNumber && + !s.isTerminal() && + this.checkpointsMatch(s.getCheckpoints(), canonical), + ); + if (existing) { + return; + } + if (this.atMaxSessionLimit()) { + throw new Error(`Maximum pending proving jobs ${this.deps.config.maxPendingJobs} reached.`); + } + const session = this.constructSession(spec, canonical); + this.partialSessions.set(specKey(spec), session); + void this.runSession(session); + } + + // ---------------- session construction ---------------- + + protected constructSession(spec: SessionSpec, checkpoints: readonly CheckpointProver[]): EpochSession { + return this.doConstructSession(spec, checkpoints, this.buildSessionDeps(spec.epochNumber), this.sessionHooks); + } + + /** Extracted for test override. */ + protected doConstructSession( + spec: SessionSpec, + checkpoints: readonly CheckpointProver[], + sessionDeps: EpochSessionDeps, + hooks?: EpochSessionHooks, + ): EpochSession { + return new EpochSession(spec, checkpoints, { ...sessionDeps, hooks }); + } + + private buildSessionDeps(epochNumber: EpochNumber): EpochSessionDeps { + const config: EpochSessionOptions = { + finalizationDelayMs: this.deps.config.finalizationDelayMs, + }; + return { + proverFactory: this.deps.proverFactory, + proverId: this.deps.proverId, + publishingService: this.deps.publishingService, + metrics: this.deps.metrics, + dateProvider: this.deps.dateProvider, + deadline: this.computeDeadline(epochNumber), + config, + bindings: this.deps.bindings, + }; + } + + private computeDeadline(epochNumber: EpochNumber): Date | undefined { + if (!this.cachedL1Constants) { + return undefined; + } + const ts = getProofSubmissionDeadlineTimestamp(epochNumber, this.cachedL1Constants); + return new Date(Number(ts) * 1000); + } + + private async runSession(session: EpochSession): Promise { + // A reconcile may have cancelled this session before it starts (content-change + // recreation). Don't proceed — start() would build a TopTreeJob that should never run. + if (session.isTerminal()) { + this.log.debug(`Skipping start for ${session.getId()}: already terminal (${session.getState()})`); + return; + } + const state = await session.start(); + this.log.info(`Session ${session.getId()} exited with state ${state}`); + if (state === 'failed' && this.deps.onSessionFailed) { + try { + await this.deps.onSessionFailed(session); + } catch (err) { + this.log.error(`Error in onSessionFailed callback for ${session.getSpec().epochNumber}`, err); + } + } + } + + /** + * Builds the EpochProvingJobData snapshot for failure upload. Includes every checkpoint + * referenced by the session, regardless of whether sub-tree proving completed — + * partial state is still useful for post-mortem analysis. + */ + public static buildSessionProvingData(session: EpochSession): EpochProvingJobData { + const checkpoints = session.getCheckpoints(); + const txs = new Map(); + const l1ToL2Messages: Record = {}; + for (const c of checkpoints) { + for (const [hash, tx] of c.txs) { + txs.set(hash, tx); + } + l1ToL2Messages[c.checkpoint.number] = c.l1ToL2Messages; + } + return { + epochNumber: session.getSpec().epochNumber, + checkpoints: checkpoints.map(c => c.checkpoint), + txs, + l1ToL2Messages, + previousBlockHeader: checkpoints[0].previousBlockHeader, + attestations: [], + }; + } + + // ---------------- reconcile helpers ---------------- + + private atMaxSessionLimit(): boolean { + const { maxPendingJobs: max } = this.deps.config; + if (!max || max <= 0) { + return false; + } + const live = this.allSessions().filter(s => !s.isTerminal()).length; + return live >= max; + } + + private async epochsForTrigger(trigger: ReconcileTrigger): Promise { + switch (trigger.kind) { + case 'checkpoint': + return [trigger.epoch]; + case 'prune': + return trigger.affectedEpochs; + case 'tick': { + const epoch = await this.nextUnprovenEpoch(); + if (epoch === undefined || (this.lastTickEpoch !== undefined && epoch <= this.lastTickEpoch)) { + return []; + } + return [epoch]; + } + case 'start-proof': + return []; + } + } + + /** + * The next epoch to prove: the epoch containing the first block after the proven tip. + * Returns undefined when that block has not been mined yet (e.g. nothing new to prove). + * Subsequent ticks advance only once the chain's proven height moves forward, so epochs + * are proven in order rather than all at once. + */ + private async nextUnprovenEpoch(): Promise { + const lastProven = (await this.deps.l2BlockSource.getBlockNumber({ tag: 'proven' })) ?? BlockNumber.ZERO; + const firstToProve = BlockNumber(lastProven + 1); + const header = (await this.deps.l2BlockSource.getBlockData({ number: firstToProve }))?.header; + if (!header) { + return undefined; + } + return getEpochAtSlot(header.getSlot(), await this.getL1Constants()); + } + + private canonicalCheckpointsForSpec(spec: SessionSpec): CheckpointProver[] { + return this.deps.checkpointStore.listCanonicalInSlotRange(spec.fromSlot, spec.toSlot); + } + + private fireAndForgetCancel(session: EpochSession, reason: string): void { + void session.cancel(reason).catch(err => this.log.warn(`Error cancelling session ${session.getId()}`, err)); + } + + private checkpointsMatch(a: readonly CheckpointProver[], b: readonly CheckpointProver[]): boolean { + if (a.length !== b.length) { + return false; + } + for (let i = 0; i < a.length; i++) { + if (a[i].id !== b[i].id || a[i].isCancelled()) { + return false; + } + } + return true; + } + + private archiverFullyCovered( + archiverCps: readonly PublishedCheckpoint[], + storeCps: readonly CheckpointProver[], + ): boolean { + if (storeCps.length < archiverCps.length) { + return false; + } + // Compare by content-addressed id (number, slot, archive root) rather than checkpoint number: + // a reorg can keep the number while changing the checkpoint's post-state archive root. + const storeIds = new Set(storeCps.map(p => p.id)); + return archiverCps.every(cp => storeIds.has(CheckpointProver.idFor(cp.checkpoint))); + } + + /** + * Returns true if the L1 proven tip already covers every canonical checkpoint in the set — i.e. + * the epoch has already been fully proven, so there is no point starting a new proof for it. + * Conservatively returns false when nothing is proven yet. + */ + private async isProvenChainEncompassing(canonical: readonly CheckpointProver[]): Promise { + const provenBlock = await this.deps.l2BlockSource.getBlockNumber({ tag: 'proven' }); + if (!provenBlock || provenBlock <= 0) { + return false; + } + const lastCheckpoint = canonical[canonical.length - 1].checkpoint; + const lastBlock = lastCheckpoint.blocks[lastCheckpoint.blocks.length - 1].number; + return provenBlock >= lastBlock; + } + + private async getL1Constants(): Promise { + if (!this.cachedL1Constants) { + this.cachedL1Constants = await this.deps.l2BlockSource.getL1Constants(); + } + return this.cachedL1Constants; + } +} + +class EmptyEpochError extends Error { + constructor(epochNumber: EpochNumber) { + super(`No blocks found for epoch ${epochNumber}`); + this.name = 'EmptyEpochError'; + } +} + +class EpochAlreadyProvenError extends Error { + constructor(epochNumber: EpochNumber) { + super(`Epoch ${epochNumber} is already proven on L1`); + this.name = 'EpochAlreadyProvenError'; + } +} diff --git a/yarn-project/prover-node/src/test/index.ts b/yarn-project/prover-node/src/test/index.ts index ccd4fa0db6b1..87c9d10b94d5 100644 --- a/yarn-project/prover-node/src/test/index.ts +++ b/yarn-project/prover-node/src/test/index.ts @@ -1,14 +1,14 @@ +import type { EpochProverFactory } from '@aztec/prover-client'; import type { EpochProverManager } from '@aztec/stdlib/interfaces/server'; -import type { EpochProvingJob } from '../job/epoch-proving-job.js'; -import type { ProverNodePublisher } from '../prover-node-publisher.js'; +import type { ProofPublishingService } from '../proof-publishing-service.js'; import { ProverNode } from '../prover-node.js'; +import type { SessionManager } from '../session-manager.js'; abstract class TestProverNodeClass extends ProverNode { - declare public prover: EpochProverManager; - declare public publisher: ProverNodePublisher; - - public abstract override tryUploadEpochFailure(job: EpochProvingJob): Promise; + declare public prover: EpochProverManager & EpochProverFactory; + declare public publishingService: ProofPublishingService; + declare public sessionManager: SessionManager; } export type TestProverNode = TestProverNodeClass; diff --git a/yarn-project/stdlib/src/deserialization/index.ts b/yarn-project/stdlib/src/deserialization/index.ts index 3aad1911c6db..fc5e693d9357 100644 --- a/yarn-project/stdlib/src/deserialization/index.ts +++ b/yarn-project/stdlib/src/deserialization/index.ts @@ -12,12 +12,13 @@ export const MAX_TXS_PER_BLOCK = 2 ** 16; export const MAX_COMMITTEE_SIZE = 2048; /** - * Physical maximum number of L2 blocks a single checkpoint can hold. + * Maximum number of L2 blocks in a provable checkpoint. Used for deserialization and when ingesting + * checkpoints already accepted by L1. * - * This MUST be >= the number of blocks the proving system can actually carry. If it were lower, a - * structurally-valid checkpoint that L1 accepts and that can be proven would be rejected on ingest - * (`validateCheckpoint`) and wedge the archiver. The circuits and L1 impose no explicit per-checkpoint - * block count; the real ceiling is the blob-field budget: + * This MUST be >= the number of blocks the proving system can carry, or a structurally-valid, provable + * checkpoint that L1 accepts would be rejected on ingest (`validateCheckpoint`) and wedge the archiver. + * The circuits and L1 impose no explicit per-checkpoint block count; the ceiling comes from the + * blob-field budget: * * budget = BLOBS_PER_CHECKPOINT * FIELDS_PER_BLOB = 6 * 4096 = 24,576 fields * @@ -30,10 +31,12 @@ export const MAX_COMMITTEE_SIZE = 2048; * * max N: 7 + 10*(N - 1) + 1 <= 24,576 => 10*(N - 1) <= 24,568 => N <= 2,457 * - * Only the first block may be empty; all other blocks require >= 1 tx (the circuits have no empty-tx - * variant for non-first blocks). 2457 is exactly this ceiling: any checkpoint above the blob budget - * cannot be encoded, so it can never be posted to L1. Invariant checked by the unit test in - * deserialization.test.ts. Used for deserialization and when ingesting checkpoints already on L1. + * Only the first block may be empty; every other block needs >= 1 tx (the circuits have no empty-tx + * variant for non-first blocks), so 2457 is the largest provable checkpoint. The blob format can encode + * more blocks than this (up to ~4095 with all-empty blocks), but such a checkpoint is unprovable and + * can only reach L1 with a malicious committee supermajority — a terminal network compromise where a + * wedged archiver is an acceptable outcome. We therefore bound ingest to the provable maximum. + * Invariant checked by the unit test in deserialization.test.ts. */ export const MAX_CAPACITY_BLOCKS_PER_CHECKPOINT = 2457; diff --git a/yarn-project/stdlib/src/interfaces/epoch-prover.ts b/yarn-project/stdlib/src/interfaces/epoch-prover.ts deleted file mode 100644 index 4a0103ea66b8..000000000000 --- a/yarn-project/stdlib/src/interfaces/epoch-prover.ts +++ /dev/null @@ -1,72 +0,0 @@ -import type { BatchedBlob, FinalBlobBatchingChallenges } from '@aztec/blob-lib/types'; -import type { BlockNumber, EpochNumber } from '@aztec/foundation/branded-types'; -import type { Fr } from '@aztec/foundation/curves/bn254'; -import type { EthAddress } from '@aztec/foundation/eth-address'; - -import type { Proof } from '../proofs/proof.js'; -import type { CheckpointConstantData } from '../rollup/checkpoint_constant_data.js'; -import type { RootRollupPublicInputs } from '../rollup/root_rollup_public_inputs.js'; -import type { BlockHeader } from '../tx/block_header.js'; -import type { Tx } from '../tx/tx.js'; -import type { UInt64 } from '../types/index.js'; -import type { IBlockFactory } from './block-builder.js'; - -/** Coordinates the proving of an entire epoch. */ -export interface EpochProver extends Omit { - /** - * Starts a new epoch. Must be the first method to be called. - * @param epochNumber - The epoch number. - * @param totalNumCheckpoints - The total number of checkpoints expected in the epoch (must be at least one). - * @param finalBlobBatchingChallenges - The final blob batching challenges for the epoch. - **/ - startNewEpoch( - epochNumber: EpochNumber, - totalNumCheckpoints: number, - finalBlobBatchingChallenges: FinalBlobBatchingChallenges, - ): void; - - /** - * Starts a new checkpoint. - * @param checkpointIndex - The index of the checkpoint in the epoch. - * @param constants - The constants for this checkpoint. - * @param l1ToL2Messages - The set of L1 to L2 messages to be included in this checkpoint. - * @param totalNumBlocks - The total number of blocks expected in the checkpoint (must be at least one). - * @param headerOfLastBlockInPreviousCheckpoint - The header of the last block in the previous checkpoint. - */ - startNewCheckpoint( - checkpointIndex: number, - constants: CheckpointConstantData, - l1ToL2Messages: Fr[], - totalNumBlocks: number, - headerOfLastBlockInPreviousCheckpoint: BlockHeader, - ): Promise; - - /** - * Starts a new block. - * @param blockNumber - The block number. - * @param timestamp - The timestamp of the block. - * @param totalNumTxs - The total number of txs in the block. - */ - startNewBlock(blockNumber: BlockNumber, timestamp: UInt64, totalNumTxs: number): Promise; - - /** - * Kickstarts chonk verifier circuits for the specified txs. These will be used during epoch proving. - * Note that if the chonk verifier circuits are not started this way, they will be started nonetheless after processing. - */ - startChonkVerifierCircuits(txs: Tx[]): Promise; - - /** Returns the block. */ - setBlockCompleted(blockNumber: BlockNumber, expectedBlockHeader?: BlockHeader): Promise; - - /** Pads the epoch with empty block roots if needed and blocks until proven. Throws if proving has failed. */ - finalizeEpoch(): Promise<{ publicInputs: RootRollupPublicInputs; proof: Proof; batchedBlobInputs: BatchedBlob }>; - - /** Cancels all proving jobs. */ - cancel(): void; - - /** Returns an identifier for the prover or zero if not set. */ - getProverId(): EthAddress; - - /** Called when no longer required, cleans up internal resources */ - stop(): Promise; -} diff --git a/yarn-project/stdlib/src/interfaces/prover-client.ts b/yarn-project/stdlib/src/interfaces/prover-client.ts index 8b3928a5dff6..126cd1f971cb 100644 --- a/yarn-project/stdlib/src/interfaces/prover-client.ts +++ b/yarn-project/stdlib/src/interfaces/prover-client.ts @@ -5,7 +5,6 @@ import { z } from 'zod'; import { schemas, zodFor } from '../schemas/index.js'; import type { TxHash } from '../tx/tx_hash.js'; -import type { EpochProver } from './epoch-prover.js'; import type { ProvingJobConsumer } from './prover-broker.js'; export type ActualProverConfig = { @@ -124,10 +123,9 @@ function parseProverId(str: string) { /** * The interface to the prover client. * Provides the ability to generate proofs and build rollups. + * */ export interface EpochProverManager { - createEpochProver(): EpochProver; - start(): Promise; stop(): Promise; diff --git a/yarn-project/stdlib/src/interfaces/prover-node.test.ts b/yarn-project/stdlib/src/interfaces/prover-node.test.ts index 2b1c6c00fc6f..770412e760f9 100644 --- a/yarn-project/stdlib/src/interfaces/prover-node.test.ts +++ b/yarn-project/stdlib/src/interfaces/prover-node.test.ts @@ -85,16 +85,17 @@ class MockProverNode implements ProverNodeApi { getJobs(): Promise<{ uuid: string; status: EpochProvingJobState; epochNumber: number }[]> { return Promise.resolve([ { uuid: 'uuid1', status: 'initialized', epochNumber: 10 }, - { uuid: 'uuid2', status: 'processing', epochNumber: 10 }, - { uuid: 'uuid3', status: 'awaiting-prover', epochNumber: 10 }, + { uuid: 'uuid2', status: 'awaiting-checkpoints', epochNumber: 10 }, + { uuid: 'uuid3', status: 'awaiting-predecessor', epochNumber: 10 }, { uuid: 'uuid4', status: 'publishing-proof', epochNumber: 10 }, { uuid: 'uuid5', status: 'completed', epochNumber: 10 }, - { uuid: 'uuid6', status: 'failed', epochNumber: 10 }, + { uuid: 'uuid6', status: 'superseded', epochNumber: 10 }, + { uuid: 'uuid7', status: 'failed', epochNumber: 10 }, ]); } - startProof(epochNumber: number): Promise { + startProof(epochNumber: number): Promise { expect(typeof epochNumber).toBe('number'); - return Promise.resolve(); + return Promise.resolve(`job-${epochNumber}`); } } diff --git a/yarn-project/stdlib/src/interfaces/prover-node.ts b/yarn-project/stdlib/src/interfaces/prover-node.ts index f5c98d0a28aa..18f9a47a37db 100644 --- a/yarn-project/stdlib/src/interfaces/prover-node.ts +++ b/yarn-project/stdlib/src/interfaces/prover-node.ts @@ -6,24 +6,26 @@ import { type WorldStateSyncStatus, WorldStateSyncStatusSchema } from './world_s const EpochProvingJobState = [ 'initialized', - 'processing', - 'awaiting-prover', + 'awaiting-checkpoints', + 'awaiting-predecessor', 'publishing-proof', 'completed', + 'superseded', 'failed', 'stopped', + 'cancelled', 'timed-out', - 'reorg', ] as const; export type EpochProvingJobState = (typeof EpochProvingJobState)[number]; export const EpochProvingJobTerminalState: EpochProvingJobState[] = [ 'completed', + 'superseded', 'failed', 'stopped', + 'cancelled', 'timed-out', - 'reorg', ] as const; export type EpochProvingJobTerminalState = (typeof EpochProvingJobTerminalState)[number]; @@ -32,7 +34,12 @@ export type EpochProvingJobTerminalState = (typeof EpochProvingJobTerminalState) export interface ProverNodeApi { getJobs(): Promise<{ uuid: string; status: EpochProvingJobState; epochNumber: number }[]>; - startProof(epochNumber: number): Promise; + /** + * Schedules proving for the given epoch and returns the job id immediately, without waiting for + * the proof to complete (proving can take far longer than an HTTP request). Poll `getJobs()` to + * track the returned job's progress. + */ + startProof(epochNumber: number): Promise; getL2Tips(): Promise; @@ -46,7 +53,7 @@ export const ProverNodeApiSchema: ApiSchemaFor = { output: z.array(z.object({ uuid: z.string(), status: z.enum(EpochProvingJobState), epochNumber: z.number() })), }), - startProof: z.function({ input: z.tuple([schemas.Integer]), output: z.void() }), + startProof: z.function({ input: z.tuple([schemas.Integer]), output: z.string() }), getL2Tips: z.function({ input: z.tuple([]), output: L2TipsSchema }), diff --git a/yarn-project/stdlib/src/interfaces/server.ts b/yarn-project/stdlib/src/interfaces/server.ts index 96b57aa4bdbb..40ad4c737111 100644 --- a/yarn-project/stdlib/src/interfaces/server.ts +++ b/yarn-project/stdlib/src/interfaces/server.ts @@ -9,7 +9,6 @@ export * from './checkpoint_response.js'; export * from './l1_publish_info.js'; export * from './block-builder.js'; export * from './configs.js'; -export * from './epoch-prover.js'; export * from './l2_logs_source.js'; export * from './merkle_tree_operations.js'; export * from './p2p-bootstrap.js'; diff --git a/yarn-project/telemetry-client/src/attributes.ts b/yarn-project/telemetry-client/src/attributes.ts index adf11f85dced..900b27f915a8 100644 --- a/yarn-project/telemetry-client/src/attributes.ts +++ b/yarn-project/telemetry-client/src/attributes.ts @@ -63,6 +63,8 @@ export const EPOCH_SIZE = 'aztec.epoch.size'; export const BLOCK_PROPOSER = 'aztec.block.proposer'; /** The epoch number */ export const EPOCH_NUMBER = 'aztec.epoch.number'; +/** Kind of an EpochSession: 'full' or 'partial'. */ +export const EPOCH_SESSION_KIND = 'aztec.epoch_session.kind'; /** The tx hash */ export const TX_HASH = 'aztec.tx.hash'; /** Generic attribute representing whether the action was successful or not */ diff --git a/yarn-project/telemetry-client/src/metrics.ts b/yarn-project/telemetry-client/src/metrics.ts index 53feee910d90..ccdd7bd406f9 100644 --- a/yarn-project/telemetry-client/src/metrics.ts +++ b/yarn-project/telemetry-client/src/metrics.ts @@ -1232,12 +1232,6 @@ export const PROVER_NODE_BLOB_PROCESSING_LAST_DURATION: MetricDefinition = { unit: 'ms', valueType: ValueType.INT, }; -export const PROVER_NODE_CHONK_VERIFIER_LAST_DURATION: MetricDefinition = { - name: 'aztec.prover_node.chonk_verifier.last_duration', - description: 'Duration of chonk verifier enqueuing in epoch proving job', - unit: 'ms', - valueType: ValueType.INT, -}; export const PROVER_NODE_BLOCK_PROCESSING_DURATION: MetricDefinition = { name: 'aztec.prover_node.block_processing.duration', description: 'Duration of processing a single block in epoch proving job', @@ -1250,10 +1244,14 @@ export const PROVER_NODE_CHECKPOINT_PROCESSING_DURATION: MetricDefinition = { unit: 'ms', valueType: ValueType.INT, }; -export const PROVER_NODE_ALL_CHECKPOINTS_PROCESSING_LAST_DURATION: MetricDefinition = { - name: 'aztec.prover_node.all_checkpoints_processing.last_duration', - description: 'Duration of processing all checkpoints in epoch proving job', - unit: 'ms', +export const PROVER_NODE_ACTIVE_CHECKPOINTS: MetricDefinition = { + name: 'aztec.prover_node.active_checkpoints', + description: 'Current number of canonical CheckpointProvers in the store (i.e. checkpoints currently being proven)', + valueType: ValueType.INT, +}; +export const PROVER_NODE_ACTIVE_EPOCH_SESSIONS: MetricDefinition = { + name: 'aztec.prover_node.active_epoch_sessions', + description: 'Current number of live EpochSessions, broken down by kind (full|partial)', valueType: ValueType.INT, }; export const PROVER_NODE_REWARDS_TOTAL: MetricDefinition = {