diff --git a/docs/docs-operate/operators/setup/high-availability.md b/docs/docs-operate/operators/setup/high-availability.md index 359e004aa117..12ce598991e8 100644 --- a/docs/docs-operate/operators/setup/high-availability.md +++ b/docs/docs-operate/operators/setup/high-availability.md @@ -419,13 +419,15 @@ export VALIDATOR_HA_SIGNING_TIMEOUT_MS=3000 # Default: 3000ms **Optional Tuning Variables:** -| Variable | Description | Default | -| -------------------------------------- | -------------------------------- | ------------------ | -| `VALIDATOR_HA_POLLING_INTERVAL_MS` | How often to check duty status | `100` | -| `VALIDATOR_HA_SIGNING_TIMEOUT_MS` | Max wait for in-progress signing | `3000` | -| `VALIDATOR_HA_MAX_STUCK_DUTIES_AGE_MS` | Max age before cleanup | `2 * slotDuration` | -| `VALIDATOR_HA_POOL_MAX` | Max database connections | `10` | -| `VALIDATOR_HA_POOL_MIN` | Min database connections | `0` | +| Variable | Description | Default | +| -------------------------------------- | ------------------------------------------------ | ------------------ | +| `VALIDATOR_HA_POLLING_INTERVAL_MS` | How often to check duty status | `100` | +| `VALIDATOR_HA_SIGNING_TIMEOUT_MS` | Max wait for in-progress signing | `3000` | +| `VALIDATOR_HA_MAX_STUCK_DUTIES_AGE_MS` | Max age before cleanup | `2 * slotDuration` | +| `VALIDATOR_HA_POOL_MAX` | Max database connections | `10` | +| `VALIDATOR_HA_POOL_MIN` | Min database connections | `0` | +| `VALIDATOR_HA_OLD_DUTIES_MAX_AGE_H` | Clean up old signed duties after this many hours | N/A | + When `VALIDATOR_HA_SIGNING_ENABLED=true`, the validator client automatically: diff --git a/spartan/scripts/deploy_network.sh b/spartan/scripts/deploy_network.sh index d6eb3b541b95..9304a94d384f 100755 --- a/spartan/scripts/deploy_network.sh +++ b/spartan/scripts/deploy_network.sh @@ -90,6 +90,7 @@ VALIDATOR_REPLICAS=${VALIDATOR_REPLICAS:-4} VALIDATOR_PUBLISHER_MNEMONIC_START_INDEX=${VALIDATOR_PUBLISHER_MNEMONIC_START_INDEX:-5000} VALIDATOR_PUBLISHERS_PER_REPLICA=${VALIDATOR_PUBLISHERS_PER_REPLICA:-4} VALIDATOR_HA_REPLICAS=${VALIDATOR_HA_REPLICAS:-0} +VALIDATOR_HA_OLD_DUTIES_MAX_AGE_H=${VALIDATOR_HA_OLD_DUTIES_MAX_AGE_H:-24} PROVER_PUBLISHER_MNEMONIC_START_INDEX=${PROVER_PUBLISHER_MNEMONIC_START_INDEX:-8000} PUBLISHERS_PER_PROVER=${PUBLISHERS_PER_PROVER:-1} PROVER_REAL_PROOFS=${REAL_VERIFIER:-true} @@ -530,6 +531,7 @@ VALIDATORS_PER_NODE = ${VALIDATORS_PER_NODE} VALIDATOR_REPLICAS = ${VALIDATOR_REPLICAS} VALIDATOR_PUBLISHERS_PER_REPLICA = ${VALIDATOR_PUBLISHERS_PER_REPLICA} VALIDATOR_HA_REPLICAS = ${VALIDATOR_HA_REPLICAS} +VALIDATOR_HA_OLD_DUTIES_MAX_AGE_H = ${VALIDATOR_HA_OLD_DUTIES_MAX_AGE_H} SEQ_MIN_TX_PER_BLOCK = ${SEQ_MIN_TX_PER_BLOCK} SEQ_MAX_TX_PER_BLOCK = ${SEQ_MAX_TX_PER_BLOCK} SEQ_MAX_TX_PER_CHECKPOINT = ${SEQ_MAX_TX_PER_CHECKPOINT} diff --git a/spartan/terraform/deploy-aztec-infra/main.tf b/spartan/terraform/deploy-aztec-infra/main.tf index 3bcdd28ca4a5..0d8ac042099b 100644 --- a/spartan/terraform/deploy-aztec-infra/main.tf +++ b/spartan/terraform/deploy-aztec-infra/main.tf @@ -180,65 +180,65 @@ locals { } validator_common_settings = { - "validator.service.p2p.nodePortEnabled" = var.P2P_NODEPORT_ENABLED - "validator.web3signerUrl" = "http://${var.RELEASE_PREFIX}-signer-web3signer.${var.NAMESPACE}.svc.cluster.local:9000/" - "validator.mnemonic" = var.VALIDATOR_MNEMONIC - "validator.mnemonicStartIndex" = var.VALIDATOR_MNEMONIC_START_INDEX - "validator.validatorsPerNode" = var.VALIDATORS_PER_NODE - "validator.publishersPerReplica" = var.VALIDATOR_PUBLISHERS_PER_REPLICA - "validator.publisherMnemonicStartIndex" = var.VALIDATOR_PUBLISHER_MNEMONIC_START_INDEX - "validator.replicaCount" = var.VALIDATOR_REPLICAS - "validator.sentinel.enabled" = var.SENTINEL_ENABLED - "validator.slash.minPenaltyPercentage" = var.SLASH_MIN_PENALTY_PERCENTAGE - "validator.slash.maxPenaltyPercentage" = var.SLASH_MAX_PENALTY_PERCENTAGE - "validator.slash.inactivityTargetPercentage" = var.SLASH_INACTIVITY_TARGET_PERCENTAGE - "validator.slash.inactivityPenalty" = var.SLASH_INACTIVITY_PENALTY - "validator.slash.prunePenalty" = var.SLASH_PRUNE_PENALTY - "validator.slash.dataWithholdingPenalty" = var.SLASH_DATA_WITHHOLDING_PENALTY - "validator.slash.proposeInvalidAttestationsPenalty" = var.SLASH_PROPOSE_INVALID_ATTESTATIONS_PENALTY - "validator.slash.duplicateProposalPenalty" = var.SLASH_DUPLICATE_PROPOSAL_PENALTY - "validator.slash.duplicateAttestationPenalty" = var.SLASH_DUPLICATE_ATTESTATION_PENALTY - "validator.slash.attestDescendantOfInvalidPenalty" = var.SLASH_ATTEST_DESCENDANT_OF_INVALID_PENALTY - "validator.slash.unknownPenalty" = var.SLASH_UNKNOWN_PENALTY - "validator.slash.invalidBlockPenalty" = var.SLASH_INVALID_BLOCK_PENALTY - "validator.slash.offenseExpirationRounds" = var.SLASH_OFFENSE_EXPIRATION_ROUNDS - "validator.slash.maxPayloadSize" = var.SLASH_MAX_PAYLOAD_SIZE - "validator.node.env.TRANSACTIONS_DISABLED" = var.TRANSACTIONS_DISABLED - "validator.node.env.DEBUG_FORCE_TX_PROOF_VERIFICATION" = var.DEBUG_FORCE_TX_PROOF_VERIFICATION - "validator.node.env.KEY_INDEX_START" = var.VALIDATOR_MNEMONIC_START_INDEX - "validator.node.env.PUBLISHER_KEY_INDEX_START" = var.VALIDATOR_PUBLISHER_MNEMONIC_START_INDEX - "validator.node.env.VALIDATORS_PER_NODE" = var.VALIDATORS_PER_NODE - "validator.node.env.VALIDATOR_PUBLISHERS_PER_REPLICA" = var.VALIDATOR_PUBLISHERS_PER_REPLICA - "validator.node.proverRealProofs" = var.PROVER_REAL_PROOFS - "validator.node.env.SEQ_MIN_TX_PER_BLOCK" = var.SEQ_MIN_TX_PER_BLOCK - "validator.node.env.SEQ_MAX_TX_PER_BLOCK" = var.SEQ_MAX_TX_PER_BLOCK - "validator.node.env.SEQ_MAX_TX_PER_CHECKPOINT" = var.SEQ_MAX_TX_PER_CHECKPOINT - "validator.node.env.SEQ_PER_BLOCK_ALLOCATION_MULTIPLIER" = var.SEQ_PER_BLOCK_ALLOCATION_MULTIPLIER - "validator.node.env.SEQ_BLOCK_DURATION_MS" = var.SEQ_BLOCK_DURATION_MS + "validator.service.p2p.nodePortEnabled" = var.P2P_NODEPORT_ENABLED + "validator.web3signerUrl" = "http://${var.RELEASE_PREFIX}-signer-web3signer.${var.NAMESPACE}.svc.cluster.local:9000/" + "validator.mnemonic" = var.VALIDATOR_MNEMONIC + "validator.mnemonicStartIndex" = var.VALIDATOR_MNEMONIC_START_INDEX + "validator.validatorsPerNode" = var.VALIDATORS_PER_NODE + "validator.publishersPerReplica" = var.VALIDATOR_PUBLISHERS_PER_REPLICA + "validator.publisherMnemonicStartIndex" = var.VALIDATOR_PUBLISHER_MNEMONIC_START_INDEX + "validator.replicaCount" = var.VALIDATOR_REPLICAS + "validator.sentinel.enabled" = var.SENTINEL_ENABLED + "validator.slash.minPenaltyPercentage" = var.SLASH_MIN_PENALTY_PERCENTAGE + "validator.slash.maxPenaltyPercentage" = var.SLASH_MAX_PENALTY_PERCENTAGE + "validator.slash.inactivityTargetPercentage" = var.SLASH_INACTIVITY_TARGET_PERCENTAGE + "validator.slash.inactivityPenalty" = var.SLASH_INACTIVITY_PENALTY + "validator.slash.prunePenalty" = var.SLASH_PRUNE_PENALTY + "validator.slash.dataWithholdingPenalty" = var.SLASH_DATA_WITHHOLDING_PENALTY + "validator.slash.proposeInvalidAttestationsPenalty" = var.SLASH_PROPOSE_INVALID_ATTESTATIONS_PENALTY + "validator.slash.duplicateProposalPenalty" = var.SLASH_DUPLICATE_PROPOSAL_PENALTY + "validator.slash.duplicateAttestationPenalty" = var.SLASH_DUPLICATE_ATTESTATION_PENALTY + "validator.slash.attestDescendantOfInvalidPenalty" = var.SLASH_ATTEST_DESCENDANT_OF_INVALID_PENALTY + "validator.slash.unknownPenalty" = var.SLASH_UNKNOWN_PENALTY + "validator.slash.invalidBlockPenalty" = var.SLASH_INVALID_BLOCK_PENALTY + "validator.slash.offenseExpirationRounds" = var.SLASH_OFFENSE_EXPIRATION_ROUNDS + "validator.slash.maxPayloadSize" = var.SLASH_MAX_PAYLOAD_SIZE + "validator.node.env.TRANSACTIONS_DISABLED" = var.TRANSACTIONS_DISABLED + "validator.node.env.DEBUG_FORCE_TX_PROOF_VERIFICATION" = var.DEBUG_FORCE_TX_PROOF_VERIFICATION + "validator.node.env.KEY_INDEX_START" = var.VALIDATOR_MNEMONIC_START_INDEX + "validator.node.env.PUBLISHER_KEY_INDEX_START" = var.VALIDATOR_PUBLISHER_MNEMONIC_START_INDEX + "validator.node.env.VALIDATORS_PER_NODE" = var.VALIDATORS_PER_NODE + "validator.node.env.VALIDATOR_PUBLISHERS_PER_REPLICA" = var.VALIDATOR_PUBLISHERS_PER_REPLICA + "validator.node.proverRealProofs" = var.PROVER_REAL_PROOFS + "validator.node.env.SEQ_MIN_TX_PER_BLOCK" = var.SEQ_MIN_TX_PER_BLOCK + "validator.node.env.SEQ_MAX_TX_PER_BLOCK" = var.SEQ_MAX_TX_PER_BLOCK + "validator.node.env.SEQ_MAX_TX_PER_CHECKPOINT" = var.SEQ_MAX_TX_PER_CHECKPOINT + "validator.node.env.SEQ_PER_BLOCK_ALLOCATION_MULTIPLIER" = var.SEQ_PER_BLOCK_ALLOCATION_MULTIPLIER + "validator.node.env.SEQ_BLOCK_DURATION_MS" = var.SEQ_BLOCK_DURATION_MS "validator.node.env.SEQ_L1_PUBLISHING_TIME_ALLOWANCE_IN_SLOT" = var.SEQ_L1_PUBLISHING_TIME_ALLOWANCE_IN_SLOT - "validator.node.env.SEQ_BUILD_CHECKPOINT_IF_EMPTY" = var.SEQ_BUILD_CHECKPOINT_IF_EMPTY - "validator.node.env.SEQ_ENFORCE_TIME_TABLE" = var.SEQ_ENFORCE_TIME_TABLE - "validator.node.env.P2P_TX_POOL_DELETE_TXS_AFTER_REORG" = var.P2P_TX_POOL_DELETE_TXS_AFTER_REORG - "validator.node.env.L1_PRIORITY_FEE_BUMP_PERCENTAGE" = var.VALIDATOR_L1_PRIORITY_FEE_BUMP_PERCENTAGE - "validator.node.env.L1_PRIORITY_FEE_RETRY_BUMP_PERCENTAGE" = var.VALIDATOR_L1_PRIORITY_FEE_RETRY_BUMP_PERCENTAGE - "validator.node.env.BLOB_ALLOW_EMPTY_SOURCES" = var.BLOB_ALLOW_EMPTY_SOURCES - "validator.node.env.P2P_MAX_TX_POOL_SIZE" = var.P2P_MAX_TX_POOL_SIZE - "validator.node.env.PROVER_TEST_VERIFICATION_DELAY_MS" = var.PROVER_TEST_VERIFICATION_DELAY_MS - "validator.node.env.BB_CHONK_VERIFY_MAX_BATCH" = var.BB_CHONK_VERIFY_MAX_BATCH - "validator.node.env.BB_CHONK_VERIFY_BATCH_CONCURRENCY" = var.BB_CHONK_VERIFY_BATCH_CONCURRENCY - "validator.node.env.DEBUG_P2P_INSTRUMENT_MESSAGES" = var.DEBUG_P2P_INSTRUMENT_MESSAGES - "validator.node.secret.envEnabled" = true - "validator.node.secret.mnemonic" = var.VALIDATOR_MNEMONIC - "validator.node.secret.mnemonicIndex" = var.VALIDATOR_MNEMONIC_START_INDEX - "validator.node.env.P2P_GOSSIPSUB_D" = var.P2P_GOSSIPSUB_D - "validator.node.env.P2P_GOSSIPSUB_DLO" = var.P2P_GOSSIPSUB_DLO - "validator.node.env.P2P_GOSSIPSUB_DHI" = var.P2P_GOSSIPSUB_DHI - "validator.node.env.P2P_DROP_TX_CHANCE" = var.P2P_DROP_TX_CHANCE - "validator.node.env.WS_NUM_HISTORIC_CHECKPOINTS" = var.WS_NUM_HISTORIC_CHECKPOINTS - "validator.node.env.TX_COLLECTION_FILE_STORE_URLS" = var.TX_COLLECTION_FILE_STORE_URLS - "validator.node.env.SEQ_SKIP_CHECKPOINT_PUBLISH_PERCENT" = var.SEQ_SKIP_CHECKPOINT_PUBLISH_PERCENT - "validator.node.env.L1_TX_FAILED_STORE" = var.L1_TX_FAILED_STORE - "validator.node.adminApiKeyHash" = var.ADMIN_API_KEY_HASH + "validator.node.env.SEQ_BUILD_CHECKPOINT_IF_EMPTY" = var.SEQ_BUILD_CHECKPOINT_IF_EMPTY + "validator.node.env.SEQ_ENFORCE_TIME_TABLE" = var.SEQ_ENFORCE_TIME_TABLE + "validator.node.env.P2P_TX_POOL_DELETE_TXS_AFTER_REORG" = var.P2P_TX_POOL_DELETE_TXS_AFTER_REORG + "validator.node.env.L1_PRIORITY_FEE_BUMP_PERCENTAGE" = var.VALIDATOR_L1_PRIORITY_FEE_BUMP_PERCENTAGE + "validator.node.env.L1_PRIORITY_FEE_RETRY_BUMP_PERCENTAGE" = var.VALIDATOR_L1_PRIORITY_FEE_RETRY_BUMP_PERCENTAGE + "validator.node.env.BLOB_ALLOW_EMPTY_SOURCES" = var.BLOB_ALLOW_EMPTY_SOURCES + "validator.node.env.P2P_MAX_TX_POOL_SIZE" = var.P2P_MAX_TX_POOL_SIZE + "validator.node.env.PROVER_TEST_VERIFICATION_DELAY_MS" = var.PROVER_TEST_VERIFICATION_DELAY_MS + "validator.node.env.BB_CHONK_VERIFY_MAX_BATCH" = var.BB_CHONK_VERIFY_MAX_BATCH + "validator.node.env.BB_CHONK_VERIFY_BATCH_CONCURRENCY" = var.BB_CHONK_VERIFY_BATCH_CONCURRENCY + "validator.node.env.DEBUG_P2P_INSTRUMENT_MESSAGES" = var.DEBUG_P2P_INSTRUMENT_MESSAGES + "validator.node.secret.envEnabled" = true + "validator.node.secret.mnemonic" = var.VALIDATOR_MNEMONIC + "validator.node.secret.mnemonicIndex" = var.VALIDATOR_MNEMONIC_START_INDEX + "validator.node.env.P2P_GOSSIPSUB_D" = var.P2P_GOSSIPSUB_D + "validator.node.env.P2P_GOSSIPSUB_DLO" = var.P2P_GOSSIPSUB_DLO + "validator.node.env.P2P_GOSSIPSUB_DHI" = var.P2P_GOSSIPSUB_DHI + "validator.node.env.P2P_DROP_TX_CHANCE" = var.P2P_DROP_TX_CHANCE + "validator.node.env.WS_NUM_HISTORIC_CHECKPOINTS" = var.WS_NUM_HISTORIC_CHECKPOINTS + "validator.node.env.TX_COLLECTION_FILE_STORE_URLS" = var.TX_COLLECTION_FILE_STORE_URLS + "validator.node.env.SEQ_SKIP_CHECKPOINT_PUBLISH_PERCENT" = var.SEQ_SKIP_CHECKPOINT_PUBLISH_PERCENT + "validator.node.env.L1_TX_FAILED_STORE" = var.L1_TX_FAILED_STORE + "validator.node.adminApiKeyHash" = var.ADMIN_API_KEY_HASH } # Note: nonsensitive() is required here because helm_releases is used in for_each, @@ -249,7 +249,8 @@ locals { "validator.node.env.VALIDATOR_HA_DATABASE_URL" = nonsensitive(module.validator_ha_postgres[0].database_url) # Limit pool size per pod to avoid exhausting PostgreSQL connections # With 12 pods × 5 max = 60 connections (well under PostgreSQL's 500 max) - "validator.node.env.VALIDATOR_HA_POOL_MAX" = "5" + "validator.node.env.VALIDATOR_HA_POOL_MAX" = "5" + "validator.node.env.VALIDATOR_HA_OLD_DUTIES_MAX_AGE_H" = tostring(var.VALIDATOR_HA_OLD_DUTIES_MAX_AGE_H) } : {} # Generate validator releases: primary (idx=0) plus N HA replicas (idx=1..N) diff --git a/spartan/terraform/deploy-aztec-infra/variables.tf b/spartan/terraform/deploy-aztec-infra/variables.tf index efae777c52fb..ec754d70839c 100644 --- a/spartan/terraform/deploy-aztec-infra/variables.tf +++ b/spartan/terraform/deploy-aztec-infra/variables.tf @@ -241,6 +241,12 @@ variable "VALIDATOR_HA_REPLICAS" { default = 0 } +variable "VALIDATOR_HA_OLD_DUTIES_MAX_AGE_H" { + description = "Clean up old signed HA duties after this many hours (prevents unbounded DB growth)" + type = number + default = 24 +} + variable "ADMIN_API_KEY_HASH" { description = "SHA-256 hex hash of the admin API key. When set, enables admin API authentication on validator nodes. Leave empty to disable admin auth (default)." type = string