Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 40 additions & 0 deletions spartan/aztec-prover-stack/templates/agent-scaledobject.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
{{- if .Values.agent.autoscaling.keda.enabled }}
{{- $agentChartName := default "agent" .Values.agent.nameOverride }}
{{- $agentDefaultName := ternary .Release.Name (printf "%s-%s" .Release.Name $agentChartName) (contains $agentChartName .Release.Name) }}
{{- $agentName := default $agentDefaultName .Values.agent.fullnameOverride | trunc 63 | trimSuffix "-" }}
{{- $queueQuery := printf "sum(aztec_proving_queue_size{k8s_namespace_name=%q})" .Release.Namespace }}
apiVersion: keda.sh/v1alpha1
kind: ScaledObject
metadata:
name: {{ $agentName }}
labels:
{{- include "chart.labels" . | nindent 4 }}
spec:
scaleTargetRef:
kind: Deployment
name: {{ $agentName }}
pollingInterval: {{ .Values.agent.autoscaling.keda.pollingInterval }}
cooldownPeriod: {{ .Values.agent.autoscaling.keda.cooldownPeriod }}
minReplicaCount: {{ .Values.agent.autoscaling.keda.minReplicaCount }}
maxReplicaCount: {{ .Values.agent.autoscaling.keda.maxReplicaCount }}
triggers:
{{- if .Values.agent.autoscaling.keda.scalingBands }}
{{- range $band := .Values.agent.autoscaling.keda.scalingBands }}
- type: prometheus
metadata:
serverAddress: {{ $.Values.agent.autoscaling.keda.prometheus.serverAddress | quote }}
metricName: {{ printf "aztec_proving_queue_size_agents_%v_over_%v" $band.replicas $band.queueSize | replace "." "_" | quote }}
query: {{ printf "((%s or vector(0)) > bool %v) * %v" $queueQuery $band.queueSize $band.replicas | quote }}
threshold: "1"
activationThreshold: "0"
{{- end }}
{{- else }}
- type: prometheus
metadata:
serverAddress: {{ .Values.agent.autoscaling.keda.prometheus.serverAddress | quote }}
metricName: "aztec_proving_queue_size"
query: {{ $queueQuery | quote }}
threshold: "1"
activationThreshold: "0"
{{- end }}
{{- end }}
11 changes: 11 additions & 0 deletions spartan/aztec-prover-stack/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,17 @@ agent:
nodeType: "prover-agent"
replicaCount: 1

autoscaling:
keda:
enabled: false
pollingInterval: 30
cooldownPeriod: 300
minReplicaCount: 0
maxReplicaCount: 1
scalingBands: []
prometheus:
serverAddress: ""

persistence:
enabled: false

Expand Down
11 changes: 10 additions & 1 deletion spartan/environments/next-net.env
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,16 @@ VALIDATOR_PUBLISHER_MNEMONIC_START_INDEX=5000

PUBLISHERS_PER_PROVER=2
PROVER_PUBLISHER_MNEMONIC_START_INDEX=8000
PROVER_REPLICAS=4
PROVER_AGENT_KEDA_ENABLED=true
PROVER_AGENT_KEDA_PROMETHEUS_SERVER_ADDRESS=REPLACE_WITH_GCP_SECRET
PROVER_AGENT_KEDA_MIN_REPLICAS=0
PROVER_AGENT_KEDA_MAX_REPLICAS=4
PROVER_AGENT_KEDA_SCALING_BANDS='[
{
queueSize = 0
replicas = 4
}
]'

BOT_TRANSFERS_REPLICAS=1
BOT_TRANSFERS_TX_INTERVAL_SECONDS=250
Expand Down
14 changes: 14 additions & 0 deletions spartan/environments/testnet.env
Original file line number Diff line number Diff line change
Expand Up @@ -89,3 +89,17 @@ PROVER_FAILED_PROOF_STORE=gs://aztec-develop/testnet/failed-proofs
L1_TX_FAILED_STORE=gs://aztec-develop/testnet/failed-l1-txs
PROVER_REPLICAS=4
PROVER_RESOURCE_PROFILE="prod"
PROVER_AGENT_KEDA_ENABLED=false
PROVER_AGENT_KEDA_PROMETHEUS_SERVER_ADDRESS=
PROVER_AGENT_KEDA_MIN_REPLICAS=0
PROVER_AGENT_KEDA_MAX_REPLICAS=8
PROVER_AGENT_KEDA_SCALING_BANDS='[
{
queueSize = 0
replicas = 4
},
{
queueSize = 50
replicas = 8
}
]'
18 changes: 17 additions & 1 deletion spartan/scripts/deploy_network.sh
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,15 @@ SEQ_ENFORCE_TIME_TABLE=${SEQ_ENFORCE_TIME_TABLE:-}
SEQ_SKIP_CHECKPOINT_PUBLISH_PERCENT=${SEQ_SKIP_CHECKPOINT_PUBLISH_PERCENT:-0}
PROVER_REPLICAS=${PROVER_REPLICAS:-4}
PROVER_ENABLED=${PROVER_ENABLED:-true}
PROVER_AGENT_KEDA_ENABLED=${PROVER_AGENT_KEDA_ENABLED:-false}
PROVER_AGENT_KEDA_MIN_REPLICAS=${PROVER_AGENT_KEDA_MIN_REPLICAS:-0}
PROVER_AGENT_KEDA_MAX_REPLICAS=${PROVER_AGENT_KEDA_MAX_REPLICAS:-$PROVER_REPLICAS}
PROVER_AGENT_KEDA_SCALING_BANDS=${PROVER_AGENT_KEDA_SCALING_BANDS:-[]}
PROVER_AGENT_KEDA_PROMETHEUS_SERVER_ADDRESS=${PROVER_AGENT_KEDA_PROMETHEUS_SERVER_ADDRESS:-}
if [[ "$PROVER_ENABLED" == "true" && "$PROVER_AGENT_KEDA_ENABLED" == "true" && -z "$PROVER_AGENT_KEDA_PROMETHEUS_SERVER_ADDRESS" ]]; then
die "PROVER_AGENT_KEDA_ENABLED=true requires PROVER_AGENT_KEDA_PROMETHEUS_SERVER_ADDRESS. Set it explicitly, for example via GCP secret replacement."
fi
PROVER_AGENT_REPLICA_CAPACITY=$([[ "$PROVER_ENABLED" == "true" ]] && echo "$PROVER_AGENT_KEDA_MAX_REPLICAS" || echo 0)
PROVER_AGENTS_PER_PROVER=${PROVER_AGENTS_PER_PROVER:-1}
R2_ACCESS_KEY_ID=${R2_ACCESS_KEY_ID:-}
R2_SECRET_ACCESS_KEY=${R2_SECRET_ACCESS_KEY:-}
Expand Down Expand Up @@ -253,7 +262,7 @@ if (( TOTAL_VALIDATOR_PUBLISHERS > 0 )); then
fi

# Add prover publishers to prefunding list
TOTAL_PROVER_PUBLISHERS=$((PROVER_REPLICAS * PUBLISHERS_PER_PROVER))
TOTAL_PROVER_PUBLISHERS=$((PROVER_AGENT_REPLICA_CAPACITY * PUBLISHERS_PER_PROVER))

if (( TOTAL_PROVER_PUBLISHERS > 0 )); then
PROVER_PUBLISHER_RANGE=$(seq "$PROVER_PUBLISHER_MNEMONIC_START_INDEX" $((PROVER_PUBLISHER_MNEMONIC_START_INDEX + TOTAL_PROVER_PUBLISHERS - 1)) | tr '\n' ',' | sed 's/,$//')
Expand Down Expand Up @@ -630,6 +639,13 @@ BOT_CROSS_CHAIN_L2_PRIVATE_KEY = "${BOT_CROSS_CHAIN_L2_PRIVATE_KEY:-0xcafe03}"

PROVER_AGENTS_PER_PROVER = ${PROVER_AGENTS_PER_PROVER}
PROVER_AGENT_POLL_INTERVAL_MS = ${PROVER_AGENT_POLL_INTERVAL_MS}
PROVER_AGENT_KEDA_ENABLED = ${PROVER_AGENT_KEDA_ENABLED:-false}
PROVER_AGENT_KEDA_PROMETHEUS_SERVER_ADDRESS = "${PROVER_AGENT_KEDA_PROMETHEUS_SERVER_ADDRESS:-}"
PROVER_AGENT_KEDA_MIN_REPLICAS = ${PROVER_AGENT_KEDA_MIN_REPLICAS:-0}
PROVER_AGENT_KEDA_MAX_REPLICAS = ${PROVER_AGENT_KEDA_MAX_REPLICAS:-$PROVER_REPLICAS}
PROVER_AGENT_KEDA_SCALING_BANDS = ${PROVER_AGENT_KEDA_SCALING_BANDS:-[]}
PROVER_AGENT_KEDA_POLLING_INTERVAL_SECONDS = ${PROVER_AGENT_KEDA_POLLING_INTERVAL_SECONDS:-30}
PROVER_AGENT_KEDA_COOLDOWN_PERIOD_SECONDS = ${PROVER_AGENT_KEDA_COOLDOWN_PERIOD_SECONDS:-300}

RPC_INGRESS_ENABLED = ${RPC_INGRESS_ENABLED}
RPC_INGRESS_HOSTS = ${RPC_INGRESS_HOSTS}
Expand Down
1 change: 1 addition & 0 deletions spartan/scripts/network_deploy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ gcp_auth
# Second pass: source environment with GCP secret processing
source_network_env "$env_file"


# Optional: provision per-network IP + managed cert (+ DNS record in the delegated
# rpc.aztec-labs.com zone) via the network-frontend terraform module. The module's
# outputs are exported as env vars that deploy_network.sh already consumes.
Expand Down
1 change: 1 addition & 0 deletions spartan/scripts/setup_gcp_secrets.sh
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ declare -A SECRET_MAPPINGS=(
["FUNDING_PRIVATE_KEY"]="${L1_NETWORK}-funding-private-key"
["ROLLUP_DEPLOYMENT_PRIVATE_KEY"]="${L1_NETWORK}-labs-rollup-private-key"
["OTEL_COLLECTOR_ENDPOINT"]="otel-collector-url"
["PROVER_AGENT_KEDA_PROMETHEUS_SERVER_ADDRESS"]="prometheus-internal-read-url"
["ETHERSCAN_API_KEY"]="etherscan-api-key"
["LABS_INFRA_MNEMONIC"]="${MNEMONIC_SECRET}"
["STORE_SNAPSHOT_URL"]="r2-account-id"
Expand Down
19 changes: 17 additions & 2 deletions spartan/terraform/deploy-aztec-infra/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ module "web3signer" {
VALIDATOR_MNEMONIC_START_INDEX = tonumber(var.VALIDATOR_MNEMONIC_START_INDEX)
VALIDATOR_PUBLISHER_MNEMONIC_START_INDEX = tonumber(var.VALIDATOR_PUBLISHER_MNEMONIC_START_INDEX)
VALIDATOR_PUBLISHERS_PER_REPLICA = var.VALIDATOR_PUBLISHERS_PER_REPLICA
PROVER_COUNT = tonumber(var.PROVER_REPLICAS)
PROVER_COUNT = local.prover_agent_replica_capacity
PUBLISHERS_PER_PROVER = tonumber(var.PROVER_PUBLISHERS_PER_PROVER)
PROVER_PUBLISHER_MNEMONIC_START_INDEX = tonumber(var.PROVER_PUBLISHER_MNEMONIC_START_INDEX)

Expand Down Expand Up @@ -94,6 +94,8 @@ locals {
tag = split(":", var.VALIDATOR_HA_DOCKER_IMAGE)[1]
} : local.aztec_image

prover_agent_replica_capacity = var.PROVER_ENABLED ? (var.PROVER_AGENT_KEDA_ENABLED ? var.PROVER_AGENT_KEDA_MAX_REPLICAS : tonumber(var.PROVER_REPLICAS)) : 0

# Max node count: max of primary (VALIDATOR_REPLICAS) and HA pod counts
# Determines how many attester keystores and publisher key ranges to generate
effective_ha_count = var.VALIDATOR_HA_REPLICAS > 0 ? coalesce(var.VALIDATOR_HA_REPLICA_COUNT, tonumber(var.VALIDATOR_REPLICAS)) : 0
Expand Down Expand Up @@ -343,6 +345,19 @@ locals {
node = {
logLevel = var.LOG_LEVEL
}
autoscaling = {
keda = {
enabled = var.PROVER_AGENT_KEDA_ENABLED && var.PROVER_ENABLED
pollingInterval = var.PROVER_AGENT_KEDA_POLLING_INTERVAL_SECONDS
cooldownPeriod = var.PROVER_AGENT_KEDA_COOLDOWN_PERIOD_SECONDS
minReplicaCount = var.PROVER_AGENT_KEDA_MIN_REPLICAS
maxReplicaCount = var.PROVER_AGENT_KEDA_MAX_REPLICAS
scalingBands = var.PROVER_AGENT_KEDA_SCALING_BANDS
prometheus = {
serverAddress = var.PROVER_AGENT_KEDA_PROMETHEUS_SERVER_ADDRESS
}
}
}
}
})], local.is_kind ? [yamlencode({
agent = {
Expand Down Expand Up @@ -377,7 +392,7 @@ locals {
"agent.node.env.CRS_PATH" = "/usr/src/crs"
"agent.node.proverRealProofs" = var.PROVER_REAL_PROOFS
"agent.node.env.PROVER_AGENT_POLL_INTERVAL_MS" = var.PROVER_AGENT_POLL_INTERVAL_MS
"agent.replicaCount" = var.PROVER_REPLICAS
"agent.replicaCount" = var.PROVER_AGENT_KEDA_ENABLED ? "0" : var.PROVER_REPLICAS
"agent.node.env.BOOTSTRAP_NODES" = "asdf"
"agent.node.env.PROVER_AGENT_COUNT" = var.PROVER_AGENTS_PER_PROVER
"agent.node.env.PROVER_TEST_DELAY_TYPE" = var.PROVER_TEST_DELAY_TYPE
Expand Down
45 changes: 45 additions & 0 deletions spartan/terraform/deploy-aztec-infra/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -851,6 +851,51 @@ variable "PROVER_AGENT_POLL_INTERVAL_MS" {
default = 1000
}

variable "PROVER_AGENT_KEDA_ENABLED" {
description = "Whether KEDA should scale prover agent pods from proving queue depth"
type = bool
default = false
}

variable "PROVER_AGENT_KEDA_MIN_REPLICAS" {
description = "Minimum prover agent pods managed by KEDA"
type = number
default = 0
}

variable "PROVER_AGENT_KEDA_MAX_REPLICAS" {
description = "Maximum prover agent pods managed by KEDA"
type = number
default = 1
}

variable "PROVER_AGENT_KEDA_SCALING_BANDS" {
description = "Step scaling bands for prover agents. Each band scales to replicas when total proving queue size is greater than queueSize."
type = list(object({
queueSize = number
replicas = number
}))
default = []
}

variable "PROVER_AGENT_KEDA_PROMETHEUS_SERVER_ADDRESS" {
description = "Prometheus server URL queried by KEDA for prover queue depth"
type = string
default = ""
}

variable "PROVER_AGENT_KEDA_POLLING_INTERVAL_SECONDS" {
description = "KEDA polling interval for prover agent queue-depth scaling"
type = number
default = 30
}

variable "PROVER_AGENT_KEDA_COOLDOWN_PERIOD_SECONDS" {
description = "KEDA cooldown period before scaling prover agents back down"
type = number
default = 300
}

variable "PROVER_AGENT_INCLUDE_METRICS" {
description = "Metrics whitelist in the prover agent"
type = string
Expand Down
32 changes: 32 additions & 0 deletions spartan/terraform/deploy-metrics/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,11 @@ data "terraform_remote_state" "ssl" {
}
}

data "google_compute_subnetwork" "default" {
name = "default"
region = var.region
}

resource "google_compute_address" "grafana_ip" {
provider = google
name = "grafana-ip"
Expand All @@ -51,6 +56,18 @@ resource "google_compute_address" "otel_collector_ip" {
}
}

resource "google_compute_address" "prometheus_ip" {
provider = google
name = "prometheus-ip"
address_type = "INTERNAL"
region = var.region
subnetwork = data.google_compute_subnetwork.default.id

lifecycle {
prevent_destroy = true
}
}

provider "kubernetes" {
alias = "gke-cluster"
config_path = "~/.kube/config"
Expand Down Expand Up @@ -209,6 +226,21 @@ resource "helm_release" "aztec-gke-cluster" {
value = google_compute_address.otel_collector_ip.address
}

set {
name = "prometheus.server.service.type"
value = "LoadBalancer"
}

set {
name = "prometheus.server.service.annotations.networking\\.gke\\.io\\/load-balancer-type"
value = "Internal"
}

set {
name = "prometheus.server.service.loadBalancerIP"
value = google_compute_address.prometheus_ip.address
}

set {
name = "prometheus.serverFiles.prometheus\\.yml.scrape_configs[0].job_name"
value = "prometheus"
Expand Down
Loading
Loading