From 98effd2e65f9f17147e38ed5ac67a728f9cd2269 Mon Sep 17 00:00:00 2001
From: Alex Gherghisan <alexghr@users.noreply.github.com>
Date: Thu, 28 May 2026 15:01:07 +0000
Subject: [PATCH] chore: add KEDA prover agent autoscaling

---
 .../templates/agent-scaledobject.yaml         |  40 ++++
 spartan/aztec-prover-stack/values.yaml        |  11 ++
 spartan/environments/next-net.env             |  11 +-
 spartan/environments/testnet.env              |  14 ++
 spartan/scripts/deploy_network.sh             |  18 +-
 spartan/scripts/network_deploy.sh             |   1 +
 spartan/scripts/setup_gcp_secrets.sh          |   1 +
 spartan/terraform/deploy-aztec-infra/main.tf  |  19 +-
 .../terraform/deploy-aztec-infra/variables.tf |  45 +++++
 spartan/terraform/deploy-metrics/main.tf      |  32 +++
 spartan/terraform/deploy-telemetry/main.tf    | 186 ------------------
 spartan/terraform/deploy-telemetry/outputs.tf |   9 -
 .../values/public-otel-collector.yaml         | 155 ---------------
 .../values/public-prometheus.yaml             |  39 ----
 .../terraform/deploy-telemetry/variables.tf   |  27 ---
 15 files changed, 188 insertions(+), 420 deletions(-)
 create mode 100644 spartan/aztec-prover-stack/templates/agent-scaledobject.yaml
 delete mode 100644 spartan/terraform/deploy-telemetry/main.tf
 delete mode 100644 spartan/terraform/deploy-telemetry/outputs.tf
 delete mode 100644 spartan/terraform/deploy-telemetry/values/public-otel-collector.yaml
 delete mode 100644 spartan/terraform/deploy-telemetry/values/public-prometheus.yaml
 delete mode 100644 spartan/terraform/deploy-telemetry/variables.tf

diff --git a/spartan/aztec-prover-stack/templates/agent-scaledobject.yaml b/spartan/aztec-prover-stack/templates/agent-scaledobject.yaml
new file mode 100644
index 000000000000..98f55a7cf5b4
--- /dev/null
+++ b/spartan/aztec-prover-stack/templates/agent-scaledobject.yaml
@@ -0,0 +1,40 @@
+{{- if .Values.agent.autoscaling.keda.enabled }}
+{{- $agentChartName := default "agent" .Values.agent.nameOverride }}
+{{- $agentDefaultName := ternary .Release.Name (printf "%s-%s" .Release.Name $agentChartName) (contains $agentChartName .Release.Name) }}
+{{- $agentName := default $agentDefaultName .Values.agent.fullnameOverride | trunc 63 | trimSuffix "-" }}
+{{- $queueQuery := printf "sum(aztec_proving_queue_size{k8s_namespace_name=%q})" .Release.Namespace }}
+apiVersion: keda.sh/v1alpha1
+kind: ScaledObject
+metadata:
+  name: {{ $agentName }}
+  labels:
+    {{- include "chart.labels" . | nindent 4 }}
+spec:
+  scaleTargetRef:
+    kind: Deployment
+    name: {{ $agentName }}
+  pollingInterval: {{ .Values.agent.autoscaling.keda.pollingInterval }}
+  cooldownPeriod: {{ .Values.agent.autoscaling.keda.cooldownPeriod }}
+  minReplicaCount: {{ .Values.agent.autoscaling.keda.minReplicaCount }}
+  maxReplicaCount: {{ .Values.agent.autoscaling.keda.maxReplicaCount }}
+  triggers:
+    {{- if .Values.agent.autoscaling.keda.scalingBands }}
+    {{- range $band := .Values.agent.autoscaling.keda.scalingBands }}
+    - type: prometheus
+      metadata:
+        serverAddress: {{ $.Values.agent.autoscaling.keda.prometheus.serverAddress | quote }}
+        metricName: {{ printf "aztec_proving_queue_size_agents_%v_over_%v" $band.replicas $band.queueSize | replace "." "_" | quote }}
+        query: {{ printf "((%s or vector(0)) > bool %v) * %v" $queueQuery $band.queueSize $band.replicas | quote }}
+        threshold: "1"
+        activationThreshold: "0"
+    {{- end }}
+    {{- else }}
+    - type: prometheus
+      metadata:
+        serverAddress: {{ .Values.agent.autoscaling.keda.prometheus.serverAddress | quote }}
+        metricName: "aztec_proving_queue_size"
+        query: {{ $queueQuery | quote }}
+        threshold: "1"
+        activationThreshold: "0"
+    {{- end }}
+{{- end }}
diff --git a/spartan/aztec-prover-stack/values.yaml b/spartan/aztec-prover-stack/values.yaml
index 2e8a433e5eb0..ae014973bcd7 100644
--- a/spartan/aztec-prover-stack/values.yaml
+++ b/spartan/aztec-prover-stack/values.yaml
@@ -84,6 +84,17 @@ agent:
   nodeType: "prover-agent"
   replicaCount: 1
 
+  autoscaling:
+    keda:
+      enabled: false
+      pollingInterval: 30
+      cooldownPeriod: 300
+      minReplicaCount: 0
+      maxReplicaCount: 1
+      scalingBands: []
+      prometheus:
+        serverAddress: ""
+
   persistence:
     enabled: false
 
diff --git a/spartan/environments/next-net.env b/spartan/environments/next-net.env
index 7e24b13ef493..efd92c9f1d64 100644
--- a/spartan/environments/next-net.env
+++ b/spartan/environments/next-net.env
@@ -51,7 +51,16 @@ VALIDATOR_PUBLISHER_MNEMONIC_START_INDEX=5000
 
 PUBLISHERS_PER_PROVER=2
 PROVER_PUBLISHER_MNEMONIC_START_INDEX=8000
-PROVER_REPLICAS=4
+PROVER_AGENT_KEDA_ENABLED=true
+PROVER_AGENT_KEDA_PROMETHEUS_SERVER_ADDRESS=REPLACE_WITH_GCP_SECRET
+PROVER_AGENT_KEDA_MIN_REPLICAS=0
+PROVER_AGENT_KEDA_MAX_REPLICAS=4
+PROVER_AGENT_KEDA_SCALING_BANDS='[
+  {
+    queueSize = 0
+    replicas = 4
+  }
+]'
 
 BOT_TRANSFERS_REPLICAS=1
 BOT_TRANSFERS_TX_INTERVAL_SECONDS=250
diff --git a/spartan/environments/testnet.env b/spartan/environments/testnet.env
index 1097fe11f818..8f0e5c30e7ac 100644
--- a/spartan/environments/testnet.env
+++ b/spartan/environments/testnet.env
@@ -89,3 +89,17 @@ PROVER_FAILED_PROOF_STORE=gs://aztec-develop/testnet/failed-proofs
 L1_TX_FAILED_STORE=gs://aztec-develop/testnet/failed-l1-txs
 PROVER_REPLICAS=4
 PROVER_RESOURCE_PROFILE="prod"
+PROVER_AGENT_KEDA_ENABLED=false
+PROVER_AGENT_KEDA_PROMETHEUS_SERVER_ADDRESS=
+PROVER_AGENT_KEDA_MIN_REPLICAS=0
+PROVER_AGENT_KEDA_MAX_REPLICAS=8
+PROVER_AGENT_KEDA_SCALING_BANDS='[
+  {
+    queueSize = 0
+    replicas = 4
+  },
+  {
+    queueSize = 50
+    replicas = 8
+  }
+]'
diff --git a/spartan/scripts/deploy_network.sh b/spartan/scripts/deploy_network.sh
index 26a2da1623d4..bc7e9e346cdd 100755
--- a/spartan/scripts/deploy_network.sh
+++ b/spartan/scripts/deploy_network.sh
@@ -132,6 +132,15 @@ SEQ_ENFORCE_TIME_TABLE=${SEQ_ENFORCE_TIME_TABLE:-}
 SEQ_SKIP_CHECKPOINT_PUBLISH_PERCENT=${SEQ_SKIP_CHECKPOINT_PUBLISH_PERCENT:-0}
 PROVER_REPLICAS=${PROVER_REPLICAS:-4}
 PROVER_ENABLED=${PROVER_ENABLED:-true}
+PROVER_AGENT_KEDA_ENABLED=${PROVER_AGENT_KEDA_ENABLED:-false}
+PROVER_AGENT_KEDA_MIN_REPLICAS=${PROVER_AGENT_KEDA_MIN_REPLICAS:-0}
+PROVER_AGENT_KEDA_MAX_REPLICAS=${PROVER_AGENT_KEDA_MAX_REPLICAS:-$PROVER_REPLICAS}
+PROVER_AGENT_KEDA_SCALING_BANDS=${PROVER_AGENT_KEDA_SCALING_BANDS:-[]}
+PROVER_AGENT_KEDA_PROMETHEUS_SERVER_ADDRESS=${PROVER_AGENT_KEDA_PROMETHEUS_SERVER_ADDRESS:-}
+if [[ "$PROVER_ENABLED" == "true" && "$PROVER_AGENT_KEDA_ENABLED" == "true" && -z "$PROVER_AGENT_KEDA_PROMETHEUS_SERVER_ADDRESS" ]]; then
+  die "PROVER_AGENT_KEDA_ENABLED=true requires PROVER_AGENT_KEDA_PROMETHEUS_SERVER_ADDRESS. Set it explicitly, for example via GCP secret replacement."
+fi
+PROVER_AGENT_REPLICA_CAPACITY=$([[ "$PROVER_ENABLED" == "true" ]] && echo "$PROVER_AGENT_KEDA_MAX_REPLICAS" || echo 0)
 PROVER_AGENTS_PER_PROVER=${PROVER_AGENTS_PER_PROVER:-1}
 R2_ACCESS_KEY_ID=${R2_ACCESS_KEY_ID:-}
 R2_SECRET_ACCESS_KEY=${R2_SECRET_ACCESS_KEY:-}
@@ -253,7 +262,7 @@ if (( TOTAL_VALIDATOR_PUBLISHERS > 0 )); then
 fi
 
 # Add prover publishers to prefunding list
-TOTAL_PROVER_PUBLISHERS=$((PROVER_REPLICAS * PUBLISHERS_PER_PROVER))
+TOTAL_PROVER_PUBLISHERS=$((PROVER_AGENT_REPLICA_CAPACITY * PUBLISHERS_PER_PROVER))
 
 if (( TOTAL_PROVER_PUBLISHERS > 0 )); then
   PROVER_PUBLISHER_RANGE=$(seq "$PROVER_PUBLISHER_MNEMONIC_START_INDEX" $((PROVER_PUBLISHER_MNEMONIC_START_INDEX + TOTAL_PROVER_PUBLISHERS - 1)) | tr '\n' ',' | sed 's/,$//')
@@ -630,6 +639,13 @@ BOT_CROSS_CHAIN_L2_PRIVATE_KEY = "${BOT_CROSS_CHAIN_L2_PRIVATE_KEY:-0xcafe03}"
 
 PROVER_AGENTS_PER_PROVER = ${PROVER_AGENTS_PER_PROVER}
 PROVER_AGENT_POLL_INTERVAL_MS = ${PROVER_AGENT_POLL_INTERVAL_MS}
+PROVER_AGENT_KEDA_ENABLED = ${PROVER_AGENT_KEDA_ENABLED:-false}
+PROVER_AGENT_KEDA_PROMETHEUS_SERVER_ADDRESS = "${PROVER_AGENT_KEDA_PROMETHEUS_SERVER_ADDRESS:-}"
+PROVER_AGENT_KEDA_MIN_REPLICAS = ${PROVER_AGENT_KEDA_MIN_REPLICAS:-0}
+PROVER_AGENT_KEDA_MAX_REPLICAS = ${PROVER_AGENT_KEDA_MAX_REPLICAS:-$PROVER_REPLICAS}
+PROVER_AGENT_KEDA_SCALING_BANDS = ${PROVER_AGENT_KEDA_SCALING_BANDS:-[]}
+PROVER_AGENT_KEDA_POLLING_INTERVAL_SECONDS = ${PROVER_AGENT_KEDA_POLLING_INTERVAL_SECONDS:-30}
+PROVER_AGENT_KEDA_COOLDOWN_PERIOD_SECONDS = ${PROVER_AGENT_KEDA_COOLDOWN_PERIOD_SECONDS:-300}
 
 RPC_INGRESS_ENABLED = ${RPC_INGRESS_ENABLED}
 RPC_INGRESS_HOSTS = ${RPC_INGRESS_HOSTS}
diff --git a/spartan/scripts/network_deploy.sh b/spartan/scripts/network_deploy.sh
index eab3b12962f7..8fa7fcaf3e12 100755
--- a/spartan/scripts/network_deploy.sh
+++ b/spartan/scripts/network_deploy.sh
@@ -28,6 +28,7 @@ gcp_auth
 # Second pass: source environment with GCP secret processing
 source_network_env "$env_file"
 
+
 # Optional: provision per-network IP + managed cert (+ DNS record in the delegated
 # rpc.aztec-labs.com zone) via the network-frontend terraform module. The module's
 # outputs are exported as env vars that deploy_network.sh already consumes.
diff --git a/spartan/scripts/setup_gcp_secrets.sh b/spartan/scripts/setup_gcp_secrets.sh
index 9eadb39ecdc1..10b2cb30e05c 100755
--- a/spartan/scripts/setup_gcp_secrets.sh
+++ b/spartan/scripts/setup_gcp_secrets.sh
@@ -88,6 +88,7 @@ declare -A SECRET_MAPPINGS=(
     ["FUNDING_PRIVATE_KEY"]="${L1_NETWORK}-funding-private-key"
     ["ROLLUP_DEPLOYMENT_PRIVATE_KEY"]="${L1_NETWORK}-labs-rollup-private-key"
     ["OTEL_COLLECTOR_ENDPOINT"]="otel-collector-url"
+    ["PROVER_AGENT_KEDA_PROMETHEUS_SERVER_ADDRESS"]="prometheus-internal-read-url"
     ["ETHERSCAN_API_KEY"]="etherscan-api-key"
     ["LABS_INFRA_MNEMONIC"]="${MNEMONIC_SECRET}"
     ["STORE_SNAPSHOT_URL"]="r2-account-id"
diff --git a/spartan/terraform/deploy-aztec-infra/main.tf b/spartan/terraform/deploy-aztec-infra/main.tf
index b4d66a49a142..d14fd6d69d8a 100644
--- a/spartan/terraform/deploy-aztec-infra/main.tf
+++ b/spartan/terraform/deploy-aztec-infra/main.tf
@@ -52,7 +52,7 @@ module "web3signer" {
   VALIDATOR_MNEMONIC_START_INDEX           = tonumber(var.VALIDATOR_MNEMONIC_START_INDEX)
   VALIDATOR_PUBLISHER_MNEMONIC_START_INDEX = tonumber(var.VALIDATOR_PUBLISHER_MNEMONIC_START_INDEX)
   VALIDATOR_PUBLISHERS_PER_REPLICA         = var.VALIDATOR_PUBLISHERS_PER_REPLICA
-  PROVER_COUNT                             = tonumber(var.PROVER_REPLICAS)
+  PROVER_COUNT                             = local.prover_agent_replica_capacity
   PUBLISHERS_PER_PROVER                    = tonumber(var.PROVER_PUBLISHERS_PER_PROVER)
   PROVER_PUBLISHER_MNEMONIC_START_INDEX    = tonumber(var.PROVER_PUBLISHER_MNEMONIC_START_INDEX)
 
@@ -94,6 +94,8 @@ locals {
     tag        = split(":", var.VALIDATOR_HA_DOCKER_IMAGE)[1]
   } : local.aztec_image
 
+  prover_agent_replica_capacity = var.PROVER_ENABLED ? (var.PROVER_AGENT_KEDA_ENABLED ? var.PROVER_AGENT_KEDA_MAX_REPLICAS : tonumber(var.PROVER_REPLICAS)) : 0
+
   # Max node count: max of primary (VALIDATOR_REPLICAS) and HA pod counts
   # Determines how many attester keystores and publisher key ranges to generate
   effective_ha_count  = var.VALIDATOR_HA_REPLICAS > 0 ? coalesce(var.VALIDATOR_HA_REPLICA_COUNT, tonumber(var.VALIDATOR_REPLICAS)) : 0
@@ -343,6 +345,19 @@ locals {
           node = {
             logLevel = var.LOG_LEVEL
           }
+          autoscaling = {
+            keda = {
+              enabled         = var.PROVER_AGENT_KEDA_ENABLED && var.PROVER_ENABLED
+              pollingInterval = var.PROVER_AGENT_KEDA_POLLING_INTERVAL_SECONDS
+              cooldownPeriod  = var.PROVER_AGENT_KEDA_COOLDOWN_PERIOD_SECONDS
+              minReplicaCount = var.PROVER_AGENT_KEDA_MIN_REPLICAS
+              maxReplicaCount = var.PROVER_AGENT_KEDA_MAX_REPLICAS
+              scalingBands    = var.PROVER_AGENT_KEDA_SCALING_BANDS
+              prometheus = {
+                serverAddress = var.PROVER_AGENT_KEDA_PROMETHEUS_SERVER_ADDRESS
+              }
+            }
+          }
         }
         })], local.is_kind ? [yamlencode({
         agent = {
@@ -377,7 +392,7 @@ locals {
           "agent.node.env.CRS_PATH"                             = "/usr/src/crs"
           "agent.node.proverRealProofs"                         = var.PROVER_REAL_PROOFS
           "agent.node.env.PROVER_AGENT_POLL_INTERVAL_MS"        = var.PROVER_AGENT_POLL_INTERVAL_MS
-          "agent.replicaCount"                                  = var.PROVER_REPLICAS
+          "agent.replicaCount"                                  = var.PROVER_AGENT_KEDA_ENABLED ? "0" : var.PROVER_REPLICAS
           "agent.node.env.BOOTSTRAP_NODES"                      = "asdf"
           "agent.node.env.PROVER_AGENT_COUNT"                   = var.PROVER_AGENTS_PER_PROVER
           "agent.node.env.PROVER_TEST_DELAY_TYPE"               = var.PROVER_TEST_DELAY_TYPE
diff --git a/spartan/terraform/deploy-aztec-infra/variables.tf b/spartan/terraform/deploy-aztec-infra/variables.tf
index dd3f00859491..5b8c00682983 100644
--- a/spartan/terraform/deploy-aztec-infra/variables.tf
+++ b/spartan/terraform/deploy-aztec-infra/variables.tf
@@ -851,6 +851,51 @@ variable "PROVER_AGENT_POLL_INTERVAL_MS" {
   default     = 1000
 }
 
+variable "PROVER_AGENT_KEDA_ENABLED" {
+  description = "Whether KEDA should scale prover agent pods from proving queue depth"
+  type        = bool
+  default     = false
+}
+
+variable "PROVER_AGENT_KEDA_MIN_REPLICAS" {
+  description = "Minimum prover agent pods managed by KEDA"
+  type        = number
+  default     = 0
+}
+
+variable "PROVER_AGENT_KEDA_MAX_REPLICAS" {
+  description = "Maximum prover agent pods managed by KEDA"
+  type        = number
+  default     = 1
+}
+
+variable "PROVER_AGENT_KEDA_SCALING_BANDS" {
+  description = "Step scaling bands for prover agents. Each band scales to replicas when total proving queue size is greater than queueSize."
+  type = list(object({
+    queueSize = number
+    replicas  = number
+  }))
+  default = []
+}
+
+variable "PROVER_AGENT_KEDA_PROMETHEUS_SERVER_ADDRESS" {
+  description = "Prometheus server URL queried by KEDA for prover queue depth"
+  type        = string
+  default     = ""
+}
+
+variable "PROVER_AGENT_KEDA_POLLING_INTERVAL_SECONDS" {
+  description = "KEDA polling interval for prover agent queue-depth scaling"
+  type        = number
+  default     = 30
+}
+
+variable "PROVER_AGENT_KEDA_COOLDOWN_PERIOD_SECONDS" {
+  description = "KEDA cooldown period before scaling prover agents back down"
+  type        = number
+  default     = 300
+}
+
 variable "PROVER_AGENT_INCLUDE_METRICS" {
   description = "Metrics whitelist in the prover agent"
   type        = string
diff --git a/spartan/terraform/deploy-metrics/main.tf b/spartan/terraform/deploy-metrics/main.tf
index bf30ce3d59ac..d1a98c03d1a5 100644
--- a/spartan/terraform/deploy-metrics/main.tf
+++ b/spartan/terraform/deploy-metrics/main.tf
@@ -29,6 +29,11 @@ data "terraform_remote_state" "ssl" {
   }
 }
 
+data "google_compute_subnetwork" "default" {
+  name   = "default"
+  region = var.region
+}
+
 resource "google_compute_address" "grafana_ip" {
   provider     = google
   name         = "grafana-ip"
@@ -51,6 +56,18 @@ resource "google_compute_address" "otel_collector_ip" {
   }
 }
 
+resource "google_compute_address" "prometheus_ip" {
+  provider     = google
+  name         = "prometheus-ip"
+  address_type = "INTERNAL"
+  region       = var.region
+  subnetwork   = data.google_compute_subnetwork.default.id
+
+  lifecycle {
+    prevent_destroy = true
+  }
+}
+
 provider "kubernetes" {
   alias          = "gke-cluster"
   config_path    = "~/.kube/config"
@@ -209,6 +226,21 @@ resource "helm_release" "aztec-gke-cluster" {
     value = google_compute_address.otel_collector_ip.address
   }
 
+  set {
+    name  = "prometheus.server.service.type"
+    value = "LoadBalancer"
+  }
+
+  set {
+    name  = "prometheus.server.service.annotations.networking\\.gke\\.io\\/load-balancer-type"
+    value = "Internal"
+  }
+
+  set {
+    name  = "prometheus.server.service.loadBalancerIP"
+    value = google_compute_address.prometheus_ip.address
+  }
+
   set {
     name  = "prometheus.serverFiles.prometheus\\.yml.scrape_configs[0].job_name"
     value = "prometheus"
diff --git a/spartan/terraform/deploy-telemetry/main.tf b/spartan/terraform/deploy-telemetry/main.tf
deleted file mode 100644
index b44f1bc0cc28..000000000000
--- a/spartan/terraform/deploy-telemetry/main.tf
+++ /dev/null
@@ -1,186 +0,0 @@
-terraform {
-  backend "gcs" {
-    bucket = "aztec-terraform"
-    prefix = "metrics-deploy/us-west1-a/aztec-gke-private/telemetry/terraform.tfstate"
-  }
-  required_providers {
-    helm = {
-      source  = "hashicorp/helm"
-      version = "~> 2.16.1"
-    }
-    kubernetes = {
-      source  = "hashicorp/kubernetes"
-      version = "~> 2.24.0"
-    }
-  }
-}
-
-provider "google" {
-  project = var.project
-  region  = var.region
-}
-
-provider "kubernetes" {
-  alias          = "gke-cluster"
-  config_path    = "~/.kube/config"
-  config_context = var.cluster
-}
-
-provider "helm" {
-  alias = "gke-cluster"
-  kubernetes {
-    config_path    = "~/.kube/config"
-    config_context = var.cluster
-  }
-}
-
-resource "google_compute_global_address" "otel_collector_ingress" {
-  provider     = google
-  name         = "${var.RELEASE_NAME}-otel-collector-ingress"
-  address_type = "EXTERNAL"
-
-  lifecycle {
-    prevent_destroy = true
-  }
-}
-
-resource "kubernetes_namespace" "ns" {
-  provider = kubernetes.gke-cluster
-  metadata {
-    name = var.RELEASE_NAME
-  }
-}
-
-resource "kubernetes_manifest" "otel_ingress_certificate" {
-  provider = kubernetes.gke-cluster
-
-  manifest = {
-    "apiVersion" = "networking.gke.io/v1"
-    "kind"       = "ManagedCertificate"
-    "metadata" = {
-      "name"      = "otel-ingress-cert"
-      "namespace" = kubernetes_namespace.ns.metadata[0].name
-    }
-    "spec" = {
-      "domains" = var.HOSTS
-    }
-  }
-}
-
-resource "kubernetes_manifest" "otel_ingress_backend" {
-  provider = kubernetes.gke-cluster
-
-  manifest = {
-    "apiVersion" = "cloud.google.com/v1"
-    "kind"       = "BackendConfig"
-    "metadata" = {
-      "name"      = "otel-ingress-backend"
-      "namespace" = kubernetes_namespace.ns.metadata[0].name
-    }
-    "spec" = {
-      "healthCheck" = {
-        "checkIntervalSec" = 15
-        "timeoutSec"       = 5
-        "type"             = "HTTP"
-        "port"             = 13133
-        "requestPath"      = "/"
-      }
-    }
-  }
-}
-
-locals {
-  prefixes   = jsondecode(file("../../../yarn-project/cli/public_include_metric_prefixes.json"))
-  registries = ["0xec4156431d0f3df66d4e24ba3d30dcb4c85fa309", "0xf299347e765cfb27f913bde8e4983fd0f195676f", "0x2e48addca360da61e4d6c21ff2b1961af56eb83b", "0xc2f24280f5c7f4897370dfdeb30f79ded14f1c81"]
-  roles      = ["sequencer"]
-
-  otel_metric_allowlist   = join(" or ", formatlist("HasPrefix(name, %q)", local.prefixes))
-  otel_registry_allowlist = join(" or ", formatlist("resource.attributes[\"aztec.registry_address\"] == %q", local.registries))
-  otel_role_allowlist     = join(" or ", formatlist("resource.attributes[\"aztec.node_role\"] == %q", local.roles))
-}
-
-resource "helm_release" "otel_collector" {
-  provider          = helm.gke-cluster
-  name              = "otel"
-  namespace         = kubernetes_namespace.ns.metadata[0].name
-  repository        = "https://open-telemetry.github.io/opentelemetry-helm-charts"
-  chart             = "opentelemetry-collector"
-  version           = "0.127.2"
-  create_namespace  = false
-  upgrade_install   = true
-  dependency_update = true
-  force_update      = true
-  reuse_values      = false
-  reset_values      = true
-
-  # base values file
-  values = [
-    file("./values/public-otel-collector.yaml"),
-    yamlencode({
-      ingress = {
-        hosts = [
-          for index, host in var.HOSTS : ({
-            host = host
-            paths = [
-              {
-                path     = "/"
-                pathType = "Prefix"
-                port     = 4318
-              }
-            ]
-          })
-        ]
-      }
-    }),
-    # have to use a heredoc because of quotation issues with OTTL
-    <<-EOF
-config:
-  processors:
-    filter:
-      metrics:
-        metric:
-        - 'not (${local.otel_registry_allowlist})'
-        - 'not (${local.otel_role_allowlist})'
-        - 'not (${local.otel_metric_allowlist})'
-EOF
-  ]
-
-  set {
-    name  = "ingress.annotations.kubernetes\\.io\\/ingress\\.global-static-ip-name"
-    value = google_compute_global_address.otel_collector_ingress.name
-  }
-
-  set {
-    name  = "ingress.annotations.networking\\.gke\\.io\\/managed-certificates"
-    value = "otel-ingress-cert"
-  }
-
-  timeout         = 300
-  wait            = true
-  wait_for_jobs   = true
-  atomic          = true
-  cleanup_on_fail = true
-}
-
-resource "helm_release" "public_prometheus" {
-  provider          = helm.gke-cluster
-  name              = "prometheus"
-  namespace         = kubernetes_namespace.ns.metadata[0].name
-  repository        = "https://prometheus-community.github.io/helm-charts"
-  chart             = "prometheus"
-  version           = "25.27.0"
-  create_namespace  = false
-  upgrade_install   = true
-  dependency_update = true
-  force_update      = true
-  reuse_values      = false
-  reset_values      = true
-
-  values = [file("./values/public-prometheus.yaml")]
-
-  timeout         = 300
-  wait            = true
-  wait_for_jobs   = true
-  atomic          = true
-  cleanup_on_fail = true
-}
diff --git a/spartan/terraform/deploy-telemetry/outputs.tf b/spartan/terraform/deploy-telemetry/outputs.tf
deleted file mode 100644
index 693227f05012..000000000000
--- a/spartan/terraform/deploy-telemetry/outputs.tf
+++ /dev/null
@@ -1,9 +0,0 @@
-output "otel_ingress_hostname" {
-  description = "Public otel ingress"
-  value       = "https://${var.HOSTS[0]}"
-}
-
-output "otel_ingress_ip" {
-  description = "Public otel ingress IP address"
-  value       = google_compute_global_address.otel_collector_ingress.address
-}
diff --git a/spartan/terraform/deploy-telemetry/values/public-otel-collector.yaml b/spartan/terraform/deploy-telemetry/values/public-otel-collector.yaml
deleted file mode 100644
index 2e18a2d16667..000000000000
--- a/spartan/terraform/deploy-telemetry/values/public-otel-collector.yaml
+++ /dev/null
@@ -1,155 +0,0 @@
-mode: statefulset
-replicaCount: 1
-
-nodeSelector:
-  node-type: infra
-
-image:
-  repository: ghcr.io/open-telemetry/opentelemetry-collector-releases/opentelemetry-collector-contrib
-  tag: "0.128.0"
-
-resources:
-  requests:
-    memory: 12Gi
-    cpu: "2"
-  limits:
-    memory: 60Gi
-    cpu: "7"
-
-ports:
-  otlp:
-    enabled: false
-  otlp-http:
-    enabled: true
-    containerPort: 4318
-    servicePort: 4318
-    hostPort: 4318
-    protocol: TCP
-  jaeger-compact:
-    enabled: false
-  jaeger-thrift:
-    enabled: false
-  jaeger-grpc:
-    enabled: false
-  zipkin:
-    enabled: false
-
-config:
-  extensions:
-    health_check:
-      endpoint: ${env:MY_POD_IP}:13133
-
-  receivers:
-    jaeger: {}
-    prometheus: {}
-    zipkin: {}
-    otlp:
-      protocols:
-        grpc: {}
-        http:
-          endpoint: ${env:MY_POD_IP}:4318
-
-  processors:
-    memory_limiter:
-      check_interval: 1s
-      limit_mib: 12000
-      spike_limit_mib: 2000
-
-    filter:
-      metrics:
-        metric: [] # placeholder
-        datapoint:
-          - 'metric.type == METRIC_DATA_TYPE_HISTOGRAM and Len(explicit_bounds) > 20'
-
-    resource:
-      attributes:
-        - pattern: "(k8s|os|telemetry|service|exported).*"
-          action: delete
-
-    transform:
-      metric_statements:
-        - context: datapoint
-          statements:
-          - set(attributes["aztec.node_role"], resource.attributes["aztec.node_role"])
-          - set(attributes["aztec.registry_address"], resource.attributes["aztec.registry_address"])
-
-    batch: {}
-
-  exporters:
-    prometheus:
-      endpoint: ${env:MY_POD_IP}:8889
-      namespace: external
-      metric_expiration: 5m
-      resource_to_telemetry_conversion:
-        enabled: false
-
-  service:
-    telemetry:
-      metrics:
-        address: ${env:MY_POD_IP}:8888
-
-    pipelines:
-      logs: null
-      traces: null
-
-      metrics:
-        receivers:
-          - otlp
-        processors:
-          - memory_limiter
-          - resource
-          - filter
-          - transform
-          - batch
-        exporters:
-          - prometheus
-
-ports:
-  otlp:
-    enabled: false
-  otlp-http:
-    enabled: true
-  jaeger-compact:
-    enabled: false
-  jaeger-thrift:
-    enabled: false
-  jaeger-grpc:
-    enabled: false
-  zipkin:
-    enabled: false
-  metrics:
-    enabled: false
-  healthcheck:
-    enabled: true
-    containerPort: 13133
-    servicePort: 13133
-    hostPort: 13133
-    protocol: TCP
-  prom-otel:
-    enabled: true
-    containerPort: 8888
-    servicePort: 8888
-    hostPort: 8888
-    protocol: TCP
-  prom-aztec:
-    enabled: true
-    containerPort: 8889
-    servicePort: 8889
-    hostPort: 8889
-    protocol: TCP
-service:
-  enabled: true
-  annotations:
-    cloud.google.com/backend-config: "{\"default\":\"otel-ingress-backend\"}"
-    cloud.google.com/neg: "{\"ingress\": true}"
-
-ingress:
-  enabled: true
-  annotations:
-    kubernetes.io/ingress.allow-http: "true"
-    kubernetes.io/ingress.class: gce
-    kubernetes.io/ingress.global-static-ip-name: ""
-    cloud.google.com/healthcheck-port: "13133"
-    cloud.google.com/healthcheck-path: "/"
-
-    # networking.gke.io/managed-certificates: null
diff --git a/spartan/terraform/deploy-telemetry/values/public-prometheus.yaml b/spartan/terraform/deploy-telemetry/values/public-prometheus.yaml
deleted file mode 100644
index c25335185818..000000000000
--- a/spartan/terraform/deploy-telemetry/values/public-prometheus.yaml
+++ /dev/null
@@ -1,39 +0,0 @@
-server:
-  global:
-    evaluation_interval: 30s
-    scrape_interval: 1m
-    scrape_timeout: 20s
-  resources:
-    requests:
-      memory: 40Gi
-      cpu: "4"
-    limits:
-      memory: 60Gi
-      cpu: "7"
-  nodeSelector:
-    node-type: infra
-  persistentVolume:
-    enabled: true
-    size: 100Gi
-  replicaCount: 1
-  statefulSet:
-    enabled: true
-
-serverFiles:
-  prometheus.yml:
-    scrape_configs:
-      - job_name: public_telemetry
-        kubernetes_sd_configs:
-          - role: pod
-            namespaces:
-              own_namespace: true
-              names: []
-
-alertmanager:
-  enabled: false
-prometheus-node-exporter:
-  enabled: false
-prometheus-pushgateway:
-  enabled: false
-kube-state-metrics:
-  enabled: false
diff --git a/spartan/terraform/deploy-telemetry/variables.tf b/spartan/terraform/deploy-telemetry/variables.tf
deleted file mode 100644
index 55f7f8c4cca4..000000000000
--- a/spartan/terraform/deploy-telemetry/variables.tf
+++ /dev/null
@@ -1,27 +0,0 @@
-variable "cluster" {
-  description = "GKE cluster context"
-  type        = string
-  default     = "gke_testnet-440309_us-west1-a_aztec-gke-private"
-}
-
-variable "project" {
-  default = "testnet-440309"
-  type    = string
-}
-
-variable "region" {
-  default = "us-west1"
-  type    = string
-}
-
-variable "RELEASE_NAME" {
-  description = "Name of helm deployment and k8s namespace"
-  type        = string
-  default     = "public-telemetry"
-}
-
-variable "HOSTS" {
-  description = "The public hostname for the ingress"
-  type        = list(string)
-  default     = ["telemetry.alpha-testnet.aztec-labs.com"]
-}