From 98effd2e65f9f17147e38ed5ac67a728f9cd2269 Mon Sep 17 00:00:00 2001 From: Alex Gherghisan Date: Thu, 28 May 2026 15:01:07 +0000 Subject: [PATCH] chore: add KEDA prover agent autoscaling --- .../templates/agent-scaledobject.yaml | 40 ++++ spartan/aztec-prover-stack/values.yaml | 11 ++ spartan/environments/next-net.env | 11 +- spartan/environments/testnet.env | 14 ++ spartan/scripts/deploy_network.sh | 18 +- spartan/scripts/network_deploy.sh | 1 + spartan/scripts/setup_gcp_secrets.sh | 1 + spartan/terraform/deploy-aztec-infra/main.tf | 19 +- .../terraform/deploy-aztec-infra/variables.tf | 45 +++++ spartan/terraform/deploy-metrics/main.tf | 32 +++ spartan/terraform/deploy-telemetry/main.tf | 186 ------------------ spartan/terraform/deploy-telemetry/outputs.tf | 9 - .../values/public-otel-collector.yaml | 155 --------------- .../values/public-prometheus.yaml | 39 ---- .../terraform/deploy-telemetry/variables.tf | 27 --- 15 files changed, 188 insertions(+), 420 deletions(-) create mode 100644 spartan/aztec-prover-stack/templates/agent-scaledobject.yaml delete mode 100644 spartan/terraform/deploy-telemetry/main.tf delete mode 100644 spartan/terraform/deploy-telemetry/outputs.tf delete mode 100644 spartan/terraform/deploy-telemetry/values/public-otel-collector.yaml delete mode 100644 spartan/terraform/deploy-telemetry/values/public-prometheus.yaml delete mode 100644 spartan/terraform/deploy-telemetry/variables.tf diff --git a/spartan/aztec-prover-stack/templates/agent-scaledobject.yaml b/spartan/aztec-prover-stack/templates/agent-scaledobject.yaml new file mode 100644 index 000000000000..98f55a7cf5b4 --- /dev/null +++ b/spartan/aztec-prover-stack/templates/agent-scaledobject.yaml @@ -0,0 +1,40 @@ +{{- if .Values.agent.autoscaling.keda.enabled }} +{{- $agentChartName := default "agent" .Values.agent.nameOverride }} +{{- $agentDefaultName := ternary .Release.Name (printf "%s-%s" .Release.Name $agentChartName) (contains $agentChartName .Release.Name) }} +{{- $agentName := default $agentDefaultName .Values.agent.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- $queueQuery := printf "sum(aztec_proving_queue_size{k8s_namespace_name=%q})" .Release.Namespace }} +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject +metadata: + name: {{ $agentName }} + labels: + {{- include "chart.labels" . | nindent 4 }} +spec: + scaleTargetRef: + kind: Deployment + name: {{ $agentName }} + pollingInterval: {{ .Values.agent.autoscaling.keda.pollingInterval }} + cooldownPeriod: {{ .Values.agent.autoscaling.keda.cooldownPeriod }} + minReplicaCount: {{ .Values.agent.autoscaling.keda.minReplicaCount }} + maxReplicaCount: {{ .Values.agent.autoscaling.keda.maxReplicaCount }} + triggers: + {{- if .Values.agent.autoscaling.keda.scalingBands }} + {{- range $band := .Values.agent.autoscaling.keda.scalingBands }} + - type: prometheus + metadata: + serverAddress: {{ $.Values.agent.autoscaling.keda.prometheus.serverAddress | quote }} + metricName: {{ printf "aztec_proving_queue_size_agents_%v_over_%v" $band.replicas $band.queueSize | replace "." "_" | quote }} + query: {{ printf "((%s or vector(0)) > bool %v) * %v" $queueQuery $band.queueSize $band.replicas | quote }} + threshold: "1" + activationThreshold: "0" + {{- end }} + {{- else }} + - type: prometheus + metadata: + serverAddress: {{ .Values.agent.autoscaling.keda.prometheus.serverAddress | quote }} + metricName: "aztec_proving_queue_size" + query: {{ $queueQuery | quote }} + threshold: "1" + activationThreshold: "0" + {{- end }} +{{- end }} diff --git a/spartan/aztec-prover-stack/values.yaml b/spartan/aztec-prover-stack/values.yaml index 2e8a433e5eb0..ae014973bcd7 100644 --- a/spartan/aztec-prover-stack/values.yaml +++ b/spartan/aztec-prover-stack/values.yaml @@ -84,6 +84,17 @@ agent: nodeType: "prover-agent" replicaCount: 1 + autoscaling: + keda: + enabled: false + pollingInterval: 30 + cooldownPeriod: 300 + minReplicaCount: 0 + maxReplicaCount: 1 + scalingBands: [] + prometheus: + serverAddress: "" + persistence: enabled: false diff --git a/spartan/environments/next-net.env b/spartan/environments/next-net.env index 7e24b13ef493..efd92c9f1d64 100644 --- a/spartan/environments/next-net.env +++ b/spartan/environments/next-net.env @@ -51,7 +51,16 @@ VALIDATOR_PUBLISHER_MNEMONIC_START_INDEX=5000 PUBLISHERS_PER_PROVER=2 PROVER_PUBLISHER_MNEMONIC_START_INDEX=8000 -PROVER_REPLICAS=4 +PROVER_AGENT_KEDA_ENABLED=true +PROVER_AGENT_KEDA_PROMETHEUS_SERVER_ADDRESS=REPLACE_WITH_GCP_SECRET +PROVER_AGENT_KEDA_MIN_REPLICAS=0 +PROVER_AGENT_KEDA_MAX_REPLICAS=4 +PROVER_AGENT_KEDA_SCALING_BANDS='[ + { + queueSize = 0 + replicas = 4 + } +]' BOT_TRANSFERS_REPLICAS=1 BOT_TRANSFERS_TX_INTERVAL_SECONDS=250 diff --git a/spartan/environments/testnet.env b/spartan/environments/testnet.env index 1097fe11f818..8f0e5c30e7ac 100644 --- a/spartan/environments/testnet.env +++ b/spartan/environments/testnet.env @@ -89,3 +89,17 @@ PROVER_FAILED_PROOF_STORE=gs://aztec-develop/testnet/failed-proofs L1_TX_FAILED_STORE=gs://aztec-develop/testnet/failed-l1-txs PROVER_REPLICAS=4 PROVER_RESOURCE_PROFILE="prod" +PROVER_AGENT_KEDA_ENABLED=false +PROVER_AGENT_KEDA_PROMETHEUS_SERVER_ADDRESS= +PROVER_AGENT_KEDA_MIN_REPLICAS=0 +PROVER_AGENT_KEDA_MAX_REPLICAS=8 +PROVER_AGENT_KEDA_SCALING_BANDS='[ + { + queueSize = 0 + replicas = 4 + }, + { + queueSize = 50 + replicas = 8 + } +]' diff --git a/spartan/scripts/deploy_network.sh b/spartan/scripts/deploy_network.sh index 26a2da1623d4..bc7e9e346cdd 100755 --- a/spartan/scripts/deploy_network.sh +++ b/spartan/scripts/deploy_network.sh @@ -132,6 +132,15 @@ SEQ_ENFORCE_TIME_TABLE=${SEQ_ENFORCE_TIME_TABLE:-} SEQ_SKIP_CHECKPOINT_PUBLISH_PERCENT=${SEQ_SKIP_CHECKPOINT_PUBLISH_PERCENT:-0} PROVER_REPLICAS=${PROVER_REPLICAS:-4} PROVER_ENABLED=${PROVER_ENABLED:-true} +PROVER_AGENT_KEDA_ENABLED=${PROVER_AGENT_KEDA_ENABLED:-false} +PROVER_AGENT_KEDA_MIN_REPLICAS=${PROVER_AGENT_KEDA_MIN_REPLICAS:-0} +PROVER_AGENT_KEDA_MAX_REPLICAS=${PROVER_AGENT_KEDA_MAX_REPLICAS:-$PROVER_REPLICAS} +PROVER_AGENT_KEDA_SCALING_BANDS=${PROVER_AGENT_KEDA_SCALING_BANDS:-[]} +PROVER_AGENT_KEDA_PROMETHEUS_SERVER_ADDRESS=${PROVER_AGENT_KEDA_PROMETHEUS_SERVER_ADDRESS:-} +if [[ "$PROVER_ENABLED" == "true" && "$PROVER_AGENT_KEDA_ENABLED" == "true" && -z "$PROVER_AGENT_KEDA_PROMETHEUS_SERVER_ADDRESS" ]]; then + die "PROVER_AGENT_KEDA_ENABLED=true requires PROVER_AGENT_KEDA_PROMETHEUS_SERVER_ADDRESS. Set it explicitly, for example via GCP secret replacement." +fi +PROVER_AGENT_REPLICA_CAPACITY=$([[ "$PROVER_ENABLED" == "true" ]] && echo "$PROVER_AGENT_KEDA_MAX_REPLICAS" || echo 0) PROVER_AGENTS_PER_PROVER=${PROVER_AGENTS_PER_PROVER:-1} R2_ACCESS_KEY_ID=${R2_ACCESS_KEY_ID:-} R2_SECRET_ACCESS_KEY=${R2_SECRET_ACCESS_KEY:-} @@ -253,7 +262,7 @@ if (( TOTAL_VALIDATOR_PUBLISHERS > 0 )); then fi # Add prover publishers to prefunding list -TOTAL_PROVER_PUBLISHERS=$((PROVER_REPLICAS * PUBLISHERS_PER_PROVER)) +TOTAL_PROVER_PUBLISHERS=$((PROVER_AGENT_REPLICA_CAPACITY * PUBLISHERS_PER_PROVER)) if (( TOTAL_PROVER_PUBLISHERS > 0 )); then PROVER_PUBLISHER_RANGE=$(seq "$PROVER_PUBLISHER_MNEMONIC_START_INDEX" $((PROVER_PUBLISHER_MNEMONIC_START_INDEX + TOTAL_PROVER_PUBLISHERS - 1)) | tr '\n' ',' | sed 's/,$//') @@ -630,6 +639,13 @@ BOT_CROSS_CHAIN_L2_PRIVATE_KEY = "${BOT_CROSS_CHAIN_L2_PRIVATE_KEY:-0xcafe03}" PROVER_AGENTS_PER_PROVER = ${PROVER_AGENTS_PER_PROVER} PROVER_AGENT_POLL_INTERVAL_MS = ${PROVER_AGENT_POLL_INTERVAL_MS} +PROVER_AGENT_KEDA_ENABLED = ${PROVER_AGENT_KEDA_ENABLED:-false} +PROVER_AGENT_KEDA_PROMETHEUS_SERVER_ADDRESS = "${PROVER_AGENT_KEDA_PROMETHEUS_SERVER_ADDRESS:-}" +PROVER_AGENT_KEDA_MIN_REPLICAS = ${PROVER_AGENT_KEDA_MIN_REPLICAS:-0} +PROVER_AGENT_KEDA_MAX_REPLICAS = ${PROVER_AGENT_KEDA_MAX_REPLICAS:-$PROVER_REPLICAS} +PROVER_AGENT_KEDA_SCALING_BANDS = ${PROVER_AGENT_KEDA_SCALING_BANDS:-[]} +PROVER_AGENT_KEDA_POLLING_INTERVAL_SECONDS = ${PROVER_AGENT_KEDA_POLLING_INTERVAL_SECONDS:-30} +PROVER_AGENT_KEDA_COOLDOWN_PERIOD_SECONDS = ${PROVER_AGENT_KEDA_COOLDOWN_PERIOD_SECONDS:-300} RPC_INGRESS_ENABLED = ${RPC_INGRESS_ENABLED} RPC_INGRESS_HOSTS = ${RPC_INGRESS_HOSTS} diff --git a/spartan/scripts/network_deploy.sh b/spartan/scripts/network_deploy.sh index eab3b12962f7..8fa7fcaf3e12 100755 --- a/spartan/scripts/network_deploy.sh +++ b/spartan/scripts/network_deploy.sh @@ -28,6 +28,7 @@ gcp_auth # Second pass: source environment with GCP secret processing source_network_env "$env_file" + # Optional: provision per-network IP + managed cert (+ DNS record in the delegated # rpc.aztec-labs.com zone) via the network-frontend terraform module. The module's # outputs are exported as env vars that deploy_network.sh already consumes. diff --git a/spartan/scripts/setup_gcp_secrets.sh b/spartan/scripts/setup_gcp_secrets.sh index 9eadb39ecdc1..10b2cb30e05c 100755 --- a/spartan/scripts/setup_gcp_secrets.sh +++ b/spartan/scripts/setup_gcp_secrets.sh @@ -88,6 +88,7 @@ declare -A SECRET_MAPPINGS=( ["FUNDING_PRIVATE_KEY"]="${L1_NETWORK}-funding-private-key" ["ROLLUP_DEPLOYMENT_PRIVATE_KEY"]="${L1_NETWORK}-labs-rollup-private-key" ["OTEL_COLLECTOR_ENDPOINT"]="otel-collector-url" + ["PROVER_AGENT_KEDA_PROMETHEUS_SERVER_ADDRESS"]="prometheus-internal-read-url" ["ETHERSCAN_API_KEY"]="etherscan-api-key" ["LABS_INFRA_MNEMONIC"]="${MNEMONIC_SECRET}" ["STORE_SNAPSHOT_URL"]="r2-account-id" diff --git a/spartan/terraform/deploy-aztec-infra/main.tf b/spartan/terraform/deploy-aztec-infra/main.tf index b4d66a49a142..d14fd6d69d8a 100644 --- a/spartan/terraform/deploy-aztec-infra/main.tf +++ b/spartan/terraform/deploy-aztec-infra/main.tf @@ -52,7 +52,7 @@ module "web3signer" { VALIDATOR_MNEMONIC_START_INDEX = tonumber(var.VALIDATOR_MNEMONIC_START_INDEX) VALIDATOR_PUBLISHER_MNEMONIC_START_INDEX = tonumber(var.VALIDATOR_PUBLISHER_MNEMONIC_START_INDEX) VALIDATOR_PUBLISHERS_PER_REPLICA = var.VALIDATOR_PUBLISHERS_PER_REPLICA - PROVER_COUNT = tonumber(var.PROVER_REPLICAS) + PROVER_COUNT = local.prover_agent_replica_capacity PUBLISHERS_PER_PROVER = tonumber(var.PROVER_PUBLISHERS_PER_PROVER) PROVER_PUBLISHER_MNEMONIC_START_INDEX = tonumber(var.PROVER_PUBLISHER_MNEMONIC_START_INDEX) @@ -94,6 +94,8 @@ locals { tag = split(":", var.VALIDATOR_HA_DOCKER_IMAGE)[1] } : local.aztec_image + prover_agent_replica_capacity = var.PROVER_ENABLED ? (var.PROVER_AGENT_KEDA_ENABLED ? var.PROVER_AGENT_KEDA_MAX_REPLICAS : tonumber(var.PROVER_REPLICAS)) : 0 + # Max node count: max of primary (VALIDATOR_REPLICAS) and HA pod counts # Determines how many attester keystores and publisher key ranges to generate effective_ha_count = var.VALIDATOR_HA_REPLICAS > 0 ? coalesce(var.VALIDATOR_HA_REPLICA_COUNT, tonumber(var.VALIDATOR_REPLICAS)) : 0 @@ -343,6 +345,19 @@ locals { node = { logLevel = var.LOG_LEVEL } + autoscaling = { + keda = { + enabled = var.PROVER_AGENT_KEDA_ENABLED && var.PROVER_ENABLED + pollingInterval = var.PROVER_AGENT_KEDA_POLLING_INTERVAL_SECONDS + cooldownPeriod = var.PROVER_AGENT_KEDA_COOLDOWN_PERIOD_SECONDS + minReplicaCount = var.PROVER_AGENT_KEDA_MIN_REPLICAS + maxReplicaCount = var.PROVER_AGENT_KEDA_MAX_REPLICAS + scalingBands = var.PROVER_AGENT_KEDA_SCALING_BANDS + prometheus = { + serverAddress = var.PROVER_AGENT_KEDA_PROMETHEUS_SERVER_ADDRESS + } + } + } } })], local.is_kind ? [yamlencode({ agent = { @@ -377,7 +392,7 @@ locals { "agent.node.env.CRS_PATH" = "/usr/src/crs" "agent.node.proverRealProofs" = var.PROVER_REAL_PROOFS "agent.node.env.PROVER_AGENT_POLL_INTERVAL_MS" = var.PROVER_AGENT_POLL_INTERVAL_MS - "agent.replicaCount" = var.PROVER_REPLICAS + "agent.replicaCount" = var.PROVER_AGENT_KEDA_ENABLED ? "0" : var.PROVER_REPLICAS "agent.node.env.BOOTSTRAP_NODES" = "asdf" "agent.node.env.PROVER_AGENT_COUNT" = var.PROVER_AGENTS_PER_PROVER "agent.node.env.PROVER_TEST_DELAY_TYPE" = var.PROVER_TEST_DELAY_TYPE diff --git a/spartan/terraform/deploy-aztec-infra/variables.tf b/spartan/terraform/deploy-aztec-infra/variables.tf index dd3f00859491..5b8c00682983 100644 --- a/spartan/terraform/deploy-aztec-infra/variables.tf +++ b/spartan/terraform/deploy-aztec-infra/variables.tf @@ -851,6 +851,51 @@ variable "PROVER_AGENT_POLL_INTERVAL_MS" { default = 1000 } +variable "PROVER_AGENT_KEDA_ENABLED" { + description = "Whether KEDA should scale prover agent pods from proving queue depth" + type = bool + default = false +} + +variable "PROVER_AGENT_KEDA_MIN_REPLICAS" { + description = "Minimum prover agent pods managed by KEDA" + type = number + default = 0 +} + +variable "PROVER_AGENT_KEDA_MAX_REPLICAS" { + description = "Maximum prover agent pods managed by KEDA" + type = number + default = 1 +} + +variable "PROVER_AGENT_KEDA_SCALING_BANDS" { + description = "Step scaling bands for prover agents. Each band scales to replicas when total proving queue size is greater than queueSize." + type = list(object({ + queueSize = number + replicas = number + })) + default = [] +} + +variable "PROVER_AGENT_KEDA_PROMETHEUS_SERVER_ADDRESS" { + description = "Prometheus server URL queried by KEDA for prover queue depth" + type = string + default = "" +} + +variable "PROVER_AGENT_KEDA_POLLING_INTERVAL_SECONDS" { + description = "KEDA polling interval for prover agent queue-depth scaling" + type = number + default = 30 +} + +variable "PROVER_AGENT_KEDA_COOLDOWN_PERIOD_SECONDS" { + description = "KEDA cooldown period before scaling prover agents back down" + type = number + default = 300 +} + variable "PROVER_AGENT_INCLUDE_METRICS" { description = "Metrics whitelist in the prover agent" type = string diff --git a/spartan/terraform/deploy-metrics/main.tf b/spartan/terraform/deploy-metrics/main.tf index bf30ce3d59ac..d1a98c03d1a5 100644 --- a/spartan/terraform/deploy-metrics/main.tf +++ b/spartan/terraform/deploy-metrics/main.tf @@ -29,6 +29,11 @@ data "terraform_remote_state" "ssl" { } } +data "google_compute_subnetwork" "default" { + name = "default" + region = var.region +} + resource "google_compute_address" "grafana_ip" { provider = google name = "grafana-ip" @@ -51,6 +56,18 @@ resource "google_compute_address" "otel_collector_ip" { } } +resource "google_compute_address" "prometheus_ip" { + provider = google + name = "prometheus-ip" + address_type = "INTERNAL" + region = var.region + subnetwork = data.google_compute_subnetwork.default.id + + lifecycle { + prevent_destroy = true + } +} + provider "kubernetes" { alias = "gke-cluster" config_path = "~/.kube/config" @@ -209,6 +226,21 @@ resource "helm_release" "aztec-gke-cluster" { value = google_compute_address.otel_collector_ip.address } + set { + name = "prometheus.server.service.type" + value = "LoadBalancer" + } + + set { + name = "prometheus.server.service.annotations.networking\\.gke\\.io\\/load-balancer-type" + value = "Internal" + } + + set { + name = "prometheus.server.service.loadBalancerIP" + value = google_compute_address.prometheus_ip.address + } + set { name = "prometheus.serverFiles.prometheus\\.yml.scrape_configs[0].job_name" value = "prometheus" diff --git a/spartan/terraform/deploy-telemetry/main.tf b/spartan/terraform/deploy-telemetry/main.tf deleted file mode 100644 index b44f1bc0cc28..000000000000 --- a/spartan/terraform/deploy-telemetry/main.tf +++ /dev/null @@ -1,186 +0,0 @@ -terraform { - backend "gcs" { - bucket = "aztec-terraform" - prefix = "metrics-deploy/us-west1-a/aztec-gke-private/telemetry/terraform.tfstate" - } - required_providers { - helm = { - source = "hashicorp/helm" - version = "~> 2.16.1" - } - kubernetes = { - source = "hashicorp/kubernetes" - version = "~> 2.24.0" - } - } -} - -provider "google" { - project = var.project - region = var.region -} - -provider "kubernetes" { - alias = "gke-cluster" - config_path = "~/.kube/config" - config_context = var.cluster -} - -provider "helm" { - alias = "gke-cluster" - kubernetes { - config_path = "~/.kube/config" - config_context = var.cluster - } -} - -resource "google_compute_global_address" "otel_collector_ingress" { - provider = google - name = "${var.RELEASE_NAME}-otel-collector-ingress" - address_type = "EXTERNAL" - - lifecycle { - prevent_destroy = true - } -} - -resource "kubernetes_namespace" "ns" { - provider = kubernetes.gke-cluster - metadata { - name = var.RELEASE_NAME - } -} - -resource "kubernetes_manifest" "otel_ingress_certificate" { - provider = kubernetes.gke-cluster - - manifest = { - "apiVersion" = "networking.gke.io/v1" - "kind" = "ManagedCertificate" - "metadata" = { - "name" = "otel-ingress-cert" - "namespace" = kubernetes_namespace.ns.metadata[0].name - } - "spec" = { - "domains" = var.HOSTS - } - } -} - -resource "kubernetes_manifest" "otel_ingress_backend" { - provider = kubernetes.gke-cluster - - manifest = { - "apiVersion" = "cloud.google.com/v1" - "kind" = "BackendConfig" - "metadata" = { - "name" = "otel-ingress-backend" - "namespace" = kubernetes_namespace.ns.metadata[0].name - } - "spec" = { - "healthCheck" = { - "checkIntervalSec" = 15 - "timeoutSec" = 5 - "type" = "HTTP" - "port" = 13133 - "requestPath" = "/" - } - } - } -} - -locals { - prefixes = jsondecode(file("../../../yarn-project/cli/public_include_metric_prefixes.json")) - registries = ["0xec4156431d0f3df66d4e24ba3d30dcb4c85fa309", "0xf299347e765cfb27f913bde8e4983fd0f195676f", "0x2e48addca360da61e4d6c21ff2b1961af56eb83b", "0xc2f24280f5c7f4897370dfdeb30f79ded14f1c81"] - roles = ["sequencer"] - - otel_metric_allowlist = join(" or ", formatlist("HasPrefix(name, %q)", local.prefixes)) - otel_registry_allowlist = join(" or ", formatlist("resource.attributes[\"aztec.registry_address\"] == %q", local.registries)) - otel_role_allowlist = join(" or ", formatlist("resource.attributes[\"aztec.node_role\"] == %q", local.roles)) -} - -resource "helm_release" "otel_collector" { - provider = helm.gke-cluster - name = "otel" - namespace = kubernetes_namespace.ns.metadata[0].name - repository = "https://open-telemetry.github.io/opentelemetry-helm-charts" - chart = "opentelemetry-collector" - version = "0.127.2" - create_namespace = false - upgrade_install = true - dependency_update = true - force_update = true - reuse_values = false - reset_values = true - - # base values file - values = [ - file("./values/public-otel-collector.yaml"), - yamlencode({ - ingress = { - hosts = [ - for index, host in var.HOSTS : ({ - host = host - paths = [ - { - path = "/" - pathType = "Prefix" - port = 4318 - } - ] - }) - ] - } - }), - # have to use a heredoc because of quotation issues with OTTL - <<-EOF -config: - processors: - filter: - metrics: - metric: - - 'not (${local.otel_registry_allowlist})' - - 'not (${local.otel_role_allowlist})' - - 'not (${local.otel_metric_allowlist})' -EOF - ] - - set { - name = "ingress.annotations.kubernetes\\.io\\/ingress\\.global-static-ip-name" - value = google_compute_global_address.otel_collector_ingress.name - } - - set { - name = "ingress.annotations.networking\\.gke\\.io\\/managed-certificates" - value = "otel-ingress-cert" - } - - timeout = 300 - wait = true - wait_for_jobs = true - atomic = true - cleanup_on_fail = true -} - -resource "helm_release" "public_prometheus" { - provider = helm.gke-cluster - name = "prometheus" - namespace = kubernetes_namespace.ns.metadata[0].name - repository = "https://prometheus-community.github.io/helm-charts" - chart = "prometheus" - version = "25.27.0" - create_namespace = false - upgrade_install = true - dependency_update = true - force_update = true - reuse_values = false - reset_values = true - - values = [file("./values/public-prometheus.yaml")] - - timeout = 300 - wait = true - wait_for_jobs = true - atomic = true - cleanup_on_fail = true -} diff --git a/spartan/terraform/deploy-telemetry/outputs.tf b/spartan/terraform/deploy-telemetry/outputs.tf deleted file mode 100644 index 693227f05012..000000000000 --- a/spartan/terraform/deploy-telemetry/outputs.tf +++ /dev/null @@ -1,9 +0,0 @@ -output "otel_ingress_hostname" { - description = "Public otel ingress" - value = "https://${var.HOSTS[0]}" -} - -output "otel_ingress_ip" { - description = "Public otel ingress IP address" - value = google_compute_global_address.otel_collector_ingress.address -} diff --git a/spartan/terraform/deploy-telemetry/values/public-otel-collector.yaml b/spartan/terraform/deploy-telemetry/values/public-otel-collector.yaml deleted file mode 100644 index 2e18a2d16667..000000000000 --- a/spartan/terraform/deploy-telemetry/values/public-otel-collector.yaml +++ /dev/null @@ -1,155 +0,0 @@ -mode: statefulset -replicaCount: 1 - -nodeSelector: - node-type: infra - -image: - repository: ghcr.io/open-telemetry/opentelemetry-collector-releases/opentelemetry-collector-contrib - tag: "0.128.0" - -resources: - requests: - memory: 12Gi - cpu: "2" - limits: - memory: 60Gi - cpu: "7" - -ports: - otlp: - enabled: false - otlp-http: - enabled: true - containerPort: 4318 - servicePort: 4318 - hostPort: 4318 - protocol: TCP - jaeger-compact: - enabled: false - jaeger-thrift: - enabled: false - jaeger-grpc: - enabled: false - zipkin: - enabled: false - -config: - extensions: - health_check: - endpoint: ${env:MY_POD_IP}:13133 - - receivers: - jaeger: {} - prometheus: {} - zipkin: {} - otlp: - protocols: - grpc: {} - http: - endpoint: ${env:MY_POD_IP}:4318 - - processors: - memory_limiter: - check_interval: 1s - limit_mib: 12000 - spike_limit_mib: 2000 - - filter: - metrics: - metric: [] # placeholder - datapoint: - - 'metric.type == METRIC_DATA_TYPE_HISTOGRAM and Len(explicit_bounds) > 20' - - resource: - attributes: - - pattern: "(k8s|os|telemetry|service|exported).*" - action: delete - - transform: - metric_statements: - - context: datapoint - statements: - - set(attributes["aztec.node_role"], resource.attributes["aztec.node_role"]) - - set(attributes["aztec.registry_address"], resource.attributes["aztec.registry_address"]) - - batch: {} - - exporters: - prometheus: - endpoint: ${env:MY_POD_IP}:8889 - namespace: external - metric_expiration: 5m - resource_to_telemetry_conversion: - enabled: false - - service: - telemetry: - metrics: - address: ${env:MY_POD_IP}:8888 - - pipelines: - logs: null - traces: null - - metrics: - receivers: - - otlp - processors: - - memory_limiter - - resource - - filter - - transform - - batch - exporters: - - prometheus - -ports: - otlp: - enabled: false - otlp-http: - enabled: true - jaeger-compact: - enabled: false - jaeger-thrift: - enabled: false - jaeger-grpc: - enabled: false - zipkin: - enabled: false - metrics: - enabled: false - healthcheck: - enabled: true - containerPort: 13133 - servicePort: 13133 - hostPort: 13133 - protocol: TCP - prom-otel: - enabled: true - containerPort: 8888 - servicePort: 8888 - hostPort: 8888 - protocol: TCP - prom-aztec: - enabled: true - containerPort: 8889 - servicePort: 8889 - hostPort: 8889 - protocol: TCP -service: - enabled: true - annotations: - cloud.google.com/backend-config: "{\"default\":\"otel-ingress-backend\"}" - cloud.google.com/neg: "{\"ingress\": true}" - -ingress: - enabled: true - annotations: - kubernetes.io/ingress.allow-http: "true" - kubernetes.io/ingress.class: gce - kubernetes.io/ingress.global-static-ip-name: "" - cloud.google.com/healthcheck-port: "13133" - cloud.google.com/healthcheck-path: "/" - - # networking.gke.io/managed-certificates: null diff --git a/spartan/terraform/deploy-telemetry/values/public-prometheus.yaml b/spartan/terraform/deploy-telemetry/values/public-prometheus.yaml deleted file mode 100644 index c25335185818..000000000000 --- a/spartan/terraform/deploy-telemetry/values/public-prometheus.yaml +++ /dev/null @@ -1,39 +0,0 @@ -server: - global: - evaluation_interval: 30s - scrape_interval: 1m - scrape_timeout: 20s - resources: - requests: - memory: 40Gi - cpu: "4" - limits: - memory: 60Gi - cpu: "7" - nodeSelector: - node-type: infra - persistentVolume: - enabled: true - size: 100Gi - replicaCount: 1 - statefulSet: - enabled: true - -serverFiles: - prometheus.yml: - scrape_configs: - - job_name: public_telemetry - kubernetes_sd_configs: - - role: pod - namespaces: - own_namespace: true - names: [] - -alertmanager: - enabled: false -prometheus-node-exporter: - enabled: false -prometheus-pushgateway: - enabled: false -kube-state-metrics: - enabled: false diff --git a/spartan/terraform/deploy-telemetry/variables.tf b/spartan/terraform/deploy-telemetry/variables.tf deleted file mode 100644 index 55f7f8c4cca4..000000000000 --- a/spartan/terraform/deploy-telemetry/variables.tf +++ /dev/null @@ -1,27 +0,0 @@ -variable "cluster" { - description = "GKE cluster context" - type = string - default = "gke_testnet-440309_us-west1-a_aztec-gke-private" -} - -variable "project" { - default = "testnet-440309" - type = string -} - -variable "region" { - default = "us-west1" - type = string -} - -variable "RELEASE_NAME" { - description = "Name of helm deployment and k8s namespace" - type = string - default = "public-telemetry" -} - -variable "HOSTS" { - description = "The public hostname for the ingress" - type = list(string) - default = ["telemetry.alpha-testnet.aztec-labs.com"] -}