From 234aeeae08a73541a8d642401cfe86bc960a0fc4 Mon Sep 17 00:00:00 2001 From: bdchatham Date: Sat, 14 Mar 2026 17:27:43 -0700 Subject: [PATCH] fix: pass UseLocalSnapshot to configure-state-sync task When a SeiNode has a snapshot config, the controller now passes UseLocalSnapshot=true to the ConfigureStateSyncTask. This causes the seictl sidecar to set use-local-snapshot=true and derive trust-height from the local snapshot instead of the chain tip. Also bumps seictl to v0.0.13 which contains the corresponding handler changes, and updates sample manifests with production parity config overrides. --- go.mod | 2 +- go.sum | 4 +- internal/controller/node/task_builders.go | 4 +- manifests/samples/pacific-1-replay.yaml | 13 +- .../samples/pacific-1-shadow-replay.yaml | 146 +++++++++++++++--- manifests/samples/pacific-1-snapshotter.yaml | 15 +- 6 files changed, 154 insertions(+), 30 deletions(-) diff --git a/go.mod b/go.mod index 984497cb..c694179d 100644 --- a/go.mod +++ b/go.mod @@ -6,7 +6,7 @@ require ( github.com/google/uuid v1.6.0 github.com/onsi/gomega v1.38.2 github.com/sei-protocol/sei-config v0.0.5 - github.com/sei-protocol/seictl v0.0.12 + github.com/sei-protocol/seictl v0.0.13 k8s.io/api v0.35.0 k8s.io/apimachinery v0.35.0 k8s.io/client-go v0.35.0 diff --git a/go.sum b/go.sum index abc7b9e7..6b5d9e73 100644 --- a/go.sum +++ b/go.sum @@ -128,8 +128,8 @@ github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7 github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= github.com/sei-protocol/sei-config v0.0.5 h1:edMsQk0/WijGwbZIccSGC2FtPkw0N9XIWDSGgsDeAFw= github.com/sei-protocol/sei-config v0.0.5/go.mod h1:IEAv5ynYw8Gu2F2qNfE4MQR0PPihAT6g7RWLpWdw5O0= -github.com/sei-protocol/seictl v0.0.12 h1:BH6EXSrCSjMT45q/wWqskkR1ph+V9YxIgbXg4Cv80tI= -github.com/sei-protocol/seictl v0.0.12/go.mod h1:Tf6AISrbFK0i9/BYHB4pkDrLrk5KAfuFuTkz/fKfY9w= +github.com/sei-protocol/seictl v0.0.13 h1:AoJNfA8lo0cQLbqyWJVCKRIauAoDvi4UOnJwfux7S/I= +github.com/sei-protocol/seictl v0.0.13/go.mod h1:Tf6AISrbFK0i9/BYHB4pkDrLrk5KAfuFuTkz/fKfY9w= github.com/spf13/cobra v1.10.0 h1:a5/WeUlSDCvV5a45ljW2ZFtV0bTDpkfSAj3uqB6Sc+0= github.com/spf13/cobra v1.10.0/go.mod h1:9dhySC7dnTtEiqzmqfkLj47BslqLCUPMXjG2lj/NgoE= github.com/spf13/pflag v1.0.8/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= diff --git a/internal/controller/node/task_builders.go b/internal/controller/node/task_builders.go index 79d1fd5d..cc452a84 100644 --- a/internal/controller/node/task_builders.go +++ b/internal/controller/node/task_builders.go @@ -27,7 +27,9 @@ func taskBuilderForNode(node *seiv1alpha1.SeiNode, taskType string) sidecar.Task case taskConfigureGenesis: return configureGenesisBuilder(node) case taskConfigureStateSync: - return sidecar.ConfigureStateSyncTask{} + return sidecar.ConfigureStateSyncTask{ + UseLocalSnapshot: hasLocalSnapshot(node), + } case taskConfigApply: return configApplyBuilder(node) case taskConfigValidate: diff --git a/manifests/samples/pacific-1-replay.yaml b/manifests/samples/pacific-1-replay.yaml index e63eea50..852fe9a2 100644 --- a/manifests/samples/pacific-1-replay.yaml +++ b/manifests/samples/pacific-1-replay.yaml @@ -15,6 +15,17 @@ spec: command: ["seid"] args: ["start", "--home", "/sei"] + config: + overrides: + # Match sei-infra production settings + chain.concurrency_workers: "500" + chain.occ_enabled: "true" + self_remediation.blocks_behind_threshold: "300" + self_remediation.blocks_behind_check_interval_seconds: "60" + self_remediation.restart_cooldown_seconds: "300" + self_remediation.p2p_no_peers_restart_window_seconds: "120" + self_remediation.statesync_no_peers_restart_window_seconds: "90" + genesis: chainId: pacific-1 s3: @@ -27,7 +38,7 @@ spec: region: eu-central-1 tags: ChainIdentifier: pacific-1 - Component: snapshotter + Component: state-syncer stateSync: trustPeriod: "9999h0m0s" diff --git a/manifests/samples/pacific-1-shadow-replay.yaml b/manifests/samples/pacific-1-shadow-replay.yaml index 28478d9c..d9531192 100644 --- a/manifests/samples/pacific-1-shadow-replay.yaml +++ b/manifests/samples/pacific-1-shadow-replay.yaml @@ -1,12 +1,14 @@ -# Shadow Replay Job +# Shadow Replay Job — Phase 1 # # Re-executes mainnet blocks through the Giga engine and compares outcomes # against canonical results from an archival source node. -# Output: NDJSON ComparisonRecords to stdout. +# +# Output: NDJSON BlockComparison records to stdout + /sei/shadow/output/ +# Metrics: Prometheus on :9090 # # Prerequisites: # 1. A seid image built from ftr-shadow (with shadow-replay command) -# 2. An archival source node (min-retain-blocks=0) reachable via RPC +# 2. An archival source node reachable via RPC # 3. The seid-node ServiceAccount with IRSA for S3 access --- apiVersion: v1 @@ -27,14 +29,15 @@ metadata: name: pacific-1-shadow-replay namespace: default spec: - backoffLimit: 2 + backoffLimit: 5 + activeDeadlineSeconds: 21600 template: metadata: labels: sei.io/workload: shadow-replay sei.io/chain: pacific-1 spec: - restartPolicy: OnFailure + restartPolicy: Never serviceAccountName: seid-node tolerations: - key: sei.io/workload @@ -50,9 +53,9 @@ spec: values: [sei-node] initContainers: - # 1. Bootstrap the seid home directory. - name: seid-init image: &seidImage ghcr.io/bdchatham/sei-shadow:ftr-shadow + imagePullPolicy: Always command: ["/bin/sh", "-c"] args: - | @@ -61,12 +64,11 @@ spec: else seid init pacific-1 --chain-id pacific-1 --home /sei --overwrite fi - mkdir -p /sei/tmp + mkdir -p /sei/tmp /sei/shadow/output volumeMounts: - name: data mountPath: /sei - # 2. Download snapshot from S3 and extract into data directory. - name: snapshot-restore image: amazon/aws-cli:2.27.31 command: ["/bin/sh", "-c"] @@ -84,21 +86,23 @@ spec: exit 0 fi + yum install -y tar gzip >/dev/null 2>&1 + echo "resolving latest snapshot..." HEIGHT=$(aws s3 cp "s3://${BUCKET}/${PREFIX}latest.txt" - --region ${REGION} | tr -d '[:space:]') KEY="${PREFIX}snapshot_${HEIGHT}_${CHAIN_ID}_${REGION}.tar.gz" - echo "downloading s3://${BUCKET}/${KEY}" + echo "downloading and extracting s3://${BUCKET}/${KEY}" mkdir -p ${DEST} aws s3 cp "s3://${BUCKET}/${KEY}" - --region ${REGION} | tar xzf - -C ${DEST} + echo "${HEIGHT}" > /sei/shadow/snapshot-height.txt echo "snapshot restored at height ${HEIGHT}" touch /sei/.snapshot-done volumeMounts: - name: data mountPath: /sei - # 3. Download the real mainnet genesis.json (overwrites the dummy from seid init). - name: genesis image: amazon/aws-cli:2.27.31 command: ["/bin/sh", "-c"] @@ -112,26 +116,122 @@ spec: - name: data mountPath: /sei + - name: resolve-start-height + image: *seidImage + imagePullPolicy: Always + command: ["/bin/sh", "-c"] + args: + - | + if [ -f /sei/shadow/snapshot-height.txt ]; then + HEIGHT=$(cat /sei/shadow/snapshot-height.txt | tr -d '[:space:]') + START=$((HEIGHT + 1)) + echo "${START}" > /sei/shadow/start-height.txt + echo "start height resolved: ${START}" + else + echo "ERROR: snapshot-height.txt not found" + exit 1 + fi + volumeMounts: + - name: data + mountPath: /sei + + - name: bootstrap-state + image: *seidImage + imagePullPolicy: Always + command: ["/bin/sh", "-c"] + args: + - | + if [ -f /sei/.bootstrap-done ]; then + echo "state already bootstrapped, skipping" + exit 0 + fi + + HEIGHT=$(cat /sei/shadow/snapshot-height.txt | tr -d '[:space:]') + echo "bootstrapping state from local snapshot at height ${HEIGHT}..." + + # Configure state-sync to use the local snapshot + sed -i 's/^enable = .*/enable = true/' /sei/config/config.toml + sed -i 's/^use-local-snapshot = .*/use-local-snapshot = true/' /sei/config/config.toml + sed -i "s/^trust-height = .*/trust-height = ${HEIGHT}/" /sei/config/config.toml + sed -i 's|^rpc-servers = .*|rpc-servers = "18.194.110.34:26657,63.180.172.82:26657"|' /sei/config/config.toml + sed -i 's/^backfill-blocks = .*/backfill-blocks = 0/' /sei/config/config.toml + sed -i 's/^trust-period = .*/trust-period = "9999h0m0s"/' /sei/config/config.toml + + # Fetch trust hash for the snapshot height + TRUST_HASH=$(curl -s "http://18.194.110.34:26657/block?height=${HEIGHT}" | \ + jq -r '.block_id.hash // .result.block_id.hash' 2>/dev/null) + if [ -n "${TRUST_HASH}" ] && [ "${TRUST_HASH}" != "null" ]; then + sed -i "s/^trust-hash = .*/trust-hash = \"${TRUST_HASH}\"/" /sei/config/config.toml + echo "trust hash: ${TRUST_HASH}" + else + echo "WARNING: could not fetch trust hash, using empty" + fi + + # Start seid in background + seid start --home /sei & + SEID_PID=$! + + echo "waiting for state-sync to apply snapshot (pid=${SEID_PID})..." + # 7200 iterations * 2s = 4 hours max + for i in $(seq 1 7200); do + sleep 2 + # If seid crashed, exit early + if ! kill -0 ${SEID_PID} 2>/dev/null; then + echo "ERROR: seid process exited unexpectedly" + exit 1 + fi + # Check RPC (only available after state-sync completes) + CURRENT=$(curl -s http://localhost:26657/status 2>/dev/null | \ + jq -r '.sync_info.latest_block_height // .result.sync_info.latest_block_height' 2>/dev/null) + if [ -n "${CURRENT}" ] && [ "${CURRENT}" != "null" ] && [ "${CURRENT}" -ge "${HEIGHT}" ] 2>/dev/null; then + echo "state bootstrapped at height ${CURRENT}" + kill ${SEID_PID} 2>/dev/null + wait ${SEID_PID} 2>/dev/null + sed -i 's/^enable = true/enable = false/' /sei/config/config.toml + touch /sei/.bootstrap-done + exit 0 + fi + # Progress log every 5 minutes + if [ $((i % 150)) -eq 0 ]; then + ELAPSED=$((i * 2 / 60)) + echo "still waiting... ${ELAPSED}m elapsed (current height: ${CURRENT:-unknown})" + fi + done + echo "ERROR: timed out waiting for state-sync (4h)" + kill ${SEID_PID} 2>/dev/null + exit 1 + env: + - name: TMPDIR + value: /sei/tmp + volumeMounts: + - name: data + mountPath: /sei + containers: - name: shadow-replay image: *seidImage - command: ["seid"] + imagePullPolicy: Always + command: ["/bin/sh", "-c"] args: - - "shadow-replay" - - "--home" - - "/sei" - - "--source-rpc" - # TODO: replace with the archive node's RPC endpoint - - "http://pacific-replay.sei-network.svc.cluster.local:26657" - - "--start-height" - # Must equal snapshot height + 1. Current snapshots are ~197875000. - - "197875001" - - "--end-height" - # 50k blocks for initial validation. Set to "0" for continuous replay to tip. - - "197925000" + - | + START=$(cat /sei/shadow/start-height.txt | tr -d '[:space:]') + END=$((START + 49999)) + echo "replaying blocks ${START} to ${END}" + exec seid shadow-replay \ + --home /sei \ + --source-rpc http://18.194.110.34:26657 \ + --start-height "${START}" \ + --end-height "${END}" \ + --checkpoint /sei/shadow/checkpoint.json \ + --output-dir /sei/shadow/output \ + --metrics-addr :9090 \ + --chain-id pacific-1 env: - name: TMPDIR value: /sei/tmp + ports: + - name: metrics + containerPort: 9090 volumeMounts: - name: data mountPath: /sei diff --git a/manifests/samples/pacific-1-snapshotter.yaml b/manifests/samples/pacific-1-snapshotter.yaml index 5af3016c..cb78399e 100644 --- a/manifests/samples/pacific-1-snapshotter.yaml +++ b/manifests/samples/pacific-1-snapshotter.yaml @@ -5,7 +5,7 @@ metadata: namespace: default spec: chainId: pacific-1 - mode: full + mode: archive image: 189176372795.dkr.ecr.us-east-2.amazonaws.com/sei/sei-chain:837ba922db3f5313a474fbe0c7bba4cbec466cdc sidecar: @@ -15,6 +15,17 @@ spec: command: ["seid"] args: ["start", "--home", "/sei"] + config: + overrides: + # Match sei-infra production settings + chain.concurrency_workers: "500" + chain.occ_enabled: "true" + self_remediation.blocks_behind_threshold: "300" + self_remediation.blocks_behind_check_interval_seconds: "60" + self_remediation.restart_cooldown_seconds: "300" + self_remediation.p2p_no_peers_restart_window_seconds: "120" + self_remediation.statesync_no_peers_restart_window_seconds: "90" + genesis: chainId: pacific-1 s3: @@ -27,7 +38,7 @@ spec: region: eu-central-1 tags: ChainIdentifier: pacific-1 - Component: snapshotter + Component: state-syncer stateSync: trustPeriod: "9999h0m0s"