From 2193519dd380aaeb4aa58ec68073c49626666a7d Mon Sep 17 00:00:00 2001 From: Pratapa Lakshmi Date: Mon, 8 Jun 2026 10:12:26 +0530 Subject: [PATCH] feat: cap cluster worker count via IFRAMELY_WORKERS_COUNT graceful-cluster defaults to os.cpus().length workers, which is the HOST node's vCPU count and ignores the container's CPU limit. On large nodes (e.g. 32 vCPU) this forks ~32 workers, each independently loading ~1886 domains + Redis + Secrets Manager, exhausting the pod memory limit -> OOMKilled/CrashLoopBackOff. Pass workersCount to GracefulCluster.start from CONFIG.CLUSTER_WORKERS_COUNT, settable via IFRAMELY_WORKERS_COUNT (alias IFRAMELY_WORKERS). When unset, behaviour is unchanged (falls back to os.cpus().length). Co-Authored-By: Claude Opus 4.8 (1M context) --- cluster.js | 5 +++++ config.loader.js | 13 +++++++++++++ 2 files changed, 18 insertions(+) diff --git a/cluster.js b/cluster.js index cec7f50b5..92fb13c22 100644 --- a/cluster.js +++ b/cluster.js @@ -5,6 +5,11 @@ process.title = 'iframely-cluster'; GracefulCluster.start({ log: sysUtils.log, + // When undefined, graceful-cluster falls back to os.cpus().length, which is + // the HOST node's vCPU count and ignores the container's CPU limit — forking + // far too many workers on large nodes (each loads ~1886 domains + Redis + + // Secrets Manager). Cap it via IFRAMELY_WORKERS_COUNT (see config.loader.js). + workersCount: CONFIG.CLUSTER_WORKERS_COUNT, shutdownTimeout: CONFIG.SHUTDOWN_TIMEOUT, disableGraceful: CONFIG.DEBUG, restartOnTimeout: CONFIG.CLUSTER_WORKER_RESTART_ON_PERIOD, diff --git a/config.loader.js b/config.loader.js index 02569d6d7..5b2f12a92 100644 --- a/config.loader.js +++ b/config.loader.js @@ -33,6 +33,8 @@ globalConfig = globalConfig && globalConfig.default; // REDIS_PORT_KEY JSON key for port (default: REDIS_PORT) // // Cluster worker tuning +// IFRAMELY_WORKERS_COUNT number of cluster workers (default: os.cpus().length) +// alias: IFRAMELY_WORKERS // IFRAMELY_WORKER_MAX_MEMORY_MB per-worker memory before restart, MB (default: 120) // IFRAMELY_WORKER_RESTART_PERIOD_SEC periodic worker restart interval, seconds (default: 28800) // --------------------------------------------------------------------------- @@ -113,4 +115,15 @@ if (process.env.IFRAMELY_WORKER_RESTART_PERIOD_SEC) { } } +// Number of cluster workers. Without this, graceful-cluster uses os.cpus().length +// (the HOST node's vCPU count), which on large nodes forks far more workers than +// the container's CPU/memory can sustain -> OOMKilled. IFRAMELY_WORKERS is +// accepted as an alias. +if (process.env.IFRAMELY_WORKERS_COUNT || process.env.IFRAMELY_WORKERS) { + var workersCount = parseInt(process.env.IFRAMELY_WORKERS_COUNT || process.env.IFRAMELY_WORKERS, 10); + if (!isNaN(workersCount) && workersCount > 0) { + envOverrides.CLUSTER_WORKERS_COUNT = workersCount; + } +} + export default {...iframelyConfig, ...globalConfig, ...envOverrides};