From cd6093c6c455ae6e4f92a645052ede59c498f6ed Mon Sep 17 00:00:00 2001 From: Khaliq Date: Mon, 11 May 2026 13:51:14 +0200 Subject: [PATCH] fix(loader): drain broker stdout on spawn (was wired to 'pause' which never fires) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The NODE_OPTIONS register script monkeypatches child_process.spawn so that agent-relay-broker children have their stdout drained — preventing the broker from blocking on write() once the OS pipe buffer fills. The previous wiring attached a 'data' listener inside a callback that only ran when child.stdout emitted 'pause'. Node Readable streams never emit 'pause' on internal buffer fill — that event only fires when something explicitly calls .pause() (which nothing in this code path ever does), so the stream stayed in paused mode, libuv stopped draining the kernel pipe at high-water mark, and a chatty broker would block in write() once ~64KB of stdout queued up. Symptom: overnight proactive-runtime runs (Ricky-driven, M1 fans out to 9 PTY workers) froze within seconds of fanout with every worker log stuck at the same mtime, broker process parked in write() or _pthread_cond_wait, M1's step.run awaiting a never-arriving drain signal. Reproduced twice (~14h apart) with diagnostic bundles capturing the same shape. Changes - Attach `data` listener and call `resume()` synchronously at spawn time for both `init` and `pty` broker invocations. This matches what SDK 6.0.15's `drainBrokerStdoutAfterStartup` does for direct SDK consumers. - Expand the argv guard from `argv[0]==="init"` to also include `"pty"`, so per-worker PTY brokers (M1's lead + impl-* fanout) are protected, not just the channel-multiplexer init broker. - Update the explanatory comment block above `registerSource` to capture the new semantics and the prior bug. Verification - npm run typecheck — clean - npm test — 1077 / 1077 pass, including the existing "drains broker stdout after SDK startup so event floods cannot wedge the workflow node" regression at entrypoint.test.ts:3122. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/local/entrypoint.ts | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/src/local/entrypoint.ts b/src/local/entrypoint.ts index 5075af08..7d693eb7 100644 --- a/src/local/entrypoint.ts +++ b/src/local/entrypoint.ts @@ -979,9 +979,15 @@ async function workflowSdkLoaderNodeOption(cwd: string): Promise{', - 'child.stdout?.off("pause",drainBrokerStdout);', - 'child.stdout?.on("data",()=>{});', - 'child.stdout?.resume();', - '};', - 'child.stdout.on("pause",drainBrokerStdout);', + 'if((argv[0]==="init"||argv[0]==="pty")&&/(?:^|[/\\\\])agent-relay-broker(?:\\.exe)?$/u.test(executable)&&child.stdout){', + // Attach the data listener immediately so the stream enters flowing mode + // and libuv keeps draining the kernel pipe. The previous `on("pause", ...)` + // hook never fired — Node `Readable` only emits `'pause'` when something + // explicitly calls `.pause()`, which nothing does in this code path, so the + // stream stayed in paused mode and the broker eventually blocked in + // `write()` when the OS pipe buffer filled. + 'child.stdout.on("data",()=>{});', + 'child.stdout.resume();', '}', 'return child;', '};',