Skip to content

Commit f3049f6

Browse files
committed
fix(cli): reject execute() immediately when child process is dead
When a child process crashes and a retry is attempted on the same TaskRunProcess, execute() would hang forever because the IPC send was silently skipped and the attempt promise could never resolve. This caused runner pods to stay up indefinitely with no heartbeats.
1 parent b221719 commit f3049f6

File tree

3 files changed

+139
-0
lines changed

3 files changed

+139
-0
lines changed
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
"trigger.dev": patch
3+
---
4+
5+
Fix runner getting stuck indefinitely when `execute()` is called on a dead child process.
Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
import { TaskRunProcess, type TaskRunProcessOptions } from "./taskRunProcess.js";
2+
import { describe, it, expect, vi } from "vitest";
3+
import { UnexpectedExitError } from "@trigger.dev/core/v3/errors";
4+
import type {
5+
TaskRunExecution,
6+
TaskRunExecutionPayload,
7+
WorkerManifest,
8+
ServerBackgroundWorker,
9+
MachinePresetResources,
10+
} from "@trigger.dev/core/v3";
11+
12+
function createTaskRunProcessOptions(
13+
overrides: Partial<TaskRunProcessOptions> = {}
14+
): TaskRunProcessOptions {
15+
return {
16+
workerManifest: {
17+
runtime: "node",
18+
workerEntryPoint: "/dev/null",
19+
configEntryPoint: "/dev/null",
20+
otelImportHook: {},
21+
} as unknown as WorkerManifest,
22+
serverWorker: {} as unknown as ServerBackgroundWorker,
23+
env: {},
24+
machineResources: { cpu: 1, memory: 1 } as MachinePresetResources,
25+
...overrides,
26+
};
27+
}
28+
29+
function createExecution(runId: string, attemptNumber: number): TaskRunExecution {
30+
return {
31+
run: {
32+
id: runId,
33+
payload: "{}",
34+
payloadType: "application/json",
35+
tags: [],
36+
isTest: false,
37+
createdAt: new Date(),
38+
startedAt: new Date(),
39+
maxAttempts: 3,
40+
version: "1",
41+
durationMs: 0,
42+
costInCents: 0,
43+
baseCostInCents: 0,
44+
},
45+
attempt: {
46+
number: attemptNumber,
47+
startedAt: new Date(),
48+
id: "deprecated",
49+
backgroundWorkerId: "deprecated",
50+
backgroundWorkerTaskId: "deprecated",
51+
status: "deprecated" as any,
52+
},
53+
task: { id: "test-task", filePath: "test.ts" },
54+
queue: { id: "queue-1", name: "test-queue" },
55+
environment: { id: "env-1", slug: "test", type: "DEVELOPMENT" },
56+
organization: { id: "org-1", slug: "test-org", name: "Test Org" },
57+
project: { id: "proj-1", ref: "proj_test", slug: "test", name: "Test" },
58+
machine: { name: "small-1x", cpu: 0.5, memory: 0.5, centsPerMs: 0 },
59+
} as unknown as TaskRunExecution;
60+
}
61+
62+
describe("TaskRunProcess", () => {
63+
describe("execute() on a dead child process", () => {
64+
it("should reject when child process has already exited and IPC send is skipped", async () => {
65+
const proc = new TaskRunProcess(createTaskRunProcessOptions());
66+
67+
// Simulate a child process that has exited: _child exists but is not connected
68+
const fakeChild = {
69+
connected: false,
70+
killed: false,
71+
pid: 12345,
72+
kill: vi.fn(),
73+
on: vi.fn(),
74+
stdout: { on: vi.fn() },
75+
stderr: { on: vi.fn() },
76+
};
77+
78+
// Set internal state to mimic a process whose child has crashed
79+
(proc as any)._child = fakeChild;
80+
(proc as any)._childPid = 12345;
81+
(proc as any)._isBeingKilled = false;
82+
83+
const execution = createExecution("run-1", 2);
84+
85+
// This should NOT hang forever - it should reject promptly.
86+
//
87+
// BUG: Currently execute() creates a promise, skips the IPC send because
88+
// _child.connected is false, then awaits the promise which will never
89+
// resolve because the child is dead and #handleExit already ran.
90+
//
91+
// The Promise.race with a timeout detects the hang.
92+
const result = await Promise.race([
93+
proc
94+
.execute(
95+
{
96+
payload: { execution, traceContext: {}, metrics: [] },
97+
messageId: "run_run-1",
98+
env: {},
99+
},
100+
true
101+
)
102+
.then(
103+
(v) => ({ type: "resolved" as const, value: v }),
104+
(e) => ({ type: "rejected" as const, error: e })
105+
),
106+
new Promise<{ type: "hung" }>((resolve) =>
107+
setTimeout(() => resolve({ type: "hung" as const }), 2000)
108+
),
109+
]);
110+
111+
// The test fails (proving the bug) if execute() hangs
112+
expect(result.type).not.toBe("hung");
113+
expect(result.type).toBe("rejected");
114+
115+
if (result.type === "rejected") {
116+
expect(result.error).toBeInstanceOf(UnexpectedExitError);
117+
expect(result.error.stderr).toContain("not connected");
118+
}
119+
});
120+
});
121+
});

packages/cli-v3/src/executions/taskRunProcess.ts

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -297,6 +297,19 @@ export class TaskRunProcess {
297297
env: params.env,
298298
isWarmStart: isWarmStart ?? this.options.isWarmStart,
299299
});
300+
} else {
301+
// Child process is dead or disconnected — the IPC send was skipped so the attempt
302+
// promise would hang forever. Reject it immediately to let the caller handle it.
303+
this._attemptStatuses.set(key, "REJECTED");
304+
305+
// @ts-expect-error - rejecter is assigned in the promise constructor above
306+
rejecter(
307+
new UnexpectedExitError(
308+
-1,
309+
null,
310+
"Child process is not connected, cannot execute task run"
311+
)
312+
);
300313
}
301314

302315
const result = await promise;

0 commit comments

Comments
 (0)