-
Notifications
You must be signed in to change notification settings - Fork 425
fix(sdk-driver): resolve idle-timeout as success when output collected and all tools complete #40419
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
fix(sdk-driver): resolve idle-timeout as success when output collected and all tools complete #40419
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -119,6 +119,149 @@ describe("copilot_sdk_driver.cjs", () => { | |
| expect(stop).toHaveBeenCalledTimes(1); | ||
| }); | ||
|
|
||
| it("resolves exitCode 0 on SDK idle-timeout when output collected and all tool calls complete", async () => { | ||
| // Regression test: when sendAndWait throws an idle-timeout error but the agent | ||
| // produced output and all tool calls completed, the driver must return exitCode 0. | ||
| // This covers the case where the SDK drops the session.idle signal on long runs. | ||
| const disconnect = vi.fn().mockResolvedValue(undefined); | ||
| const stop = vi.fn().mockResolvedValue(undefined); | ||
| let onEvent = () => {}; | ||
| const session = { | ||
| sessionId: "session-idle-timeout-success", | ||
| on: handler => { | ||
| onEvent = handler; | ||
| }, | ||
| sendAndWait: vi.fn().mockImplementation(async () => { | ||
| // Simulate tool execution events before the idle-timeout | ||
| onEvent({ | ||
| type: "tool.execution_start", | ||
| ephemeral: false, | ||
| timestamp: new Date().toISOString(), | ||
| data: { toolName: "bash", mcpServerName: "terminal", toolCallId: "call-1" }, | ||
| }); | ||
| onEvent({ | ||
| type: "assistant.message", | ||
| ephemeral: false, | ||
| timestamp: new Date().toISOString(), | ||
| data: { content: "I found the answer" }, | ||
| }); | ||
| onEvent({ | ||
| type: "tool.execution_complete", | ||
| ephemeral: false, | ||
| timestamp: new Date().toISOString(), | ||
| data: { toolCallId: "call-1", success: true }, | ||
| }); | ||
| throw new Error("Timeout after 870000ms waiting for session.idle"); | ||
| }), | ||
| disconnect, | ||
| }; | ||
| class FakeCopilotClient { | ||
| start = vi.fn().mockResolvedValue(undefined); | ||
| createSession = vi.fn().mockResolvedValue(session); | ||
| stop = stop; | ||
| } | ||
|
|
||
| const result = await runWithCopilotSDK({ | ||
| sdkUri: "http://127.0.0.1:3002", | ||
| prompt: "test prompt", | ||
| logger: () => {}, | ||
| sdkModule: { | ||
| CopilotClient: FakeCopilotClient, | ||
| RuntimeConnection: { forUri: vi.fn(() => ({})) }, | ||
| approveAll: () => "allow", | ||
| }, | ||
| }); | ||
|
|
||
| expect(result.exitCode).toBe(0); | ||
| expect(result.hasOutput).toBe(true); | ||
| expect(result.output).toContain("I found the answer"); | ||
| expect(disconnect).toHaveBeenCalledTimes(1); | ||
| expect(stop).toHaveBeenCalledTimes(1); | ||
| }); | ||
|
|
||
| it("returns exitCode 1 on SDK idle-timeout when tool calls are still pending", async () => { | ||
| // When the idle-timeout fires with in-flight (unmatched) tool calls, the agent did | ||
| // not finish cleanly — the driver must NOT treat it as success. | ||
| const disconnect = vi.fn().mockResolvedValue(undefined); | ||
| const stop = vi.fn().mockResolvedValue(undefined); | ||
| let onEvent = () => {}; | ||
| const session = { | ||
| sessionId: "session-idle-timeout-pending-tools", | ||
| on: handler => { | ||
| onEvent = handler; | ||
| }, | ||
| sendAndWait: vi.fn().mockImplementation(async () => { | ||
| onEvent({ | ||
| type: "tool.execution_start", | ||
| ephemeral: false, | ||
| timestamp: new Date().toISOString(), | ||
| data: { toolName: "bash", mcpServerName: "terminal", toolCallId: "call-pending" }, | ||
| }); | ||
| onEvent({ | ||
| type: "assistant.message", | ||
| ephemeral: false, | ||
| timestamp: new Date().toISOString(), | ||
| data: { content: "working on it" }, | ||
| }); | ||
| // tool.execution_complete is never emitted — tool call remains pending | ||
| throw new Error("Timeout after 870000ms waiting for session.idle"); | ||
| }), | ||
| disconnect, | ||
| }; | ||
| class FakeCopilotClient { | ||
| start = vi.fn().mockResolvedValue(undefined); | ||
| createSession = vi.fn().mockResolvedValue(session); | ||
| stop = stop; | ||
| } | ||
|
|
||
| const result = await runWithCopilotSDK({ | ||
| sdkUri: "http://127.0.0.1:3002", | ||
| prompt: "test prompt", | ||
| logger: () => {}, | ||
| sdkModule: { | ||
| CopilotClient: FakeCopilotClient, | ||
| RuntimeConnection: { forUri: vi.fn(() => ({})) }, | ||
| approveAll: () => "allow", | ||
| }, | ||
| }); | ||
|
|
||
| expect(result.exitCode).toBe(1); | ||
| expect(result.hasOutput).toBe(true); | ||
| expect(result.output).toContain("working on it"); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Tests 2 and 3 don't assert that 💡 Suggested additionsFor the pending-tools test (after line 230): expect(disconnect).toHaveBeenCalledTimes(1);
expect(stop).toHaveBeenCalledTimes(1);For the no-output test (after line 262), the same assertions apply, and it's also worth adding: expect(result.output).toBe('Timeout after 870000ms waiting for session.idle');to lock in the new behaviour where Without the cleanup assertions, a regression that skips |
||
| }); | ||
|
|
||
| it("returns exitCode 1 on SDK idle-timeout with no output collected", async () => { | ||
| // When the idle-timeout fires before the agent produces any output, the driver | ||
| // must return exitCode 1 — there is nothing useful to surface. | ||
| const disconnect = vi.fn().mockResolvedValue(undefined); | ||
| const stop = vi.fn().mockResolvedValue(undefined); | ||
| const session = { | ||
| sessionId: "session-idle-timeout-no-output", | ||
| on: () => {}, | ||
| sendAndWait: vi.fn().mockRejectedValue(new Error("Timeout after 870000ms waiting for session.idle")), | ||
| disconnect, | ||
| }; | ||
| class FakeCopilotClient { | ||
| start = vi.fn().mockResolvedValue(undefined); | ||
| createSession = vi.fn().mockResolvedValue(session); | ||
| stop = stop; | ||
| } | ||
|
|
||
| const result = await runWithCopilotSDK({ | ||
| sdkUri: "http://127.0.0.1:3002", | ||
| prompt: "test prompt", | ||
| logger: () => {}, | ||
| sdkModule: { | ||
| CopilotClient: FakeCopilotClient, | ||
| RuntimeConnection: { forUri: vi.fn(() => ({})) }, | ||
| approveAll: () => "allow", | ||
| }, | ||
| }); | ||
|
|
||
| expect(result.exitCode).toBe(1); | ||
| expect(result.hasOutput).toBe(false); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [/tdd] This test verifies 💡 Suggested addition// Add after the existing expects:
expect(result.output).toContain("waiting for session.idle");
// confirms the error string is surfaced to the harness for diagnostics |
||
| }); | ||
|
|
||
| it("passes custom provider and model through to SDK createSession", async () => { | ||
| const disconnect = vi.fn().mockResolvedValue(undefined); | ||
| const stop = vi.fn().mockResolvedValue(undefined); | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -36,6 +36,12 @@ const { buildCopilotSDKPermissionHandler, getEnvPositiveIntOrDefault, parseMaxTo | |
| // Override via the COPILOT_SDK_SEND_TIMEOUT_MS environment variable. | ||
| const SDK_SEND_TIMEOUT_MS_DEFAULT = 10 * 60 * 1000; | ||
|
|
||
| // Pattern matching the SDK idle-timeout error emitted when sendAndWait reaches its | ||
| // deadline waiting for the session.idle event. This matches the message format | ||
| // "Timeout after <N>ms waiting for session.idle" produced by the Copilot SDK. | ||
| // Keep in sync with SDK_SESSION_IDLE_TIMEOUT_PATTERN in copilot_harness.cjs. | ||
| const SDK_IDLE_TIMEOUT_PATTERN = /Timeout after \d+ms waiting for session\.idle/; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [/diagnose] 💡 Suggestion: import from here instead of duplicatingSince // copilot_harness.cjs
const { SDK_IDLE_TIMEOUT_PATTERN } = require("./copilot_sdk_session.cjs");
// ... replace SDK_SESSION_IDLE_TIMEOUT_PATTERN usages with SDK_IDLE_TIMEOUT_PATTERNThis eliminates the "keep in sync" maintenance burden and makes the single source of truth explicit.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Exported pattern is not consumed by its stated peer — 💡 DetailsThe comment says "Keep in sync with // copilot_harness.cjs line 120 — still exists, still independent
const SDK_SESSION_IDLE_TIMEOUT_PATTERN = /Timeout after \d+ms waiting for session\.idle/;The export is semantically incomplete. There is also a diagnostic side-effect: for the idle-timeout + pending-tools + output path, this PR correctly returns Suggested fix: import from this module in const { SDK_IDLE_TIMEOUT_PATTERN } = require('./copilot_sdk_session.cjs');
// retain old name via alias if needed:
const SDK_SESSION_IDLE_TIMEOUT_PATTERN = SDK_IDLE_TIMEOUT_PATTERN; |
||
|
|
||
| /** | ||
| * Extract the prompt text from a resolved args array. | ||
| * Looks for the first occurrence of "-p <value>" or "--prompt <value>". | ||
|
|
@@ -141,6 +147,13 @@ async function runWithCopilotSDK({ sdkUri, prompt, logger, attempt = 0, model, c | |
| let toolDenialCount = 0; | ||
| let catastrophicToolDenialsError = null; | ||
| let catastrophicToolDenialsTriggered = false; | ||
| /** | ||
| * Map from toolCallId → {toolName, mcpServerName} for enriching tool.execution_complete | ||
| * events and for tracking in-flight tool calls when the idle-timeout fires. | ||
| * Declared at function scope so the catch block can check pendingToolCalls.size. | ||
| * @type {Map<string, {toolName: string, mcpServerName: string}>} | ||
| */ | ||
| const pendingToolCalls = new Map(); | ||
|
|
||
| /** | ||
| * Best-effort write of a driver-level event to events.jsonl and stderr. | ||
|
|
@@ -214,13 +227,6 @@ async function runWithCopilotSDK({ sdkUri, prompt, logger, attempt = 0, model, c | |
| const stream = eventsStream; | ||
| log(`serialising SDK events to ${eventsPath}`); | ||
|
|
||
| /** | ||
| * Map from toolCallId → {toolName, mcpServerName} so that tool.execution_complete | ||
| * events (which carry no mcpServerName) can be enriched from the matching start event. | ||
| * @type {Map<string, {toolName: string, mcpServerName: string}>} | ||
| */ | ||
| const pendingToolCalls = new Map(); | ||
|
|
||
| /** | ||
| * Write one JSONL entry to the events file and stderr. | ||
| * Uses the event's own ISO-8601 timestamp when available. | ||
|
|
@@ -316,10 +322,24 @@ async function runWithCopilotSDK({ sdkUri, prompt, logger, attempt = 0, model, c | |
| const durationMs = Date.now() - startTime; | ||
| const failure = catastrophicToolDenialsError ?? (err instanceof Error ? err : new Error(String(err))); | ||
| log(`error: ${failure.message}`); | ||
|
|
||
| // When sendAndWait times out waiting for session.idle but the agent produced | ||
| // output and all tracked tool calls have already completed, the session work is | ||
| // done — the SDK simply failed to emit the idle signal. Treat it as a successful | ||
| // run so the harness does not classify it as a failure or waste retry attempts. | ||
| const isIdleTimeout = !catastrophicToolDenialsError && SDK_IDLE_TIMEOUT_PATTERN.test(failure.message); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [/tdd] The 💡 Suggested test caseit("returns exitCode 1 when catastrophic tool denials fires even if timeout message matches", async () => {
// Simulate maxToolDenials being hit; the driver should NOT treat this as
// an idle-timeout success even if the error text happens to match.
const session = {
sessionId: "session-denial-not-idle",
on: handler => { onEvent = handler; },
sendAndWait: vi.fn().mockImplementation(async () => {
// fire enough tool-denial events to breach the threshold, then stall
throw new Error("Timeout after 870000ms waiting for session.idle");
}),
disconnect: vi.fn().mockResolvedValue(undefined),
};
// ... configure maxToolDenials:1 and fire denial events ...
expect(result.exitCode).toBe(1);
});Without this test, a future refactor that accidentally drops the |
||
| if (isIdleTimeout && hasOutput && pendingToolCalls.size === 0) { | ||
| log(`warning: SDK idle-timeout with collected output and no pending tool calls — treating as completed`); | ||
| log(`session completed: hasOutput=${hasOutput} durationMs=${durationMs}`); | ||
| return { exitCode: 0, output, hasOutput, durationMs }; | ||
| } | ||
|
|
||
| // Preserve any output collected before the error so the harness can use it | ||
| // for retry decisions and diagnostics. | ||
| return { | ||
| exitCode: 1, | ||
| output: failure.message, | ||
| hasOutput: false, | ||
| output: hasOutput ? output : failure.message, | ||
| hasOutput, | ||
| durationMs, | ||
| }; | ||
| } finally { | ||
|
|
@@ -345,4 +365,4 @@ async function runWithCopilotSDK({ sdkUri, prompt, logger, attempt = 0, model, c | |
| } | ||
| } | ||
|
|
||
| module.exports = { SDK_SEND_TIMEOUT_MS_DEFAULT, extractPromptFromArgs, runWithCopilotSDK }; | ||
| module.exports = { SDK_SEND_TIMEOUT_MS_DEFAULT, SDK_IDLE_TIMEOUT_PATTERN, extractPromptFromArgs, runWithCopilotSDK }; | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
[/tdd] The pending-tools and no-output failure tests don't assert that cleanup (
disconnect,stop) was called, unlike the success-case test above. Thefinallyblock should run on all exit paths — adding these assertions prevents a future regression where error handling disrupts cleanup.💡 Suggested additions (also applies to the no-output test)