Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
143 changes: 143 additions & 0 deletions actions/setup/js/copilot_sdk_driver.test.cjs
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,149 @@ describe("copilot_sdk_driver.cjs", () => {
expect(stop).toHaveBeenCalledTimes(1);
});

it("resolves exitCode 0 on SDK idle-timeout when output collected and all tool calls complete", async () => {
// Regression test: when sendAndWait throws an idle-timeout error but the agent
// produced output and all tool calls completed, the driver must return exitCode 0.
// This covers the case where the SDK drops the session.idle signal on long runs.
const disconnect = vi.fn().mockResolvedValue(undefined);
const stop = vi.fn().mockResolvedValue(undefined);
let onEvent = () => {};
const session = {
sessionId: "session-idle-timeout-success",
on: handler => {
onEvent = handler;
},
sendAndWait: vi.fn().mockImplementation(async () => {
// Simulate tool execution events before the idle-timeout
onEvent({
type: "tool.execution_start",
ephemeral: false,
timestamp: new Date().toISOString(),
data: { toolName: "bash", mcpServerName: "terminal", toolCallId: "call-1" },
});
onEvent({
type: "assistant.message",
ephemeral: false,
timestamp: new Date().toISOString(),
data: { content: "I found the answer" },
});
onEvent({
type: "tool.execution_complete",
ephemeral: false,
timestamp: new Date().toISOString(),
data: { toolCallId: "call-1", success: true },
});
throw new Error("Timeout after 870000ms waiting for session.idle");
}),
disconnect,
};
class FakeCopilotClient {
start = vi.fn().mockResolvedValue(undefined);
createSession = vi.fn().mockResolvedValue(session);
stop = stop;
}

const result = await runWithCopilotSDK({
sdkUri: "http://127.0.0.1:3002",
prompt: "test prompt",
logger: () => {},
sdkModule: {
CopilotClient: FakeCopilotClient,
RuntimeConnection: { forUri: vi.fn(() => ({})) },
approveAll: () => "allow",
},
});

expect(result.exitCode).toBe(0);
expect(result.hasOutput).toBe(true);
expect(result.output).toContain("I found the answer");
expect(disconnect).toHaveBeenCalledTimes(1);
expect(stop).toHaveBeenCalledTimes(1);
});

it("returns exitCode 1 on SDK idle-timeout when tool calls are still pending", async () => {
// When the idle-timeout fires with in-flight (unmatched) tool calls, the agent did
// not finish cleanly — the driver must NOT treat it as success.
const disconnect = vi.fn().mockResolvedValue(undefined);
const stop = vi.fn().mockResolvedValue(undefined);
let onEvent = () => {};
const session = {
sessionId: "session-idle-timeout-pending-tools",
on: handler => {
onEvent = handler;
},
sendAndWait: vi.fn().mockImplementation(async () => {
onEvent({
type: "tool.execution_start",
ephemeral: false,
timestamp: new Date().toISOString(),
data: { toolName: "bash", mcpServerName: "terminal", toolCallId: "call-pending" },
});
onEvent({
type: "assistant.message",
ephemeral: false,
timestamp: new Date().toISOString(),
data: { content: "working on it" },
});
// tool.execution_complete is never emitted — tool call remains pending
throw new Error("Timeout after 870000ms waiting for session.idle");
}),
disconnect,
};
class FakeCopilotClient {
start = vi.fn().mockResolvedValue(undefined);
createSession = vi.fn().mockResolvedValue(session);
stop = stop;
}

const result = await runWithCopilotSDK({
sdkUri: "http://127.0.0.1:3002",
prompt: "test prompt",
logger: () => {},
sdkModule: {
CopilotClient: FakeCopilotClient,
RuntimeConnection: { forUri: vi.fn(() => ({})) },
approveAll: () => "allow",
},
});

expect(result.exitCode).toBe(1);
expect(result.hasOutput).toBe(true);
expect(result.output).toContain("working on it");

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[/tdd] The pending-tools and no-output failure tests don't assert that cleanup (disconnect, stop) was called, unlike the success-case test above. The finally block should run on all exit paths — adding these assertions prevents a future regression where error handling disrupts cleanup.

💡 Suggested additions (also applies to the no-output test)
// Add to both failure tests:
expect(disconnect).toHaveBeenCalledTimes(1);
expect(stop).toHaveBeenCalledTimes(1);

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tests 2 and 3 don't assert that disconnect and stop are called, leaving the error-path cleanup unverified. Test 1 (success path) asserts both; the failure paths should too.

💡 Suggested additions

For the pending-tools test (after line 230):

expect(disconnect).toHaveBeenCalledTimes(1);
expect(stop).toHaveBeenCalledTimes(1);

For the no-output test (after line 262), the same assertions apply, and it's also worth adding:

expect(result.output).toBe('Timeout after 870000ms waiting for session.idle');

to lock in the new behaviour where output = failure.message when hasOutput is false (the fix this PR introduces — currently this invariant has no test coverage).

Without the cleanup assertions, a regression that skips disconnect/stop in the catch path would not be caught by these two tests.

});

it("returns exitCode 1 on SDK idle-timeout with no output collected", async () => {
// When the idle-timeout fires before the agent produces any output, the driver
// must return exitCode 1 — there is nothing useful to surface.
const disconnect = vi.fn().mockResolvedValue(undefined);
const stop = vi.fn().mockResolvedValue(undefined);
const session = {
sessionId: "session-idle-timeout-no-output",
on: () => {},
sendAndWait: vi.fn().mockRejectedValue(new Error("Timeout after 870000ms waiting for session.idle")),
disconnect,
};
class FakeCopilotClient {
start = vi.fn().mockResolvedValue(undefined);
createSession = vi.fn().mockResolvedValue(session);
stop = stop;
}

const result = await runWithCopilotSDK({
sdkUri: "http://127.0.0.1:3002",
prompt: "test prompt",
logger: () => {},
sdkModule: {
CopilotClient: FakeCopilotClient,
RuntimeConnection: { forUri: vi.fn(() => ({})) },
approveAll: () => "allow",
},
});

expect(result.exitCode).toBe(1);
expect(result.hasOutput).toBe(false);

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[/tdd] This test verifies exitCode: 1 and hasOutput: false but doesn't assert what output contains. When hasOutput is false, the implementation returns failure.message — so the harness receives the timeout error string, not an empty value. Asserting this fully specifies the behaviour and prevents a silent regression if the fallback is changed.

💡 Suggested addition
// Add after the existing expects:
expect(result.output).toContain("waiting for session.idle");
// confirms the error string is surfaced to the harness for diagnostics

});

it("passes custom provider and model through to SDK createSession", async () => {
const disconnect = vi.fn().mockResolvedValue(undefined);
const stop = vi.fn().mockResolvedValue(undefined);
Expand Down
40 changes: 30 additions & 10 deletions actions/setup/js/copilot_sdk_session.cjs
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,12 @@ const { buildCopilotSDKPermissionHandler, getEnvPositiveIntOrDefault, parseMaxTo
// Override via the COPILOT_SDK_SEND_TIMEOUT_MS environment variable.
const SDK_SEND_TIMEOUT_MS_DEFAULT = 10 * 60 * 1000;

// Pattern matching the SDK idle-timeout error emitted when sendAndWait reaches its
// deadline waiting for the session.idle event. This matches the message format
// "Timeout after <N>ms waiting for session.idle" produced by the Copilot SDK.
// Keep in sync with SDK_SESSION_IDLE_TIMEOUT_PATTERN in copilot_harness.cjs.
const SDK_IDLE_TIMEOUT_PATTERN = /Timeout after \d+ms waiting for session\.idle/;

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[/diagnose] SDK_IDLE_TIMEOUT_PATTERN is identical to SDK_SESSION_IDLE_TIMEOUT_PATTERN already defined in copilot_harness.cjs (line 120). The "Keep in sync" comment flags a live drift risk — the two patterns can diverge silently.

💡 Suggestion: import from here instead of duplicating

Since SDK_IDLE_TIMEOUT_PATTERN is now exported, copilot_harness.cjs could import it directly:

// copilot_harness.cjs
const { SDK_IDLE_TIMEOUT_PATTERN } = require("./copilot_sdk_session.cjs");
// ... replace SDK_SESSION_IDLE_TIMEOUT_PATTERN usages with SDK_IDLE_TIMEOUT_PATTERN

This eliminates the "keep in sync" maintenance burden and makes the single source of truth explicit.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Exported pattern is not consumed by its stated peer — copilot_harness.cjs still carries its own copy. Two independent regexes can silently drift.

💡 Details

The comment says "Keep in sync with SDK_SESSION_IDLE_TIMEOUT_PATTERN in copilot_harness.cjs", implying the export is meant to be a single source of truth. But copilot_harness.cjs defines its own copy at line 120 and never imports from this module:

// copilot_harness.cjs line 120 — still exists, still independent
const SDK_SESSION_IDLE_TIMEOUT_PATTERN = /Timeout after \d+ms waiting for session\.idle/;

The export is semantically incomplete.

There is also a diagnostic side-effect: for the idle-timeout + pending-tools + output path, this PR correctly returns output = <assistant content> rather than the error string. That means isSDKSessionIdleTimeoutError(result.output) in the harness will now return false for those cases, so classifyCopilotFailure logs partial_execution instead of sdk_session_idle_timeout. Retry logic is unaffected (keyed on hasOutput), but failure traces become less specific.

Suggested fix: import from this module in copilot_harness.cjs:

const { SDK_IDLE_TIMEOUT_PATTERN } = require('./copilot_sdk_session.cjs');
// retain old name via alias if needed:
const SDK_SESSION_IDLE_TIMEOUT_PATTERN = SDK_IDLE_TIMEOUT_PATTERN;


/**
* Extract the prompt text from a resolved args array.
* Looks for the first occurrence of "-p <value>" or "--prompt <value>".
Expand Down Expand Up @@ -141,6 +147,13 @@ async function runWithCopilotSDK({ sdkUri, prompt, logger, attempt = 0, model, c
let toolDenialCount = 0;
let catastrophicToolDenialsError = null;
let catastrophicToolDenialsTriggered = false;
/**
* Map from toolCallId → {toolName, mcpServerName} for enriching tool.execution_complete
* events and for tracking in-flight tool calls when the idle-timeout fires.
* Declared at function scope so the catch block can check pendingToolCalls.size.
* @type {Map<string, {toolName: string, mcpServerName: string}>}
*/
const pendingToolCalls = new Map();

/**
* Best-effort write of a driver-level event to events.jsonl and stderr.
Expand Down Expand Up @@ -214,13 +227,6 @@ async function runWithCopilotSDK({ sdkUri, prompt, logger, attempt = 0, model, c
const stream = eventsStream;
log(`serialising SDK events to ${eventsPath}`);

/**
* Map from toolCallId → {toolName, mcpServerName} so that tool.execution_complete
* events (which carry no mcpServerName) can be enriched from the matching start event.
* @type {Map<string, {toolName: string, mcpServerName: string}>}
*/
const pendingToolCalls = new Map();

/**
* Write one JSONL entry to the events file and stderr.
* Uses the event's own ISO-8601 timestamp when available.
Expand Down Expand Up @@ -316,10 +322,24 @@ async function runWithCopilotSDK({ sdkUri, prompt, logger, attempt = 0, model, c
const durationMs = Date.now() - startTime;
const failure = catastrophicToolDenialsError ?? (err instanceof Error ? err : new Error(String(err)));
log(`error: ${failure.message}`);

// When sendAndWait times out waiting for session.idle but the agent produced
// output and all tracked tool calls have already completed, the session work is
// done — the SDK simply failed to emit the idle signal. Treat it as a successful
// run so the harness does not classify it as a failure or waste retry attempts.
const isIdleTimeout = !catastrophicToolDenialsError && SDK_IDLE_TIMEOUT_PATTERN.test(failure.message);

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[/tdd] The !catastrophicToolDenialsError guard is a critical safety rail — it prevents a tool-denial failure whose error message accidentally matches the idle-timeout pattern from being promoted to success. But this branch is not covered by a regression test.

💡 Suggested test case
it("returns exitCode 1 when catastrophic tool denials fires even if timeout message matches", async () => {
  // Simulate maxToolDenials being hit; the driver should NOT treat this as
  // an idle-timeout success even if the error text happens to match.
  const session = {
    sessionId: "session-denial-not-idle",
    on: handler => { onEvent = handler; },
    sendAndWait: vi.fn().mockImplementation(async () => {
      // fire enough tool-denial events to breach the threshold, then stall
      throw new Error("Timeout after 870000ms waiting for session.idle");
    }),
    disconnect: vi.fn().mockResolvedValue(undefined),
  };
  // ... configure maxToolDenials:1 and fire denial events ...
  expect(result.exitCode).toBe(1);
});

Without this test, a future refactor that accidentally drops the !catastrophicToolDenialsError condition would go undetected.

if (isIdleTimeout && hasOutput && pendingToolCalls.size === 0) {
log(`warning: SDK idle-timeout with collected output and no pending tool calls — treating as completed`);
log(`session completed: hasOutput=${hasOutput} durationMs=${durationMs}`);
return { exitCode: 0, output, hasOutput, durationMs };
}

// Preserve any output collected before the error so the harness can use it
// for retry decisions and diagnostics.
return {
exitCode: 1,
output: failure.message,
hasOutput: false,
output: hasOutput ? output : failure.message,
hasOutput,
durationMs,
};
} finally {
Expand All @@ -345,4 +365,4 @@ async function runWithCopilotSDK({ sdkUri, prompt, logger, attempt = 0, model, c
}
}

module.exports = { SDK_SEND_TIMEOUT_MS_DEFAULT, extractPromptFromArgs, runWithCopilotSDK };
module.exports = { SDK_SEND_TIMEOUT_MS_DEFAULT, SDK_IDLE_TIMEOUT_PATTERN, extractPromptFromArgs, runWithCopilotSDK };
Loading