Skip to content
70 changes: 70 additions & 0 deletions actions/setup/js/copilot_sdk_driver.test.cjs
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,76 @@ describe("copilot_sdk_driver.cjs", () => {
expect(stop).toHaveBeenCalledTimes(1);
});

it("serializes tool.execution_start command details when available", async () => {
const disconnect = vi.fn().mockResolvedValue(undefined);
const stop = vi.fn().mockResolvedValue(undefined);
const stderrWriteSpy = vi.spyOn(process.stderr, "write").mockImplementation(() => true);
try {
let onEvent = () => {};
const session = {
sessionId: "session-tool-start-command",
on: handler => {
onEvent = handler;
},
sendAndWait: vi.fn().mockImplementation(async () => {
onEvent({
type: "tool.execution_start",
ephemeral: false,
timestamp: new Date().toISOString(),
data: {
toolName: "bash",
mcpServerName: "terminal",
input: { command: "git status" },
},
});
onEvent({
type: "assistant.message",
ephemeral: false,
timestamp: new Date().toISOString(),
data: { content: "ok" },
});
return { data: { content: "ok" } };
}),
disconnect,
};
class FakeCopilotClient {
start = vi.fn().mockResolvedValue(undefined);
createSession = vi.fn().mockResolvedValue(session);
stop = stop;
}

const result = await runWithCopilotSDK({
sdkUri: "http://127.0.0.1:3002",
prompt: "test prompt",
logger: () => {},
sdkModule: {
CopilotClient: FakeCopilotClient,
RuntimeConnection: { forUri: vi.fn(() => ({})) },
approveAll: () => "allow",
},
});

expect(result.exitCode).toBe(0);
const parsedEvents = stderrWriteSpy.mock.calls
.map(([message]) => {
if (typeof message !== "string" || !message.endsWith("\n")) return null;
try {
return JSON.parse(message.trimEnd());
} catch {
return null;
}
})
.filter(Boolean);
const startEvent = parsedEvents.find(event => event.type === "tool.execution_start");
expect(startEvent).toMatchObject({

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[/tdd] The test verifies the happy path (command present → field added), but not the absence path: when a non-shell tool fires tool.execution_start with no command data, the JSONL event must not get a command field.

The if (command) eventData.command = command conditional in copilot_sdk_session.cjs is untested for the falsy case, so a future refactor that always writes command: "" would go undetected.

💡 Suggested addition

Add a parallel it block with toolName: "read_file" and input: {}, then assert that the captured tool.execution_start event has no command key:

expect(startEvent.data).not.toHaveProperty("command");

type: "tool.execution_start",
data: { toolName: "bash", mcpServerName: "terminal", command: "git status" },
});
} finally {
stderrWriteSpy.mockRestore();
}
});

it("resolves exitCode 0 on SDK idle-timeout when output collected and all tool calls complete", async () => {
// Regression test: when sendAndWait throws an idle-timeout error but the agent
// produced output and all tool calls completed, the driver must return exitCode 0.
Expand Down
7 changes: 5 additions & 2 deletions actions/setup/js/copilot_sdk_session.cjs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
*
* Event mapping:
* SDK "user.message" → JSONL "user.message"
* SDK "tool.execution_start" → JSONL "tool.execution_start" (toolName, mcpServerName)
* SDK "tool.execution_start" → JSONL "tool.execution_start" (toolName, mcpServerName, command?)
* SDK "tool.execution_complete" → JSONL "tool.execution_complete" (toolName, mcpServerName, success, result)
* SDK "assistant.message" → JSONL "assistant.message" (content)
*
Expand All @@ -29,6 +29,7 @@ const fs = require("fs");
const path = require("path");
const os = require("os");
const { buildCopilotSDKPermissionHandler, getEnvPositiveIntOrDefault, parseMaxToolDenialsLimit, MAX_TOOL_DENIALS_DEFAULT } = require("./copilot_sdk_permissions.cjs");
const { extractShellCommandFromToolData } = require("./tool_call_details.cjs");

// Default timeout for a single sendAndWait call: 10 minutes.
// This is intentionally generous — the headless Copilot CLI has its own internal
Expand Down Expand Up @@ -256,10 +257,12 @@ async function runWithCopilotSDK({ sdkUri, prompt, logger, attempt = 0, model, c
const toolName = event.data?.toolName ?? "unknown";
const mcpServerName = event.data?.mcpServerName ?? "";
const toolCallId = event.data?.toolCallId;
const command = extractShellCommandFromToolData(event.data);
if (toolCallId) {
pendingToolCalls.set(toolCallId, { toolName, mcpServerName });
}
writeEvent("tool.execution_start", { toolName, mcpServerName }, event.timestamp);
const eventData = command ? { toolName, mcpServerName, command } : { toolName, mcpServerName };
writeEvent("tool.execution_start", eventData, event.timestamp);
break;
}

Expand Down
48 changes: 47 additions & 1 deletion actions/setup/js/handle_agent_failure.cjs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ const { formatAICCredits } = require("./daily_aic_workflow_helpers.cjs");
const { formatAIC } = require("./model_costs.cjs");
const { parseTokenUsageJsonl, generateTokenUsageSummary } = require("./parse_mcp_gateway_log.cjs");
const { readDedupedTokenUsage, TOKEN_USAGE_PATHS } = require("./parse_token_usage.cjs");
const { extractShellCommandFromToolData } = require("./tool_call_details.cjs");
const fs = require("fs");
const os = require("os");
const path = require("path");
Expand All @@ -29,6 +30,9 @@ const DEFAULT_OTEL_JSONL_PATH = "/tmp/gh-aw/otel.jsonl";
const FAILURE_CATEGORIES_PATH = "/tmp/gh-aw/failure_categories.json";
const GITHUB_API_VERSION = "2022-11-28";
const COPILOT_SESSION_STATE_DIR = path.join(os.tmpdir(), "gh-aw", "sandbox", "agent", "logs", "copilot-session-state");
const RECENT_TOOL_CALLS_WITH_COMMAND_PREVIEW = new Set(["bash", "shell"]);

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[/grill-with-docs] RECENT_TOOL_CALLS_WITH_COMMAND_PREVIEW hard-codes ["bash", "shell"] with no comment explaining the selection criteria. Future shell-like tools (e.g., powershell, run_in_terminal, computer) will silently fall back to bare name display.

A single-line comment stating the intent — "tools that accept a free-form command string" — would help maintainers decide whether to extend the set without guessing.

💡 Suggested comment
// Tools that accept a free-form shell command string and should
// show a command preview in denial reports. Add new shell-like
// MCP server tool names here (lowercase).
const RECENT_TOOL_CALLS_WITH_COMMAND_PREVIEW = new Set(["bash", "shell"]);

const ELLIPSIS = "...";
const ELLIPSIS_LENGTH = ELLIPSIS.length;
// Engine-side 429/rate-limit signatures:
// - HTTP 429 accompanied by "too many requests"/"rate limit" phrasing
// - provider error codes like rate_limit_error / rate_limit_exceeded
Expand Down Expand Up @@ -1174,6 +1178,48 @@ function normalizeDeniedPermissionCommand(command) {
return cmd;
}

/**
* Collapse tool call details to a compact single-line preview.
* @param {string} value
* @param {number} [maxLen]
* @returns {string}
*/
function normalizeToolCallPreview(value, maxLen = 120) {

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[/grill-with-docs] The maxLen = 120 default is a behavior constant that controls issue-body quality, but it's not a named export or top-level constant — it's buried as a default parameter. If this preview is reused (e.g., in a Slack notification or OTEL span attribute with a different length limit), the 120 will be duplicated or silently wrong.

💡 Suggested change
// At the top of the file (or in tool_call_details.cjs)
const TOOL_CALL_PREVIEW_MAX_LEN = 120;

// Then:
function normalizeToolCallPreview(value, maxLen = TOOL_CALL_PREVIEW_MAX_LEN) {

This makes the constant visible in grep/audit and easy to tune in one place.

const singleLine = String(value || "")
.replace(/`/g, "'")
.replace(/\s+/g, " ")
.trim();
if (!singleLine) return "";
if (singleLine.length <= maxLen) return singleLine;
return `${singleLine.slice(0, maxLen - ELLIPSIS_LENGTH)}${ELLIPSIS}`;

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Negative slice index when maxLen < ELLIPSIS_LENGTH: singleLine.slice(0, maxLen - 3) becomes slice(0, -N) for any maxLen < 3, which drops characters from the end rather than truncating to maxLen. The result exceeds the stated length contract.

💡 Suggested fix

Add a guard before the slice:

if (maxLen <= ELLIPSIS_LENGTH) return singleLine.slice(0, maxLen);
return `${singleLine.slice(0, maxLen - ELLIPSIS_LENGTH)}${ELLIPSIS}`;

Currently only called with the default maxLen = 120, so this is latent rather than live — but the parameter is part of the public function signature and a future caller could trigger it.

}
Comment on lines +1187 to +1195

/**
* Best-effort extraction of a shell command preview from a tool.execution_start payload.
* @param {Record<string, any>} data
* @returns {string}
*/
function extractShellCommandPreview(data) {
return normalizeToolCallPreview(extractShellCommandFromToolData(data));
}

/**
* Format a compact display value for a recent tool call entry.
* @param {string} toolName
* @param {string} mcpServerName
* @param {Record<string, any>} data
* @returns {string}
*/
function formatRecentToolCall(toolName, mcpServerName, data) {

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[/improve-codebase-architecture] normalizeToolCallPreview, extractShellCommandPreview, and formatRecentToolCall belong in tool_call_details.cjs, not here. The PR introduces tool_call_details.cjs specifically as the shared extraction module, but these rendering helpers live in the wrong file.

This splits "what is a tool call" logic across two files. When a third call site needs formatted previews (e.g., OTEL trace rendering), the author may re-implement rather than reuse.

💡 Suggested move

Export formatRecentToolCall and normalizeToolCallPreview from tool_call_details.cjs and import them here. The 120-char TOOL_CALL_PREVIEW_MAX_LEN constant and ELLIPSIS constant can also live there, turning the module into a complete "tool call display" utility rather than a narrow extractor.

const base = mcpServerName ? `${mcpServerName}.${toolName}` : toolName;
const normalizedToolName = typeof toolName === "string" ? toolName.toLowerCase() : "";
if (!RECENT_TOOL_CALLS_WITH_COMMAND_PREVIEW.has(normalizedToolName)) {
return base;
}
const commandPreview = extractShellCommandPreview(data);
return commandPreview ? `${base}(${commandPreview})` : base;

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Backtick injection corrupts markdown rendering: commandPreview is embedded verbatim inside an inline code span at the call site (- \${toolCall}`). Shell commands routinely contain backticks (e.g. `` bash -c 'echo id`' ``), which close the code span early and produce malformed GitHub Markdown in the denial report issue.

💡 Suggested fix

Escape backticks in the preview before returning, so the rendered output stays inside the code span:

function formatRecentToolCall(toolName, mcpServerName, data) {
  const base = mcpServerName ? `${mcpServerName}.${toolName}` : toolName;
  const normalizedToolName = typeof toolName === "string" ? toolName.toLowerCase() : "";
  if (!RECENT_TOOL_CALLS_WITH_COMMAND_PREVIEW.has(normalizedToolName)) return base;
  const commandPreview = extractShellCommandPreview(data).replace(/`/g, "'");
  return commandPreview ? `${base}(${commandPreview})` : base;
}

Alternatively, render recentToolCallsList in buildToolDenialsExceededContext using a fenced code block instead of inline backtick spans, which is immune to embedded backtick characters.

A test using a command like echo `hostname` would catch this regression.

}

/**
* Load missing_tool messages from agent output.
* Returns an empty array when the output file doesn't exist, cannot be parsed, or has no missing_tool items.
Expand Down Expand Up @@ -1320,7 +1366,7 @@ function loadToolDenialsExceededEvents() {
const toolName = typeof parsed.data.toolName === "string" ? parsed.data.toolName.trim() : "";
if (toolName) {
const mcpServerName = typeof parsed.data.mcpServerName === "string" ? parsed.data.mcpServerName.trim() : "";
recentToolCalls.push(mcpServerName ? `${mcpServerName}.${toolName}` : toolName);
recentToolCalls.push(formatRecentToolCall(toolName, mcpServerName, parsed.data));
if (recentToolCalls.length > 5) recentToolCalls.shift();
}
continue;
Expand Down
62 changes: 62 additions & 0 deletions actions/setup/js/handle_agent_failure.test.cjs
Original file line number Diff line number Diff line change
Expand Up @@ -3272,6 +3272,68 @@ describe("handle_agent_failure", () => {
},
]);
});

it("captures shell command details for recent bash tool calls", () => {
const sessionDir = path.join(os.tmpdir(), "gh-aw", "sandbox", "agent", "logs", "copilot-session-state", "session-1");
fs.mkdirSync(sessionDir, { recursive: true });
fs.writeFileSync(
path.join(sessionDir, "events.jsonl"),
[
JSON.stringify({
type: "tool.execution_start",
timestamp: "2026-06-06T00:00:00Z",
data: { toolName: "bash", mcpServerName: "terminal", command: "cd /home/runner/work/gh-aw/gh-aw && git diff --name-only" },

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[/tdd] The test fixture pre-bakes command directly on the event data, which exercises only the first (data.command) branch of extractShellCommandFromToolData. The MCP-standard schema — where command text lives at data.input.command — is not covered here.

If extractShellCommandFromToolData's input.command path regressed, this test would still pass.

💡 Suggested additional fixture

Add a second it block that writes a JSONL event with data: { toolName: "bash", mcpServerName: "terminal", input: { command: "make test" } } (no top-level command field) and asserts the same terminal.bash(make test) shape in recentToolCalls. This guards the nested extraction path end-to-end.

}),
JSON.stringify({
type: "guard.tool_denials_exceeded",
timestamp: "2026-06-06T00:00:01Z",
data: { denialCount: 5, threshold: 5, reason: "permission denied: bash" },
}),
].join("\n") + "\n"
);

const events = loadToolDenialsExceededEvents();
expect(events).toEqual([
{
denialCount: 5,
threshold: 5,
reason: "permission denied: bash",
recentToolCalls: ["terminal.bash(cd /home/runner/work/gh-aw/gh-aw && git diff --name-only)"],
timestamp: "2026-06-06T00:00:01Z",
},
]);
});

it("sanitizes backticks in shell command previews", () => {
const sessionDir = path.join(os.tmpdir(), "gh-aw", "sandbox", "agent", "logs", "copilot-session-state", "session-1");
fs.mkdirSync(sessionDir, { recursive: true });
fs.writeFileSync(
path.join(sessionDir, "events.jsonl"),
[
JSON.stringify({
type: "tool.execution_start",
timestamp: "2026-06-06T00:00:00Z",
data: { toolName: "bash", mcpServerName: "terminal", command: "echo `hostname` && echo ok" },
}),
JSON.stringify({
type: "guard.tool_denials_exceeded",
timestamp: "2026-06-06T00:00:01Z",
data: { denialCount: 5, threshold: 5, reason: "permission denied: bash" },
}),
].join("\n") + "\n"
);

const events = loadToolDenialsExceededEvents();
expect(events).toEqual([
{
denialCount: 5,
threshold: 5,
reason: "permission denied: bash",
recentToolCalls: ["terminal.bash(echo 'hostname' && echo ok)"],
timestamp: "2026-06-06T00:00:01Z",
},
]);
});
});

// ──────────────────────────────────────────────────────
Expand Down
37 changes: 37 additions & 0 deletions actions/setup/js/tool_call_details.cjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
// @ts-check

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing "use strict" directive: every other .cjs file in this directory opens with "use strict";. Omitting it means sloppy-mode semantics — silent global variable creation on undeclared assignments, different this binding in functions, and missed V8 optimisations. Add "use strict"; on the line after // @ts-check``.


/**
* Best-effort extraction of shell command text from a tool.execution_start payload.
* @param {any} data
* @returns {string}
*/
function extractShellCommandFromToolData(data) {
if (!data || typeof data !== "object") return "";
// Priority order prefers top-level command-like fields emitted by tool wrappers,
// then object-shaped payloads used by MCP/SDK tool schemas.
/** @type {Array<any>} */
const commandFieldCandidates = [];
if ("command" in data) commandFieldCandidates.push(data.command);
if ("input" in data) commandFieldCandidates.push(data.input);
if ("arguments" in data) commandFieldCandidates.push(data.arguments);
if ("args" in data) commandFieldCandidates.push(data.args);

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[/tdd] args as an array is silently skipped — a common shape for shell-like tools that pass arguments as ["-c", "echo hello"].

If data.args is ["-c", "git status"], the candidate is an array: not a string, so the first check fails; it's an object, so continue is skipped; but .command and .cmd are both undefined on arrays, so the function falls through and returns "". No preview is shown.

💡 Suggested fix

Add an explicit array branch in the loop:

if (Array.isArray(candidate)) {
  // Some tools pass args as [flag, command_string]; try the last string element
  const last = [...candidate].reverse().find(x => typeof x === "string" && x.trim());
  if (last) return last.trim();
  continue;
}

Or document in the JSDoc which array shapes are intentionally out of scope.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Array-shaped args silently yields nothing: the PR description lists args as a supported extraction shape, but args: ["cmd", "arg"] — the most common CLI array form — is pushed as a candidate then skipped because typeof array === "object" and array.command/array.cmd are both undefined. The function returns "" without any signal.

💡 Suggested fix

Either handle the array case explicitly, or clarify in the JSDoc that only string and {command, cmd} object shapes are supported for args:

if (Array.isArray(candidate)) {
  const joined = candidate.filter(s => typeof s === "string").join(" ").trim();
  if (joined) return joined;
  continue;
}

As-is, consumers passing args: ["-c", "git status"] get no preview, making the extraction gap invisible.

if ("toolInput" in data) commandFieldCandidates.push(data.toolInput);
if ("parameters" in data) commandFieldCandidates.push(data.parameters);
for (const candidate of commandFieldCandidates) {

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[/tdd] The multi-candidate fallback chain has no dedicated unit tests — bugs in specific paths (e.g., toolInput.cmd, parameters.command, or arguments as an object with .command) would be invisible today.

Both call sites rely on this function silently returning "" on a miss; a regression produces empty previews in denial reports without any failing test.

💡 Suggested `tool_call_details.test.cjs` cases
describe("extractShellCommandFromToolData", () => {
  it("returns top-level command string", () =>
    expect(fn({ command: "ls" })).toBe("ls"));
  it("extracts from input.command (MCP schema)", () =>
    expect(fn({ input: { command: "git status" } })).toBe("git status"));
  it("extracts from toolInput.cmd", () =>
    expect(fn({ toolInput: { cmd: "make test" } })).toBe("make test"));
  it("prefers command over input when both present", () =>
    expect(fn({ command: "first", input: { command: "second" } })).toBe("first"));
  it("returns empty string when no match", () =>
    expect(fn({ workingDir: "/tmp" })).toBe(""));
  it("returns empty string for null input", () =>
    expect(fn(null)).toBe(""));
});

if (typeof candidate === "string" && candidate.trim()) {
return candidate.trim();
}
if (!candidate || typeof candidate !== "object") continue;
if (typeof candidate.command === "string" && candidate.command.trim()) {
return candidate.command.trim();
}
if (typeof candidate.cmd === "string" && candidate.cmd.trim()) {
return candidate.cmd.trim();
}
}
return "";
}

module.exports = {
extractShellCommandFromToolData,
};
Loading