Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 87 additions & 13 deletions actions/setup/js/safe_outputs_handlers.cjs
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,80 @@ function resolvePatchWorkspacePath(workspacePath) {
function createHandlers(server, appendSafeOutput, config = {}) {
const TOKEN_THRESHOLD = 16000;

/**
* Session-scoped per-type operation counters.
* Incremented on every successful appendSafeOutput call (MCE4 dual enforcement).
* @type {Map<string, number>}
*/
const operationCounts = new Map();

/**
* Return the explicitly user-configured max for a safe-output type, or null if not set / unlimited.
* Uses getSafeOutputsToolConfig for consistent key-normalisation (hyphens → underscores).
* Does NOT fall back to validation-config defaults: MCP-time enforcement is only
* applied when the user has explicitly set a limit; downstream enforcement covers defaults.
* Per Safe Outputs Specification MCE5: the same config source as the processor.
* @param {string} type - normalised safe-output type name (e.g. "add_comment")
* @returns {number | null}
*/
function getExplicitMax(type) {
const toolConfig = getSafeOutputsToolConfig(config, type);
if (!toolConfig || typeof toolConfig !== "object") return null;
if (!("max" in toolConfig)) return null;
const maxVal = toolConfig.max;
if (maxVal === -1) return null; // -1 means unlimited
if (typeof maxVal === "number" && Number.isInteger(maxVal) && maxVal > 0) {

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[/tdd] max: 0 is silently treated as unlimited — undocumented edge case.

The maxVal > 0 guard causes getExplicitMax to return null for max: 0, which disables invocation-time enforcement for that type. This is surprising if a user sets max: 0 expecting to block a tool entirely. The downstream processor may handle it differently, creating an MCE4/MCE5 consistency gap.

💡 Suggested options

Option A — Document the intent with an inline comment:

// max: 0 is treated as unlimited here; downstream processor handles 0-limit separately
if (typeof maxVal === "number" && Number.isInteger(maxVal) && maxVal > 0) {

Option B — Add a test that explicitly asserts max: 0 is treated as unlimited:

it("treats max: 0 as unlimited (no invocation-time enforcement)", () => {
  const h = createHandlers(mockServer, mockAppendSafeOutput, { add_labels: { max: 0 } });
  // Should NOT throw — downstream processor is the only guard for max: 0
  for (let i = 0; i < 3; i++) {
    expect(h.defaultHandler("add_labels")({ labels: ["ok"] })).not.toHaveProperty("isError");
  }
});

Either way, the current silent behavior should be intentional and documented.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

max: 0 silently falls through to unlimited — MCE4 dual enforcement is broken for this value. The maxVal > 0 guard means max: 0 returns null (no limit), so MCP-time allows unlimited calls while the downstream processor would block them all.

💡 Impact and fix

getExplicitMax is supposed to be the MCP-side mirror of downstream enforcement. If downstream treats max: 0 as "block all", the two layers are inconsistent: agents emit entries freely at MCP time and discover they are all truncated only after the run — the exact silent-truncation problem this PR was written to prevent.

There is also no regression test for this: the test named "rejects immediately when max is 0" (line 2770 in the test file) uses max: 1, not max: 0, so the bug would slip through CI undetected.

// current — max: 0 returns null (unlimited)
if (typeof maxVal === "number" && Number.isInteger(maxVal) && maxVal > 0) {

// fix — include 0 so max: 0 blocks all calls
if (typeof maxVal === "number" && Number.isInteger(maxVal) && maxVal >= 0) {

Also add a dedicated test:

it("blocks all calls when max is 0", () => {
  const h = createHandlers(mockServer, mockAppendSafeOutput, {
    add_labels: { max: 0 },
  });
  expect(() => h.defaultHandler("add_labels")({ labels: ["x"] })).toThrow(
    expect.objectContaining({ code: -32602, message: expect.stringContaining("E002") })
  );
  expect(mockAppendSafeOutput).not.toHaveBeenCalled();
});

return maxVal;
Comment on lines +222 to +229
}
return null;
Comment on lines +214 to +231
}

/**
* Enforce the per-type operation count limit at invocation time.
* Throws a JSON-RPC -32602 error when the configured max has already been reached.
* Per Safe Outputs Specification MCE4: Dual Enforcement — constraints MUST be
* enforced at both invocation time (MCP server) and processing time (safe output
* processor) to provide defence-in-depth.
* @param {string} type - normalised safe-output type name
*/
function enforcePerTypeMax(type) {
const maxAllowed = getExplicitMax(type);
if (maxAllowed === null) return; // no explicit limit configured
const current = operationCounts.get(type) || 0;
if (current >= maxAllowed) {
throw {

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[/zoom-out] Throwing a plain object instead of an Error instance — potential catch-block mismatch.

enforcePerTypeMax throws { code: -32602, message: ..., data: ... } — a plain object, not an Error. Any error handling middleware that checks err instanceof Error or accesses err.stack will silently fail to match. The existing inlineReviewCommentCount pattern (which this PR mirrors) should be checked for consistency.

💡 Context

This is likely intentional for the JSON-RPC protocol (the MCP framework converts thrown objects directly to JSON-RPC error responses). If so, a brief comment would confirm the intent:

// Throw a plain JSON-RPC error object (not Error) — the MCP framework
// catches this and returns it directly as the JSON-RPC error payload.
throw {
  code: -32602,
  ...
};

If the framework does rely on instanceof Error, a custom class (class SafeOutputLimitError extends Error) would be safer.

code: -32602,
message: `E002: ${type} limit reached — ${current} of ${maxAllowed} already used this run`,
data: {
constraint: "max",
type,
limit: maxAllowed,
guidance:
`You have used all ${maxAllowed} ${type} operations for this run. ` +
`Further ${type} calls will be ignored. Prioritize the most important items ` +

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[/tdd] Guidance says "will be ignored" but calls actually throw — misleading error message.

The phrase Further ${type} calls will be ignored implies silent no-ops, but every subsequent call to a type-enforced handler also throws -32602. An agent reading this guidance and assuming its next call will be silently dropped will be surprised when it also fails.

💡 Suggested fix

Change:

`Further ${type} calls will be ignored. Prioritize the most important items ` +

To:

`Further ${type} calls will fail with this error. Prioritize the most important items ` +

This accurately describes the runtime behavior and removes the risk of the agent misunderstanding the enforcement mode.

`(e.g. consolidate multiple updates into one), or call noop. ` +
Comment on lines +254 to +257
`Note: other safe-output types have independent budgets, so applying one type ` +
`without its companion type can leave inconsistent state.`,
},
};
}
}

/**
* Append a safe-output entry after enforcing the per-type max count.
* Increments the session counter only after a successful write, mirroring the
* approach used by inlineReviewCommentCount so that write errors do not advance
* the counter.
* Per Safe Outputs Specification MCE4: invocation-time half of dual enforcement.
* @param {Record<string, any>} entry
*/
const appendSafeOutputCounted = entry => {
const type = entry?.type;

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[/tdd] Entries without a type field silently bypass enforcement and counting.

if (type) enforcePerTypeMax(type) means any entry where entry.type is falsy skips both the limit check and the counter increment. In the current handlers this is safe because all call sites set type, but there's no test confirming this assumption, and a future handler that forgets to set type would silently evade MCE4 enforcement.

💡 Suggested test or guard

Add a defensive assertion, or at least a test:

it("does not count or enforce when entry.type is missing", () => {
  const h = createHandlers(mockServer, mockAppendSafeOutput, { add_labels: { max: 1 } });
  // Direct call to appendSafeOutputCounted with no type (internal/unusual)
  // Verify it appends without throwing and does not consume budget
  // ... (requires exposing appendSafeOutputCounted or testing via a handler that passes a typeless entry)
});

Alternatively, throw an internal error when type is missing to catch regressions early.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

appendSafeOutputCounted silently bypasses all enforcement when entry.type is falsy — limit checks and counter increments are both skipped, making the enforcement contract invisible to callers that forget to set type.

💡 Details and suggested hardening

Currently all callers set type (either via defaultHandler spreading {type: toolName} or explicitly in named handlers). But the function signature accepts any Record<string, any> and silently no-ops when type is missing. A future handler that calls appendSafeOutputCounted without first setting entry.type will bypass all per-type limits with no error or warning — a silent regression.

The minimal fix is a debug-level warning:

const appendSafeOutputCounted = entry => {
  const type = entry?.type;
  if (!type) {
    // enforcement impossible without a type — log and fall through
    server.debug("appendSafeOutputCounted: entry.type is missing; per-type max not enforced");
    appendSafeOutput(entry);
    return;
  }
  enforcePerTypeMax(type);
  appendSafeOutput(entry);
  operationCounts.set(type, (operationCounts.get(type) || 0) + 1);
};

A stronger fix would be to throw:

if (!type) throw new Error("appendSafeOutputCounted: entry.type is required for limit enforcement");

This would surface integration bugs immediately in tests rather than silently degrading enforcement in production.

if (type) enforcePerTypeMax(type);
appendSafeOutput(entry);
if (type) operationCounts.set(type, (operationCounts.get(type) || 0) + 1);
};

/**
* Validate schema-declared explicit target parameters for wildcard-target tools.
* @param {Record<string, any>} entry
Expand Down Expand Up @@ -258,7 +332,7 @@ function createHandlers(server, appendSafeOutput, config = {}) {

const fileInfo = writeLargeContentToFile(largeContent);

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[/diagnose] File is written to disk before the enforcement check runs — orphaned file on limit hit.

When maybeHandleLargeContent is invoked, writeLargeContentToFile(largeContent) runs on line 333 before appendSafeOutputCounted(entry) on line 335 enforces the limit. If the per-type count is already at max, the file write succeeds, the enforcement throws, and the temporary file is left on disk with no corresponding safe-output entry.

💡 Suggested fix

Enforce the limit before writing the file:

// Check limit before committing the expensive file write
if (entry?.type) enforcePerTypeMax(entry.type);

const fileInfo = writeLargeContentToFile(largeContent);
entry[largeFieldName] = `[Content too large, saved to file: ${fileInfo.filename}]`;
appendSafeOutputCounted(entry);

Alternatively, call getExplicitMax and check the counter before entering the file-write path.

This also needs a regression test: a test that verifies the file is NOT written when the limit is already reached.

entry[largeFieldName] = `[Content too large, saved to file: ${fileInfo.filename}]`;
appendSafeOutput(entry);
appendSafeOutputCounted(entry);

return {
content: [
Expand Down Expand Up @@ -286,7 +360,7 @@ function createHandlers(server, appendSafeOutput, config = {}) {
if (largeContentResponse) return largeContentResponse;

// Normal case - no large content
appendSafeOutput(entry);
appendSafeOutputCounted(entry);
return {
content: [
{
Expand Down Expand Up @@ -416,7 +490,7 @@ function createHandlers(server, appendSafeOutput, config = {}) {
targetFileName: targetFileName,
};

appendSafeOutput(entry);
appendSafeOutputCounted(entry);

return {
content: [
Expand Down Expand Up @@ -620,7 +694,7 @@ function createHandlers(server, appendSafeOutput, config = {}) {
if (allowEmpty) {
server.debug(`allow-empty is enabled for create_pull_request - skipping patch generation`);
// Append the safe output entry without generating a patch
appendSafeOutput(entry);
appendSafeOutputCounted(entry);
return {
content: [
{
Expand Down Expand Up @@ -834,7 +908,7 @@ function createHandlers(server, appendSafeOutput, config = {}) {
entry.base_commit = bundleResult.baseCommit;
}

appendSafeOutput(entry);
appendSafeOutputCounted(entry);
return {
content: [
{
Expand All @@ -856,7 +930,7 @@ function createHandlers(server, appendSafeOutput, config = {}) {
};
}

appendSafeOutput(entry);
appendSafeOutputCounted(entry);
return {
content: [
{
Expand Down Expand Up @@ -1277,7 +1351,7 @@ function createHandlers(server, appendSafeOutput, config = {}) {
entry.base_commit = bundleResult.baseCommit;
}

appendSafeOutput(entry);
appendSafeOutputCounted(entry);
return {
content: [
{
Expand All @@ -1299,7 +1373,7 @@ function createHandlers(server, appendSafeOutput, config = {}) {
};
}

appendSafeOutput(entry);
appendSafeOutputCounted(entry);
return {
content: [
{
Expand Down Expand Up @@ -1551,7 +1625,7 @@ function createHandlers(server, appendSafeOutput, config = {}) {
};
const largeContentResponse = maybeHandleLargeContent(droppedEntry);
if (!largeContentResponse) {
appendSafeOutput(droppedEntry);
appendSafeOutputCounted(droppedEntry);
}
return {
content: [
Expand All @@ -1572,7 +1646,7 @@ function createHandlers(server, appendSafeOutput, config = {}) {
const largeContentResponse = maybeHandleLargeContent(entry);
if (largeContentResponse) return largeContentResponse;

appendSafeOutput(entry);
appendSafeOutputCounted(entry);
return {
content: [
{
Expand Down Expand Up @@ -1603,7 +1677,7 @@ function createHandlers(server, appendSafeOutput, config = {}) {
server.debug(`temporary_id for create_project: ${entry.temporary_id}`);

// Append to safe outputs
appendSafeOutput(entry);
appendSafeOutputCounted(entry);

// Return the temporary_id to the agent so it can reference this project
return {
Expand Down Expand Up @@ -1714,7 +1788,7 @@ function createHandlers(server, appendSafeOutput, config = {}) {
server.debug(`temporary_id for add_comment: ${entry.temporary_id}`);

// Append to safe outputs
appendSafeOutput(entry);
appendSafeOutputCounted(entry);

// Return the temporary_id to the agent so it can reference this comment
return {
Expand Down Expand Up @@ -1893,7 +1967,7 @@ function createHandlers(server, appendSafeOutput, config = {}) {
server.debug(`upload_artifact: staged ${filePath} as ${destName}`);
}

appendSafeOutput(entry);
appendSafeOutputCounted(entry);

const temporaryId = entry.temporary_id || null;
return {
Expand Down
181 changes: 181 additions & 0 deletions actions/setup/js/safe_outputs_handlers.test.cjs
Original file line number Diff line number Diff line change
Expand Up @@ -2736,6 +2736,187 @@ describe("safe_outputs_handlers", () => {
});
});

describe("per-type max enforcement (MCE4 dual enforcement)", () => {
let mockServer;
let mockAppendSafeOutput;

beforeEach(() => {
vi.clearAllMocks();
mockServer = { debug: vi.fn() };
mockAppendSafeOutput = vi.fn();
});

it("allows calls up to the configured max and rejects the (max+1)th call via defaultHandler", () => {
const h = createHandlers(mockServer, mockAppendSafeOutput, {
add_labels: { max: 2 },
});

// First two calls succeed
expect(h.defaultHandler("add_labels")({ labels: ["approved"] })).not.toHaveProperty("isError");
expect(h.defaultHandler("add_labels")({ labels: ["approved"] })).not.toHaveProperty("isError");
expect(mockAppendSafeOutput).toHaveBeenCalledTimes(2);

// Third call must throw E002
expect(() => h.defaultHandler("add_labels")({ labels: ["approved"] })).toThrow(
expect.objectContaining({
code: -32602,
message: expect.stringContaining("E002"),
})
);
// No additional append after limit
expect(mockAppendSafeOutput).toHaveBeenCalledTimes(2);
});

it("rejects immediately when max is 0 and config uses hyphen-keyed type (key normalisation)", () => {

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[/tdd] Test description says "when max is 0" but the config uses max: 1 — misleading spec.

The test title references max: 0 which suggests a zero-limit test, but the body sets "add-labels": { max: 1 }. The test is actually verifying hyphen→underscore key normalisation with max: 1. A developer reading the description would expect this test to document max: 0 behaviour (which is currently silently unlimited — see related comment on getExplicitMax).

💡 Suggested fix

Rename the test to match what it actually asserts:

it("enforces max via hyphen-keyed config (key normalisation: hyphen→underscore)", () => {

If you also want to cover max: 0, add a separate test that documents the explicit intent (e.g., max: 0 is treated as unlimited at invocation time).

// Ensure getSafeOutputsToolConfig's hyphen→underscore lookup works for max checks
const h = createHandlers(mockServer, mockAppendSafeOutput, {
"add-labels": { max: 1 },
});
Comment on lines +2770 to +2774

h.defaultHandler("add_labels")({ labels: ["ok"] });
expect(mockAppendSafeOutput).toHaveBeenCalledTimes(1);

expect(() => h.defaultHandler("add_labels")({ labels: ["ok"] })).toThrow(expect.objectContaining({ code: -32602, message: expect.stringContaining("E002") }));
expect(mockAppendSafeOutput).toHaveBeenCalledTimes(1);
});

it("enforces max for addCommentHandler", () => {
const h = createHandlers(mockServer, mockAppendSafeOutput, {
add_comment: { max: 1 },
});

global.context = { repo: { owner: "o", repo: "r" }, eventName: "issues", payload: { issue: { number: 1 } } };
try {
const ok = h.addCommentHandler({ body: "first comment", item_number: 1 });
expect(ok).not.toHaveProperty("isError");
expect(mockAppendSafeOutput).toHaveBeenCalledTimes(1);

expect(() => h.addCommentHandler({ body: "second comment", item_number: 2 })).toThrow(expect.objectContaining({ code: -32602, message: expect.stringContaining("E002") }));
expect(mockAppendSafeOutput).toHaveBeenCalledTimes(1);
} finally {
global.context = { repo: { owner: "test-owner", repo: "test-repo" }, eventName: "push", payload: {} };
}
});

it("independent per-type budgets: exceeding add_comment limit does not affect add_labels", () => {
const h = createHandlers(mockServer, mockAppendSafeOutput, {
add_comment: { max: 1 },
add_labels: { max: 3 },
});

global.context = { repo: { owner: "o", repo: "r" }, eventName: "issues", payload: { issue: { number: 1 } } };
try {
h.addCommentHandler({ body: "first comment", item_number: 1 });
expect(() => h.addCommentHandler({ body: "second comment", item_number: 2 })).toThrow(expect.objectContaining({ code: -32602, message: expect.stringContaining("E002") }));

// add_labels budget is separate — all 3 calls should succeed
expect(h.defaultHandler("add_labels")({ labels: ["a"] })).not.toHaveProperty("isError");
expect(h.defaultHandler("add_labels")({ labels: ["b"] })).not.toHaveProperty("isError");
expect(h.defaultHandler("add_labels")({ labels: ["c"] })).not.toHaveProperty("isError");

// 4th add_labels call must fail
expect(() => h.defaultHandler("add_labels")({ labels: ["d"] })).toThrow(expect.objectContaining({ code: -32602, message: expect.stringContaining("E002") }));
} finally {
global.context = { repo: { owner: "test-owner", repo: "test-repo" }, eventName: "push", payload: {} };
}
});

it("does not enforce when max is -1 (unlimited)", () => {
const h = createHandlers(mockServer, mockAppendSafeOutput, {
add_labels: { max: -1 },
});

for (let i = 0; i < 20; i++) {
expect(h.defaultHandler("add_labels")({ labels: ["ok"] })).not.toHaveProperty("isError");
}
expect(mockAppendSafeOutput).toHaveBeenCalledTimes(20);
});

it("does not enforce when max is not explicitly configured", () => {
// Only target is set — no max → no invocation-time limit
const h = createHandlers(mockServer, mockAppendSafeOutput, {
add_labels: { target: "*" },
});

for (let i = 0; i < 5; i++) {
expect(h.defaultHandler("add_labels")({ labels: ["ok"] })).not.toHaveProperty("isError");
}
expect(mockAppendSafeOutput).toHaveBeenCalledTimes(5);
});

it("does not enforce when config is empty (no safe-outputs config)", () => {
const h = createHandlers(mockServer, mockAppendSafeOutput);

for (let i = 0; i < 5; i++) {
expect(h.defaultHandler("add_labels")({ labels: ["ok"] })).not.toHaveProperty("isError");
}
expect(mockAppendSafeOutput).toHaveBeenCalledTimes(5);
});

it("error message includes type, current count, and limit", () => {
const h = createHandlers(mockServer, mockAppendSafeOutput, {
add_labels: { max: 3 },
});

h.defaultHandler("add_labels")({ labels: ["a"] });
h.defaultHandler("add_labels")({ labels: ["b"] });
h.defaultHandler("add_labels")({ labels: ["c"] });

let thrown;
try {
h.defaultHandler("add_labels")({ labels: ["d"] });
} catch (err) {
thrown = err;
}

expect(thrown).toBeDefined();
expect(thrown.code).toBe(-32602);
expect(thrown.message).toContain("add_labels");
expect(thrown.message).toContain("3 of 3");
expect(thrown.data).toMatchObject({
constraint: "max",
type: "add_labels",
limit: 3,
});
expect(thrown.data.guidance).toContain("add_labels");
});

it("counter does not increment when append throws (write error)", () => {
const h = createHandlers(mockServer, mockAppendSafeOutput, {
add_labels: { max: 2 },
});

// First call succeeds
h.defaultHandler("add_labels")({ labels: ["ok"] });
expect(mockAppendSafeOutput).toHaveBeenCalledTimes(1);

// Second call: append throws a write error
mockAppendSafeOutput.mockImplementationOnce(() => {
throw new Error("disk write error");
});
expect(() => h.defaultHandler("add_labels")({ labels: ["fail"] })).toThrow("disk write error");

// Counter is still 1 (not 2) because the failed write shouldn't count
// Third call should succeed (not hit limit)
expect(h.defaultHandler("add_labels")({ labels: ["ok2"] })).not.toHaveProperty("isError");
expect(mockAppendSafeOutput).toHaveBeenCalledTimes(3); // call 1 + (failed) call 2 + call 3
});

it("each createHandlers() call gets a fresh independent counter", () => {
const config = { add_labels: { max: 1 } };

const h1 = createHandlers(mockServer, mockAppendSafeOutput, config);
const h2 = createHandlers(mockServer, mockAppendSafeOutput, config);

h1.defaultHandler("add_labels")({ labels: ["a"] });
// h1's budget is now exhausted — must throw
expect(() => h1.defaultHandler("add_labels")({ labels: ["b"] })).toThrow(expect.objectContaining({ code: -32602 }));

// h2 has its own fresh counter — should still allow 1 call
expect(h2.defaultHandler("add_labels")({ labels: ["a"] })).not.toHaveProperty("isError");
});
});

describe("hasUpdatePullRequestFields", () => {
it("returns false for empty object", () => {
expect(hasUpdatePullRequestFields({})).toBe(false);
Expand Down
Loading