Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 38 additions & 2 deletions .github/workflows/daily-compiler-quality.lock.yml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

23 changes: 22 additions & 1 deletion .github/workflows/daily-compiler-quality.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,28 @@ name: Daily Compiler Quality Check
strict: true
timeout-minutes: 30
tools:
bash: true
bash:
- set
- find
- wc
- git
- mkdir
- cat
- jq
- mv
- echo
- bc
- sed
- printf
- date
- grep
- head
- ls
- pwd
- sort
- tail
- uniq
- yq
cache-memory: true
cli-proxy: true
github:
Expand Down
68 changes: 59 additions & 9 deletions actions/setup/js/bash_command_parser.cjs
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@

/**
* Split a shell command text into individual pipeline segments.
* Splits on the following shell operators: &&, ||, |, ;
* Splits on the following shell operators: &&, ||, |, ; and newlines.
*
* The split respects:
* - Single-quoted strings (no escaping inside)
Expand Down Expand Up @@ -143,6 +143,39 @@ function splitOnPipelineOperators(commandText) {
continue;
}

// Newline (sequential) — treat line breaks as command separators,
// except when escaped as a shell line continuation ("\\" + newline).
// Handles LF, CRLF, and CR forms.
if (ch === "\n" || ch === "\r") {

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[/diagnose] Shell comment lines (# ...) are now split into their own segments by the newline handler, and extractCommandName will return "#" for them — causing any script with comment lines to be rejected because shell(#) is never in any allowlist.

💡 Root cause & fix

word = "#" passes every existing guard (not a redirection, not !/{/}, not in SHELL_KEYWORDS) so the function returns it as a command name.

Add an early exit in extractCommandName (before the keyword check):

// Shell line-comments — not an executable command
if (word.startsWith("#")) return null;

Alternatively strip trailing comments in splitOnPipelineOperators before pushing a segment. The extractCommandName approach is narrower and easier to test.

A regression test should be added for the pipeline "git status\n# a comment\necho done" expecting ["git", "echo"].

let backslashRunLength = 0;
for (let j = current.length - 1; j >= 0 && current[j] === "\\"; j--) {
backslashRunLength++;
}

// Odd number of trailing backslashes means the newline is escaped.
if (backslashRunLength % 2 === 1) {
current = current.slice(0, -1);
i++;
if (ch === "\r" && i < len && commandText[i] === "\n") {
i++;
}
while (i < len && (commandText[i] === " " || commandText[i] === "\t")) i++;
if (current && !/\s$/.test(current)) {
current += " ";
}
continue;
}

segments.push(current);
current = "";
i++;
if (ch === "\r" && i < len && commandText[i] === "\n") {
i++;
}
while (i < len && /\s/.test(commandText[i])) i++;
continue;
}

current += ch;
i++;
}
Expand All @@ -156,13 +189,26 @@ function splitOnPipelineOperators(commandText) {
}

/**
* Shell flow-control keywords that can appear as the first word of a segment
* but do not represent an executable command. They must be excluded so the
* permission checker does not attempt to look up keywords like "then" or "fi"
* as command names and incorrectly deny (or allow) a pipeline that contains
* them as part of a compound statement (e.g. `if …; then cat …; fi`).
* Clause keywords can prefix an executable command in the same segment
* (for example: "then cat file", "do git log"). These are skipped and
* scanning continues to find the command token.
*/
const SHELL_KEYWORDS = new Set(["then", "else", "elif", "fi", "do", "done", "esac", "in", "function", "time", "coproc"]);
const CLAUSE_KEYWORDS = new Set(["then", "else", "elif", "do"]);

/**
* Structural shell keywords never represent an executable command token
* for permission matching in this parser. They introduce/close control
* structures and are treated as non-command segment starts.
*/
const STRUCTURE_KEYWORDS = new Set(["if", "fi", "for", "done", "while", "until", "case", "esac", "select", "in", "function", "time", "coproc"]);

const SHELL_KEYWORDS = new Set([...CLAUSE_KEYWORDS, ...STRUCTURE_KEYWORDS]);

// IDENTIFIER=VALUE where VALUE is one of:
// - "(...)" double-quoted text (supports escapes like \")
// - '(...)' single-quoted text
// - an unquoted non-space token
const ENV_ASSIGNMENT_PREFIX_RE = /^[A-Za-z_][A-Za-z0-9_]*=(?:"(?:\\.|[^"\\])*"|'[^']*'|\S*)\s*/;

/**
* Extract the executable command name from a single shell command segment.
Expand All @@ -186,9 +232,8 @@ function extractCommandName(segment) {
if (!remaining) return null;

// Skip leading env-var assignments: IDENTIFIER=anything (repeat)
const envAssignRe = /^[A-Za-z_][A-Za-z0-9_]*=\S*\s*/;
for (;;) {
const m = remaining.match(envAssignRe);
const m = remaining.match(ENV_ASSIGNMENT_PREFIX_RE);
if (!m) break;
remaining = remaining.slice(m[0].length).trim();
}
Expand Down Expand Up @@ -216,6 +261,11 @@ function extractCommandName(segment) {

// Flow-control keywords are not executable commands
if (SHELL_KEYWORDS.has(word)) {
if (CLAUSE_KEYWORDS.has(word)) {

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[/diagnose] elif conditions extract [ as a required command, but if conditions do not — because if is a structural keyword (whole segment returns null) while elif is a clause keyword (scanning continues to the next token).

💡 Impact

For a segment like elif [ -f "$FILE" ]:

  • elif → clause keyword → skip → remaining = "[ -f \"$FILE\" ]"
  • word = [ → not a keyword → returns "["

But if [ -f "$FILE" ]:

  • if → structural keyword → returns null

This means workflows using elif branches require an explicit shell([) or shell(test) permission, while logically equivalent if branches do not. A test verifying that elif [ -f file ] extracts [ (and that shell([) is in the required set for such scripts) would make this behavior explicit and intentional.

Consider whether elif should be promoted to a structural keyword, or add a note in the spec documenting this asymmetry.

remaining = remaining.slice(word.length).trim();
if (!remaining) return null;
continue;
}
return null;
}

Expand Down
64 changes: 63 additions & 1 deletion actions/setup/js/bash_command_parser.test.cjs
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,16 @@ describe("splitOnPipelineOperators", () => {
expect(segments).toEqual(["pwd", "ls -la", "safeoutputs --help"]);
});

it("splits on newlines as sequential separators", () => {
const segments = splitOnPipelineOperators("pwd\nls -la\nsafeoutputs --help");

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[/tdd] Only \n is tested; the implementation explicitly handles \r\n (CRLF) and bare \r (old-Mac CR) at lines 152–154 of the parser, but neither path has a test. A Windows-authored script or a script fetched over HTTP with CRLF line endings would exercise untested code.

💡 Suggested additions
it("splits on CRLF as sequential separator", () => {
  const segments = splitOnPipelineOperators("pwd\r\nls -la\r\nsafeoutputs --help");
  expect(segments).toEqual(["pwd", "ls -la", "safeoutputs --help"]);
});

it("splits on bare CR as sequential separator", () => {
  const segments = splitOnPipelineOperators("pwd\rls -la\rsafeoutputs --help");
  expect(segments).toEqual(["pwd", "ls -la", "safeoutputs --help"]);
});

Corresponding conformance vectors should also be added to bash_command_parser_spec_vectors.json.

expect(segments).toEqual(["pwd", "ls -la", "safeoutputs --help"]);
});

Comment on lines +108 to +112
it("does not split on escaped newline continuations", () => {
const segments = splitOnPipelineOperators("git log \\\n --oneline \\\n --max-count=1");
expect(segments).toEqual(["git log --oneline --max-count=1"]);
});

it("trims leading/trailing whitespace from each segment", () => {
const segments = splitOnPipelineOperators(" ls /tmp && cat file ");
expect(segments[0]).toBe("ls /tmp");
Expand Down Expand Up @@ -137,6 +147,18 @@ describe("extractCommandName", () => {
expect(extractCommandName("FOO=bar BAZ=qux echo hi")).toBe("echo");
});

it("skips leading env-var assignment with quoted spaces", () => {
expect(extractCommandName("FILES='a b c' echo hi")).toBe("echo");
});

it("skips leading env-var assignment with double-quoted spaces", () => {
expect(extractCommandName('FILES="a b c" echo hi')).toBe("echo");
});

it("skips leading env-var assignment with escaped quote in double-quoted value", () => {
expect(extractCommandName('FILES="a \\"b\\" c" echo hi')).toBe("echo");
});

it("handles negation operator ! and returns next command", () => {
expect(extractCommandName("! ls /tmp")).toBe("ls");
});
Expand All @@ -153,10 +175,30 @@ describe("extractCommandName", () => {
expect(extractCommandName("else")).toBeNull();
});

it("extracts command after shell keyword 'else'", () => {
expect(extractCommandName("else cat file")).toBe("cat");
});

it("returns null for shell keyword 'fi'", () => {
expect(extractCommandName("fi")).toBeNull();
});

it("extracts command after shell keyword 'elif'", () => {
expect(extractCommandName("elif grep x file")).toBe("grep");
});

it("returns null for shell keyword 'if'", () => {
expect(extractCommandName("if [ -f file ]")).toBeNull();
});

it("returns null for shell keyword 'for'", () => {
expect(extractCommandName("for f in a b c")).toBeNull();
});

it("extracts command after shell keyword 'do'", () => {
expect(extractCommandName("do git status")).toBe("git");
});

it("returns null for a bare redirection like >file", () => {
expect(extractCommandName(">file.txt")).toBeNull();
});
Expand Down Expand Up @@ -280,6 +322,26 @@ describe("extractCommandNamesFromPipeline", () => {
it("handles date with flags", () => {
expect(extractCommandNamesFromPipeline("date +%Y-%m-%d && echo done")).toEqual(["date", "echo"]);
});

it("extracts all command names from multiline script with variables and control flow", () => {
const cmd = `set -euo pipefail
CACHE_DIR='cache/gh-aw/cache-memory/compiler-quality'
ANALYSES_DIR="$CACHE_DIR/analyses"
mkdir -p "$ANALYSES_DIR"
FILES='compiler.go compiler_activation_jobs.go compiler_orchestrator.go compiler_jobs.go compiler_safe_outputs.go compiler_safe_outputs_config.go compiler_safe_outputs_job.go compiler_yaml.go compiler_yaml_main_job.go'
for f in $FILES; do git -C /home/runner/work/gh-aw/gh-aw log -1 --format='%H' -- "pkg/workflow/$f" | sed "s|^|$f |"; done
printf '---ROTATION---\n'
if [ -f "$CACHE_DIR/rotation.json" ]; then cat "$CACHE_DIR/rotation.json"; fi
printf '\n---HASHES---\n'
if [ -f "$CACHE_DIR/file-hashes.json" ]; then cat "$CACHE_DIR/file-hashes.json"; fi
printf '\n---FILES---\n'
for f in $FILES; do wc -l "/home/runner/work/gh-aw/gh-aw/pkg/workflow/$f"; done`;
expect(extractCommandNamesFromPipeline(cmd)).toEqual(["set", "mkdir", "git", "sed", "printf", "cat", "wc"]);

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[/tdd] The test asserts that [ does not appear in the extracted commands for if [ -f ... ] conditions — but this behavior is a side-effect of if being a structural keyword, not an explicit design decision with documented rationale. A future reader may not understand why [ is absent.\n\n

Details

💡 Suggestion

Add a comment in the test (or a dedicated it for clarity) that makes the intent explicit:

// 'if' is a structural keyword — the entire condition segment returns null,
// so '[' is intentionally NOT extracted as a required command.
// Scripts with if-conditions do NOT need shell([) permission.
it("does not extract [ from if-condition segments", () => {
  expect(extractCommandName("if [ -f file ]")).toBeNull();
});

This documents the design choice and acts as a guard against accidentally promoting if to a clause keyword.

});

it("keeps continued multiline command as one extracted command", () => {
expect(extractCommandNamesFromPipeline("git log \\\n --oneline \\\n --max-count=1")).toEqual(["git"]);
});
});

// ─────────────────────────────────────────────────────────────────────────────
Expand Down Expand Up @@ -351,7 +413,7 @@ describe("extractCommandName – extensive vectors", () => {
{ id: "BP-EC-004", segment: "2>&1", expected: null },
{ id: "BP-EC-005", segment: ">out.txt", expected: null },
{ id: "BP-EC-006", segment: "A=1 B=2 safeoutputs missing_data", expected: "safeoutputs" },
{ id: "BP-EC-007", segment: "then cat file", expected: null },
{ id: "BP-EC-007", segment: "then cat file", expected: "cat" },
{ id: "BP-EC-008", segment: "fi", expected: null },
{ id: "BP-EC-009", segment: "do", expected: null },
{ id: "BP-EC-010", segment: "done", expected: null },
Expand Down
41 changes: 39 additions & 2 deletions actions/setup/js/bash_command_parser_spec_vectors.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"version": "1.0.0",
"version": "1.1.0",
"metadata": {
"spec": "docs/src/content/docs/specs/bash-command-parser-specification.md",
"description": "Language-agnostic conformance vectors for the bash command parser",
Expand Down Expand Up @@ -54,6 +54,18 @@
"source": "verification",
"input": " ! ls /tmp && echo done ",
"expected": ["! ls /tmp", "echo done"]
},
{
"id": "BP-SP-153",
"source": "verification",
"input": "pwd\nls -la\nsafeoutputs --help",
"expected": ["pwd", "ls -la", "safeoutputs --help"]

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[/tdd] The spec and implementation support \r\n and \r, but the only new conformance vector (BP-SP-153) uses \n. Conformance vectors are language-agnostic regression anchors for any future re-implementation — missing CRLF vectors leave that path uncovered for any port.

💡 Suggested vectors
{
  "id": "BP-SP-154",
  "source": "verification",
  "input": "pwd\r\nls -la\r\nsafeoutputs --help",
  "expected": ["pwd", "ls -la", "safeoutputs --help"]
},
{
  "id": "BP-SP-155",
  "source": "verification",
  "input": "pwd\rls -la\rsafeoutputs --help",
  "expected": ["pwd", "ls -la", "safeoutputs --help"]
}

},
{
"id": "BP-SP-154",
"source": "verification",
"input": "git log \\\n --oneline \\\n --max-count=1",
"expected": ["git log --oneline --max-count=1"]
}
],
"extractCommandName": [
Expand Down Expand Up @@ -85,7 +97,7 @@
"id": "BP-EC-105",
"source": "model-based",
"input": "then cat file",
"expected": null
"expected": "cat"
},
{
"id": "BP-EC-151",
Expand All @@ -104,6 +116,18 @@
"source": "verification",
"input": "coproc",
"expected": null
},
{
"id": "BP-EC-154",
"source": "verification",
"input": "FILES='a b c' echo hi",
"expected": "echo"
},
{
"id": "BP-EC-155",
"source": "verification",
"input": "FILES=\"a \\\"b\\\" c\" echo hi",
"expected": "echo"
}
],
"extractCommandNamesFromPipeline": [
Expand Down Expand Up @@ -148,6 +172,19 @@
"source": "verification",
"input": "cat $(ls /tmp)",
"expected": ["cat"]
},
{
"id": "BP-EP-154",
"source": "verification",
"note": "Multiline control-flow script fixture to verify extraction of set/mkdir/git/sed/printf/cat/wc across for/if blocks.",
"input": "set -euo pipefail\nmkdir -p \"$ANALYSES_DIR\"\nfor f in $FILES; do git -C \"$REPO\" log -1 -- \"$f\" | sed \"s|^|$f |\"; done\nprintf '---ROTATION---\\n'\nif [ -f \"$CACHE_DIR/rotation.json\" ]; then cat \"$CACHE_DIR/rotation.json\"; fi\nfor f in $FILES; do wc -l \"$f\"; done",
"expected": ["set", "mkdir", "git", "sed", "printf", "cat", "wc"]
},
{
"id": "BP-EP-155",
"source": "verification",
"input": "git log \\\n --oneline \\\n --max-count=1",
"expected": ["git"]
}
]
},
Expand Down
Loading