diff --git a/Directory.Build.props b/Directory.Build.props index 8f78ad0..ebca8ce 100644 --- a/Directory.Build.props +++ b/Directory.Build.props @@ -7,7 +7,7 @@ latest enable true - 0.1.2 + 0.1.3 alpha diff --git a/IMPLEMENTATION_PLAN.md b/IMPLEMENTATION_PLAN.md index cdcbd91..5ef1120 100644 --- a/IMPLEMENTATION_PLAN.md +++ b/IMPLEMENTATION_PLAN.md @@ -205,10 +205,32 @@ bulldoze priorities. - [x] `dotnet pack` validation: icon embedded in `.nupkg` confirmed - [ ] Re-pack to validate icon embeds +### 16. Bash line comments (#25) — 0.1.3-alpha + +- [x] `BashTokenKind.Comment` enum member (internal) +- [x] `BashLexer.ConsumeLineComment` helper; `#` dispatch in main scan loop +- [x] `BashCommandParser.FilterSignificant` drops Comment tokens +- [x] SPEC.md §4 BNF note + §5 "Comment handling" subsection +- [x] 10 new lexer unit tests + 8 new parser unit tests +- [x] 9 new corpus entries (123–131) including both Netclaw repros + (sanitized paths per SPEC §14) +- [x] `Directory.Build.props` `VersionPrefix` 0.1.2 → 0.1.3 +- [x] `RELEASE_NOTES.md` 0.1.3-alpha section +- [ ] Cut 0.1.3-alpha tag once branch is merged + --- ## NEXT (0.1.x — additive, post-alpha) +- Newline-as-statement-separator at the parser level (SPEC §4 gap + surfaced by #25). The lexer already emits Whitespace tokens for + newlines with the intent of acting as separators (see the lexer + comment at the newline branch), but `BashCommandParser.SplitIntoSegments` + only splits on `&&` / `||` / `;` / `|`. As a result, `cmd1\ncmd2` + currently parses to one clause `[cmd1]` with `cmd2` as an argument. + v0.1.3 corpus entry 126 works around this with an explicit `;`; + the long-term fix is to bridge newline-Whitespace tokens to the + segment splitter as synthetic `;` separators. - Seed 50–100 corpus entries from sanitized real-world dogfood logs (SPEC §14 workflow) - Expand verb tables as corpus surfaces real commands diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md index 406d123..b2ad45f 100644 --- a/RELEASE_NOTES.md +++ b/RELEASE_NOTES.md @@ -1,3 +1,43 @@ +#### 0.1.3-alpha May 12th 2026 #### + +Bash line comment handling. Public API unchanged. + +**Fixed** + +- **Bash line comments are now recognized and skipped (#25).** `BashLexer` + treats `#` at a word boundary (start of input, or preceded by + whitespace, a newline, or any operator) as the start of a comment + that runs to the next newline. The comment text is emitted as a new + internal `BashTokenKind.Comment` token for source fidelity and is + filtered by the parser alongside `Whitespace` / `Continuation`, so + it contributes no verb, args, redirects, or flags to any clause. + Comment-only input parses to `Clauses = []`, `IsUnparseable = false`, + matching the existing empty-/whitespace-only path. Quoting and + escape rules are honored: `#` inside single or double quotes is + literal, `#` in the interior of an unquoted word (e.g. `abc#def`) + is literal, and `\#` outside quotes is literal. + + Before this fix, `# Extract worktree branches\ngit worktree list` + parsed to a single clause with verb chain `[#, Extract]` — the + comment text leaked into downstream approval prompts and broke + approval-state caching in consumers that did asymmetric verb-chain + extraction (persistence-time vs. retry-authorization saw different + verb sets, causing tool calls to fail after the user had already + clicked Approve). + +**Behavior notes** + +- Public API surface is unchanged (no `PublicApiSnapshotTests` delta). +- SPEC.md §4 / §5: new "Comment handling" subsection in §5 documents + the boundary rules; §4 BNF notes that comments are + whitespace-equivalent at the lexer level. +- Corpus: 9 new entries (123–131) pin every case from the issue + report, plus the two Netclaw repros (sanitized paths per §14). +- v0.1 still does not treat top-level newlines as statement separators + (SPEC §4 gap, tracked separately in IMPLEMENTATION_PLAN NEXT) — a + comment between two commands on separate lines requires an explicit + `;` separator to split into two clauses. + #### 0.1.2-alpha May 11th 2026 #### Three parser correctness fixes. Public API unchanged. diff --git a/SPEC.md b/SPEC.md index 91bac16..0897414 100644 --- a/SPEC.md +++ b/SPEC.md @@ -357,6 +357,11 @@ quoted_string := single-quoted | double-quoted - Whitespace between tokens is one or more spaces or tabs. - `\` followed by a newline is a line continuation (treat as whitespace). +- Bash line comments (`#` at a word boundary through end-of-line) are + whitespace-equivalent at the lexer level — they emit a Comment token + for source fidelity but are filtered alongside Whitespace by the + parser, so they do not appear in the grammar. See §5 "Comment + handling" for boundary rules. - `\` before a metachar inside a double-quoted string escapes the metachar. - Single-quoted strings preserve all bytes literally — no escape processing. - Heredocs (`<`, `>>`, `<`, + `2>`, `2>>`, `(`, `)`, `<<`, `<<-`), a quoted string, or an opaque + substitution. Equivalently: `#` is comment-start everywhere the + outer lexer dispatch loop sits, because every other lexer rule has + already consumed its territory before `#` is considered. +- `#` **inside** single or double quotes is a literal character (no + comment). +- `#` in the **interior** of an unquoted word (e.g. `abc#def`) is a + literal character. `ReadWord` consumes the whole word before the + outer loop can see the embedded `#`; there is no re-scanning. +- `\#` (backslash-escaped `#` outside quotes) is consumed by the + normal escape rule — the backslash is dropped and `#` becomes a + regular word character. Equivalent example: `cmd \#abc` produces + one Word token `#abc`. +- The terminating newline is **not** consumed by the Comment token. + It survives as a Whitespace token, preserving statement-boundary + semantics for the parser (see §4). +- A Comment token's `Value` is empty (matching `Whitespace` / + `Continuation`); `SourceStart` / `SourceLength` identify the slice + including the leading `#` so callers that need the literal text can + recover it from the original input span. +- **Effect on parsing**: comment-only input parses to + `Clauses = []`, `IsUnparseable = false` — mirroring empty-input + behavior. A comment leading, trailing, or interleaved with a clause + contributes no tokens to the verb chain, args, or redirects of any + clause. + --- ## 6. Verb Tables diff --git a/src/ShellSyntaxTree/Internal/Bash/Lexing/BashLexer.cs b/src/ShellSyntaxTree/Internal/Bash/Lexing/BashLexer.cs index 2373c2b..827ed84 100644 --- a/src/ShellSyntaxTree/Internal/Bash/Lexing/BashLexer.cs +++ b/src/ShellSyntaxTree/Internal/Bash/Lexing/BashLexer.cs @@ -183,6 +183,18 @@ internal static IReadOnlyList Tokenize(string input) continue; } + // ---- line comment ---- + // Reaching this branch implies a word boundary (quotes, + // operators, and opaque regions are dispatched above), so `#` + // here starts a comment to EOL. Mid-word `#` is consumed by + // ReadWord, and `\#` by its escape handling — neither reaches + // this point. SPEC §5. + if (c == '#') + { + i = ConsumeLineComment(src, i, tokens); + continue; + } + // ---- word ---- i = ReadWord(src, i, tokens); } @@ -392,6 +404,26 @@ private static int ConsumeBacktickSubstitution( return start + length; } + // Consume `#` through (but not including) the next newline. The + // terminating newline stays in the stream so the outer loop emits it + // as a Whitespace token, preserving SPEC §4 clause-boundary + // semantics. Value is "" to match Whitespace/Continuation — callers + // that need the literal text can slice the source via + // SourceStart/SourceLength. + private static int ConsumeLineComment( + ReadOnlySpan src, int start, List tokens) + { + var i = start; + while (i < src.Length && src[i] != '\n' && src[i] != '\r') + { + i++; + } + + tokens.Add(new BashToken( + BashTokenKind.Comment, "", null, start, i - start, null)); + return i; + } + private static int ConsumeArithmetic( ReadOnlySpan src, int start, List tokens) { diff --git a/src/ShellSyntaxTree/Internal/Bash/Lexing/BashTokenKind.cs b/src/ShellSyntaxTree/Internal/Bash/Lexing/BashTokenKind.cs index 3e13ecc..da786ca 100644 --- a/src/ShellSyntaxTree/Internal/Bash/Lexing/BashTokenKind.cs +++ b/src/ShellSyntaxTree/Internal/Bash/Lexing/BashTokenKind.cs @@ -38,6 +38,17 @@ internal enum BashTokenKind /// whitespace by the parser. SPEC §5. Continuation, + /// A bash line comment — # at a word boundary + /// through end-of-line (the terminating newline is preserved as + /// a separate token so statement + /// boundaries are unaffected). Emitted for source fidelity with + /// empty ; + /// and identify the slice. + /// The parser drops these in FilterSignificant alongside + /// and . + /// SPEC §5. + Comment, + /// An opaque region — $(…) or backtick-quoted /// `…`. The parser consumes one of these as a single /// Arg{ Kind = DynamicSkip, IsPath = false } per the v0.1 diff --git a/src/ShellSyntaxTree/Internal/Bash/Parsing/BashCommandParser.cs b/src/ShellSyntaxTree/Internal/Bash/Parsing/BashCommandParser.cs index c488e0f..c367987 100644 --- a/src/ShellSyntaxTree/Internal/Bash/Parsing/BashCommandParser.cs +++ b/src/ShellSyntaxTree/Internal/Bash/Parsing/BashCommandParser.cs @@ -496,7 +496,9 @@ private static List FilterSignificant(IReadOnlyList tokens var filtered = new List(tokens.Count); foreach (var t in tokens) { - if (t.Kind == BashTokenKind.Whitespace || t.Kind == BashTokenKind.Continuation) + if (t.Kind == BashTokenKind.Whitespace + || t.Kind == BashTokenKind.Continuation + || t.Kind == BashTokenKind.Comment) { continue; } diff --git a/tests/ShellSyntaxTree.Tests/Corpus/bash/123_comment_only_no_clauses.json b/tests/ShellSyntaxTree.Tests/Corpus/bash/123_comment_only_no_clauses.json new file mode 100644 index 0000000..76d81f8 --- /dev/null +++ b/tests/ShellSyntaxTree.Tests/Corpus/bash/123_comment_only_no_clauses.json @@ -0,0 +1,9 @@ +{ + "name": "Comment-only input produces zero clauses", + "input": "# just a note", + "expected": { + "isUnparseable": false, + "clauses": [] + }, + "notes": "Issue #25 / SPEC §5: a comment-only script is whitespace-equivalent — zero significant tokens → zero clauses, not IsUnparseable." +} diff --git a/tests/ShellSyntaxTree.Tests/Corpus/bash/124_leading_comment_then_command.json b/tests/ShellSyntaxTree.Tests/Corpus/bash/124_leading_comment_then_command.json new file mode 100644 index 0000000..011c549 --- /dev/null +++ b/tests/ShellSyntaxTree.Tests/Corpus/bash/124_leading_comment_then_command.json @@ -0,0 +1,18 @@ +{ + "name": "Leading comment followed by a single command", + "input": "# fetch the latest\ngit pull", + "expected": { + "isUnparseable": false, + "clauses": [ + { + "operator": "None", + "verb": ["git", "pull"], + "args": [], + "redirects": [], + "isSubshell": false, + "isBashCWrapped": false + } + ] + }, + "notes": "Issue #25: comment-as-verb regression. The leading explanatory line must not leak into the verb chain; v0.1.3 lexer skips `#`-to-EOL." +} diff --git a/tests/ShellSyntaxTree.Tests/Corpus/bash/125_inline_trailing_comment.json b/tests/ShellSyntaxTree.Tests/Corpus/bash/125_inline_trailing_comment.json new file mode 100644 index 0000000..9364a6c --- /dev/null +++ b/tests/ShellSyntaxTree.Tests/Corpus/bash/125_inline_trailing_comment.json @@ -0,0 +1,18 @@ +{ + "name": "Inline trailing comment is dropped from the clause", + "input": "git pull # update local", + "expected": { + "isUnparseable": false, + "clauses": [ + { + "operator": "None", + "verb": ["git", "pull"], + "args": [], + "redirects": [], + "isSubshell": false, + "isBashCWrapped": false + } + ] + }, + "notes": "Issue #25 / SPEC §5: a `#` preceded by whitespace starts a comment. The trailing text becomes a Comment token and is filtered out by the parser." +} diff --git a/tests/ShellSyntaxTree.Tests/Corpus/bash/126_comment_between_two_commands.json b/tests/ShellSyntaxTree.Tests/Corpus/bash/126_comment_between_two_commands.json new file mode 100644 index 0000000..b7b08b4 --- /dev/null +++ b/tests/ShellSyntaxTree.Tests/Corpus/bash/126_comment_between_two_commands.json @@ -0,0 +1,26 @@ +{ + "name": "Comment between two ;-separated commands", + "input": "git pull ; # now build\ndotnet build", + "expected": { + "isUnparseable": false, + "clauses": [ + { + "operator": "None", + "verb": ["git", "pull"], + "args": [], + "redirects": [], + "isSubshell": false, + "isBashCWrapped": false + }, + { + "operator": "Sequence", + "verb": ["dotnet", "build"], + "args": [], + "redirects": [], + "isSubshell": false, + "isBashCWrapped": false + } + ] + }, + "notes": "Issue #25: comment on its own line between two clauses must not pollute either verb chain. v0.1 uses `;` (not `\\n`) as the statement separator — newline-as-separator is a separate SPEC §4 gap tracked in IMPLEMENTATION_PLAN NEXT." +} diff --git a/tests/ShellSyntaxTree.Tests/Corpus/bash/127_hash_in_double_quotes_not_comment.json b/tests/ShellSyntaxTree.Tests/Corpus/bash/127_hash_in_double_quotes_not_comment.json new file mode 100644 index 0000000..c1fc5b4 --- /dev/null +++ b/tests/ShellSyntaxTree.Tests/Corpus/bash/127_hash_in_double_quotes_not_comment.json @@ -0,0 +1,20 @@ +{ + "name": "# inside double quotes is literal, not a comment", + "input": "echo \"hash is #1234\"", + "expected": { + "isUnparseable": false, + "clauses": [ + { + "operator": "None", + "verb": ["echo"], + "args": [ + { "raw": "\"hash is #1234\"", "kind": "Literal", "isPath": false, "resolved": "__NULL__" } + ], + "redirects": [], + "isSubshell": false, + "isBashCWrapped": false + } + ] + }, + "notes": "Issue #25 / SPEC §5: `#` is comment-start only at a word boundary. Inside double quotes it's a literal character." +} diff --git a/tests/ShellSyntaxTree.Tests/Corpus/bash/128_hash_in_single_quotes_not_comment.json b/tests/ShellSyntaxTree.Tests/Corpus/bash/128_hash_in_single_quotes_not_comment.json new file mode 100644 index 0000000..adbb78f --- /dev/null +++ b/tests/ShellSyntaxTree.Tests/Corpus/bash/128_hash_in_single_quotes_not_comment.json @@ -0,0 +1,20 @@ +{ + "name": "# inside single quotes is literal, not a comment", + "input": "echo 'use #foo'", + "expected": { + "isUnparseable": false, + "clauses": [ + { + "operator": "None", + "verb": ["echo"], + "args": [ + { "raw": "'use #foo'", "kind": "Literal", "isPath": false, "resolved": "__NULL__" } + ], + "redirects": [], + "isSubshell": false, + "isBashCWrapped": false + } + ] + }, + "notes": "Issue #25 / SPEC §5: `#` inside single quotes is a literal byte (single-quote literalness rule)." +} diff --git a/tests/ShellSyntaxTree.Tests/Corpus/bash/129_hash_mid_word_not_comment.json b/tests/ShellSyntaxTree.Tests/Corpus/bash/129_hash_mid_word_not_comment.json new file mode 100644 index 0000000..f94cd5d --- /dev/null +++ b/tests/ShellSyntaxTree.Tests/Corpus/bash/129_hash_mid_word_not_comment.json @@ -0,0 +1,20 @@ +{ + "name": "# in the interior of an unquoted word is literal", + "input": "echo abc#def", + "expected": { + "isUnparseable": false, + "clauses": [ + { + "operator": "None", + "verb": ["echo"], + "args": [ + { "raw": "abc#def", "kind": "Literal", "isPath": false, "resolved": "__NULL__" } + ], + "redirects": [], + "isSubshell": false, + "isBashCWrapped": false + } + ] + }, + "notes": "Issue #25 / SPEC §5: `#` is comment-start only at a word boundary (start-of-input or preceded by whitespace/operator). Mid-word `#` is consumed by ReadWord and stays literal." +} diff --git a/tests/ShellSyntaxTree.Tests/Corpus/bash/130_netclaw_repro_leading_comment_pipeline.json b/tests/ShellSyntaxTree.Tests/Corpus/bash/130_netclaw_repro_leading_comment_pipeline.json new file mode 100644 index 0000000..70ee577 --- /dev/null +++ b/tests/ShellSyntaxTree.Tests/Corpus/bash/130_netclaw_repro_leading_comment_pipeline.json @@ -0,0 +1,53 @@ +{ + "name": "Netclaw repro: leading comment + git ... | awk | tr | sort pipeline", + "input": "# Extract all unique branch names from worktrees\ngit -C /home/user/repos/sample-repo worktree list | awk '{print $NF}' | tr -d '[]' | sort -u", + "expected": { + "isUnparseable": false, + "clauses": [ + { + "operator": "None", + "verb": ["git", "worktree"], + "args": [ + { "raw": "-C", "kind": "Literal", "isPath": false, "isFlag": true }, + { "raw": "/home/user/repos/sample-repo", "kind": "Literal", "isPath": true, "isFlag": false, "resolved": "/home/user/repos/sample-repo" }, + { "raw": "list", "kind": "Literal", "isPath": false, "isFlag": false, "resolved": "__NULL__" } + ], + "redirects": [], + "isSubshell": false, + "isBashCWrapped": false + }, + { + "operator": "Pipe", + "verb": ["awk"], + "args": [ + { "raw": "'{print $NF}'", "kind": "Literal", "isPath": false, "resolved": "__NULL__" } + ], + "redirects": [], + "isSubshell": false, + "isBashCWrapped": false + }, + { + "operator": "Pipe", + "verb": ["tr"], + "args": [ + { "raw": "-d", "kind": "Literal", "isPath": false, "isFlag": true }, + { "raw": "'[]'", "kind": "Literal", "isPath": false, "resolved": "__NULL__" } + ], + "redirects": [], + "isSubshell": false, + "isBashCWrapped": false + }, + { + "operator": "Pipe", + "verb": ["sort"], + "args": [ + { "raw": "-u", "kind": "Literal", "isPath": false, "isFlag": true } + ], + "redirects": [], + "isSubshell": false, + "isBashCWrapped": false + } + ] + }, + "notes": "Issue #25 — original Netclaw repro, paths sanitized per SPEC §14 (user repo path → /home/user/repos/sample-repo). Without the v0.1.3 comment-skip fix this entry parsed Clause 0 with verb `[#, Extract]` — the failure mode that broke approval-prompt rendering for agents authoring scripts with explanatory comments." +} diff --git a/tests/ShellSyntaxTree.Tests/Corpus/bash/131_netclaw_repro_compound_with_comment.json b/tests/ShellSyntaxTree.Tests/Corpus/bash/131_netclaw_repro_compound_with_comment.json new file mode 100644 index 0000000..0dfdcfd --- /dev/null +++ b/tests/ShellSyntaxTree.Tests/Corpus/bash/131_netclaw_repro_compound_with_comment.json @@ -0,0 +1,44 @@ +{ + "name": "Netclaw repro: leading comment + curl|jq pipeline with ||-fallback", + "input": "# Get open PRs from the upstream repo\ncurl -s \"https://api.github.com/repos/sample-org/sample-repo/pulls?state=open\" | jq -r '.[] | \"PR\"' 2>/dev/null || echo \"API failed, trying alternative...\"", + "expected": { + "isUnparseable": false, + "clauses": [ + { + "operator": "None", + "verb": ["curl"], + "args": [ + { "raw": "-s", "kind": "Literal", "isPath": false, "isFlag": true }, + { "raw": "\"https://api.github.com/repos/sample-org/sample-repo/pulls?state=open\"", "kind": "Glob", "isPath": false, "resolved": "__NULL__" } + ], + "redirects": [], + "isSubshell": false, + "isBashCWrapped": false + }, + { + "operator": "Pipe", + "verb": ["jq"], + "args": [ + { "raw": "-r", "kind": "Literal", "isPath": false, "isFlag": true }, + { "raw": "'.[] | \"PR\"'", "kind": "Literal", "isPath": false, "resolved": "__NULL__" } + ], + "redirects": [ + { "direction": "ErrOut", "target": "/dev/null" } + ], + "isSubshell": false, + "isBashCWrapped": false + }, + { + "operator": "OrIf", + "verb": ["echo"], + "args": [ + { "raw": "\"API failed, trying alternative...\"", "kind": "Literal", "isPath": false, "resolved": "__NULL__" } + ], + "redirects": [], + "isSubshell": false, + "isBashCWrapped": false + } + ] + }, + "notes": "Issue #25 follow-up: leading comment + `||`-fallback that surfaced the approval-state desync cascade (verb-chain extracted as `# Get` at persistence time → cache miss at retry-authorization → tool fails after user clicked Approve). Paths and org names sanitized per SPEC §14." +} diff --git a/tests/ShellSyntaxTree.Tests/Lexing/BashLexerTests.cs b/tests/ShellSyntaxTree.Tests/Lexing/BashLexerTests.cs index 471586c..0cfb1bf 100644 --- a/tests/ShellSyntaxTree.Tests/Lexing/BashLexerTests.cs +++ b/tests/ShellSyntaxTree.Tests/Lexing/BashLexerTests.cs @@ -567,6 +567,141 @@ public void Operator_text_is_set_exactly_for_each_kind() ops); } + // ------------------------------------------------------------ comments (SPEC §5) + + [Fact] + public void Comment_only_input_lexes_as_single_comment_token() + { + var tokens = BashLexer.Tokenize("# just a note"); + var t = Assert.Single(tokens); + Assert.Equal(BashTokenKind.Comment, t.Kind); + Assert.Equal(0, t.SourceStart); + Assert.Equal(13, t.SourceLength); + } + + [Fact] + public void Leading_comment_followed_by_newline_and_command_emits_three_significant_tokens() + { + // `# fetch\ngit pull` → Comment, Whitespace(\n), Word(git), Whitespace, Word(pull) + var all = BashLexer.Tokenize("# fetch\ngit pull"); + Assert.Equal(BashTokenKind.Comment, all[0].Kind); + Assert.Equal(7, all[0].SourceLength); + Assert.Equal(BashTokenKind.Whitespace, all[1].Kind); + Assert.Equal(BashTokenKind.Word, all[2].Kind); + Assert.Equal("git", all[2].Value); + } + + [Fact] + public void Inline_trailing_comment_does_not_swallow_preceding_word() + { + var nonWs = LexNonWs("git pull # update local"); + // Comment is preserved in this view because LexNonWs only filters + // Whitespace/Continuation. The Comment is still in the stream; + // the parser is what drops it. + Assert.Equal(3, nonWs.Length); + Assert.Equal(BashTokenKind.Word, nonWs[0].Kind); + Assert.Equal("git", nonWs[0].Value); + Assert.Equal(BashTokenKind.Word, nonWs[1].Kind); + Assert.Equal("pull", nonWs[1].Value); + Assert.Equal(BashTokenKind.Comment, nonWs[2].Kind); + Assert.Equal(11, nonWs[2].SourceStart); + Assert.Equal(14, nonWs[2].SourceLength); + } + + [Fact] + public void Hash_in_middle_of_unquoted_word_is_literal_not_comment() + { + // bash treats `#` as comment-start only at a "word boundary". A `#` + // already inside a word (no preceding whitespace/operator) is just + // another word character. + var tokens = LexNonWs("echo abc#def"); + Assert.Equal(2, tokens.Length); + Assert.Equal(BashTokenKind.Word, tokens[0].Kind); + Assert.Equal("echo", tokens[0].Value); + Assert.Equal(BashTokenKind.Word, tokens[1].Kind); + Assert.Equal("abc#def", tokens[1].Value); + } + + [Fact] + public void Hash_inside_double_quotes_is_literal_not_comment() + { + // The count-equality below would fail if a stray Comment token + // appeared, so a separate DoesNotContain isn't needed. + var tokens = LexNonWs("echo \"hash is #1234\""); + Assert.Equal(2, tokens.Length); + Assert.Equal(BashTokenKind.Word, tokens[0].Kind); + Assert.Equal(BashTokenKind.QuotedString, tokens[1].Kind); + Assert.Equal("hash is #1234", tokens[1].Value); + } + + [Fact] + public void Hash_inside_single_quotes_is_literal_not_comment() + { + var tokens = LexNonWs("echo 'use #foo'"); + Assert.Equal(2, tokens.Length); + Assert.Equal(BashTokenKind.Word, tokens[0].Kind); + Assert.Equal(BashTokenKind.QuotedString, tokens[1].Kind); + Assert.Equal("use #foo", tokens[1].Value); + Assert.True(tokens[1].IsSingleQuoted); + } + + [Fact] + public void Backslash_escaped_hash_is_consumed_by_ReadWord_not_comment() + { + // `\#abc` — escape processing strips the backslash; the word + // is `#abc`. This must NOT trip the comment branch because + // ReadWord has already consumed the backslash by the time the + // outer-loop dispatch would see `#`. + var tokens = LexNonWs("\\#abc"); + var t = Assert.Single(tokens); + Assert.Equal(BashTokenKind.Word, t.Kind); + Assert.Equal("#abc", t.Value); + } + + [Fact] + public void Comment_starts_immediately_after_operator_without_whitespace() + { + // `cmd &&# foo` — bash treats `#` as comment-start because `&&` + // ended the previous token; `#` is at a word boundary. + var all = BashLexer.Tokenize("cmd &&# foo"); + // Word(cmd), Whitespace, Operator(&&), Comment(# foo) + Assert.Equal(4, all.Count); + Assert.Equal(BashTokenKind.Word, all[0].Kind); + Assert.Equal(BashTokenKind.Whitespace, all[1].Kind); + Assert.Equal(BashTokenKind.Operator, all[2].Kind); + Assert.Equal("&&", all[2].OperatorText); + Assert.Equal(BashTokenKind.Comment, all[3].Kind); + Assert.Equal(6, all[3].SourceStart); + Assert.Equal(5, all[3].SourceLength); + } + + [Fact] + public void Comment_at_EOF_without_trailing_newline_terminates_naturally() + { + var tokens = BashLexer.Tokenize("echo hi # done"); + // Word(echo), Whitespace, Word(hi), Whitespace, Comment(# done) + Assert.Equal(5, tokens.Count); + Assert.Equal(BashTokenKind.Comment, tokens[4].Kind); + Assert.Equal(14, tokens[4].SourceStart + tokens[4].SourceLength); + } + + [Fact] + public void Comment_does_not_consume_terminating_newline() + { + // The newline must stay in the stream so the parser still sees + // a statement boundary between `# a` and `cmd`. + var tokens = BashLexer.Tokenize("# a\ncmd"); + Assert.Equal(BashTokenKind.Comment, tokens[0].Kind); + Assert.Equal(0, tokens[0].SourceStart); + Assert.Equal(3, tokens[0].SourceLength); + Assert.Equal(BashTokenKind.Whitespace, tokens[1].Kind); + // The Whitespace token covers the newline. + Assert.Equal(3, tokens[1].SourceStart); + Assert.Equal(1, tokens[1].SourceLength); + Assert.Equal(BashTokenKind.Word, tokens[2].Kind); + Assert.Equal("cmd", tokens[2].Value); + } + // ------------------------------------------------------------ misc [Fact] diff --git a/tests/ShellSyntaxTree.Tests/Parsing/BashCommandParserTests.cs b/tests/ShellSyntaxTree.Tests/Parsing/BashCommandParserTests.cs index d5ff527..02c4a01 100644 --- a/tests/ShellSyntaxTree.Tests/Parsing/BashCommandParserTests.cs +++ b/tests/ShellSyntaxTree.Tests/Parsing/BashCommandParserTests.cs @@ -569,6 +569,102 @@ public void Whitespace_only_input_returns_empty_clauses() Assert.False(result.IsUnparseable); } + // ---------------- Comments (SPEC §5, issue #25) ---------------- + + [Fact] + public void Comment_only_input_returns_empty_clauses() + { + // Mirrors the empty/whitespace-only path: zero clauses, not + // unparseable. SPEC §5 — comments are whitespace-equivalent. + var result = Parse("# just a note"); + Assert.Equal("# just a note", result.Source); + Assert.Empty(result.Clauses); + Assert.False(result.IsUnparseable); + } + + [Fact] + public void Leading_comment_does_not_pollute_verb_chain() + { + // The exact failure mode from issue #25: a leading explanatory + // comment was being parsed as the verb of the next clause, + // surfacing as `# Extract` in downstream approval prompts. + // BashArity collapses `git worktree` to a 2-token verb in v0.1 + // (the deeper `git worktree list` subcommand is not in the table, + // so `list` lands as a positional arg — see SPEC §6.1). + var result = Parse("# Extract worktree branches\ngit worktree list"); + var clause = Assert.Single(result.Clauses); + Assert.Equal(new[] { "git", "worktree" }, clause.Verb.Tokens); + Assert.Equal("list", clause.Args[0].Raw); + Assert.False(result.IsUnparseable); + } + + [Fact] + public void Inline_trailing_comment_is_dropped_from_clause() + { + var result = Parse("git pull # update local"); + var clause = Assert.Single(result.Clauses); + Assert.Equal(new[] { "git", "pull" }, clause.Verb.Tokens); + Assert.Empty(clause.Args); + } + + [Fact] + public void Comment_between_two_statements_preserves_both_clauses() + { + // v0.1 does not treat top-level newlines as statement separators + // (SPEC §4 gap — separate from #25). Use explicit `;` so the + // separator survives FilterSignificant. + var result = Parse("git pull ; # now build\ndotnet build"); + Assert.Equal(2, result.Clauses.Count); + Assert.Equal(new[] { "git", "pull" }, result.Clauses[0].Verb.Tokens); + Assert.Equal(new[] { "dotnet", "build" }, result.Clauses[1].Verb.Tokens); + Assert.False(result.IsUnparseable); + } + + [Fact] + public void Hash_inside_double_quotes_remains_literal_arg() + { + var result = Parse("echo \"hash is #1234\""); + var clause = Assert.Single(result.Clauses); + Assert.Equal(new[] { "echo" }, clause.Verb.Tokens); + var arg = Assert.Single(clause.Args); + Assert.Equal("\"hash is #1234\"", arg.Raw); + } + + [Fact] + public void Hash_inside_single_quotes_remains_literal_arg() + { + var result = Parse("echo 'use #foo'"); + var clause = Assert.Single(result.Clauses); + Assert.Equal(new[] { "echo" }, clause.Verb.Tokens); + var arg = Assert.Single(clause.Args); + Assert.Equal("'use #foo'", arg.Raw); + } + + [Fact] + public void Hash_mid_word_remains_literal_arg() + { + // Per bash: `#` is a comment-start only at a word boundary. + // `abc#def` is a single word with a literal `#`. + var result = Parse("echo abc#def"); + var clause = Assert.Single(result.Clauses); + Assert.Equal(new[] { "echo" }, clause.Verb.Tokens); + var arg = Assert.Single(clause.Args); + Assert.Equal("abc#def", arg.Raw); + } + + [Fact] + public void Comment_inside_orif_compound_does_not_break_clause_split() + { + // Issue #25 follow-up comment: a leading comment + ||-fallback + // would persist `[# Get, echo]` at one pass but `[# Get, curl, jq]` + // at another. After the fix both passes see `curl`/`echo`. + var result = Parse("# Get open PRs\ncurl example || echo \"failed\""); + Assert.Equal(2, result.Clauses.Count); + Assert.Equal(new[] { "curl" }, result.Clauses[0].Verb.Tokens); + Assert.Equal(new[] { "echo" }, result.Clauses[1].Verb.Tokens); + Assert.False(result.IsUnparseable); + } + // ---------------- Subshell ---------------- [Fact]