diff --git a/Directory.Build.props b/Directory.Build.props
index 8f78ad0..ebca8ce 100644
--- a/Directory.Build.props
+++ b/Directory.Build.props
@@ -7,7 +7,7 @@
latest
enable
true
- 0.1.2
+ 0.1.3
alpha
diff --git a/IMPLEMENTATION_PLAN.md b/IMPLEMENTATION_PLAN.md
index cdcbd91..5ef1120 100644
--- a/IMPLEMENTATION_PLAN.md
+++ b/IMPLEMENTATION_PLAN.md
@@ -205,10 +205,32 @@ bulldoze priorities.
- [x] `dotnet pack` validation: icon embedded in `.nupkg` confirmed
- [ ] Re-pack to validate icon embeds
+### 16. Bash line comments (#25) — 0.1.3-alpha
+
+- [x] `BashTokenKind.Comment` enum member (internal)
+- [x] `BashLexer.ConsumeLineComment` helper; `#` dispatch in main scan loop
+- [x] `BashCommandParser.FilterSignificant` drops Comment tokens
+- [x] SPEC.md §4 BNF note + §5 "Comment handling" subsection
+- [x] 10 new lexer unit tests + 8 new parser unit tests
+- [x] 9 new corpus entries (123–131) including both Netclaw repros
+ (sanitized paths per SPEC §14)
+- [x] `Directory.Build.props` `VersionPrefix` 0.1.2 → 0.1.3
+- [x] `RELEASE_NOTES.md` 0.1.3-alpha section
+- [ ] Cut 0.1.3-alpha tag once branch is merged
+
---
## NEXT (0.1.x — additive, post-alpha)
+- Newline-as-statement-separator at the parser level (SPEC §4 gap
+ surfaced by #25). The lexer already emits Whitespace tokens for
+ newlines with the intent of acting as separators (see the lexer
+ comment at the newline branch), but `BashCommandParser.SplitIntoSegments`
+ only splits on `&&` / `||` / `;` / `|`. As a result, `cmd1\ncmd2`
+ currently parses to one clause `[cmd1]` with `cmd2` as an argument.
+ v0.1.3 corpus entry 126 works around this with an explicit `;`;
+ the long-term fix is to bridge newline-Whitespace tokens to the
+ segment splitter as synthetic `;` separators.
- Seed 50–100 corpus entries from sanitized real-world dogfood logs
(SPEC §14 workflow)
- Expand verb tables as corpus surfaces real commands
diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md
index 406d123..b2ad45f 100644
--- a/RELEASE_NOTES.md
+++ b/RELEASE_NOTES.md
@@ -1,3 +1,43 @@
+#### 0.1.3-alpha May 12th 2026 ####
+
+Bash line comment handling. Public API unchanged.
+
+**Fixed**
+
+- **Bash line comments are now recognized and skipped (#25).** `BashLexer`
+ treats `#` at a word boundary (start of input, or preceded by
+ whitespace, a newline, or any operator) as the start of a comment
+ that runs to the next newline. The comment text is emitted as a new
+ internal `BashTokenKind.Comment` token for source fidelity and is
+ filtered by the parser alongside `Whitespace` / `Continuation`, so
+ it contributes no verb, args, redirects, or flags to any clause.
+ Comment-only input parses to `Clauses = []`, `IsUnparseable = false`,
+ matching the existing empty-/whitespace-only path. Quoting and
+ escape rules are honored: `#` inside single or double quotes is
+ literal, `#` in the interior of an unquoted word (e.g. `abc#def`)
+ is literal, and `\#` outside quotes is literal.
+
+ Before this fix, `# Extract worktree branches\ngit worktree list`
+ parsed to a single clause with verb chain `[#, Extract]` — the
+ comment text leaked into downstream approval prompts and broke
+ approval-state caching in consumers that did asymmetric verb-chain
+ extraction (persistence-time vs. retry-authorization saw different
+ verb sets, causing tool calls to fail after the user had already
+ clicked Approve).
+
+**Behavior notes**
+
+- Public API surface is unchanged (no `PublicApiSnapshotTests` delta).
+- SPEC.md §4 / §5: new "Comment handling" subsection in §5 documents
+ the boundary rules; §4 BNF notes that comments are
+ whitespace-equivalent at the lexer level.
+- Corpus: 9 new entries (123–131) pin every case from the issue
+ report, plus the two Netclaw repros (sanitized paths per §14).
+- v0.1 still does not treat top-level newlines as statement separators
+ (SPEC §4 gap, tracked separately in IMPLEMENTATION_PLAN NEXT) — a
+ comment between two commands on separate lines requires an explicit
+ `;` separator to split into two clauses.
+
#### 0.1.2-alpha May 11th 2026 ####
Three parser correctness fixes. Public API unchanged.
diff --git a/SPEC.md b/SPEC.md
index 91bac16..0897414 100644
--- a/SPEC.md
+++ b/SPEC.md
@@ -357,6 +357,11 @@ quoted_string := single-quoted | double-quoted
- Whitespace between tokens is one or more spaces or tabs.
- `\` followed by a newline is a line continuation (treat as whitespace).
+- Bash line comments (`#` at a word boundary through end-of-line) are
+ whitespace-equivalent at the lexer level — they emit a Comment token
+ for source fidelity but are filtered alongside Whitespace by the
+ parser, so they do not appear in the grammar. See §5 "Comment
+ handling" for boundary rules.
- `\` before a metachar inside a double-quoted string escapes the metachar.
- Single-quoted strings preserve all bytes literally — no escape processing.
- Heredocs (`<`, `>>`, `<`,
+ `2>`, `2>>`, `(`, `)`, `<<`, `<<-`), a quoted string, or an opaque
+ substitution. Equivalently: `#` is comment-start everywhere the
+ outer lexer dispatch loop sits, because every other lexer rule has
+ already consumed its territory before `#` is considered.
+- `#` **inside** single or double quotes is a literal character (no
+ comment).
+- `#` in the **interior** of an unquoted word (e.g. `abc#def`) is a
+ literal character. `ReadWord` consumes the whole word before the
+ outer loop can see the embedded `#`; there is no re-scanning.
+- `\#` (backslash-escaped `#` outside quotes) is consumed by the
+ normal escape rule — the backslash is dropped and `#` becomes a
+ regular word character. Equivalent example: `cmd \#abc` produces
+ one Word token `#abc`.
+- The terminating newline is **not** consumed by the Comment token.
+ It survives as a Whitespace token, preserving statement-boundary
+ semantics for the parser (see §4).
+- A Comment token's `Value` is empty (matching `Whitespace` /
+ `Continuation`); `SourceStart` / `SourceLength` identify the slice
+ including the leading `#` so callers that need the literal text can
+ recover it from the original input span.
+- **Effect on parsing**: comment-only input parses to
+ `Clauses = []`, `IsUnparseable = false` — mirroring empty-input
+ behavior. A comment leading, trailing, or interleaved with a clause
+ contributes no tokens to the verb chain, args, or redirects of any
+ clause.
+
---
## 6. Verb Tables
diff --git a/src/ShellSyntaxTree/Internal/Bash/Lexing/BashLexer.cs b/src/ShellSyntaxTree/Internal/Bash/Lexing/BashLexer.cs
index 2373c2b..827ed84 100644
--- a/src/ShellSyntaxTree/Internal/Bash/Lexing/BashLexer.cs
+++ b/src/ShellSyntaxTree/Internal/Bash/Lexing/BashLexer.cs
@@ -183,6 +183,18 @@ internal static IReadOnlyList Tokenize(string input)
continue;
}
+ // ---- line comment ----
+ // Reaching this branch implies a word boundary (quotes,
+ // operators, and opaque regions are dispatched above), so `#`
+ // here starts a comment to EOL. Mid-word `#` is consumed by
+ // ReadWord, and `\#` by its escape handling — neither reaches
+ // this point. SPEC §5.
+ if (c == '#')
+ {
+ i = ConsumeLineComment(src, i, tokens);
+ continue;
+ }
+
// ---- word ----
i = ReadWord(src, i, tokens);
}
@@ -392,6 +404,26 @@ private static int ConsumeBacktickSubstitution(
return start + length;
}
+ // Consume `#` through (but not including) the next newline. The
+ // terminating newline stays in the stream so the outer loop emits it
+ // as a Whitespace token, preserving SPEC §4 clause-boundary
+ // semantics. Value is "" to match Whitespace/Continuation — callers
+ // that need the literal text can slice the source via
+ // SourceStart/SourceLength.
+ private static int ConsumeLineComment(
+ ReadOnlySpan src, int start, List tokens)
+ {
+ var i = start;
+ while (i < src.Length && src[i] != '\n' && src[i] != '\r')
+ {
+ i++;
+ }
+
+ tokens.Add(new BashToken(
+ BashTokenKind.Comment, "", null, start, i - start, null));
+ return i;
+ }
+
private static int ConsumeArithmetic(
ReadOnlySpan src, int start, List tokens)
{
diff --git a/src/ShellSyntaxTree/Internal/Bash/Lexing/BashTokenKind.cs b/src/ShellSyntaxTree/Internal/Bash/Lexing/BashTokenKind.cs
index 3e13ecc..da786ca 100644
--- a/src/ShellSyntaxTree/Internal/Bash/Lexing/BashTokenKind.cs
+++ b/src/ShellSyntaxTree/Internal/Bash/Lexing/BashTokenKind.cs
@@ -38,6 +38,17 @@ internal enum BashTokenKind
/// whitespace by the parser. SPEC §5.
Continuation,
+ /// A bash line comment — # at a word boundary
+ /// through end-of-line (the terminating newline is preserved as
+ /// a separate token so statement
+ /// boundaries are unaffected). Emitted for source fidelity with
+ /// empty ;
+ /// and identify the slice.
+ /// The parser drops these in FilterSignificant alongside
+ /// and .
+ /// SPEC §5.
+ Comment,
+
/// An opaque region — $(…) or backtick-quoted
/// `…`. The parser consumes one of these as a single
/// Arg{ Kind = DynamicSkip, IsPath = false } per the v0.1
diff --git a/src/ShellSyntaxTree/Internal/Bash/Parsing/BashCommandParser.cs b/src/ShellSyntaxTree/Internal/Bash/Parsing/BashCommandParser.cs
index c488e0f..c367987 100644
--- a/src/ShellSyntaxTree/Internal/Bash/Parsing/BashCommandParser.cs
+++ b/src/ShellSyntaxTree/Internal/Bash/Parsing/BashCommandParser.cs
@@ -496,7 +496,9 @@ private static List FilterSignificant(IReadOnlyList tokens
var filtered = new List(tokens.Count);
foreach (var t in tokens)
{
- if (t.Kind == BashTokenKind.Whitespace || t.Kind == BashTokenKind.Continuation)
+ if (t.Kind == BashTokenKind.Whitespace
+ || t.Kind == BashTokenKind.Continuation
+ || t.Kind == BashTokenKind.Comment)
{
continue;
}
diff --git a/tests/ShellSyntaxTree.Tests/Corpus/bash/123_comment_only_no_clauses.json b/tests/ShellSyntaxTree.Tests/Corpus/bash/123_comment_only_no_clauses.json
new file mode 100644
index 0000000..76d81f8
--- /dev/null
+++ b/tests/ShellSyntaxTree.Tests/Corpus/bash/123_comment_only_no_clauses.json
@@ -0,0 +1,9 @@
+{
+ "name": "Comment-only input produces zero clauses",
+ "input": "# just a note",
+ "expected": {
+ "isUnparseable": false,
+ "clauses": []
+ },
+ "notes": "Issue #25 / SPEC §5: a comment-only script is whitespace-equivalent — zero significant tokens → zero clauses, not IsUnparseable."
+}
diff --git a/tests/ShellSyntaxTree.Tests/Corpus/bash/124_leading_comment_then_command.json b/tests/ShellSyntaxTree.Tests/Corpus/bash/124_leading_comment_then_command.json
new file mode 100644
index 0000000..011c549
--- /dev/null
+++ b/tests/ShellSyntaxTree.Tests/Corpus/bash/124_leading_comment_then_command.json
@@ -0,0 +1,18 @@
+{
+ "name": "Leading comment followed by a single command",
+ "input": "# fetch the latest\ngit pull",
+ "expected": {
+ "isUnparseable": false,
+ "clauses": [
+ {
+ "operator": "None",
+ "verb": ["git", "pull"],
+ "args": [],
+ "redirects": [],
+ "isSubshell": false,
+ "isBashCWrapped": false
+ }
+ ]
+ },
+ "notes": "Issue #25: comment-as-verb regression. The leading explanatory line must not leak into the verb chain; v0.1.3 lexer skips `#`-to-EOL."
+}
diff --git a/tests/ShellSyntaxTree.Tests/Corpus/bash/125_inline_trailing_comment.json b/tests/ShellSyntaxTree.Tests/Corpus/bash/125_inline_trailing_comment.json
new file mode 100644
index 0000000..9364a6c
--- /dev/null
+++ b/tests/ShellSyntaxTree.Tests/Corpus/bash/125_inline_trailing_comment.json
@@ -0,0 +1,18 @@
+{
+ "name": "Inline trailing comment is dropped from the clause",
+ "input": "git pull # update local",
+ "expected": {
+ "isUnparseable": false,
+ "clauses": [
+ {
+ "operator": "None",
+ "verb": ["git", "pull"],
+ "args": [],
+ "redirects": [],
+ "isSubshell": false,
+ "isBashCWrapped": false
+ }
+ ]
+ },
+ "notes": "Issue #25 / SPEC §5: a `#` preceded by whitespace starts a comment. The trailing text becomes a Comment token and is filtered out by the parser."
+}
diff --git a/tests/ShellSyntaxTree.Tests/Corpus/bash/126_comment_between_two_commands.json b/tests/ShellSyntaxTree.Tests/Corpus/bash/126_comment_between_two_commands.json
new file mode 100644
index 0000000..b7b08b4
--- /dev/null
+++ b/tests/ShellSyntaxTree.Tests/Corpus/bash/126_comment_between_two_commands.json
@@ -0,0 +1,26 @@
+{
+ "name": "Comment between two ;-separated commands",
+ "input": "git pull ; # now build\ndotnet build",
+ "expected": {
+ "isUnparseable": false,
+ "clauses": [
+ {
+ "operator": "None",
+ "verb": ["git", "pull"],
+ "args": [],
+ "redirects": [],
+ "isSubshell": false,
+ "isBashCWrapped": false
+ },
+ {
+ "operator": "Sequence",
+ "verb": ["dotnet", "build"],
+ "args": [],
+ "redirects": [],
+ "isSubshell": false,
+ "isBashCWrapped": false
+ }
+ ]
+ },
+ "notes": "Issue #25: comment on its own line between two clauses must not pollute either verb chain. v0.1 uses `;` (not `\\n`) as the statement separator — newline-as-separator is a separate SPEC §4 gap tracked in IMPLEMENTATION_PLAN NEXT."
+}
diff --git a/tests/ShellSyntaxTree.Tests/Corpus/bash/127_hash_in_double_quotes_not_comment.json b/tests/ShellSyntaxTree.Tests/Corpus/bash/127_hash_in_double_quotes_not_comment.json
new file mode 100644
index 0000000..c1fc5b4
--- /dev/null
+++ b/tests/ShellSyntaxTree.Tests/Corpus/bash/127_hash_in_double_quotes_not_comment.json
@@ -0,0 +1,20 @@
+{
+ "name": "# inside double quotes is literal, not a comment",
+ "input": "echo \"hash is #1234\"",
+ "expected": {
+ "isUnparseable": false,
+ "clauses": [
+ {
+ "operator": "None",
+ "verb": ["echo"],
+ "args": [
+ { "raw": "\"hash is #1234\"", "kind": "Literal", "isPath": false, "resolved": "__NULL__" }
+ ],
+ "redirects": [],
+ "isSubshell": false,
+ "isBashCWrapped": false
+ }
+ ]
+ },
+ "notes": "Issue #25 / SPEC §5: `#` is comment-start only at a word boundary. Inside double quotes it's a literal character."
+}
diff --git a/tests/ShellSyntaxTree.Tests/Corpus/bash/128_hash_in_single_quotes_not_comment.json b/tests/ShellSyntaxTree.Tests/Corpus/bash/128_hash_in_single_quotes_not_comment.json
new file mode 100644
index 0000000..adbb78f
--- /dev/null
+++ b/tests/ShellSyntaxTree.Tests/Corpus/bash/128_hash_in_single_quotes_not_comment.json
@@ -0,0 +1,20 @@
+{
+ "name": "# inside single quotes is literal, not a comment",
+ "input": "echo 'use #foo'",
+ "expected": {
+ "isUnparseable": false,
+ "clauses": [
+ {
+ "operator": "None",
+ "verb": ["echo"],
+ "args": [
+ { "raw": "'use #foo'", "kind": "Literal", "isPath": false, "resolved": "__NULL__" }
+ ],
+ "redirects": [],
+ "isSubshell": false,
+ "isBashCWrapped": false
+ }
+ ]
+ },
+ "notes": "Issue #25 / SPEC §5: `#` inside single quotes is a literal byte (single-quote literalness rule)."
+}
diff --git a/tests/ShellSyntaxTree.Tests/Corpus/bash/129_hash_mid_word_not_comment.json b/tests/ShellSyntaxTree.Tests/Corpus/bash/129_hash_mid_word_not_comment.json
new file mode 100644
index 0000000..f94cd5d
--- /dev/null
+++ b/tests/ShellSyntaxTree.Tests/Corpus/bash/129_hash_mid_word_not_comment.json
@@ -0,0 +1,20 @@
+{
+ "name": "# in the interior of an unquoted word is literal",
+ "input": "echo abc#def",
+ "expected": {
+ "isUnparseable": false,
+ "clauses": [
+ {
+ "operator": "None",
+ "verb": ["echo"],
+ "args": [
+ { "raw": "abc#def", "kind": "Literal", "isPath": false, "resolved": "__NULL__" }
+ ],
+ "redirects": [],
+ "isSubshell": false,
+ "isBashCWrapped": false
+ }
+ ]
+ },
+ "notes": "Issue #25 / SPEC §5: `#` is comment-start only at a word boundary (start-of-input or preceded by whitespace/operator). Mid-word `#` is consumed by ReadWord and stays literal."
+}
diff --git a/tests/ShellSyntaxTree.Tests/Corpus/bash/130_netclaw_repro_leading_comment_pipeline.json b/tests/ShellSyntaxTree.Tests/Corpus/bash/130_netclaw_repro_leading_comment_pipeline.json
new file mode 100644
index 0000000..70ee577
--- /dev/null
+++ b/tests/ShellSyntaxTree.Tests/Corpus/bash/130_netclaw_repro_leading_comment_pipeline.json
@@ -0,0 +1,53 @@
+{
+ "name": "Netclaw repro: leading comment + git ... | awk | tr | sort pipeline",
+ "input": "# Extract all unique branch names from worktrees\ngit -C /home/user/repos/sample-repo worktree list | awk '{print $NF}' | tr -d '[]' | sort -u",
+ "expected": {
+ "isUnparseable": false,
+ "clauses": [
+ {
+ "operator": "None",
+ "verb": ["git", "worktree"],
+ "args": [
+ { "raw": "-C", "kind": "Literal", "isPath": false, "isFlag": true },
+ { "raw": "/home/user/repos/sample-repo", "kind": "Literal", "isPath": true, "isFlag": false, "resolved": "/home/user/repos/sample-repo" },
+ { "raw": "list", "kind": "Literal", "isPath": false, "isFlag": false, "resolved": "__NULL__" }
+ ],
+ "redirects": [],
+ "isSubshell": false,
+ "isBashCWrapped": false
+ },
+ {
+ "operator": "Pipe",
+ "verb": ["awk"],
+ "args": [
+ { "raw": "'{print $NF}'", "kind": "Literal", "isPath": false, "resolved": "__NULL__" }
+ ],
+ "redirects": [],
+ "isSubshell": false,
+ "isBashCWrapped": false
+ },
+ {
+ "operator": "Pipe",
+ "verb": ["tr"],
+ "args": [
+ { "raw": "-d", "kind": "Literal", "isPath": false, "isFlag": true },
+ { "raw": "'[]'", "kind": "Literal", "isPath": false, "resolved": "__NULL__" }
+ ],
+ "redirects": [],
+ "isSubshell": false,
+ "isBashCWrapped": false
+ },
+ {
+ "operator": "Pipe",
+ "verb": ["sort"],
+ "args": [
+ { "raw": "-u", "kind": "Literal", "isPath": false, "isFlag": true }
+ ],
+ "redirects": [],
+ "isSubshell": false,
+ "isBashCWrapped": false
+ }
+ ]
+ },
+ "notes": "Issue #25 — original Netclaw repro, paths sanitized per SPEC §14 (user repo path → /home/user/repos/sample-repo). Without the v0.1.3 comment-skip fix this entry parsed Clause 0 with verb `[#, Extract]` — the failure mode that broke approval-prompt rendering for agents authoring scripts with explanatory comments."
+}
diff --git a/tests/ShellSyntaxTree.Tests/Corpus/bash/131_netclaw_repro_compound_with_comment.json b/tests/ShellSyntaxTree.Tests/Corpus/bash/131_netclaw_repro_compound_with_comment.json
new file mode 100644
index 0000000..0dfdcfd
--- /dev/null
+++ b/tests/ShellSyntaxTree.Tests/Corpus/bash/131_netclaw_repro_compound_with_comment.json
@@ -0,0 +1,44 @@
+{
+ "name": "Netclaw repro: leading comment + curl|jq pipeline with ||-fallback",
+ "input": "# Get open PRs from the upstream repo\ncurl -s \"https://api.github.com/repos/sample-org/sample-repo/pulls?state=open\" | jq -r '.[] | \"PR\"' 2>/dev/null || echo \"API failed, trying alternative...\"",
+ "expected": {
+ "isUnparseable": false,
+ "clauses": [
+ {
+ "operator": "None",
+ "verb": ["curl"],
+ "args": [
+ { "raw": "-s", "kind": "Literal", "isPath": false, "isFlag": true },
+ { "raw": "\"https://api.github.com/repos/sample-org/sample-repo/pulls?state=open\"", "kind": "Glob", "isPath": false, "resolved": "__NULL__" }
+ ],
+ "redirects": [],
+ "isSubshell": false,
+ "isBashCWrapped": false
+ },
+ {
+ "operator": "Pipe",
+ "verb": ["jq"],
+ "args": [
+ { "raw": "-r", "kind": "Literal", "isPath": false, "isFlag": true },
+ { "raw": "'.[] | \"PR\"'", "kind": "Literal", "isPath": false, "resolved": "__NULL__" }
+ ],
+ "redirects": [
+ { "direction": "ErrOut", "target": "/dev/null" }
+ ],
+ "isSubshell": false,
+ "isBashCWrapped": false
+ },
+ {
+ "operator": "OrIf",
+ "verb": ["echo"],
+ "args": [
+ { "raw": "\"API failed, trying alternative...\"", "kind": "Literal", "isPath": false, "resolved": "__NULL__" }
+ ],
+ "redirects": [],
+ "isSubshell": false,
+ "isBashCWrapped": false
+ }
+ ]
+ },
+ "notes": "Issue #25 follow-up: leading comment + `||`-fallback that surfaced the approval-state desync cascade (verb-chain extracted as `# Get` at persistence time → cache miss at retry-authorization → tool fails after user clicked Approve). Paths and org names sanitized per SPEC §14."
+}
diff --git a/tests/ShellSyntaxTree.Tests/Lexing/BashLexerTests.cs b/tests/ShellSyntaxTree.Tests/Lexing/BashLexerTests.cs
index 471586c..0cfb1bf 100644
--- a/tests/ShellSyntaxTree.Tests/Lexing/BashLexerTests.cs
+++ b/tests/ShellSyntaxTree.Tests/Lexing/BashLexerTests.cs
@@ -567,6 +567,141 @@ public void Operator_text_is_set_exactly_for_each_kind()
ops);
}
+ // ------------------------------------------------------------ comments (SPEC §5)
+
+ [Fact]
+ public void Comment_only_input_lexes_as_single_comment_token()
+ {
+ var tokens = BashLexer.Tokenize("# just a note");
+ var t = Assert.Single(tokens);
+ Assert.Equal(BashTokenKind.Comment, t.Kind);
+ Assert.Equal(0, t.SourceStart);
+ Assert.Equal(13, t.SourceLength);
+ }
+
+ [Fact]
+ public void Leading_comment_followed_by_newline_and_command_emits_three_significant_tokens()
+ {
+ // `# fetch\ngit pull` → Comment, Whitespace(\n), Word(git), Whitespace, Word(pull)
+ var all = BashLexer.Tokenize("# fetch\ngit pull");
+ Assert.Equal(BashTokenKind.Comment, all[0].Kind);
+ Assert.Equal(7, all[0].SourceLength);
+ Assert.Equal(BashTokenKind.Whitespace, all[1].Kind);
+ Assert.Equal(BashTokenKind.Word, all[2].Kind);
+ Assert.Equal("git", all[2].Value);
+ }
+
+ [Fact]
+ public void Inline_trailing_comment_does_not_swallow_preceding_word()
+ {
+ var nonWs = LexNonWs("git pull # update local");
+ // Comment is preserved in this view because LexNonWs only filters
+ // Whitespace/Continuation. The Comment is still in the stream;
+ // the parser is what drops it.
+ Assert.Equal(3, nonWs.Length);
+ Assert.Equal(BashTokenKind.Word, nonWs[0].Kind);
+ Assert.Equal("git", nonWs[0].Value);
+ Assert.Equal(BashTokenKind.Word, nonWs[1].Kind);
+ Assert.Equal("pull", nonWs[1].Value);
+ Assert.Equal(BashTokenKind.Comment, nonWs[2].Kind);
+ Assert.Equal(11, nonWs[2].SourceStart);
+ Assert.Equal(14, nonWs[2].SourceLength);
+ }
+
+ [Fact]
+ public void Hash_in_middle_of_unquoted_word_is_literal_not_comment()
+ {
+ // bash treats `#` as comment-start only at a "word boundary". A `#`
+ // already inside a word (no preceding whitespace/operator) is just
+ // another word character.
+ var tokens = LexNonWs("echo abc#def");
+ Assert.Equal(2, tokens.Length);
+ Assert.Equal(BashTokenKind.Word, tokens[0].Kind);
+ Assert.Equal("echo", tokens[0].Value);
+ Assert.Equal(BashTokenKind.Word, tokens[1].Kind);
+ Assert.Equal("abc#def", tokens[1].Value);
+ }
+
+ [Fact]
+ public void Hash_inside_double_quotes_is_literal_not_comment()
+ {
+ // The count-equality below would fail if a stray Comment token
+ // appeared, so a separate DoesNotContain isn't needed.
+ var tokens = LexNonWs("echo \"hash is #1234\"");
+ Assert.Equal(2, tokens.Length);
+ Assert.Equal(BashTokenKind.Word, tokens[0].Kind);
+ Assert.Equal(BashTokenKind.QuotedString, tokens[1].Kind);
+ Assert.Equal("hash is #1234", tokens[1].Value);
+ }
+
+ [Fact]
+ public void Hash_inside_single_quotes_is_literal_not_comment()
+ {
+ var tokens = LexNonWs("echo 'use #foo'");
+ Assert.Equal(2, tokens.Length);
+ Assert.Equal(BashTokenKind.Word, tokens[0].Kind);
+ Assert.Equal(BashTokenKind.QuotedString, tokens[1].Kind);
+ Assert.Equal("use #foo", tokens[1].Value);
+ Assert.True(tokens[1].IsSingleQuoted);
+ }
+
+ [Fact]
+ public void Backslash_escaped_hash_is_consumed_by_ReadWord_not_comment()
+ {
+ // `\#abc` — escape processing strips the backslash; the word
+ // is `#abc`. This must NOT trip the comment branch because
+ // ReadWord has already consumed the backslash by the time the
+ // outer-loop dispatch would see `#`.
+ var tokens = LexNonWs("\\#abc");
+ var t = Assert.Single(tokens);
+ Assert.Equal(BashTokenKind.Word, t.Kind);
+ Assert.Equal("#abc", t.Value);
+ }
+
+ [Fact]
+ public void Comment_starts_immediately_after_operator_without_whitespace()
+ {
+ // `cmd & foo` — bash treats `#` as comment-start because `&&`
+ // ended the previous token; `#` is at a word boundary.
+ var all = BashLexer.Tokenize("cmd & foo");
+ // Word(cmd), Whitespace, Operator(&&), Comment(# foo)
+ Assert.Equal(4, all.Count);
+ Assert.Equal(BashTokenKind.Word, all[0].Kind);
+ Assert.Equal(BashTokenKind.Whitespace, all[1].Kind);
+ Assert.Equal(BashTokenKind.Operator, all[2].Kind);
+ Assert.Equal("&&", all[2].OperatorText);
+ Assert.Equal(BashTokenKind.Comment, all[3].Kind);
+ Assert.Equal(6, all[3].SourceStart);
+ Assert.Equal(5, all[3].SourceLength);
+ }
+
+ [Fact]
+ public void Comment_at_EOF_without_trailing_newline_terminates_naturally()
+ {
+ var tokens = BashLexer.Tokenize("echo hi # done");
+ // Word(echo), Whitespace, Word(hi), Whitespace, Comment(# done)
+ Assert.Equal(5, tokens.Count);
+ Assert.Equal(BashTokenKind.Comment, tokens[4].Kind);
+ Assert.Equal(14, tokens[4].SourceStart + tokens[4].SourceLength);
+ }
+
+ [Fact]
+ public void Comment_does_not_consume_terminating_newline()
+ {
+ // The newline must stay in the stream so the parser still sees
+ // a statement boundary between `# a` and `cmd`.
+ var tokens = BashLexer.Tokenize("# a\ncmd");
+ Assert.Equal(BashTokenKind.Comment, tokens[0].Kind);
+ Assert.Equal(0, tokens[0].SourceStart);
+ Assert.Equal(3, tokens[0].SourceLength);
+ Assert.Equal(BashTokenKind.Whitespace, tokens[1].Kind);
+ // The Whitespace token covers the newline.
+ Assert.Equal(3, tokens[1].SourceStart);
+ Assert.Equal(1, tokens[1].SourceLength);
+ Assert.Equal(BashTokenKind.Word, tokens[2].Kind);
+ Assert.Equal("cmd", tokens[2].Value);
+ }
+
// ------------------------------------------------------------ misc
[Fact]
diff --git a/tests/ShellSyntaxTree.Tests/Parsing/BashCommandParserTests.cs b/tests/ShellSyntaxTree.Tests/Parsing/BashCommandParserTests.cs
index d5ff527..02c4a01 100644
--- a/tests/ShellSyntaxTree.Tests/Parsing/BashCommandParserTests.cs
+++ b/tests/ShellSyntaxTree.Tests/Parsing/BashCommandParserTests.cs
@@ -569,6 +569,102 @@ public void Whitespace_only_input_returns_empty_clauses()
Assert.False(result.IsUnparseable);
}
+ // ---------------- Comments (SPEC §5, issue #25) ----------------
+
+ [Fact]
+ public void Comment_only_input_returns_empty_clauses()
+ {
+ // Mirrors the empty/whitespace-only path: zero clauses, not
+ // unparseable. SPEC §5 — comments are whitespace-equivalent.
+ var result = Parse("# just a note");
+ Assert.Equal("# just a note", result.Source);
+ Assert.Empty(result.Clauses);
+ Assert.False(result.IsUnparseable);
+ }
+
+ [Fact]
+ public void Leading_comment_does_not_pollute_verb_chain()
+ {
+ // The exact failure mode from issue #25: a leading explanatory
+ // comment was being parsed as the verb of the next clause,
+ // surfacing as `# Extract` in downstream approval prompts.
+ // BashArity collapses `git worktree` to a 2-token verb in v0.1
+ // (the deeper `git worktree list` subcommand is not in the table,
+ // so `list` lands as a positional arg — see SPEC §6.1).
+ var result = Parse("# Extract worktree branches\ngit worktree list");
+ var clause = Assert.Single(result.Clauses);
+ Assert.Equal(new[] { "git", "worktree" }, clause.Verb.Tokens);
+ Assert.Equal("list", clause.Args[0].Raw);
+ Assert.False(result.IsUnparseable);
+ }
+
+ [Fact]
+ public void Inline_trailing_comment_is_dropped_from_clause()
+ {
+ var result = Parse("git pull # update local");
+ var clause = Assert.Single(result.Clauses);
+ Assert.Equal(new[] { "git", "pull" }, clause.Verb.Tokens);
+ Assert.Empty(clause.Args);
+ }
+
+ [Fact]
+ public void Comment_between_two_statements_preserves_both_clauses()
+ {
+ // v0.1 does not treat top-level newlines as statement separators
+ // (SPEC §4 gap — separate from #25). Use explicit `;` so the
+ // separator survives FilterSignificant.
+ var result = Parse("git pull ; # now build\ndotnet build");
+ Assert.Equal(2, result.Clauses.Count);
+ Assert.Equal(new[] { "git", "pull" }, result.Clauses[0].Verb.Tokens);
+ Assert.Equal(new[] { "dotnet", "build" }, result.Clauses[1].Verb.Tokens);
+ Assert.False(result.IsUnparseable);
+ }
+
+ [Fact]
+ public void Hash_inside_double_quotes_remains_literal_arg()
+ {
+ var result = Parse("echo \"hash is #1234\"");
+ var clause = Assert.Single(result.Clauses);
+ Assert.Equal(new[] { "echo" }, clause.Verb.Tokens);
+ var arg = Assert.Single(clause.Args);
+ Assert.Equal("\"hash is #1234\"", arg.Raw);
+ }
+
+ [Fact]
+ public void Hash_inside_single_quotes_remains_literal_arg()
+ {
+ var result = Parse("echo 'use #foo'");
+ var clause = Assert.Single(result.Clauses);
+ Assert.Equal(new[] { "echo" }, clause.Verb.Tokens);
+ var arg = Assert.Single(clause.Args);
+ Assert.Equal("'use #foo'", arg.Raw);
+ }
+
+ [Fact]
+ public void Hash_mid_word_remains_literal_arg()
+ {
+ // Per bash: `#` is a comment-start only at a word boundary.
+ // `abc#def` is a single word with a literal `#`.
+ var result = Parse("echo abc#def");
+ var clause = Assert.Single(result.Clauses);
+ Assert.Equal(new[] { "echo" }, clause.Verb.Tokens);
+ var arg = Assert.Single(clause.Args);
+ Assert.Equal("abc#def", arg.Raw);
+ }
+
+ [Fact]
+ public void Comment_inside_orif_compound_does_not_break_clause_split()
+ {
+ // Issue #25 follow-up comment: a leading comment + ||-fallback
+ // would persist `[# Get, echo]` at one pass but `[# Get, curl, jq]`
+ // at another. After the fix both passes see `curl`/`echo`.
+ var result = Parse("# Get open PRs\ncurl example || echo \"failed\"");
+ Assert.Equal(2, result.Clauses.Count);
+ Assert.Equal(new[] { "curl" }, result.Clauses[0].Verb.Tokens);
+ Assert.Equal(new[] { "echo" }, result.Clauses[1].Verb.Tokens);
+ Assert.False(result.IsUnparseable);
+ }
+
// ---------------- Subshell ----------------
[Fact]