diff --git a/Directory.Build.props b/Directory.Build.props index ebca8ce..0717dbc 100644 --- a/Directory.Build.props +++ b/Directory.Build.props @@ -7,7 +7,7 @@ latest enable true - 0.1.3 + 0.1.4 alpha diff --git a/IMPLEMENTATION_PLAN.md b/IMPLEMENTATION_PLAN.md index 5ef1120..26d4b4d 100644 --- a/IMPLEMENTATION_PLAN.md +++ b/IMPLEMENTATION_PLAN.md @@ -216,7 +216,36 @@ bulldoze priorities. (sanitized paths per SPEC §14) - [x] `Directory.Build.props` `VersionPrefix` 0.1.2 → 0.1.3 - [x] `RELEASE_NOTES.md` 0.1.3-alpha section -- [ ] Cut 0.1.3-alpha tag once branch is merged +- [x] Cut 0.1.3-alpha tag once branch is merged + +### 17. Greedy verb-chain extraction (#27) — 0.1.4-alpha + +- [x] Remove `BashArity` static table and `ProbeArity()` method from + `BashVerbs.cs` +- [x] Add `BashVerbs.IsVerbLikeToken` predicate (strict allow-list: + Word kind, length 1–64, leading `[a-z]`, body `[a-z0-9._-]`) +- [x] Rewrite verb-extraction loop in + `BashCommandParser.ParseClauseSegment` (greedy walk + FileVerb + 1-token carveout + flag-with-value consumption) +- [x] 7 new corpus entries (132–138) for the issue #27 headline cases: + `freshdesk ticket list`, `git -C /repo worktree list --porcelain`, + `kubectl get pods`, `kubectl get pods my-pod`, `aws s3 cp src dst`, + `dotnet ef migrations add InitialCreate`, `cat README` + (FileVerb-carveout proof) +- [x] 11 existing corpus entries flipped to new shape: `04_echo_hello`, + `11_git_push_origin_main`, `13_git_checkout_dev`, + `17_docker_run_nginx`, `27_make_install`, `45_echo_append_log`, + `84_subshell_nested`, `91_bash_c_simple`, + `96_bash_c_nested_depth_2`, `100_bash_c_nested_depth_3`, + `130_netclaw_repro_leading_comment_pipeline` +- [x] 8 unit-test cases updated in `BashCommandParserTests.cs` to match + the new expected verb chains +- [x] SPEC.md updates: §3 `VerbChain`, §4 grammar, §6.1 rewritten end-to-end, + new §6.1.1 consumer pattern-matching guidance, §7 flag-with-value + note, §12 worked examples, §15 versioning, §16 sequencing +- [x] `Directory.Build.props` `VersionPrefix` 0.1.3 → 0.1.4 +- [x] `RELEASE_NOTES.md` 0.1.4-alpha section +- [ ] Cut 0.1.4-alpha tag once branch is merged --- diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md index b2ad45f..fb58774 100644 --- a/RELEASE_NOTES.md +++ b/RELEASE_NOTES.md @@ -1,3 +1,71 @@ +#### 0.1.4-alpha May 12th 2026 #### + +Greedy verb-chain extraction. Public API surface (`VerbChain`, `Clause`) +unchanged; the *content* of `Clause.Verb.Tokens` changes for many inputs. + +**BEHAVIOR CHANGE: verb-chain length is no longer table-driven (#27)** + +- The `BashArity` static lookup table and `ProbeArity()` method have been + **removed**. The parser walks consecutive verb-like Word tokens from + the start of each clause, transparently consuming flag-with-value + pairs (e.g. `git -C /repo`), and stops at the first non-verb-like + token, the first plain flag, or the first non-Word token. +- A token is "verb-like" when its kind is `Word`, length 1–64, first + character is an ASCII lowercase letter, and remaining characters are + in `[a-z0-9._-]`. The strict allow-list naturally excludes flags, + paths (`/`, `\`, `~`), env-var refs (`$VAR`), URLs (`://`), globs, + numeric tokens, and uppercase user-named identifiers like migration + names — without requiring per-case predicate logic. See SPEC §6.1. +- For known FILE verbs (`cat`, `ls`, `bash`, `cd`, `chmod`, `grep`, + `find`, …) the verb chain stops at exactly one token to preserve + per-verb positional-arg classification. The flag-with-value + consumption still runs so `tar -C /path` and `curl -o file` style + values still pick up `IsPath=true` via `FlagValueIsPath`. + +Examples that change: + +- `git push origin main` → verb `[git, push, origin, main]` (was + `[git, push]`). +- `git worktree list` (and arbitrary CLI subcommand chains) → fully + extracted as `[git, worktree, list]` (was `[git, worktree]`). +- `freshdesk ticket list --status open` → `[freshdesk, ticket, list]` + (was `[freshdesk]` because freshdesk wasn't in the BashArity table). +- `kubectl get pods my-pod` → `[kubectl, get, pods, my-pod]` (was + `[kubectl, get]`). +- `aws s3 cp src dst` → `[aws, s3, cp, src, dst]` (was `[aws, s3]`). +- `dotnet ef migrations add InitialCreate` → `[dotnet, ef, migrations, + add]` (was `[dotnet, ef]`). `InitialCreate` stays in args because the + predicate rejects uppercase first character. +- `cat README` → still `[cat]` (FileVerb carveout preserves `IsPath` on + bare-name targets). +- `echo hello` → `[echo, hello]` (echo is not a FILE verb). + +`Clause.Verb` is now documented as a **convenience hint, not a security +contract** (SPEC §6.1.1). Consumers needing security-grade verb +identification should pattern-prefix match against the raw token +stream: a command matches an approval pattern *P* iff the first +`len(P.verb_prefix)` command tokens equal `P.verb_prefix`. This punts +depth choice to the consumer and accommodates the parser's deliberate +over-extraction on bare-word args. Auto-proposed patterns should default +to the full extracted verb chain (greedy match): a subsequent variation +re-prompts rather than silently auto-grants. + +**Behavior notes** + +- Public API surface is unchanged (no `PublicApiSnapshotTests` delta). +- SPEC.md updates: §3 `VerbChain`, §4 grammar, §6.1 verb-chain + extraction (rewritten end-to-end), new §6.1.1 consumer + pattern-matching guidance, §7 flag-with-value note, §12 worked + examples, §15 versioning, §16 implementation sequencing. +- Corpus: 7 new entries (132–138) pin the issue #27 headline cases; + 10 existing entries flipped to the new shape (`04_echo_hello`, + `11_git_push_origin_main`, `13_git_checkout_dev`, `17_docker_run_nginx`, + `27_make_install`, `45_echo_append_log`, `84_subshell_nested`, + `91_bash_c_simple`, `96_bash_c_nested_depth_2`, + `100_bash_c_nested_depth_3`, `130_netclaw_repro_leading_comment_pipeline`). +- Unit tests: 8 pinned `BashCommandParserTests` cases updated to the new + expected verb chains. + #### 0.1.3-alpha May 12th 2026 #### Bash line comment handling. Public API unchanged. diff --git a/SPEC.md b/SPEC.md index 0897414..79b5b8a 100644 --- a/SPEC.md +++ b/SPEC.md @@ -202,8 +202,10 @@ public sealed record Clause ### `VerbChain` The verb of a clause. Multi-token to handle commands like `git push`, -`docker compose up`, `bun run`, `dotnet test`. Length determined by the -`BashArity` table (see §6). +`docker compose up`, `dotnet ef migrations add`. Length determined by the +greedy verb-chain heuristic in §6.1 — consecutive verb-like Word tokens +from the start of the clause, transparently consuming flag-with-value +pairs, with a 1-token carveout for FILE verbs. ```csharp public sealed record VerbChain @@ -343,7 +345,12 @@ clause := subshell | bash_c_wrapper | simple_clause subshell := "(" command ")" bash_c_wrapper := ("bash" | "sh") "-c" QUOTED_STRING simple_clause := verb_chain arg* redirect* -verb_chain := word{1..N} // N = BashArity[word_0] +verb_chain := verb_like_word (FW_pair? verb_like_word)* + // greedy walk per §6.1; FW_pair is a + // flag-with-value pair owned by word_0 + // (transparent to the walk); stops at + // the first non-verb-like token. For + // word_0 ∈ FileVerbs, exactly 1 token. arg := word | flag | quoted_string flag := "-" letter+ | "--" word redirect := redirect_op target @@ -479,57 +486,118 @@ must handle this. These are **data**, not logic. Implement as `static readonly` collections. -### 6.1 BashArity - -How many tokens form the verb chain for known commands. Defaults to 1 -when not in the table. - -```csharp -internal static readonly IReadOnlyDictionary BashArity = - new Dictionary(StringComparer.OrdinalIgnoreCase) -{ - // Two-token verbs - ["git"] = 2, // git push, git log, git checkout - ["dotnet"] = 2, // dotnet test, dotnet build - ["npm"] = 2, // npm install, npm run - ["yarn"] = 2, // yarn add, yarn install - ["pnpm"] = 2, // pnpm add, pnpm install - ["cargo"] = 2, // cargo build, cargo test - ["go"] = 2, // go build, go test, go run - ["kubectl"] = 2, // kubectl apply, kubectl get - ["helm"] = 2, // helm install, helm upgrade - ["systemctl"] = 2, // systemctl start, systemctl status - ["service"] = 2, // service nginx - ["pip"] = 2, // pip install, pip uninstall - ["pip3"] = 2, // pip3 install - ["brew"] = 2, // brew install, brew upgrade - ["apt"] = 2, // apt install, apt update - ["apt-get"] = 2, // apt-get install - ["yum"] = 2, // yum install - ["dnf"] = 2, // dnf install - ["pacman"] = 2, // pacman -S, pacman -Syu (the -S is the verb) - ["aws"] = 2, // aws s3, aws ec2 (top-level) - ["gcloud"] = 2, // gcloud compute, gcloud auth - ["az"] = 2, // az vm, az storage - - // Three-token verbs - ["docker"] = 2, // docker run, docker ps; "docker compose up" handled below - ["docker compose"] = 3, // docker compose up, docker compose down - ["docker-compose"] = 2, // legacy single-token form - ["bun"] = 2, // bun run, bun install - ["bun run"] = 3, // bun run my-script (treat as 3-tuple verb) - ["nuget"] = 2, // nuget push, nuget pack -}; -``` - -The table is not exhaustive. Missing verbs default to 1 token. Add entries -as the corpus surfaces real commands. - -**Implementation note:** The parser must look up multi-token verbs by -joining the first 1, 2, then 3 tokens and probing the table from longest -to shortest. So `docker compose up nginx` first probes `docker compose up` -(not in table; arity defaults wouldn't fire), then probes `docker compose` -(in table → arity 3) → verb chain is the first 3 tokens. +### 6.1 Verb-chain extraction (greedy heuristic) + +Per issue #27 (locked in v0.1.4-alpha), the parser does not consult a +static arity table. Instead, it walks consecutive verb-like Word tokens +from the start of the clause and stops at the first token that doesn't +look like a subcommand. This naturally scales to unknown CLIs +(`freshdesk ticket list`, `kubectl get pods`, `dotnet ef migrations add`) +without curated table entries. + +#### IsVerbLikeToken predicate + +A token is "verb-like" when **all** of these hold: + +- `Kind == BashTokenKind.Word` (quoted strings are values, never verbs at + index ≥ 1). +- Length is in `[1, 64]` characters. +- First character is an ASCII lowercase letter `[a-z]`. +- Remaining characters are drawn from `[a-z0-9._-]` only. + +The predicate is implemented in `BashVerbs.IsVerbLikeToken`. The leading +lowercase requirement mirrors real CLI subcommand convention; the +character allow-list naturally excludes flags (`-x` starts with `-`), +paths (`/`, `\`, `~`), env-var refs (`$VAR`), URLs (`://`), globs +(`* ? [`), and user-named identifiers (uppercase first char like +`InitialCreate`). + +#### Walk algorithm + +For a clause whose first token is a Word `firstVerb`: + +1. Append `firstVerb` to the verb chain (it does not need to satisfy + `IsVerbLikeToken` — bare commands like `Curl` or `_init` are still + commands). +2. Iterate the remaining tokens in order. For each token `t`: + - If `t.Kind != Word`: **stop**. + - If `t` is a flag (`IsFlagWord`): + - If `firstVerb` has a `FlagsWithValue` entry containing + `StripEqualsValue(t.Value)` AND the next token is `Word` or + `QuotedString` AND `t.Value` has no inline `=`: consume both as a + flag-value pair, mark their indices for `consumedFlagValueIndices`, + and continue walking. + - Otherwise: **stop**. + - If `firstVerb ∈ FileVerbs`: **stop** (1-token carveout — see below). + - If `!IsVerbLikeToken(t)`: **stop**. + - Otherwise: append `t.Value` to the verb chain and continue. + +If the first token is a `QuotedString` (e.g. `"git" push origin main`), +emit a 1-token verb chain `[firstVerb]` and skip the walk entirely. Bash +treats the quoted form as a verb-identity carrier; remaining tokens are +arg-list material. + +#### FileVerb 1-token carveout + +For verbs in §6.3 `FileVerbs` (file-mutation, file-read, editors, +compression, shell loaders, etc.), the verb chain stops at exactly one +token. The flag-with-value consumption still runs so the value of +`curl -o file`, `tar -C /path`, `git -C /repo` style flags picks up +`IsPath=true` via the `FlagValueIsPath` mechanism. + +The carveout exists because FileVerbs use SPEC §7 per-verb positional +rules to classify args as paths. Without it, a bare-name target like +`cat README` would over-extract — `README` is shape-wise verb-like — +and lose the `IsPath=true` classification downstream consumers depend +on for zone-gate evaluation. + +#### Examples + +| Input | Verb chain | Args | +|---|---|---| +| `git push origin main` | `[git, push, origin, main]` | `[]` (over-extracts; see §6.1.1) | +| `git -C /repo worktree list --porcelain` | `[git, worktree, list]` | `[-C, /repo, --porcelain]` | +| `freshdesk ticket list --status open` | `[freshdesk, ticket, list]` | `[--status, open]` | +| `kubectl get pods my-pod` | `[kubectl, get, pods, my-pod]` | `[]` | +| `aws s3 cp src dst` | `[aws, s3, cp, src, dst]` | `[]` (bare-word path args over-extract) | +| `dotnet ef migrations add InitialCreate` | `[dotnet, ef, migrations, add]` | `[InitialCreate]` (stops at uppercase) | +| `cat /etc/passwd` | `[cat]` | `[/etc/passwd]` (FileVerb carveout) | +| `cat README` | `[cat]` | `[README]` (FileVerb carveout preserves IsPath) | +| `ls -la /tmp` | `[ls]` | `[-la, /tmp]` (FileVerb carveout) | +| `chmod 755 file` | `[chmod]` | `[755, file]` (digit-start kills walk; FileVerb anyway) | +| `echo hello` | `[echo, hello]` | `[]` (echo is not a FileVerb; over-extracts) | + +### 6.1.1 Consumer pattern-matching guidance + +`Clause.Verb` is a **convenience hint, not a security contract**. +The parser deliberately over-extracts on bare-word args because no +syntactic rule disambiguates `origin` (a branch name) from `worktree` +(a subcommand verb) without per-CLI semantic knowledge — and we will +not bake per-CLI knowledge into the parser. + +Consumers needing security-grade verb identification should pattern-prefix +match against the raw token stream: + +> A command matches an approval pattern `P` if and only if the first +> `len(P.verb_prefix)` tokens of the command equal `P.verb_prefix`. + +This punts depth choice to the consumer (via the pattern they author) +and accommodates the parser's over-extraction transparently: + +- Pattern `git push *` (verb-prefix length 2) matches `git push origin + main` because the first two command tokens are `[git, push]`. +- Pattern `kubectl get pods *` (verb-prefix length 3) matches + `kubectl get pods my-pod` because the first three tokens are + `[kubectl, get, pods]`. +- Auto-proposed patterns for unknown commands should default to + the **full** extracted verb chain (greedy match), which is the + security-correct default: a subsequent variation re-prompts rather + than silently auto-grants. Operators wanting broader grants opt in + explicitly. + +False-negative (re-prompt) is recoverable. False-positive (silent +destructive grant) is not. Narrow-by-default favors the recoverable +failure mode. ### 6.2 CWD verbs @@ -643,11 +711,12 @@ internal static readonly IReadOnlyDictionary> > because `IReadOnlySet` is .NET 5+ only and the library > multi-targets `netstandard2.0`. Internal-only — no public-API impact. -> **Note (PR 3 → PR 4 follow-up):** the verb-chain probe must run *after* -> the flag-with-value pair is consumed for invocations like -> `git -C /repo log`. PR 3 ships the probe at token zero (the simpler -> shape); PR 4 lands the flag-with-value-aware probe so `git -C /repo log` -> produces `Verb.Tokens = ["git", "log"]` per SPEC §12's example. +> **Note:** the verb-chain walk consumes flag-with-value pairs +> transparently. For `git -C /repo log`, the walk consumes `-C /repo` +> before evaluating the next token; `log` is then verb-like and extends +> the chain, producing `Verb.Tokens = ["git", "log"]` per §12's example. +> The same mechanic lets `git -C /repo worktree list` extract the full +> 3-token chain per §6.1. When a flag-with-value consumes the next token, the consumed token's `IsPath` flag is set if the value is path-shaped (per the resolver in §8). @@ -959,22 +1028,42 @@ ParsedCommand { } ``` -### Multi-token verb +### Multi-token verb (greedy over-extraction) Input: `git push origin main` ``` Clauses = [ Clause { - Verb = VerbChain { Tokens = ["git", "push"] }, + Verb = VerbChain { Tokens = ["git", "push", "origin", "main"] }, + Args = [] + } +] +``` + +The greedy heuristic absorbs `origin` and `main` because they're +syntactically indistinguishable from subcommand verbs (lowercase +identifiers, no path-shape). Consumers gating on `git push *` use +pattern-prefix length 2 — see §6.1.1. + +Input: `freshdesk ticket list --status open` + +``` +Clauses = [ + Clause { + Verb = VerbChain { Tokens = ["freshdesk", "ticket", "list"] }, Args = [ - Arg { Raw = "origin", Kind = Literal, IsPath = false }, - Arg { Raw = "main", Kind = Literal, IsPath = false } + Arg { Raw = "--status", Kind = Literal, IsFlag = true }, + Arg { Raw = "open", Kind = Literal, IsPath = false } ] } ] ``` +The walk stops at `--status` (a flag with no `FlagsWithValue` entry for +`freshdesk`). The full subcommand stack is captured without requiring a +curated table entry — the canonical benefit motivating the change. + ### Compound with cd attribution Input: `cd /target && cmd1 && cmd2 file.txt` @@ -1235,10 +1324,14 @@ Adapt for ShellSyntaxTree: ### Versioning -- **v0.1.0-alpha** — first publishable cut. Bash-only. Public API surface - per §2 is locked; internal changes are free. -- **v0.1.x** — additive changes (more verb table entries, more corpus, - bug fixes). +- **v0.1.x-alpha** — pre-release alpha cycle. Public API surface per §2 is + locked; internal data and behavior are subject to course-correction + while real-world feedback lands (e.g. v0.1.4-alpha replaces the + `BashArity` static table with the greedy verb-chain heuristic per + issue #27). +- **v0.1.0** — first publishable non-alpha cut. Bash-only. +- **v0.1.x** (post-0.1.0) — additive changes only (more verb table + entries, more corpus, bug fixes that don't shift parsed-AST shape). - **v0.2.0** — first PowerShell parser implementation. - **v1.0.0** — ready when at least one external consumer beyond Netclaw ships against it without finding API gaps. @@ -1275,7 +1368,7 @@ A natural order for the implementer: but throw `NotImplementedException` on `Parse()`. Lock the surface first. 4. **Implement BashLexer** (§5). Heavy unit tests on tokenization. -5. **Implement BashArity / FILE / CWD verb tables** (§6) as static data. +5. **Implement FILE / CWD verb tables and IsVerbLikeToken predicate** (§6) as static data + helper. 6. **Implement BashParser** (§4). One production at a time; unit-test each. 7. **Implement Resolver** (§8). Unit-test each resolution rule. diff --git a/src/ShellSyntaxTree/Internal/Bash/Parsing/BashCommandParser.cs b/src/ShellSyntaxTree/Internal/Bash/Parsing/BashCommandParser.cs index c367987..1cd3306 100644 --- a/src/ShellSyntaxTree/Internal/Bash/Parsing/BashCommandParser.cs +++ b/src/ShellSyntaxTree/Internal/Bash/Parsing/BashCommandParser.cs @@ -769,89 +769,96 @@ private static ClauseResult ParseClauseSegment( return ClauseResult.Empty(); } - // ---- Verb-chain extraction with flag-with-value awareness ---- - // - // PR 4 follow-up to the PR 3 probe: a token like `git -C /repo log` - // shouldn't truncate the verb chain at `-C`. We greedily consume - // any flag-with-value pair owned by the tentative first verb - // (`tokens[0]`) before probing arity. The consumed flag + value - // pair stays in the segment for arg-extraction; only the verb - // probe sees a "compressed" view of the segment. - // - // Locked interpretation #8 / SPEC §12: `git -C /repo log` → - // Verb=["git", "log"], Args=[-C, /repo]. The flag and its value - // appear in source order in the Args list, with /repo carrying - // IsPath=true via the FlagValueIsPath table. - var verbCandidateValues = new List(3); - var verbCandidateIndices = new List(3); + // Verb-chain extraction per SPEC §6.1. The FileVerb carveout is + // load-bearing: downstream per-verb positional-arg classification + // depends on the verb chain staying 1 token for FILE verbs so + // bare-name targets like `cat README` still surface as Args with + // IsPath=true. Flag-with-value consumption must run *before* the + // carveout gate so `tar -C /repo` still attributes IsPath to /repo. var consumedFlagValueIndices = new HashSet(); + var verbTokens = new List(4); + var verbPositions = new HashSet(); - // Look up the would-be verb so we know which flags are - // "owned" by it. We only honor the flag-with-value skip when the - // first token is a known Word verb — quoted strings and opaque - // substitutions don't carry verb identity. - string? tentativeVerb = null; - if (segment.Tokens.Count > 0 - && segment.Tokens[0].Kind == BashTokenKind.Word - && !IsFlagWord(segment.Tokens[0])) + var firstToken = segment.Tokens[0]; + string? firstVerb = null; + if (firstToken.Kind == BashTokenKind.Word && !IsFlagWord(firstToken)) { - tentativeVerb = segment.Tokens[0].Value; + firstVerb = firstToken.Value; + verbTokens.Add(firstVerb); + verbPositions.Add(0); + } + else if (firstToken.Kind == BashTokenKind.QuotedString) + { + // Quoted command (`"git" push`): emit a 1-token chain and skip + // the walk. Bash semantics treat the quoted form as a verb + // identity carrier; remaining tokens are arg-list material. + verbTokens.Add(firstToken.Value); + verbPositions.Add(0); } - var hasFlagsTable = tentativeVerb is not null - && BashVerbs.FlagsWithValue.TryGetValue(tentativeVerb, out _); + BashVerbs.FlagsWithValue.TryGetValue(firstVerb ?? string.Empty, out var flagsForVerb); + var fileVerbCarveout = firstVerb is not null + && BashVerbs.FileVerbs.Contains(firstVerb); - for (var i = 0; i < segment.Tokens.Count && verbCandidateValues.Count < 3; i++) + if (firstVerb is not null) { - var t = segment.Tokens[i]; - if (t.Kind == BashTokenKind.Word || t.Kind == BashTokenKind.QuotedString) + for (var i = 1; i < segment.Tokens.Count; i++) { + var t = segment.Tokens[i]; + if (t.Kind != BashTokenKind.Word) + { + break; + } + if (IsFlagWord(t)) { - // Skip-through case: this is a flag-with-value pair owned - // by the tentative verb. Skip both the flag and its - // immediate value and keep probing arity. Only Word - // tokens qualify as flags (quoted "-x" stays literal). - if (hasFlagsTable - && tentativeVerb is not null - && BashVerbs.FlagsWithValue[tentativeVerb].Contains(StripEqualsValue(t.Value)) - && i + 1 < segment.Tokens.Count - && (segment.Tokens[i + 1].Kind == BashTokenKind.Word - || segment.Tokens[i + 1].Kind == BashTokenKind.QuotedString)) + if (flagsForVerb is null) { - // The two-token `-C /repo` form. Equals-form - // `--git-dir=/repo` is a single token and never enters - // this branch — but the verb-probe still ends at it - // (next iteration sees IsFlagWord and breaks below). - if (HasInlineEqualsValue(t.Value)) - { - // `--flag=value` — single token. Don't consume - // the next, and let the normal arg-extraction - // path split on `=`. End the verb-probe here. - break; - } + break; + } + + var eq = t.Value.IndexOf('='); + var flagKey = eq > 0 ? t.Value.Substring(0, eq) : t.Value; + if (!flagsForVerb.Contains(flagKey)) + { + break; + } - consumedFlagValueIndices.Add(i); - consumedFlagValueIndices.Add(i + 1); - i++; // skip the value too on the next loop step - continue; + if (eq > 0) + { + // `--flag=value` is a single token; arg-extraction + // splits on `=`. Stop the walk here. + break; + } + + if (i + 1 >= segment.Tokens.Count + || (segment.Tokens[i + 1].Kind != BashTokenKind.Word + && segment.Tokens[i + 1].Kind != BashTokenKind.QuotedString)) + { + break; } - // Plain flag with no path-value to skip → stops the verb probe. + consumedFlagValueIndices.Add(i); + consumedFlagValueIndices.Add(i + 1); + i++; + continue; + } + + if (fileVerbCarveout || !BashVerbs.IsVerbLikeToken(t)) + { break; } - verbCandidateValues.Add(t.Value); - verbCandidateIndices.Add(i); - continue; + verbTokens.Add(t.Value); + verbPositions.Add(i); } - - break; } - if (verbCandidateValues.Count == 0) + if (verbTokens.Count == 0) { - // Redirect-only clause. + // Redirect-only clause: no verb identified (e.g. clause starts + // with an operator, opaque substitution, or a leading flag with + // no Word firstVerb). ExtractRedirectsAndArgs( segment.Tokens, 0, @@ -879,40 +886,17 @@ private static ClauseResult ParseClauseSegment( }); } - var firstVerb = verbCandidateValues[0]; - if (BashVerbs.ControlFlowKeywords.Contains(firstVerb)) + if (BashVerbs.ControlFlowKeywords.Contains(verbTokens[0])) { return ClauseResult.Fail( - $"control-flow keyword '{firstVerb}' is not supported in v0.1"); - } - - var arity = BashVerbs.ProbeArity(verbCandidateValues, 0); - if (arity <= 0) - { - arity = 1; - } - - var verbTokens = new List(arity); - for (var k = 0; k < arity; k++) - { - verbTokens.Add(verbCandidateValues[k]); + $"control-flow keyword '{verbTokens[0]}' is not supported in v0.1"); } var verbChain = new VerbChain { Tokens = verbTokens }; - // The arg-extraction starts immediately after the last verb-chain - // *position* in the original segment, so the consumed flag-value - // pair (which sits *before* that position when it precedes the - // verb-chain extension) still gets emitted as Args in source order. - // Concretely: for `git -C /repo log`, the verb-chain positions are - // 0 and 3; we walk all of segment.Tokens from position 0 and emit - // -C, /repo as args while skipping the verb-position tokens. - var argStart = 0; - var verbPositions = new HashSet(verbCandidateIndices.GetRange(0, arity)); - ExtractRedirectsAndArgs( segment.Tokens, - argStart, + 0, source, options, verb: verbChain, @@ -951,20 +935,6 @@ private static bool IsFlagWord(BashToken token) return token.Value.Length > 0 && token.Value[0] == '-'; } - /// - /// For an equals-form flag like --output=file.txt, return the - /// flag portion (--output) so the FlagsWithValue table lookup - /// matches. For plain flags returns the input unchanged. - /// - private static string StripEqualsValue(string flag) - { - var eq = flag.IndexOf('='); - return eq > 0 ? flag.Substring(0, eq) : flag; - } - - private static bool HasInlineEqualsValue(string flag) => - flag.IndexOf('=') > 0; - // ---------------------------------------------------------------- args + redirects /// diff --git a/src/ShellSyntaxTree/Internal/Bash/Verbs/BashVerbs.cs b/src/ShellSyntaxTree/Internal/Bash/Verbs/BashVerbs.cs index 6d2ea26..d8c4ccb 100644 --- a/src/ShellSyntaxTree/Internal/Bash/Verbs/BashVerbs.cs +++ b/src/ShellSyntaxTree/Internal/Bash/Verbs/BashVerbs.cs @@ -1,10 +1,11 @@ -// ----------------------------------------------------------------------- +// ----------------------------------------------------------------------- // // Copyright (C) 2026 - 2026 Aaron Stannard // // ----------------------------------------------------------------------- using System; using System.Collections.Generic; +using ShellSyntaxTree.Internal.Bash.Lexing; namespace ShellSyntaxTree.Internal.Bash.Verbs; @@ -16,62 +17,6 @@ namespace ShellSyntaxTree.Internal.Bash.Verbs; /// internal static class BashVerbs { - /// - /// How many tokens form the verb chain for known commands. Defaults to - /// 1 when not in the table. SPEC §6.1. - /// - /// - /// - /// Per the SPEC §6.1 implementation note, the parser must look up - /// multi-token verbs by joining the first 1, 2, then 3 tokens and - /// probing the table from longest to shortest. So - /// docker compose up nginx first probes docker compose up - /// (not in table), then docker compose (arity 3 — match). The - /// matched key's value is the verb chain length, so e.g. - /// docker compose's value of 3 means "consume three source - /// tokens as the verb chain." - /// - /// - /// The table is non-exhaustive. Verbs not listed default to a 1-token - /// chain. Add entries as the corpus surfaces real commands. - /// - /// - internal static readonly IReadOnlyDictionary BashArity = - new Dictionary(StringComparer.OrdinalIgnoreCase) - { - // Two-token verbs. - ["git"] = 2, - ["dotnet"] = 2, - ["npm"] = 2, - ["yarn"] = 2, - ["pnpm"] = 2, - ["cargo"] = 2, - ["go"] = 2, - ["kubectl"] = 2, - ["helm"] = 2, - ["systemctl"] = 2, - ["service"] = 2, - ["pip"] = 2, - ["pip3"] = 2, - ["brew"] = 2, - ["apt"] = 2, - ["apt-get"] = 2, - ["yum"] = 2, - ["dnf"] = 2, - ["pacman"] = 2, - ["aws"] = 2, - ["gcloud"] = 2, - ["az"] = 2, - ["docker"] = 2, - ["docker-compose"] = 2, - ["bun"] = 2, - ["nuget"] = 2, - - // Three-token verbs. - ["docker compose"] = 3, - ["bun run"] = 3, - }; - /// /// Verbs whose first non-flag positional arg becomes the cwd for /// subsequent clauses in the same compound. SPEC §6.2. @@ -175,52 +120,56 @@ internal static readonly IReadOnlyDictionary> }; /// - /// Resolve the verb-chain length for a token sequence at . - /// Implements the longest-prefix probe from SPEC §6.1: try the first 3 - /// tokens, then 2, then 1, returning the matching arity. + /// SPEC §6.1: returns true when has the + /// shape of a CLI subcommand verb — a bare lowercase identifier + /// containing only ASCII letters, digits, hyphens, dots, and underscores. + /// Used to terminate the greedy verb-chain walk at the first token that + /// looks like a value rather than another subcommand. /// - /// Source-order list of verb-candidate tokens. - /// Index into where the verb chain begins. - /// - /// 1, 2, or 3 — the number of tokens that form the verb chain. - /// Defaults to 1 when no match is found (per SPEC §6.1) or when fewer - /// than the probed-prefix length tokens remain. - /// - internal static int ProbeArity(IReadOnlyList tokens, int start) + /// + /// Strict allow-list (leading [a-z], body [a-z0-9._-]) + /// over the more obvious negation-of-LooksLikePath because it stays + /// conservative for unknown shapes: a token like readme.md + /// satisfies the allow-list and would extend an unknown CLI's verb + /// chain, but the FileVerb carveout in BashCommandParser + /// short-circuits the common case (cat readme.md) before the + /// allow-list ever runs. Quoted strings are excluded so the user's + /// intent to treat bytes literally is preserved. The 64-char bound + /// is a defensive cap against pathological inputs. + /// + internal static bool IsVerbLikeToken(in BashToken token) { - var available = tokens.Count - start; - if (available <= 0) + if (token.Kind != BashTokenKind.Word) { - return 0; + return false; } - // Three-token probe. - if (available >= 3) + var v = token.Value; + if (v.Length == 0 || v.Length > 64) { - var key3 = tokens[start] + " " + tokens[start + 1] + " " + tokens[start + 2]; - if (BashArity.TryGetValue(key3, out var arity3) && arity3 == 3) - { - return 3; - } + return false; } - // Two-token probe. - if (available >= 2) + var first = v[0]; + if (!(first >= 'a' && first <= 'z')) { - var key2 = tokens[start] + " " + tokens[start + 1]; - if (BashArity.TryGetValue(key2, out var arity2)) - { - return Math.Min(arity2, available); - } + return false; } - // Single-token probe (default arity 1). - if (BashArity.TryGetValue(tokens[start], out var arity1)) + for (var i = 1; i < v.Length; i++) { - return Math.Min(arity1, available); + var c = v[i]; + var ok = + (c >= 'a' && c <= 'z') || + (c >= '0' && c <= '9') || + c == '-' || c == '.' || c == '_'; + if (!ok) + { + return false; + } } - return 1; + return true; } /// diff --git a/tests/ShellSyntaxTree.Tests/Corpus/bash/04_echo_hello.json b/tests/ShellSyntaxTree.Tests/Corpus/bash/04_echo_hello.json index 8dae71c..42868b2 100644 --- a/tests/ShellSyntaxTree.Tests/Corpus/bash/04_echo_hello.json +++ b/tests/ShellSyntaxTree.Tests/Corpus/bash/04_echo_hello.json @@ -1,19 +1,18 @@ { - "name": "Simple verb: echo hello", + "name": "Simple verb: echo hello (greedy walk absorbs bare-word arg)", "input": "echo hello", "expected": { "isUnparseable": false, "clauses": [ { "operator": "None", - "verb": ["echo"], - "args": [ - { "raw": "hello", "kind": "Literal", "isPath": false } - ], + "verb": ["echo", "hello"], + "args": [], "redirects": [], "isSubshell": false, "isBashCWrapped": false } ] - } + }, + "notes": "Issue #27: `echo` is not a FILE verb, so the greedy heuristic walks verb-like tokens past it. `hello` matches the verb-like predicate (lowercase identifier, no path-shape) and is absorbed into the verb chain. Over-extraction is documented and acceptable; consumers wanting to gate on `echo *` use pattern-prefix matching against the first token." } diff --git a/tests/ShellSyntaxTree.Tests/Corpus/bash/100_bash_c_nested_depth_3.json b/tests/ShellSyntaxTree.Tests/Corpus/bash/100_bash_c_nested_depth_3.json index 7240c59..962bcbe 100644 --- a/tests/ShellSyntaxTree.Tests/Corpus/bash/100_bash_c_nested_depth_3.json +++ b/tests/ShellSyntaxTree.Tests/Corpus/bash/100_bash_c_nested_depth_3.json @@ -1,20 +1,18 @@ { - "name": "bash -c nested depth 3: three wrappers consumed", + "name": "bash -c nested depth 3 with greedy verb chain: three wrappers consumed", "input": "bash -c \"bash -c \\\"bash -c \\\\\\\"echo deep\\\\\\\"\\\"\"", "expected": { "isUnparseable": false, "clauses": [ { "operator": "None", - "verb": ["echo"], - "args": [ - { "raw": "deep", "kind": "Literal", "isPath": false } - ], + "verb": ["echo", "deep"], + "args": [], "redirects": [], "isSubshell": false, "isBashCWrapped": true } ] }, - "notes": "Three nested bash -c wrappers all consumed; innermost echo surfaces flat. Depth 3 is under the cap of 5; the BashCommandParserTests cover the depth-6 overflow case." + "notes": "Three nested bash -c wrappers all consumed; innermost echo surfaces flat. Depth 3 is under the cap of 5; the BashCommandParserTests cover the depth-6 overflow case. Issue #27: `deep` absorbed into the verb chain." } diff --git a/tests/ShellSyntaxTree.Tests/Corpus/bash/11_git_push_origin_main.json b/tests/ShellSyntaxTree.Tests/Corpus/bash/11_git_push_origin_main.json index faf08a7..90d2148 100644 --- a/tests/ShellSyntaxTree.Tests/Corpus/bash/11_git_push_origin_main.json +++ b/tests/ShellSyntaxTree.Tests/Corpus/bash/11_git_push_origin_main.json @@ -1,21 +1,18 @@ { - "name": "Multi-token verb: git push origin main", + "name": "Greedy verb chain: git push origin main (over-extracts on bare-word remote+branch)", "input": "git push origin main", "expected": { "isUnparseable": false, "clauses": [ { "operator": "None", - "verb": ["git", "push"], - "args": [ - { "raw": "origin", "kind": "Literal", "isPath": false }, - { "raw": "main", "kind": "Literal", "isPath": false } - ], + "verb": ["git", "push", "origin", "main"], + "args": [], "redirects": [], "isSubshell": false, "isBashCWrapped": false } ] }, - "notes": "git arity=2 → first two source tokens become the verb chain." + "notes": "Issue #27: documented over-extraction. `origin` and `main` are syntactically indistinguishable from subcommand verbs without per-CLI knowledge. Consumers gating on `git push *` use pattern-prefix length (2) rather than relying on the parser to guess depth." } diff --git a/tests/ShellSyntaxTree.Tests/Corpus/bash/130_netclaw_repro_leading_comment_pipeline.json b/tests/ShellSyntaxTree.Tests/Corpus/bash/130_netclaw_repro_leading_comment_pipeline.json index 70ee577..f1a0747 100644 --- a/tests/ShellSyntaxTree.Tests/Corpus/bash/130_netclaw_repro_leading_comment_pipeline.json +++ b/tests/ShellSyntaxTree.Tests/Corpus/bash/130_netclaw_repro_leading_comment_pipeline.json @@ -1,16 +1,15 @@ { - "name": "Netclaw repro: leading comment + git ... | awk | tr | sort pipeline", + "name": "Netclaw repro: leading comment + git worktree list | awk | tr | sort pipeline", "input": "# Extract all unique branch names from worktrees\ngit -C /home/user/repos/sample-repo worktree list | awk '{print $NF}' | tr -d '[]' | sort -u", "expected": { "isUnparseable": false, "clauses": [ { "operator": "None", - "verb": ["git", "worktree"], + "verb": ["git", "worktree", "list"], "args": [ { "raw": "-C", "kind": "Literal", "isPath": false, "isFlag": true }, - { "raw": "/home/user/repos/sample-repo", "kind": "Literal", "isPath": true, "isFlag": false, "resolved": "/home/user/repos/sample-repo" }, - { "raw": "list", "kind": "Literal", "isPath": false, "isFlag": false, "resolved": "__NULL__" } + { "raw": "/home/user/repos/sample-repo", "kind": "Literal", "isPath": true, "isFlag": false, "resolved": "/home/user/repos/sample-repo" } ], "redirects": [], "isSubshell": false, @@ -49,5 +48,5 @@ } ] }, - "notes": "Issue #25 — original Netclaw repro, paths sanitized per SPEC §14 (user repo path → /home/user/repos/sample-repo). Without the v0.1.3 comment-skip fix this entry parsed Clause 0 with verb `[#, Extract]` — the failure mode that broke approval-prompt rendering for agents authoring scripts with explanatory comments." + "notes": "Issue #25 — original Netclaw repro, paths sanitized per SPEC §14. Without the v0.1.3 comment-skip fix, Clause 0's verb parsed as `[#, Extract]`. Updated for issue #27 / v0.1.4-alpha: the greedy heuristic now walks past the consumed `-C /repo` flag-value pair and captures `worktree list` as part of the verb chain instead of leaving `list` as a stranded positional arg." } diff --git a/tests/ShellSyntaxTree.Tests/Corpus/bash/132_freshdesk_ticket_list_status_open.json b/tests/ShellSyntaxTree.Tests/Corpus/bash/132_freshdesk_ticket_list_status_open.json new file mode 100644 index 0000000..de85b9f --- /dev/null +++ b/tests/ShellSyntaxTree.Tests/Corpus/bash/132_freshdesk_ticket_list_status_open.json @@ -0,0 +1,21 @@ +{ + "name": "Greedy verb chain: freshdesk ticket list --status open (unknown CLI)", + "input": "freshdesk ticket list --status open", + "expected": { + "isUnparseable": false, + "clauses": [ + { + "operator": "None", + "verb": ["freshdesk", "ticket", "list"], + "args": [ + { "raw": "--status", "kind": "Literal", "isPath": false, "isFlag": true }, + { "raw": "open", "kind": "Literal", "isPath": false, "resolved": "__NULL__" } + ], + "redirects": [], + "isSubshell": false, + "isBashCWrapped": false + } + ] + }, + "notes": "Issue #27 headline: unknown/private CLIs (freshdesk, internal tools) used to truncate to a 1-token verb chain. The greedy heuristic walks verb-like Word tokens (lowercase[a-z0-9._-]) and stops at the first flag, so the canonical subcommand stack `freshdesk ticket list` extracts fully without requiring a curated table entry." +} diff --git a/tests/ShellSyntaxTree.Tests/Corpus/bash/133_git_C_repo_worktree_list_porcelain.json b/tests/ShellSyntaxTree.Tests/Corpus/bash/133_git_C_repo_worktree_list_porcelain.json new file mode 100644 index 0000000..aaa9899 --- /dev/null +++ b/tests/ShellSyntaxTree.Tests/Corpus/bash/133_git_C_repo_worktree_list_porcelain.json @@ -0,0 +1,22 @@ +{ + "name": "Greedy verb chain past consumed flag-value: git -C /repo worktree list --porcelain", + "input": "git -C /repo worktree list --porcelain", + "expected": { + "isUnparseable": false, + "clauses": [ + { + "operator": "None", + "verb": ["git", "worktree", "list"], + "args": [ + { "raw": "-C", "kind": "Literal", "isPath": false, "isFlag": true }, + { "raw": "/repo", "kind": "Literal", "isPath": true, "isFlag": false, "resolved": "/repo" }, + { "raw": "--porcelain", "kind": "Literal", "isPath": false, "isFlag": true } + ], + "redirects": [], + "isSubshell": false, + "isBashCWrapped": false + } + ] + }, + "notes": "Issue #27: the heuristic must walk PAST a consumed flag-with-value pair to extract trailing verb-like tokens. `-C /repo` is consumed via FlagsWithValue[git]; `worktree` and `list` are then both verb-like; `--porcelain` stops the walk. /repo carries IsPath=true via the FlagValueIsPath mechanism." +} diff --git a/tests/ShellSyntaxTree.Tests/Corpus/bash/134_kubectl_get_pods.json b/tests/ShellSyntaxTree.Tests/Corpus/bash/134_kubectl_get_pods.json new file mode 100644 index 0000000..41f38aa --- /dev/null +++ b/tests/ShellSyntaxTree.Tests/Corpus/bash/134_kubectl_get_pods.json @@ -0,0 +1,18 @@ +{ + "name": "Greedy verb chain: kubectl get pods", + "input": "kubectl get pods", + "expected": { + "isUnparseable": false, + "clauses": [ + { + "operator": "None", + "verb": ["kubectl", "get", "pods"], + "args": [], + "redirects": [], + "isSubshell": false, + "isBashCWrapped": false + } + ] + }, + "notes": "Issue #27: kubectl resource-type pluralizations (`pods`, `services`, `deployments`) used to be silently truncated under the BashArity=2 default for kubectl. The greedy heuristic captures the full subcommand stack." +} diff --git a/tests/ShellSyntaxTree.Tests/Corpus/bash/135_kubectl_get_pods_my_pod.json b/tests/ShellSyntaxTree.Tests/Corpus/bash/135_kubectl_get_pods_my_pod.json new file mode 100644 index 0000000..3175589 --- /dev/null +++ b/tests/ShellSyntaxTree.Tests/Corpus/bash/135_kubectl_get_pods_my_pod.json @@ -0,0 +1,18 @@ +{ + "name": "Greedy verb chain over-extracts on bare-word arg: kubectl get pods my-pod", + "input": "kubectl get pods my-pod", + "expected": { + "isUnparseable": false, + "clauses": [ + { + "operator": "None", + "verb": ["kubectl", "get", "pods", "my-pod"], + "args": [], + "redirects": [], + "isSubshell": false, + "isBashCWrapped": false + } + ] + }, + "notes": "Issue #27: documented over-extraction. `my-pod` is syntactically indistinguishable from a subcommand verb (lowercase + hyphen, no path-shape). Consumers needing security-grade matching use pattern-prefix length, not parser-level depth guessing — see SPEC §6.1.1." +} diff --git a/tests/ShellSyntaxTree.Tests/Corpus/bash/136_aws_s3_cp_src_dst.json b/tests/ShellSyntaxTree.Tests/Corpus/bash/136_aws_s3_cp_src_dst.json new file mode 100644 index 0000000..a6dcb23 --- /dev/null +++ b/tests/ShellSyntaxTree.Tests/Corpus/bash/136_aws_s3_cp_src_dst.json @@ -0,0 +1,18 @@ +{ + "name": "Greedy verb chain over-extracts on bare-word path args: aws s3 cp src dst", + "input": "aws s3 cp src dst", + "expected": { + "isUnparseable": false, + "clauses": [ + { + "operator": "None", + "verb": ["aws", "s3", "cp", "src", "dst"], + "args": [], + "redirects": [], + "isSubshell": false, + "isBashCWrapped": false + } + ] + }, + "notes": "Issue #27: documented over-extraction. `src` and `dst` are bare-word path-like args but lack path-shape (no slash, no extension, no tilde). The parser cannot syntactically distinguish them from subcommand verbs. Consumers that need to recover the path semantics walk the token stream directly; for verb-pattern gates, prefix-length matching against the consumer's pattern is the correct approach." +} diff --git a/tests/ShellSyntaxTree.Tests/Corpus/bash/137_dotnet_ef_migrations_add_initial.json b/tests/ShellSyntaxTree.Tests/Corpus/bash/137_dotnet_ef_migrations_add_initial.json new file mode 100644 index 0000000..e46e76e --- /dev/null +++ b/tests/ShellSyntaxTree.Tests/Corpus/bash/137_dotnet_ef_migrations_add_initial.json @@ -0,0 +1,20 @@ +{ + "name": "Greedy verb chain stops at uppercase: dotnet ef migrations add InitialCreate", + "input": "dotnet ef migrations add InitialCreate", + "expected": { + "isUnparseable": false, + "clauses": [ + { + "operator": "None", + "verb": ["dotnet", "ef", "migrations", "add"], + "args": [ + { "raw": "InitialCreate", "kind": "Literal", "isPath": false, "resolved": "__NULL__" } + ], + "redirects": [], + "isSubshell": false, + "isBashCWrapped": false + } + ] + }, + "notes": "Issue #27: the IsVerbLikeToken predicate requires a lowercase ASCII letter as the first character (mirroring real CLI subcommand convention). User-named identifiers like migration names (`InitialCreate`) naturally fall out of the verb chain because they start with an uppercase letter. This is an example of the heuristic happening to do the right thing — though it's incidental, not a guarantee." +} diff --git a/tests/ShellSyntaxTree.Tests/Corpus/bash/138_cat_bare_name_filevervb_carveout.json b/tests/ShellSyntaxTree.Tests/Corpus/bash/138_cat_bare_name_filevervb_carveout.json new file mode 100644 index 0000000..f321354 --- /dev/null +++ b/tests/ShellSyntaxTree.Tests/Corpus/bash/138_cat_bare_name_filevervb_carveout.json @@ -0,0 +1,20 @@ +{ + "name": "FileVerb 1-token carveout: cat README (bare-name preserves IsPath)", + "input": "cat README", + "expected": { + "isUnparseable": false, + "clauses": [ + { + "operator": "None", + "verb": ["cat"], + "args": [ + { "raw": "README", "kind": "Literal", "isPath": true, "resolved": "/work/README" } + ], + "redirects": [], + "isSubshell": false, + "isBashCWrapped": false + } + ] + }, + "notes": "Issue #27 / FileVerb carveout: for known FILE verbs (`cat`, `ls`, `bash`, `cd`, `chmod`, `grep`, `find`, …) the heuristic stops after the first verb token so per-verb positional-arg classification still fires. `README` itself would also stop the walk under the strict predicate (uppercase first letter), but consider `cat hello` or `bash myscript` — bare-name targets whose shape *would* be verb-like. The carveout is what guarantees those targets remain as Args with IsPath=true, preserving the zone-gate semantics downstream consumers depend on." +} diff --git a/tests/ShellSyntaxTree.Tests/Corpus/bash/13_git_checkout_dev.json b/tests/ShellSyntaxTree.Tests/Corpus/bash/13_git_checkout_dev.json index 80d0a3a..12b6d3c 100644 --- a/tests/ShellSyntaxTree.Tests/Corpus/bash/13_git_checkout_dev.json +++ b/tests/ShellSyntaxTree.Tests/Corpus/bash/13_git_checkout_dev.json @@ -1,19 +1,18 @@ { - "name": "Multi-token verb: git checkout dev", + "name": "Greedy verb chain: git checkout dev (over-extracts on branch name)", "input": "git checkout dev", "expected": { "isUnparseable": false, "clauses": [ { "operator": "None", - "verb": ["git", "checkout"], - "args": [ - { "raw": "dev", "kind": "Literal", "isPath": false } - ], + "verb": ["git", "checkout", "dev"], + "args": [], "redirects": [], "isSubshell": false, "isBashCWrapped": false } ] - } + }, + "notes": "Issue #27: documented over-extraction. `dev` is a branch name but syntactically matches the verb-like predicate (lowercase identifier). Consumers gating on `git checkout *` use pattern-prefix length (2)." } diff --git a/tests/ShellSyntaxTree.Tests/Corpus/bash/17_docker_run_nginx.json b/tests/ShellSyntaxTree.Tests/Corpus/bash/17_docker_run_nginx.json index 75cc9c8..bd16602 100644 --- a/tests/ShellSyntaxTree.Tests/Corpus/bash/17_docker_run_nginx.json +++ b/tests/ShellSyntaxTree.Tests/Corpus/bash/17_docker_run_nginx.json @@ -1,19 +1,18 @@ { - "name": "Multi-token verb: docker run nginx", + "name": "Greedy verb chain: docker run nginx (over-extracts on image name)", "input": "docker run nginx", "expected": { "isUnparseable": false, "clauses": [ { "operator": "None", - "verb": ["docker", "run"], - "args": [ - { "raw": "nginx", "kind": "Literal", "isPath": false } - ], + "verb": ["docker", "run", "nginx"], + "args": [], "redirects": [], "isSubshell": false, "isBashCWrapped": false } ] - } + }, + "notes": "Issue #27: documented over-extraction. `nginx` is a Docker image name but syntactically matches the verb-like predicate. A registry-qualified image (e.g. `registry.example.com/ns/nginx:1.25`) would stop the walk via `://` or path-shape rejection. Consumers gating on `docker run *` use pattern-prefix length (2)." } diff --git a/tests/ShellSyntaxTree.Tests/Corpus/bash/27_make_install.json b/tests/ShellSyntaxTree.Tests/Corpus/bash/27_make_install.json index 50a12dd..ad6b646 100644 --- a/tests/ShellSyntaxTree.Tests/Corpus/bash/27_make_install.json +++ b/tests/ShellSyntaxTree.Tests/Corpus/bash/27_make_install.json @@ -1,5 +1,5 @@ { - "name": "Compound: make && make install", + "name": "Compound with greedy verb chain: make && make install", "input": "make && make install", "expected": { "isUnparseable": false, @@ -14,14 +14,13 @@ }, { "operator": "AndIf", - "verb": ["make"], - "args": [ - { "raw": "install", "kind": "Literal", "isPath": false } - ], + "verb": ["make", "install"], + "args": [], "redirects": [], "isSubshell": false, "isBashCWrapped": false } ] - } + }, + "notes": "Issue #27: `install` is a verb-like lowercase identifier so the greedy heuristic absorbs it into the second clause's verb chain. `make` is not a FILE verb." } diff --git a/tests/ShellSyntaxTree.Tests/Corpus/bash/45_echo_append_log.json b/tests/ShellSyntaxTree.Tests/Corpus/bash/45_echo_append_log.json index f114505..00b5730 100644 --- a/tests/ShellSyntaxTree.Tests/Corpus/bash/45_echo_append_log.json +++ b/tests/ShellSyntaxTree.Tests/Corpus/bash/45_echo_append_log.json @@ -1,15 +1,13 @@ { - "name": "Append redirect: echo hi >> log.txt", + "name": "Append redirect with greedy verb chain: echo hi >> log.txt", "input": "echo hi >> log.txt", "expected": { "isUnparseable": false, "clauses": [ { "operator": "None", - "verb": ["echo"], - "args": [ - { "raw": "hi", "kind": "Literal", "isPath": false } - ], + "verb": ["echo", "hi"], + "args": [], "redirects": [ { "direction": "Append", "target": "/work/log.txt" } ], @@ -18,5 +16,5 @@ } ] }, - "notes": "PR 4: redirect target resolves; `echo` is not a FileVerb so `hi` stays IsPath=false." + "notes": "Issue #27: `echo` is not a FILE verb so the greedy heuristic absorbs `hi` into the verb chain. Redirect operators (`>>`) stop the walk regardless of the absorbed-arg shape; the redirect target still resolves to /work/log.txt via the existing resolver path." } diff --git a/tests/ShellSyntaxTree.Tests/Corpus/bash/84_subshell_nested.json b/tests/ShellSyntaxTree.Tests/Corpus/bash/84_subshell_nested.json index 0abdba0..6183698 100644 --- a/tests/ShellSyntaxTree.Tests/Corpus/bash/84_subshell_nested.json +++ b/tests/ShellSyntaxTree.Tests/Corpus/bash/84_subshell_nested.json @@ -1,20 +1,18 @@ { - "name": "Nested subshells: ((echo deep))", + "name": "Nested subshells with greedy verb chain: ((echo deep))", "input": "((echo deep))", "expected": { "isUnparseable": false, "clauses": [ { "operator": "None", - "verb": ["echo"], - "args": [ - { "raw": "deep", "kind": "Literal", "isPath": false } - ], + "verb": ["echo", "deep"], + "args": [], "redirects": [], "isSubshell": true, "isBashCWrapped": false } ] }, - "notes": "Two levels of parens; the inner clause is still IsSubshell=true." + "notes": "Two levels of parens; the inner clause is still IsSubshell=true. Issue #27: `deep` is verb-like and absorbed into the verb chain because `echo` is not a FILE verb." } diff --git a/tests/ShellSyntaxTree.Tests/Corpus/bash/91_bash_c_simple.json b/tests/ShellSyntaxTree.Tests/Corpus/bash/91_bash_c_simple.json index 17da9e1..5d2da93 100644 --- a/tests/ShellSyntaxTree.Tests/Corpus/bash/91_bash_c_simple.json +++ b/tests/ShellSyntaxTree.Tests/Corpus/bash/91_bash_c_simple.json @@ -1,20 +1,18 @@ { - "name": "bash -c simple: bash -c \"echo hi\"", + "name": "bash -c simple with greedy verb chain: bash -c \"echo hi\"", "input": "bash -c \"echo hi\"", "expected": { "isUnparseable": false, "clauses": [ { "operator": "None", - "verb": ["echo"], - "args": [ - { "raw": "hi", "kind": "Literal", "isPath": false } - ], + "verb": ["echo", "hi"], + "args": [], "redirects": [], "isSubshell": false, "isBashCWrapped": true } ] }, - "notes": "bash -c wrapper consumed; inner echo surfaces with IsBashCWrapped=true." + "notes": "bash -c wrapper consumed; inner echo surfaces with IsBashCWrapped=true. Issue #27: `hi` is verb-like and absorbed into the inner clause's verb chain." } diff --git a/tests/ShellSyntaxTree.Tests/Corpus/bash/96_bash_c_nested_depth_2.json b/tests/ShellSyntaxTree.Tests/Corpus/bash/96_bash_c_nested_depth_2.json index caaace3..40d2ce7 100644 --- a/tests/ShellSyntaxTree.Tests/Corpus/bash/96_bash_c_nested_depth_2.json +++ b/tests/ShellSyntaxTree.Tests/Corpus/bash/96_bash_c_nested_depth_2.json @@ -1,20 +1,18 @@ { - "name": "bash -c nested depth 2: bash -c \"bash -c \\\"echo hi\\\"\"", + "name": "bash -c nested depth 2 with greedy verb chain: bash -c \"bash -c \\\"echo hi\\\"\"", "input": "bash -c \"bash -c \\\"echo hi\\\"\"", "expected": { "isUnparseable": false, "clauses": [ { "operator": "None", - "verb": ["echo"], - "args": [ - { "raw": "hi", "kind": "Literal", "isPath": false } - ], + "verb": ["echo", "hi"], + "args": [], "redirects": [], "isSubshell": false, "isBashCWrapped": true } ] }, - "notes": "Two wrappers consumed; the innermost echo surfaces flat. Depth 2 well under the cap of 5." + "notes": "Two wrappers consumed; the innermost echo surfaces flat. Depth 2 well under the cap of 5. Issue #27: `hi` absorbed into the verb chain." } diff --git a/tests/ShellSyntaxTree.Tests/Parsing/BashCommandParserTests.cs b/tests/ShellSyntaxTree.Tests/Parsing/BashCommandParserTests.cs index 02c4a01..f4e16ec 100644 --- a/tests/ShellSyntaxTree.Tests/Parsing/BashCommandParserTests.cs +++ b/tests/ShellSyntaxTree.Tests/Parsing/BashCommandParserTests.cs @@ -95,13 +95,17 @@ public void Single_quoted_arg_preserves_quotes_in_raw() // ---------------- Multi-token verb chains ---------------- [Fact] - public void Two_token_verb_git_push() + public void Greedy_verb_chain_absorbs_bare_word_args_git_push_origin_main() { + // Issue #27: documented over-extraction. `origin` and `main` are + // syntactically indistinguishable from subcommand verbs (lowercase + // identifiers, no path-shape). Consumers gating on `git push *` + // use pattern-prefix length (2) — see SPEC §6.1.1. var result = Parse("git push origin main"); Assert.False(result.IsUnparseable); var clause = Assert.Single(result.Clauses); - Assert.Equal(new[] { "git", "push" }, clause.Verb.Tokens); - Assert.Equal(new[] { "origin", "main" }, clause.Args.Select(a => a.Raw).ToArray()); + Assert.Equal(new[] { "git", "push", "origin", "main" }, clause.Verb.Tokens); + Assert.Empty(clause.Args); } [Fact] @@ -114,13 +118,18 @@ public void Two_token_verb_dotnet_test() } [Fact] - public void Three_token_verb_docker_compose_up() + public void Greedy_verb_chain_walks_through_docker_compose_up_nginx() { + // Issue #27: `docker compose up nginx` over-extracts because `nginx` + // is a verb-like lowercase identifier. The previous BashArity + // approach capped the chain at 3 tokens for `docker compose`; the + // greedy heuristic walks until a non-verb-like token. Consumers + // gating on `docker compose up *` use pattern-prefix length (3). var result = Parse("docker compose up nginx"); Assert.False(result.IsUnparseable); var clause = Assert.Single(result.Clauses); - Assert.Equal(new[] { "docker", "compose", "up" }, clause.Verb.Tokens); - Assert.Equal(new[] { "nginx" }, clause.Args.Select(a => a.Raw).ToArray()); + Assert.Equal(new[] { "docker", "compose", "up", "nginx" }, clause.Verb.Tokens); + Assert.Empty(clause.Args); } [Fact] @@ -134,13 +143,17 @@ public void Three_token_verb_bun_run_my_script() } [Fact] - public void Two_token_verb_docker_run() + public void Greedy_verb_chain_absorbs_docker_run_image_name() { + // Issue #27: `nginx` is a docker image name but syntactically a + // verb-like identifier. A registry-qualified image like + // `registry.example.com/ns/nginx:1.25` would stop the walk via + // path-shape rejection. var result = Parse("docker run nginx"); Assert.False(result.IsUnparseable); var clause = Assert.Single(result.Clauses); - Assert.Equal(new[] { "docker", "run" }, clause.Verb.Tokens); - Assert.Equal(new[] { "nginx" }, clause.Args.Select(a => a.Raw).ToArray()); + Assert.Equal(new[] { "docker", "run", "nginx" }, clause.Verb.Tokens); + Assert.Empty(clause.Args); } [Fact] @@ -153,12 +166,17 @@ public void Verb_chain_capped_by_available_tokens() } [Fact] - public void Default_arity_when_verb_unknown() + public void Unknown_verb_with_bare_word_args_extracts_full_chain() { + // Issue #27: under the greedy heuristic, unknown verbs (not in any + // table) extract the full chain of consecutive verb-like tokens. + // This is the strict-better-than-default-arity-1 behavior the + // issue motivates — `freshdesk ticket list` and similar private + // CLIs surface their full subcommand stack without curation. var result = Parse("totally-unknown-verb foo bar"); var clause = Assert.Single(result.Clauses); - Assert.Equal(new[] { "totally-unknown-verb" }, clause.Verb.Tokens); - Assert.Equal(new[] { "foo", "bar" }, clause.Args.Select(a => a.Raw).ToArray()); + Assert.Equal(new[] { "totally-unknown-verb", "foo", "bar" }, clause.Verb.Tokens); + Assert.Empty(clause.Args); } [Fact] @@ -481,7 +499,9 @@ public void Sh_c_recurses_same_as_bash_c() var result = Parse("sh -c \"echo hi\""); Assert.False(result.IsUnparseable); var clause = Assert.Single(result.Clauses); - Assert.Equal(new[] { "echo" }, clause.Verb.Tokens); + // Issue #27: `hi` is verb-like and absorbed into the inner clause's + // verb chain (echo is not a FILE verb). + Assert.Equal(new[] { "echo", "hi" }, clause.Verb.Tokens); Assert.True(clause.IsBashCWrapped); } @@ -526,7 +546,8 @@ public void Bash_c_nested_depth_2_parses() var result = Parse("bash -c \"bash -c \\\"echo hi\\\"\""); Assert.False(result.IsUnparseable); var clause = Assert.Single(result.Clauses); - Assert.Equal(new[] { "echo" }, clause.Verb.Tokens); + // Issue #27: `hi` absorbed into the verb chain. + Assert.Equal(new[] { "echo", "hi" }, clause.Verb.Tokens); Assert.True(clause.IsBashCWrapped); } @@ -587,14 +608,14 @@ public void Leading_comment_does_not_pollute_verb_chain() { // The exact failure mode from issue #25: a leading explanatory // comment was being parsed as the verb of the next clause, - // surfacing as `# Extract` in downstream approval prompts. - // BashArity collapses `git worktree` to a 2-token verb in v0.1 - // (the deeper `git worktree list` subcommand is not in the table, - // so `list` lands as a positional arg — see SPEC §6.1). + // surfacing as `# Extract` in downstream approval prompts. Issue + // #27 follow-up: the greedy verb-chain heuristic now captures + // `worktree` and `list` together — the parser used to truncate + // the chain at the BashArity=2 default for git. var result = Parse("# Extract worktree branches\ngit worktree list"); var clause = Assert.Single(result.Clauses); - Assert.Equal(new[] { "git", "worktree" }, clause.Verb.Tokens); - Assert.Equal("list", clause.Args[0].Raw); + Assert.Equal(new[] { "git", "worktree", "list" }, clause.Verb.Tokens); + Assert.Empty(clause.Args); Assert.False(result.IsUnparseable); } @@ -1051,11 +1072,15 @@ public void Docker_volume_value_is_not_a_path_per_locked_interpretation_8() { var result = Parse("docker run -v /host:/container nginx"); var clause = Assert.Single(result.Clauses); - // verb chain probe: -v /host:/container should be consumed; verb = ["docker", "run"] - Assert.Equal(new[] { "docker", "run" }, clause.Verb.Tokens); - Assert.Equal(3, clause.Args.Count); + // Issue #27 follow-up: the verb-chain walker consumes `-v + // /host:/container` as a flag-with-value pair, then `nginx` + // (verb-like) extends the chain. Verb = ["docker", "run", "nginx"]; + // the volume-mount value still surfaces as an arg with IsPath=false + // per locked interpretation #8 (colon-joined target is not a path). + Assert.Equal(new[] { "docker", "run", "nginx" }, clause.Verb.Tokens); + Assert.Equal(2, clause.Args.Count); Assert.Equal("-v", clause.Args[0].Raw); - Assert.False(clause.Args[1].IsPath); // colon-joined volume mount, NOT a path + Assert.False(clause.Args[1].IsPath); Assert.Equal("/host:/container", clause.Args[1].Raw); } }