diff --git a/Directory.Build.props b/Directory.Build.props
index ebca8ce..0717dbc 100644
--- a/Directory.Build.props
+++ b/Directory.Build.props
@@ -7,7 +7,7 @@
latest
enable
true
- 0.1.3
+ 0.1.4
alpha
diff --git a/IMPLEMENTATION_PLAN.md b/IMPLEMENTATION_PLAN.md
index 5ef1120..26d4b4d 100644
--- a/IMPLEMENTATION_PLAN.md
+++ b/IMPLEMENTATION_PLAN.md
@@ -216,7 +216,36 @@ bulldoze priorities.
(sanitized paths per SPEC §14)
- [x] `Directory.Build.props` `VersionPrefix` 0.1.2 → 0.1.3
- [x] `RELEASE_NOTES.md` 0.1.3-alpha section
-- [ ] Cut 0.1.3-alpha tag once branch is merged
+- [x] Cut 0.1.3-alpha tag once branch is merged
+
+### 17. Greedy verb-chain extraction (#27) — 0.1.4-alpha
+
+- [x] Remove `BashArity` static table and `ProbeArity()` method from
+ `BashVerbs.cs`
+- [x] Add `BashVerbs.IsVerbLikeToken` predicate (strict allow-list:
+ Word kind, length 1–64, leading `[a-z]`, body `[a-z0-9._-]`)
+- [x] Rewrite verb-extraction loop in
+ `BashCommandParser.ParseClauseSegment` (greedy walk + FileVerb
+ 1-token carveout + flag-with-value consumption)
+- [x] 7 new corpus entries (132–138) for the issue #27 headline cases:
+ `freshdesk ticket list`, `git -C /repo worktree list --porcelain`,
+ `kubectl get pods`, `kubectl get pods my-pod`, `aws s3 cp src dst`,
+ `dotnet ef migrations add InitialCreate`, `cat README`
+ (FileVerb-carveout proof)
+- [x] 11 existing corpus entries flipped to new shape: `04_echo_hello`,
+ `11_git_push_origin_main`, `13_git_checkout_dev`,
+ `17_docker_run_nginx`, `27_make_install`, `45_echo_append_log`,
+ `84_subshell_nested`, `91_bash_c_simple`,
+ `96_bash_c_nested_depth_2`, `100_bash_c_nested_depth_3`,
+ `130_netclaw_repro_leading_comment_pipeline`
+- [x] 8 unit-test cases updated in `BashCommandParserTests.cs` to match
+ the new expected verb chains
+- [x] SPEC.md updates: §3 `VerbChain`, §4 grammar, §6.1 rewritten end-to-end,
+ new §6.1.1 consumer pattern-matching guidance, §7 flag-with-value
+ note, §12 worked examples, §15 versioning, §16 sequencing
+- [x] `Directory.Build.props` `VersionPrefix` 0.1.3 → 0.1.4
+- [x] `RELEASE_NOTES.md` 0.1.4-alpha section
+- [ ] Cut 0.1.4-alpha tag once branch is merged
---
diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md
index b2ad45f..fb58774 100644
--- a/RELEASE_NOTES.md
+++ b/RELEASE_NOTES.md
@@ -1,3 +1,71 @@
+#### 0.1.4-alpha May 12th 2026 ####
+
+Greedy verb-chain extraction. Public API surface (`VerbChain`, `Clause`)
+unchanged; the *content* of `Clause.Verb.Tokens` changes for many inputs.
+
+**BEHAVIOR CHANGE: verb-chain length is no longer table-driven (#27)**
+
+- The `BashArity` static lookup table and `ProbeArity()` method have been
+ **removed**. The parser walks consecutive verb-like Word tokens from
+ the start of each clause, transparently consuming flag-with-value
+ pairs (e.g. `git -C /repo`), and stops at the first non-verb-like
+ token, the first plain flag, or the first non-Word token.
+- A token is "verb-like" when its kind is `Word`, length 1–64, first
+ character is an ASCII lowercase letter, and remaining characters are
+ in `[a-z0-9._-]`. The strict allow-list naturally excludes flags,
+ paths (`/`, `\`, `~`), env-var refs (`$VAR`), URLs (`://`), globs,
+ numeric tokens, and uppercase user-named identifiers like migration
+ names — without requiring per-case predicate logic. See SPEC §6.1.
+- For known FILE verbs (`cat`, `ls`, `bash`, `cd`, `chmod`, `grep`,
+ `find`, …) the verb chain stops at exactly one token to preserve
+ per-verb positional-arg classification. The flag-with-value
+ consumption still runs so `tar -C /path` and `curl -o file` style
+ values still pick up `IsPath=true` via `FlagValueIsPath`.
+
+Examples that change:
+
+- `git push origin main` → verb `[git, push, origin, main]` (was
+ `[git, push]`).
+- `git worktree list` (and arbitrary CLI subcommand chains) → fully
+ extracted as `[git, worktree, list]` (was `[git, worktree]`).
+- `freshdesk ticket list --status open` → `[freshdesk, ticket, list]`
+ (was `[freshdesk]` because freshdesk wasn't in the BashArity table).
+- `kubectl get pods my-pod` → `[kubectl, get, pods, my-pod]` (was
+ `[kubectl, get]`).
+- `aws s3 cp src dst` → `[aws, s3, cp, src, dst]` (was `[aws, s3]`).
+- `dotnet ef migrations add InitialCreate` → `[dotnet, ef, migrations,
+ add]` (was `[dotnet, ef]`). `InitialCreate` stays in args because the
+ predicate rejects uppercase first character.
+- `cat README` → still `[cat]` (FileVerb carveout preserves `IsPath` on
+ bare-name targets).
+- `echo hello` → `[echo, hello]` (echo is not a FILE verb).
+
+`Clause.Verb` is now documented as a **convenience hint, not a security
+contract** (SPEC §6.1.1). Consumers needing security-grade verb
+identification should pattern-prefix match against the raw token
+stream: a command matches an approval pattern *P* iff the first
+`len(P.verb_prefix)` command tokens equal `P.verb_prefix`. This punts
+depth choice to the consumer and accommodates the parser's deliberate
+over-extraction on bare-word args. Auto-proposed patterns should default
+to the full extracted verb chain (greedy match): a subsequent variation
+re-prompts rather than silently auto-grants.
+
+**Behavior notes**
+
+- Public API surface is unchanged (no `PublicApiSnapshotTests` delta).
+- SPEC.md updates: §3 `VerbChain`, §4 grammar, §6.1 verb-chain
+ extraction (rewritten end-to-end), new §6.1.1 consumer
+ pattern-matching guidance, §7 flag-with-value note, §12 worked
+ examples, §15 versioning, §16 implementation sequencing.
+- Corpus: 7 new entries (132–138) pin the issue #27 headline cases;
+ 10 existing entries flipped to the new shape (`04_echo_hello`,
+ `11_git_push_origin_main`, `13_git_checkout_dev`, `17_docker_run_nginx`,
+ `27_make_install`, `45_echo_append_log`, `84_subshell_nested`,
+ `91_bash_c_simple`, `96_bash_c_nested_depth_2`,
+ `100_bash_c_nested_depth_3`, `130_netclaw_repro_leading_comment_pipeline`).
+- Unit tests: 8 pinned `BashCommandParserTests` cases updated to the new
+ expected verb chains.
+
#### 0.1.3-alpha May 12th 2026 ####
Bash line comment handling. Public API unchanged.
diff --git a/SPEC.md b/SPEC.md
index 0897414..79b5b8a 100644
--- a/SPEC.md
+++ b/SPEC.md
@@ -202,8 +202,10 @@ public sealed record Clause
### `VerbChain`
The verb of a clause. Multi-token to handle commands like `git push`,
-`docker compose up`, `bun run`, `dotnet test`. Length determined by the
-`BashArity` table (see §6).
+`docker compose up`, `dotnet ef migrations add`. Length determined by the
+greedy verb-chain heuristic in §6.1 — consecutive verb-like Word tokens
+from the start of the clause, transparently consuming flag-with-value
+pairs, with a 1-token carveout for FILE verbs.
```csharp
public sealed record VerbChain
@@ -343,7 +345,12 @@ clause := subshell | bash_c_wrapper | simple_clause
subshell := "(" command ")"
bash_c_wrapper := ("bash" | "sh") "-c" QUOTED_STRING
simple_clause := verb_chain arg* redirect*
-verb_chain := word{1..N} // N = BashArity[word_0]
+verb_chain := verb_like_word (FW_pair? verb_like_word)*
+ // greedy walk per §6.1; FW_pair is a
+ // flag-with-value pair owned by word_0
+ // (transparent to the walk); stops at
+ // the first non-verb-like token. For
+ // word_0 ∈ FileVerbs, exactly 1 token.
arg := word | flag | quoted_string
flag := "-" letter+ | "--" word
redirect := redirect_op target
@@ -479,57 +486,118 @@ must handle this.
These are **data**, not logic. Implement as `static readonly` collections.
-### 6.1 BashArity
-
-How many tokens form the verb chain for known commands. Defaults to 1
-when not in the table.
-
-```csharp
-internal static readonly IReadOnlyDictionary BashArity =
- new Dictionary(StringComparer.OrdinalIgnoreCase)
-{
- // Two-token verbs
- ["git"] = 2, // git push, git log, git checkout
- ["dotnet"] = 2, // dotnet test, dotnet build
- ["npm"] = 2, // npm install, npm run
- ["yarn"] = 2, // yarn add, yarn install
- ["pnpm"] = 2, // pnpm add, pnpm install
- ["cargo"] = 2, // cargo build, cargo test
- ["go"] = 2, // go build, go test, go run
- ["kubectl"] = 2, // kubectl apply, kubectl get
- ["helm"] = 2, // helm install, helm upgrade
- ["systemctl"] = 2, // systemctl start, systemctl status
- ["service"] = 2, // service nginx
- ["pip"] = 2, // pip install, pip uninstall
- ["pip3"] = 2, // pip3 install
- ["brew"] = 2, // brew install, brew upgrade
- ["apt"] = 2, // apt install, apt update
- ["apt-get"] = 2, // apt-get install
- ["yum"] = 2, // yum install
- ["dnf"] = 2, // dnf install
- ["pacman"] = 2, // pacman -S, pacman -Syu (the -S is the verb)
- ["aws"] = 2, // aws s3, aws ec2 (top-level)
- ["gcloud"] = 2, // gcloud compute, gcloud auth
- ["az"] = 2, // az vm, az storage
-
- // Three-token verbs
- ["docker"] = 2, // docker run, docker ps; "docker compose up" handled below
- ["docker compose"] = 3, // docker compose up, docker compose down
- ["docker-compose"] = 2, // legacy single-token form
- ["bun"] = 2, // bun run, bun install
- ["bun run"] = 3, // bun run my-script (treat as 3-tuple verb)
- ["nuget"] = 2, // nuget push, nuget pack
-};
-```
-
-The table is not exhaustive. Missing verbs default to 1 token. Add entries
-as the corpus surfaces real commands.
-
-**Implementation note:** The parser must look up multi-token verbs by
-joining the first 1, 2, then 3 tokens and probing the table from longest
-to shortest. So `docker compose up nginx` first probes `docker compose up`
-(not in table; arity defaults wouldn't fire), then probes `docker compose`
-(in table → arity 3) → verb chain is the first 3 tokens.
+### 6.1 Verb-chain extraction (greedy heuristic)
+
+Per issue #27 (locked in v0.1.4-alpha), the parser does not consult a
+static arity table. Instead, it walks consecutive verb-like Word tokens
+from the start of the clause and stops at the first token that doesn't
+look like a subcommand. This naturally scales to unknown CLIs
+(`freshdesk ticket list`, `kubectl get pods`, `dotnet ef migrations add`)
+without curated table entries.
+
+#### IsVerbLikeToken predicate
+
+A token is "verb-like" when **all** of these hold:
+
+- `Kind == BashTokenKind.Word` (quoted strings are values, never verbs at
+ index ≥ 1).
+- Length is in `[1, 64]` characters.
+- First character is an ASCII lowercase letter `[a-z]`.
+- Remaining characters are drawn from `[a-z0-9._-]` only.
+
+The predicate is implemented in `BashVerbs.IsVerbLikeToken`. The leading
+lowercase requirement mirrors real CLI subcommand convention; the
+character allow-list naturally excludes flags (`-x` starts with `-`),
+paths (`/`, `\`, `~`), env-var refs (`$VAR`), URLs (`://`), globs
+(`* ? [`), and user-named identifiers (uppercase first char like
+`InitialCreate`).
+
+#### Walk algorithm
+
+For a clause whose first token is a Word `firstVerb`:
+
+1. Append `firstVerb` to the verb chain (it does not need to satisfy
+ `IsVerbLikeToken` — bare commands like `Curl` or `_init` are still
+ commands).
+2. Iterate the remaining tokens in order. For each token `t`:
+ - If `t.Kind != Word`: **stop**.
+ - If `t` is a flag (`IsFlagWord`):
+ - If `firstVerb` has a `FlagsWithValue` entry containing
+ `StripEqualsValue(t.Value)` AND the next token is `Word` or
+ `QuotedString` AND `t.Value` has no inline `=`: consume both as a
+ flag-value pair, mark their indices for `consumedFlagValueIndices`,
+ and continue walking.
+ - Otherwise: **stop**.
+ - If `firstVerb ∈ FileVerbs`: **stop** (1-token carveout — see below).
+ - If `!IsVerbLikeToken(t)`: **stop**.
+ - Otherwise: append `t.Value` to the verb chain and continue.
+
+If the first token is a `QuotedString` (e.g. `"git" push origin main`),
+emit a 1-token verb chain `[firstVerb]` and skip the walk entirely. Bash
+treats the quoted form as a verb-identity carrier; remaining tokens are
+arg-list material.
+
+#### FileVerb 1-token carveout
+
+For verbs in §6.3 `FileVerbs` (file-mutation, file-read, editors,
+compression, shell loaders, etc.), the verb chain stops at exactly one
+token. The flag-with-value consumption still runs so the value of
+`curl -o file`, `tar -C /path`, `git -C /repo` style flags picks up
+`IsPath=true` via the `FlagValueIsPath` mechanism.
+
+The carveout exists because FileVerbs use SPEC §7 per-verb positional
+rules to classify args as paths. Without it, a bare-name target like
+`cat README` would over-extract — `README` is shape-wise verb-like —
+and lose the `IsPath=true` classification downstream consumers depend
+on for zone-gate evaluation.
+
+#### Examples
+
+| Input | Verb chain | Args |
+|---|---|---|
+| `git push origin main` | `[git, push, origin, main]` | `[]` (over-extracts; see §6.1.1) |
+| `git -C /repo worktree list --porcelain` | `[git, worktree, list]` | `[-C, /repo, --porcelain]` |
+| `freshdesk ticket list --status open` | `[freshdesk, ticket, list]` | `[--status, open]` |
+| `kubectl get pods my-pod` | `[kubectl, get, pods, my-pod]` | `[]` |
+| `aws s3 cp src dst` | `[aws, s3, cp, src, dst]` | `[]` (bare-word path args over-extract) |
+| `dotnet ef migrations add InitialCreate` | `[dotnet, ef, migrations, add]` | `[InitialCreate]` (stops at uppercase) |
+| `cat /etc/passwd` | `[cat]` | `[/etc/passwd]` (FileVerb carveout) |
+| `cat README` | `[cat]` | `[README]` (FileVerb carveout preserves IsPath) |
+| `ls -la /tmp` | `[ls]` | `[-la, /tmp]` (FileVerb carveout) |
+| `chmod 755 file` | `[chmod]` | `[755, file]` (digit-start kills walk; FileVerb anyway) |
+| `echo hello` | `[echo, hello]` | `[]` (echo is not a FileVerb; over-extracts) |
+
+### 6.1.1 Consumer pattern-matching guidance
+
+`Clause.Verb` is a **convenience hint, not a security contract**.
+The parser deliberately over-extracts on bare-word args because no
+syntactic rule disambiguates `origin` (a branch name) from `worktree`
+(a subcommand verb) without per-CLI semantic knowledge — and we will
+not bake per-CLI knowledge into the parser.
+
+Consumers needing security-grade verb identification should pattern-prefix
+match against the raw token stream:
+
+> A command matches an approval pattern `P` if and only if the first
+> `len(P.verb_prefix)` tokens of the command equal `P.verb_prefix`.
+
+This punts depth choice to the consumer (via the pattern they author)
+and accommodates the parser's over-extraction transparently:
+
+- Pattern `git push *` (verb-prefix length 2) matches `git push origin
+ main` because the first two command tokens are `[git, push]`.
+- Pattern `kubectl get pods *` (verb-prefix length 3) matches
+ `kubectl get pods my-pod` because the first three tokens are
+ `[kubectl, get, pods]`.
+- Auto-proposed patterns for unknown commands should default to
+ the **full** extracted verb chain (greedy match), which is the
+ security-correct default: a subsequent variation re-prompts rather
+ than silently auto-grants. Operators wanting broader grants opt in
+ explicitly.
+
+False-negative (re-prompt) is recoverable. False-positive (silent
+destructive grant) is not. Narrow-by-default favors the recoverable
+failure mode.
### 6.2 CWD verbs
@@ -643,11 +711,12 @@ internal static readonly IReadOnlyDictionary>
> because `IReadOnlySet` is .NET 5+ only and the library
> multi-targets `netstandard2.0`. Internal-only — no public-API impact.
-> **Note (PR 3 → PR 4 follow-up):** the verb-chain probe must run *after*
-> the flag-with-value pair is consumed for invocations like
-> `git -C /repo log`. PR 3 ships the probe at token zero (the simpler
-> shape); PR 4 lands the flag-with-value-aware probe so `git -C /repo log`
-> produces `Verb.Tokens = ["git", "log"]` per SPEC §12's example.
+> **Note:** the verb-chain walk consumes flag-with-value pairs
+> transparently. For `git -C /repo log`, the walk consumes `-C /repo`
+> before evaluating the next token; `log` is then verb-like and extends
+> the chain, producing `Verb.Tokens = ["git", "log"]` per §12's example.
+> The same mechanic lets `git -C /repo worktree list` extract the full
+> 3-token chain per §6.1.
When a flag-with-value consumes the next token, the consumed token's
`IsPath` flag is set if the value is path-shaped (per the resolver in §8).
@@ -959,22 +1028,42 @@ ParsedCommand {
}
```
-### Multi-token verb
+### Multi-token verb (greedy over-extraction)
Input: `git push origin main`
```
Clauses = [
Clause {
- Verb = VerbChain { Tokens = ["git", "push"] },
+ Verb = VerbChain { Tokens = ["git", "push", "origin", "main"] },
+ Args = []
+ }
+]
+```
+
+The greedy heuristic absorbs `origin` and `main` because they're
+syntactically indistinguishable from subcommand verbs (lowercase
+identifiers, no path-shape). Consumers gating on `git push *` use
+pattern-prefix length 2 — see §6.1.1.
+
+Input: `freshdesk ticket list --status open`
+
+```
+Clauses = [
+ Clause {
+ Verb = VerbChain { Tokens = ["freshdesk", "ticket", "list"] },
Args = [
- Arg { Raw = "origin", Kind = Literal, IsPath = false },
- Arg { Raw = "main", Kind = Literal, IsPath = false }
+ Arg { Raw = "--status", Kind = Literal, IsFlag = true },
+ Arg { Raw = "open", Kind = Literal, IsPath = false }
]
}
]
```
+The walk stops at `--status` (a flag with no `FlagsWithValue` entry for
+`freshdesk`). The full subcommand stack is captured without requiring a
+curated table entry — the canonical benefit motivating the change.
+
### Compound with cd attribution
Input: `cd /target && cmd1 && cmd2 file.txt`
@@ -1235,10 +1324,14 @@ Adapt for ShellSyntaxTree:
### Versioning
-- **v0.1.0-alpha** — first publishable cut. Bash-only. Public API surface
- per §2 is locked; internal changes are free.
-- **v0.1.x** — additive changes (more verb table entries, more corpus,
- bug fixes).
+- **v0.1.x-alpha** — pre-release alpha cycle. Public API surface per §2 is
+ locked; internal data and behavior are subject to course-correction
+ while real-world feedback lands (e.g. v0.1.4-alpha replaces the
+ `BashArity` static table with the greedy verb-chain heuristic per
+ issue #27).
+- **v0.1.0** — first publishable non-alpha cut. Bash-only.
+- **v0.1.x** (post-0.1.0) — additive changes only (more verb table
+ entries, more corpus, bug fixes that don't shift parsed-AST shape).
- **v0.2.0** — first PowerShell parser implementation.
- **v1.0.0** — ready when at least one external consumer beyond Netclaw
ships against it without finding API gaps.
@@ -1275,7 +1368,7 @@ A natural order for the implementer:
but throw `NotImplementedException` on `Parse()`. Lock the surface
first.
4. **Implement BashLexer** (§5). Heavy unit tests on tokenization.
-5. **Implement BashArity / FILE / CWD verb tables** (§6) as static data.
+5. **Implement FILE / CWD verb tables and IsVerbLikeToken predicate** (§6) as static data + helper.
6. **Implement BashParser** (§4). One production at a time; unit-test
each.
7. **Implement Resolver** (§8). Unit-test each resolution rule.
diff --git a/src/ShellSyntaxTree/Internal/Bash/Parsing/BashCommandParser.cs b/src/ShellSyntaxTree/Internal/Bash/Parsing/BashCommandParser.cs
index c367987..1cd3306 100644
--- a/src/ShellSyntaxTree/Internal/Bash/Parsing/BashCommandParser.cs
+++ b/src/ShellSyntaxTree/Internal/Bash/Parsing/BashCommandParser.cs
@@ -769,89 +769,96 @@ private static ClauseResult ParseClauseSegment(
return ClauseResult.Empty();
}
- // ---- Verb-chain extraction with flag-with-value awareness ----
- //
- // PR 4 follow-up to the PR 3 probe: a token like `git -C /repo log`
- // shouldn't truncate the verb chain at `-C`. We greedily consume
- // any flag-with-value pair owned by the tentative first verb
- // (`tokens[0]`) before probing arity. The consumed flag + value
- // pair stays in the segment for arg-extraction; only the verb
- // probe sees a "compressed" view of the segment.
- //
- // Locked interpretation #8 / SPEC §12: `git -C /repo log` →
- // Verb=["git", "log"], Args=[-C, /repo]. The flag and its value
- // appear in source order in the Args list, with /repo carrying
- // IsPath=true via the FlagValueIsPath table.
- var verbCandidateValues = new List(3);
- var verbCandidateIndices = new List(3);
+ // Verb-chain extraction per SPEC §6.1. The FileVerb carveout is
+ // load-bearing: downstream per-verb positional-arg classification
+ // depends on the verb chain staying 1 token for FILE verbs so
+ // bare-name targets like `cat README` still surface as Args with
+ // IsPath=true. Flag-with-value consumption must run *before* the
+ // carveout gate so `tar -C /repo` still attributes IsPath to /repo.
var consumedFlagValueIndices = new HashSet();
+ var verbTokens = new List(4);
+ var verbPositions = new HashSet();
- // Look up the would-be verb so we know which flags are
- // "owned" by it. We only honor the flag-with-value skip when the
- // first token is a known Word verb — quoted strings and opaque
- // substitutions don't carry verb identity.
- string? tentativeVerb = null;
- if (segment.Tokens.Count > 0
- && segment.Tokens[0].Kind == BashTokenKind.Word
- && !IsFlagWord(segment.Tokens[0]))
+ var firstToken = segment.Tokens[0];
+ string? firstVerb = null;
+ if (firstToken.Kind == BashTokenKind.Word && !IsFlagWord(firstToken))
{
- tentativeVerb = segment.Tokens[0].Value;
+ firstVerb = firstToken.Value;
+ verbTokens.Add(firstVerb);
+ verbPositions.Add(0);
+ }
+ else if (firstToken.Kind == BashTokenKind.QuotedString)
+ {
+ // Quoted command (`"git" push`): emit a 1-token chain and skip
+ // the walk. Bash semantics treat the quoted form as a verb
+ // identity carrier; remaining tokens are arg-list material.
+ verbTokens.Add(firstToken.Value);
+ verbPositions.Add(0);
}
- var hasFlagsTable = tentativeVerb is not null
- && BashVerbs.FlagsWithValue.TryGetValue(tentativeVerb, out _);
+ BashVerbs.FlagsWithValue.TryGetValue(firstVerb ?? string.Empty, out var flagsForVerb);
+ var fileVerbCarveout = firstVerb is not null
+ && BashVerbs.FileVerbs.Contains(firstVerb);
- for (var i = 0; i < segment.Tokens.Count && verbCandidateValues.Count < 3; i++)
+ if (firstVerb is not null)
{
- var t = segment.Tokens[i];
- if (t.Kind == BashTokenKind.Word || t.Kind == BashTokenKind.QuotedString)
+ for (var i = 1; i < segment.Tokens.Count; i++)
{
+ var t = segment.Tokens[i];
+ if (t.Kind != BashTokenKind.Word)
+ {
+ break;
+ }
+
if (IsFlagWord(t))
{
- // Skip-through case: this is a flag-with-value pair owned
- // by the tentative verb. Skip both the flag and its
- // immediate value and keep probing arity. Only Word
- // tokens qualify as flags (quoted "-x" stays literal).
- if (hasFlagsTable
- && tentativeVerb is not null
- && BashVerbs.FlagsWithValue[tentativeVerb].Contains(StripEqualsValue(t.Value))
- && i + 1 < segment.Tokens.Count
- && (segment.Tokens[i + 1].Kind == BashTokenKind.Word
- || segment.Tokens[i + 1].Kind == BashTokenKind.QuotedString))
+ if (flagsForVerb is null)
{
- // The two-token `-C /repo` form. Equals-form
- // `--git-dir=/repo` is a single token and never enters
- // this branch — but the verb-probe still ends at it
- // (next iteration sees IsFlagWord and breaks below).
- if (HasInlineEqualsValue(t.Value))
- {
- // `--flag=value` — single token. Don't consume
- // the next, and let the normal arg-extraction
- // path split on `=`. End the verb-probe here.
- break;
- }
+ break;
+ }
+
+ var eq = t.Value.IndexOf('=');
+ var flagKey = eq > 0 ? t.Value.Substring(0, eq) : t.Value;
+ if (!flagsForVerb.Contains(flagKey))
+ {
+ break;
+ }
- consumedFlagValueIndices.Add(i);
- consumedFlagValueIndices.Add(i + 1);
- i++; // skip the value too on the next loop step
- continue;
+ if (eq > 0)
+ {
+ // `--flag=value` is a single token; arg-extraction
+ // splits on `=`. Stop the walk here.
+ break;
+ }
+
+ if (i + 1 >= segment.Tokens.Count
+ || (segment.Tokens[i + 1].Kind != BashTokenKind.Word
+ && segment.Tokens[i + 1].Kind != BashTokenKind.QuotedString))
+ {
+ break;
}
- // Plain flag with no path-value to skip → stops the verb probe.
+ consumedFlagValueIndices.Add(i);
+ consumedFlagValueIndices.Add(i + 1);
+ i++;
+ continue;
+ }
+
+ if (fileVerbCarveout || !BashVerbs.IsVerbLikeToken(t))
+ {
break;
}
- verbCandidateValues.Add(t.Value);
- verbCandidateIndices.Add(i);
- continue;
+ verbTokens.Add(t.Value);
+ verbPositions.Add(i);
}
-
- break;
}
- if (verbCandidateValues.Count == 0)
+ if (verbTokens.Count == 0)
{
- // Redirect-only clause.
+ // Redirect-only clause: no verb identified (e.g. clause starts
+ // with an operator, opaque substitution, or a leading flag with
+ // no Word firstVerb).
ExtractRedirectsAndArgs(
segment.Tokens,
0,
@@ -879,40 +886,17 @@ private static ClauseResult ParseClauseSegment(
});
}
- var firstVerb = verbCandidateValues[0];
- if (BashVerbs.ControlFlowKeywords.Contains(firstVerb))
+ if (BashVerbs.ControlFlowKeywords.Contains(verbTokens[0]))
{
return ClauseResult.Fail(
- $"control-flow keyword '{firstVerb}' is not supported in v0.1");
- }
-
- var arity = BashVerbs.ProbeArity(verbCandidateValues, 0);
- if (arity <= 0)
- {
- arity = 1;
- }
-
- var verbTokens = new List(arity);
- for (var k = 0; k < arity; k++)
- {
- verbTokens.Add(verbCandidateValues[k]);
+ $"control-flow keyword '{verbTokens[0]}' is not supported in v0.1");
}
var verbChain = new VerbChain { Tokens = verbTokens };
- // The arg-extraction starts immediately after the last verb-chain
- // *position* in the original segment, so the consumed flag-value
- // pair (which sits *before* that position when it precedes the
- // verb-chain extension) still gets emitted as Args in source order.
- // Concretely: for `git -C /repo log`, the verb-chain positions are
- // 0 and 3; we walk all of segment.Tokens from position 0 and emit
- // -C, /repo as args while skipping the verb-position tokens.
- var argStart = 0;
- var verbPositions = new HashSet(verbCandidateIndices.GetRange(0, arity));
-
ExtractRedirectsAndArgs(
segment.Tokens,
- argStart,
+ 0,
source,
options,
verb: verbChain,
@@ -951,20 +935,6 @@ private static bool IsFlagWord(BashToken token)
return token.Value.Length > 0 && token.Value[0] == '-';
}
- ///
- /// For an equals-form flag like --output=file.txt, return the
- /// flag portion (--output) so the FlagsWithValue table lookup
- /// matches. For plain flags returns the input unchanged.
- ///
- private static string StripEqualsValue(string flag)
- {
- var eq = flag.IndexOf('=');
- return eq > 0 ? flag.Substring(0, eq) : flag;
- }
-
- private static bool HasInlineEqualsValue(string flag) =>
- flag.IndexOf('=') > 0;
-
// ---------------------------------------------------------------- args + redirects
///
diff --git a/src/ShellSyntaxTree/Internal/Bash/Verbs/BashVerbs.cs b/src/ShellSyntaxTree/Internal/Bash/Verbs/BashVerbs.cs
index 6d2ea26..d8c4ccb 100644
--- a/src/ShellSyntaxTree/Internal/Bash/Verbs/BashVerbs.cs
+++ b/src/ShellSyntaxTree/Internal/Bash/Verbs/BashVerbs.cs
@@ -1,10 +1,11 @@
-// -----------------------------------------------------------------------
+// -----------------------------------------------------------------------
//
// Copyright (C) 2026 - 2026 Aaron Stannard
//
// -----------------------------------------------------------------------
using System;
using System.Collections.Generic;
+using ShellSyntaxTree.Internal.Bash.Lexing;
namespace ShellSyntaxTree.Internal.Bash.Verbs;
@@ -16,62 +17,6 @@ namespace ShellSyntaxTree.Internal.Bash.Verbs;
///
internal static class BashVerbs
{
- ///
- /// How many tokens form the verb chain for known commands. Defaults to
- /// 1 when not in the table. SPEC §6.1.
- ///
- ///
- ///
- /// Per the SPEC §6.1 implementation note, the parser must look up
- /// multi-token verbs by joining the first 1, 2, then 3 tokens and
- /// probing the table from longest to shortest. So
- /// docker compose up nginx first probes docker compose up
- /// (not in table), then docker compose (arity 3 — match). The
- /// matched key's value is the verb chain length, so e.g.
- /// docker compose's value of 3 means "consume three source
- /// tokens as the verb chain."
- ///
- ///
- /// The table is non-exhaustive. Verbs not listed default to a 1-token
- /// chain. Add entries as the corpus surfaces real commands.
- ///
- ///
- internal static readonly IReadOnlyDictionary BashArity =
- new Dictionary(StringComparer.OrdinalIgnoreCase)
- {
- // Two-token verbs.
- ["git"] = 2,
- ["dotnet"] = 2,
- ["npm"] = 2,
- ["yarn"] = 2,
- ["pnpm"] = 2,
- ["cargo"] = 2,
- ["go"] = 2,
- ["kubectl"] = 2,
- ["helm"] = 2,
- ["systemctl"] = 2,
- ["service"] = 2,
- ["pip"] = 2,
- ["pip3"] = 2,
- ["brew"] = 2,
- ["apt"] = 2,
- ["apt-get"] = 2,
- ["yum"] = 2,
- ["dnf"] = 2,
- ["pacman"] = 2,
- ["aws"] = 2,
- ["gcloud"] = 2,
- ["az"] = 2,
- ["docker"] = 2,
- ["docker-compose"] = 2,
- ["bun"] = 2,
- ["nuget"] = 2,
-
- // Three-token verbs.
- ["docker compose"] = 3,
- ["bun run"] = 3,
- };
-
///
/// Verbs whose first non-flag positional arg becomes the cwd for
/// subsequent clauses in the same compound. SPEC §6.2.
@@ -175,52 +120,56 @@ internal static readonly IReadOnlyDictionary>
};
///
- /// Resolve the verb-chain length for a token sequence at .
- /// Implements the longest-prefix probe from SPEC §6.1: try the first 3
- /// tokens, then 2, then 1, returning the matching arity.
+ /// SPEC §6.1: returns true when has the
+ /// shape of a CLI subcommand verb — a bare lowercase identifier
+ /// containing only ASCII letters, digits, hyphens, dots, and underscores.
+ /// Used to terminate the greedy verb-chain walk at the first token that
+ /// looks like a value rather than another subcommand.
///
- /// Source-order list of verb-candidate tokens.
- /// Index into where the verb chain begins.
- ///
- /// 1, 2, or 3 — the number of tokens that form the verb chain.
- /// Defaults to 1 when no match is found (per SPEC §6.1) or when fewer
- /// than the probed-prefix length tokens remain.
- ///
- internal static int ProbeArity(IReadOnlyList tokens, int start)
+ ///
+ /// Strict allow-list (leading [a-z], body [a-z0-9._-])
+ /// over the more obvious negation-of-LooksLikePath because it stays
+ /// conservative for unknown shapes: a token like readme.md
+ /// satisfies the allow-list and would extend an unknown CLI's verb
+ /// chain, but the FileVerb carveout in BashCommandParser
+ /// short-circuits the common case (cat readme.md) before the
+ /// allow-list ever runs. Quoted strings are excluded so the user's
+ /// intent to treat bytes literally is preserved. The 64-char bound
+ /// is a defensive cap against pathological inputs.
+ ///
+ internal static bool IsVerbLikeToken(in BashToken token)
{
- var available = tokens.Count - start;
- if (available <= 0)
+ if (token.Kind != BashTokenKind.Word)
{
- return 0;
+ return false;
}
- // Three-token probe.
- if (available >= 3)
+ var v = token.Value;
+ if (v.Length == 0 || v.Length > 64)
{
- var key3 = tokens[start] + " " + tokens[start + 1] + " " + tokens[start + 2];
- if (BashArity.TryGetValue(key3, out var arity3) && arity3 == 3)
- {
- return 3;
- }
+ return false;
}
- // Two-token probe.
- if (available >= 2)
+ var first = v[0];
+ if (!(first >= 'a' && first <= 'z'))
{
- var key2 = tokens[start] + " " + tokens[start + 1];
- if (BashArity.TryGetValue(key2, out var arity2))
- {
- return Math.Min(arity2, available);
- }
+ return false;
}
- // Single-token probe (default arity 1).
- if (BashArity.TryGetValue(tokens[start], out var arity1))
+ for (var i = 1; i < v.Length; i++)
{
- return Math.Min(arity1, available);
+ var c = v[i];
+ var ok =
+ (c >= 'a' && c <= 'z') ||
+ (c >= '0' && c <= '9') ||
+ c == '-' || c == '.' || c == '_';
+ if (!ok)
+ {
+ return false;
+ }
}
- return 1;
+ return true;
}
///
diff --git a/tests/ShellSyntaxTree.Tests/Corpus/bash/04_echo_hello.json b/tests/ShellSyntaxTree.Tests/Corpus/bash/04_echo_hello.json
index 8dae71c..42868b2 100644
--- a/tests/ShellSyntaxTree.Tests/Corpus/bash/04_echo_hello.json
+++ b/tests/ShellSyntaxTree.Tests/Corpus/bash/04_echo_hello.json
@@ -1,19 +1,18 @@
{
- "name": "Simple verb: echo hello",
+ "name": "Simple verb: echo hello (greedy walk absorbs bare-word arg)",
"input": "echo hello",
"expected": {
"isUnparseable": false,
"clauses": [
{
"operator": "None",
- "verb": ["echo"],
- "args": [
- { "raw": "hello", "kind": "Literal", "isPath": false }
- ],
+ "verb": ["echo", "hello"],
+ "args": [],
"redirects": [],
"isSubshell": false,
"isBashCWrapped": false
}
]
- }
+ },
+ "notes": "Issue #27: `echo` is not a FILE verb, so the greedy heuristic walks verb-like tokens past it. `hello` matches the verb-like predicate (lowercase identifier, no path-shape) and is absorbed into the verb chain. Over-extraction is documented and acceptable; consumers wanting to gate on `echo *` use pattern-prefix matching against the first token."
}
diff --git a/tests/ShellSyntaxTree.Tests/Corpus/bash/100_bash_c_nested_depth_3.json b/tests/ShellSyntaxTree.Tests/Corpus/bash/100_bash_c_nested_depth_3.json
index 7240c59..962bcbe 100644
--- a/tests/ShellSyntaxTree.Tests/Corpus/bash/100_bash_c_nested_depth_3.json
+++ b/tests/ShellSyntaxTree.Tests/Corpus/bash/100_bash_c_nested_depth_3.json
@@ -1,20 +1,18 @@
{
- "name": "bash -c nested depth 3: three wrappers consumed",
+ "name": "bash -c nested depth 3 with greedy verb chain: three wrappers consumed",
"input": "bash -c \"bash -c \\\"bash -c \\\\\\\"echo deep\\\\\\\"\\\"\"",
"expected": {
"isUnparseable": false,
"clauses": [
{
"operator": "None",
- "verb": ["echo"],
- "args": [
- { "raw": "deep", "kind": "Literal", "isPath": false }
- ],
+ "verb": ["echo", "deep"],
+ "args": [],
"redirects": [],
"isSubshell": false,
"isBashCWrapped": true
}
]
},
- "notes": "Three nested bash -c wrappers all consumed; innermost echo surfaces flat. Depth 3 is under the cap of 5; the BashCommandParserTests cover the depth-6 overflow case."
+ "notes": "Three nested bash -c wrappers all consumed; innermost echo surfaces flat. Depth 3 is under the cap of 5; the BashCommandParserTests cover the depth-6 overflow case. Issue #27: `deep` absorbed into the verb chain."
}
diff --git a/tests/ShellSyntaxTree.Tests/Corpus/bash/11_git_push_origin_main.json b/tests/ShellSyntaxTree.Tests/Corpus/bash/11_git_push_origin_main.json
index faf08a7..90d2148 100644
--- a/tests/ShellSyntaxTree.Tests/Corpus/bash/11_git_push_origin_main.json
+++ b/tests/ShellSyntaxTree.Tests/Corpus/bash/11_git_push_origin_main.json
@@ -1,21 +1,18 @@
{
- "name": "Multi-token verb: git push origin main",
+ "name": "Greedy verb chain: git push origin main (over-extracts on bare-word remote+branch)",
"input": "git push origin main",
"expected": {
"isUnparseable": false,
"clauses": [
{
"operator": "None",
- "verb": ["git", "push"],
- "args": [
- { "raw": "origin", "kind": "Literal", "isPath": false },
- { "raw": "main", "kind": "Literal", "isPath": false }
- ],
+ "verb": ["git", "push", "origin", "main"],
+ "args": [],
"redirects": [],
"isSubshell": false,
"isBashCWrapped": false
}
]
},
- "notes": "git arity=2 → first two source tokens become the verb chain."
+ "notes": "Issue #27: documented over-extraction. `origin` and `main` are syntactically indistinguishable from subcommand verbs without per-CLI knowledge. Consumers gating on `git push *` use pattern-prefix length (2) rather than relying on the parser to guess depth."
}
diff --git a/tests/ShellSyntaxTree.Tests/Corpus/bash/130_netclaw_repro_leading_comment_pipeline.json b/tests/ShellSyntaxTree.Tests/Corpus/bash/130_netclaw_repro_leading_comment_pipeline.json
index 70ee577..f1a0747 100644
--- a/tests/ShellSyntaxTree.Tests/Corpus/bash/130_netclaw_repro_leading_comment_pipeline.json
+++ b/tests/ShellSyntaxTree.Tests/Corpus/bash/130_netclaw_repro_leading_comment_pipeline.json
@@ -1,16 +1,15 @@
{
- "name": "Netclaw repro: leading comment + git ... | awk | tr | sort pipeline",
+ "name": "Netclaw repro: leading comment + git worktree list | awk | tr | sort pipeline",
"input": "# Extract all unique branch names from worktrees\ngit -C /home/user/repos/sample-repo worktree list | awk '{print $NF}' | tr -d '[]' | sort -u",
"expected": {
"isUnparseable": false,
"clauses": [
{
"operator": "None",
- "verb": ["git", "worktree"],
+ "verb": ["git", "worktree", "list"],
"args": [
{ "raw": "-C", "kind": "Literal", "isPath": false, "isFlag": true },
- { "raw": "/home/user/repos/sample-repo", "kind": "Literal", "isPath": true, "isFlag": false, "resolved": "/home/user/repos/sample-repo" },
- { "raw": "list", "kind": "Literal", "isPath": false, "isFlag": false, "resolved": "__NULL__" }
+ { "raw": "/home/user/repos/sample-repo", "kind": "Literal", "isPath": true, "isFlag": false, "resolved": "/home/user/repos/sample-repo" }
],
"redirects": [],
"isSubshell": false,
@@ -49,5 +48,5 @@
}
]
},
- "notes": "Issue #25 — original Netclaw repro, paths sanitized per SPEC §14 (user repo path → /home/user/repos/sample-repo). Without the v0.1.3 comment-skip fix this entry parsed Clause 0 with verb `[#, Extract]` — the failure mode that broke approval-prompt rendering for agents authoring scripts with explanatory comments."
+ "notes": "Issue #25 — original Netclaw repro, paths sanitized per SPEC §14. Without the v0.1.3 comment-skip fix, Clause 0's verb parsed as `[#, Extract]`. Updated for issue #27 / v0.1.4-alpha: the greedy heuristic now walks past the consumed `-C /repo` flag-value pair and captures `worktree list` as part of the verb chain instead of leaving `list` as a stranded positional arg."
}
diff --git a/tests/ShellSyntaxTree.Tests/Corpus/bash/132_freshdesk_ticket_list_status_open.json b/tests/ShellSyntaxTree.Tests/Corpus/bash/132_freshdesk_ticket_list_status_open.json
new file mode 100644
index 0000000..de85b9f
--- /dev/null
+++ b/tests/ShellSyntaxTree.Tests/Corpus/bash/132_freshdesk_ticket_list_status_open.json
@@ -0,0 +1,21 @@
+{
+ "name": "Greedy verb chain: freshdesk ticket list --status open (unknown CLI)",
+ "input": "freshdesk ticket list --status open",
+ "expected": {
+ "isUnparseable": false,
+ "clauses": [
+ {
+ "operator": "None",
+ "verb": ["freshdesk", "ticket", "list"],
+ "args": [
+ { "raw": "--status", "kind": "Literal", "isPath": false, "isFlag": true },
+ { "raw": "open", "kind": "Literal", "isPath": false, "resolved": "__NULL__" }
+ ],
+ "redirects": [],
+ "isSubshell": false,
+ "isBashCWrapped": false
+ }
+ ]
+ },
+ "notes": "Issue #27 headline: unknown/private CLIs (freshdesk, internal tools) used to truncate to a 1-token verb chain. The greedy heuristic walks verb-like Word tokens (lowercase[a-z0-9._-]) and stops at the first flag, so the canonical subcommand stack `freshdesk ticket list` extracts fully without requiring a curated table entry."
+}
diff --git a/tests/ShellSyntaxTree.Tests/Corpus/bash/133_git_C_repo_worktree_list_porcelain.json b/tests/ShellSyntaxTree.Tests/Corpus/bash/133_git_C_repo_worktree_list_porcelain.json
new file mode 100644
index 0000000..aaa9899
--- /dev/null
+++ b/tests/ShellSyntaxTree.Tests/Corpus/bash/133_git_C_repo_worktree_list_porcelain.json
@@ -0,0 +1,22 @@
+{
+ "name": "Greedy verb chain past consumed flag-value: git -C /repo worktree list --porcelain",
+ "input": "git -C /repo worktree list --porcelain",
+ "expected": {
+ "isUnparseable": false,
+ "clauses": [
+ {
+ "operator": "None",
+ "verb": ["git", "worktree", "list"],
+ "args": [
+ { "raw": "-C", "kind": "Literal", "isPath": false, "isFlag": true },
+ { "raw": "/repo", "kind": "Literal", "isPath": true, "isFlag": false, "resolved": "/repo" },
+ { "raw": "--porcelain", "kind": "Literal", "isPath": false, "isFlag": true }
+ ],
+ "redirects": [],
+ "isSubshell": false,
+ "isBashCWrapped": false
+ }
+ ]
+ },
+ "notes": "Issue #27: the heuristic must walk PAST a consumed flag-with-value pair to extract trailing verb-like tokens. `-C /repo` is consumed via FlagsWithValue[git]; `worktree` and `list` are then both verb-like; `--porcelain` stops the walk. /repo carries IsPath=true via the FlagValueIsPath mechanism."
+}
diff --git a/tests/ShellSyntaxTree.Tests/Corpus/bash/134_kubectl_get_pods.json b/tests/ShellSyntaxTree.Tests/Corpus/bash/134_kubectl_get_pods.json
new file mode 100644
index 0000000..41f38aa
--- /dev/null
+++ b/tests/ShellSyntaxTree.Tests/Corpus/bash/134_kubectl_get_pods.json
@@ -0,0 +1,18 @@
+{
+ "name": "Greedy verb chain: kubectl get pods",
+ "input": "kubectl get pods",
+ "expected": {
+ "isUnparseable": false,
+ "clauses": [
+ {
+ "operator": "None",
+ "verb": ["kubectl", "get", "pods"],
+ "args": [],
+ "redirects": [],
+ "isSubshell": false,
+ "isBashCWrapped": false
+ }
+ ]
+ },
+ "notes": "Issue #27: kubectl resource-type pluralizations (`pods`, `services`, `deployments`) used to be silently truncated under the BashArity=2 default for kubectl. The greedy heuristic captures the full subcommand stack."
+}
diff --git a/tests/ShellSyntaxTree.Tests/Corpus/bash/135_kubectl_get_pods_my_pod.json b/tests/ShellSyntaxTree.Tests/Corpus/bash/135_kubectl_get_pods_my_pod.json
new file mode 100644
index 0000000..3175589
--- /dev/null
+++ b/tests/ShellSyntaxTree.Tests/Corpus/bash/135_kubectl_get_pods_my_pod.json
@@ -0,0 +1,18 @@
+{
+ "name": "Greedy verb chain over-extracts on bare-word arg: kubectl get pods my-pod",
+ "input": "kubectl get pods my-pod",
+ "expected": {
+ "isUnparseable": false,
+ "clauses": [
+ {
+ "operator": "None",
+ "verb": ["kubectl", "get", "pods", "my-pod"],
+ "args": [],
+ "redirects": [],
+ "isSubshell": false,
+ "isBashCWrapped": false
+ }
+ ]
+ },
+ "notes": "Issue #27: documented over-extraction. `my-pod` is syntactically indistinguishable from a subcommand verb (lowercase + hyphen, no path-shape). Consumers needing security-grade matching use pattern-prefix length, not parser-level depth guessing — see SPEC §6.1.1."
+}
diff --git a/tests/ShellSyntaxTree.Tests/Corpus/bash/136_aws_s3_cp_src_dst.json b/tests/ShellSyntaxTree.Tests/Corpus/bash/136_aws_s3_cp_src_dst.json
new file mode 100644
index 0000000..a6dcb23
--- /dev/null
+++ b/tests/ShellSyntaxTree.Tests/Corpus/bash/136_aws_s3_cp_src_dst.json
@@ -0,0 +1,18 @@
+{
+ "name": "Greedy verb chain over-extracts on bare-word path args: aws s3 cp src dst",
+ "input": "aws s3 cp src dst",
+ "expected": {
+ "isUnparseable": false,
+ "clauses": [
+ {
+ "operator": "None",
+ "verb": ["aws", "s3", "cp", "src", "dst"],
+ "args": [],
+ "redirects": [],
+ "isSubshell": false,
+ "isBashCWrapped": false
+ }
+ ]
+ },
+ "notes": "Issue #27: documented over-extraction. `src` and `dst` are bare-word path-like args but lack path-shape (no slash, no extension, no tilde). The parser cannot syntactically distinguish them from subcommand verbs. Consumers that need to recover the path semantics walk the token stream directly; for verb-pattern gates, prefix-length matching against the consumer's pattern is the correct approach."
+}
diff --git a/tests/ShellSyntaxTree.Tests/Corpus/bash/137_dotnet_ef_migrations_add_initial.json b/tests/ShellSyntaxTree.Tests/Corpus/bash/137_dotnet_ef_migrations_add_initial.json
new file mode 100644
index 0000000..e46e76e
--- /dev/null
+++ b/tests/ShellSyntaxTree.Tests/Corpus/bash/137_dotnet_ef_migrations_add_initial.json
@@ -0,0 +1,20 @@
+{
+ "name": "Greedy verb chain stops at uppercase: dotnet ef migrations add InitialCreate",
+ "input": "dotnet ef migrations add InitialCreate",
+ "expected": {
+ "isUnparseable": false,
+ "clauses": [
+ {
+ "operator": "None",
+ "verb": ["dotnet", "ef", "migrations", "add"],
+ "args": [
+ { "raw": "InitialCreate", "kind": "Literal", "isPath": false, "resolved": "__NULL__" }
+ ],
+ "redirects": [],
+ "isSubshell": false,
+ "isBashCWrapped": false
+ }
+ ]
+ },
+ "notes": "Issue #27: the IsVerbLikeToken predicate requires a lowercase ASCII letter as the first character (mirroring real CLI subcommand convention). User-named identifiers like migration names (`InitialCreate`) naturally fall out of the verb chain because they start with an uppercase letter. This is an example of the heuristic happening to do the right thing — though it's incidental, not a guarantee."
+}
diff --git a/tests/ShellSyntaxTree.Tests/Corpus/bash/138_cat_bare_name_filevervb_carveout.json b/tests/ShellSyntaxTree.Tests/Corpus/bash/138_cat_bare_name_filevervb_carveout.json
new file mode 100644
index 0000000..f321354
--- /dev/null
+++ b/tests/ShellSyntaxTree.Tests/Corpus/bash/138_cat_bare_name_filevervb_carveout.json
@@ -0,0 +1,20 @@
+{
+ "name": "FileVerb 1-token carveout: cat README (bare-name preserves IsPath)",
+ "input": "cat README",
+ "expected": {
+ "isUnparseable": false,
+ "clauses": [
+ {
+ "operator": "None",
+ "verb": ["cat"],
+ "args": [
+ { "raw": "README", "kind": "Literal", "isPath": true, "resolved": "/work/README" }
+ ],
+ "redirects": [],
+ "isSubshell": false,
+ "isBashCWrapped": false
+ }
+ ]
+ },
+ "notes": "Issue #27 / FileVerb carveout: for known FILE verbs (`cat`, `ls`, `bash`, `cd`, `chmod`, `grep`, `find`, …) the heuristic stops after the first verb token so per-verb positional-arg classification still fires. `README` itself would also stop the walk under the strict predicate (uppercase first letter), but consider `cat hello` or `bash myscript` — bare-name targets whose shape *would* be verb-like. The carveout is what guarantees those targets remain as Args with IsPath=true, preserving the zone-gate semantics downstream consumers depend on."
+}
diff --git a/tests/ShellSyntaxTree.Tests/Corpus/bash/13_git_checkout_dev.json b/tests/ShellSyntaxTree.Tests/Corpus/bash/13_git_checkout_dev.json
index 80d0a3a..12b6d3c 100644
--- a/tests/ShellSyntaxTree.Tests/Corpus/bash/13_git_checkout_dev.json
+++ b/tests/ShellSyntaxTree.Tests/Corpus/bash/13_git_checkout_dev.json
@@ -1,19 +1,18 @@
{
- "name": "Multi-token verb: git checkout dev",
+ "name": "Greedy verb chain: git checkout dev (over-extracts on branch name)",
"input": "git checkout dev",
"expected": {
"isUnparseable": false,
"clauses": [
{
"operator": "None",
- "verb": ["git", "checkout"],
- "args": [
- { "raw": "dev", "kind": "Literal", "isPath": false }
- ],
+ "verb": ["git", "checkout", "dev"],
+ "args": [],
"redirects": [],
"isSubshell": false,
"isBashCWrapped": false
}
]
- }
+ },
+ "notes": "Issue #27: documented over-extraction. `dev` is a branch name but syntactically matches the verb-like predicate (lowercase identifier). Consumers gating on `git checkout *` use pattern-prefix length (2)."
}
diff --git a/tests/ShellSyntaxTree.Tests/Corpus/bash/17_docker_run_nginx.json b/tests/ShellSyntaxTree.Tests/Corpus/bash/17_docker_run_nginx.json
index 75cc9c8..bd16602 100644
--- a/tests/ShellSyntaxTree.Tests/Corpus/bash/17_docker_run_nginx.json
+++ b/tests/ShellSyntaxTree.Tests/Corpus/bash/17_docker_run_nginx.json
@@ -1,19 +1,18 @@
{
- "name": "Multi-token verb: docker run nginx",
+ "name": "Greedy verb chain: docker run nginx (over-extracts on image name)",
"input": "docker run nginx",
"expected": {
"isUnparseable": false,
"clauses": [
{
"operator": "None",
- "verb": ["docker", "run"],
- "args": [
- { "raw": "nginx", "kind": "Literal", "isPath": false }
- ],
+ "verb": ["docker", "run", "nginx"],
+ "args": [],
"redirects": [],
"isSubshell": false,
"isBashCWrapped": false
}
]
- }
+ },
+ "notes": "Issue #27: documented over-extraction. `nginx` is a Docker image name but syntactically matches the verb-like predicate. A registry-qualified image (e.g. `registry.example.com/ns/nginx:1.25`) would stop the walk via `://` or path-shape rejection. Consumers gating on `docker run *` use pattern-prefix length (2)."
}
diff --git a/tests/ShellSyntaxTree.Tests/Corpus/bash/27_make_install.json b/tests/ShellSyntaxTree.Tests/Corpus/bash/27_make_install.json
index 50a12dd..ad6b646 100644
--- a/tests/ShellSyntaxTree.Tests/Corpus/bash/27_make_install.json
+++ b/tests/ShellSyntaxTree.Tests/Corpus/bash/27_make_install.json
@@ -1,5 +1,5 @@
{
- "name": "Compound: make && make install",
+ "name": "Compound with greedy verb chain: make && make install",
"input": "make && make install",
"expected": {
"isUnparseable": false,
@@ -14,14 +14,13 @@
},
{
"operator": "AndIf",
- "verb": ["make"],
- "args": [
- { "raw": "install", "kind": "Literal", "isPath": false }
- ],
+ "verb": ["make", "install"],
+ "args": [],
"redirects": [],
"isSubshell": false,
"isBashCWrapped": false
}
]
- }
+ },
+ "notes": "Issue #27: `install` is a verb-like lowercase identifier so the greedy heuristic absorbs it into the second clause's verb chain. `make` is not a FILE verb."
}
diff --git a/tests/ShellSyntaxTree.Tests/Corpus/bash/45_echo_append_log.json b/tests/ShellSyntaxTree.Tests/Corpus/bash/45_echo_append_log.json
index f114505..00b5730 100644
--- a/tests/ShellSyntaxTree.Tests/Corpus/bash/45_echo_append_log.json
+++ b/tests/ShellSyntaxTree.Tests/Corpus/bash/45_echo_append_log.json
@@ -1,15 +1,13 @@
{
- "name": "Append redirect: echo hi >> log.txt",
+ "name": "Append redirect with greedy verb chain: echo hi >> log.txt",
"input": "echo hi >> log.txt",
"expected": {
"isUnparseable": false,
"clauses": [
{
"operator": "None",
- "verb": ["echo"],
- "args": [
- { "raw": "hi", "kind": "Literal", "isPath": false }
- ],
+ "verb": ["echo", "hi"],
+ "args": [],
"redirects": [
{ "direction": "Append", "target": "/work/log.txt" }
],
@@ -18,5 +16,5 @@
}
]
},
- "notes": "PR 4: redirect target resolves; `echo` is not a FileVerb so `hi` stays IsPath=false."
+ "notes": "Issue #27: `echo` is not a FILE verb so the greedy heuristic absorbs `hi` into the verb chain. Redirect operators (`>>`) stop the walk regardless of the absorbed-arg shape; the redirect target still resolves to /work/log.txt via the existing resolver path."
}
diff --git a/tests/ShellSyntaxTree.Tests/Corpus/bash/84_subshell_nested.json b/tests/ShellSyntaxTree.Tests/Corpus/bash/84_subshell_nested.json
index 0abdba0..6183698 100644
--- a/tests/ShellSyntaxTree.Tests/Corpus/bash/84_subshell_nested.json
+++ b/tests/ShellSyntaxTree.Tests/Corpus/bash/84_subshell_nested.json
@@ -1,20 +1,18 @@
{
- "name": "Nested subshells: ((echo deep))",
+ "name": "Nested subshells with greedy verb chain: ((echo deep))",
"input": "((echo deep))",
"expected": {
"isUnparseable": false,
"clauses": [
{
"operator": "None",
- "verb": ["echo"],
- "args": [
- { "raw": "deep", "kind": "Literal", "isPath": false }
- ],
+ "verb": ["echo", "deep"],
+ "args": [],
"redirects": [],
"isSubshell": true,
"isBashCWrapped": false
}
]
},
- "notes": "Two levels of parens; the inner clause is still IsSubshell=true."
+ "notes": "Two levels of parens; the inner clause is still IsSubshell=true. Issue #27: `deep` is verb-like and absorbed into the verb chain because `echo` is not a FILE verb."
}
diff --git a/tests/ShellSyntaxTree.Tests/Corpus/bash/91_bash_c_simple.json b/tests/ShellSyntaxTree.Tests/Corpus/bash/91_bash_c_simple.json
index 17da9e1..5d2da93 100644
--- a/tests/ShellSyntaxTree.Tests/Corpus/bash/91_bash_c_simple.json
+++ b/tests/ShellSyntaxTree.Tests/Corpus/bash/91_bash_c_simple.json
@@ -1,20 +1,18 @@
{
- "name": "bash -c simple: bash -c \"echo hi\"",
+ "name": "bash -c simple with greedy verb chain: bash -c \"echo hi\"",
"input": "bash -c \"echo hi\"",
"expected": {
"isUnparseable": false,
"clauses": [
{
"operator": "None",
- "verb": ["echo"],
- "args": [
- { "raw": "hi", "kind": "Literal", "isPath": false }
- ],
+ "verb": ["echo", "hi"],
+ "args": [],
"redirects": [],
"isSubshell": false,
"isBashCWrapped": true
}
]
},
- "notes": "bash -c wrapper consumed; inner echo surfaces with IsBashCWrapped=true."
+ "notes": "bash -c wrapper consumed; inner echo surfaces with IsBashCWrapped=true. Issue #27: `hi` is verb-like and absorbed into the inner clause's verb chain."
}
diff --git a/tests/ShellSyntaxTree.Tests/Corpus/bash/96_bash_c_nested_depth_2.json b/tests/ShellSyntaxTree.Tests/Corpus/bash/96_bash_c_nested_depth_2.json
index caaace3..40d2ce7 100644
--- a/tests/ShellSyntaxTree.Tests/Corpus/bash/96_bash_c_nested_depth_2.json
+++ b/tests/ShellSyntaxTree.Tests/Corpus/bash/96_bash_c_nested_depth_2.json
@@ -1,20 +1,18 @@
{
- "name": "bash -c nested depth 2: bash -c \"bash -c \\\"echo hi\\\"\"",
+ "name": "bash -c nested depth 2 with greedy verb chain: bash -c \"bash -c \\\"echo hi\\\"\"",
"input": "bash -c \"bash -c \\\"echo hi\\\"\"",
"expected": {
"isUnparseable": false,
"clauses": [
{
"operator": "None",
- "verb": ["echo"],
- "args": [
- { "raw": "hi", "kind": "Literal", "isPath": false }
- ],
+ "verb": ["echo", "hi"],
+ "args": [],
"redirects": [],
"isSubshell": false,
"isBashCWrapped": true
}
]
},
- "notes": "Two wrappers consumed; the innermost echo surfaces flat. Depth 2 well under the cap of 5."
+ "notes": "Two wrappers consumed; the innermost echo surfaces flat. Depth 2 well under the cap of 5. Issue #27: `hi` absorbed into the verb chain."
}
diff --git a/tests/ShellSyntaxTree.Tests/Parsing/BashCommandParserTests.cs b/tests/ShellSyntaxTree.Tests/Parsing/BashCommandParserTests.cs
index 02c4a01..f4e16ec 100644
--- a/tests/ShellSyntaxTree.Tests/Parsing/BashCommandParserTests.cs
+++ b/tests/ShellSyntaxTree.Tests/Parsing/BashCommandParserTests.cs
@@ -95,13 +95,17 @@ public void Single_quoted_arg_preserves_quotes_in_raw()
// ---------------- Multi-token verb chains ----------------
[Fact]
- public void Two_token_verb_git_push()
+ public void Greedy_verb_chain_absorbs_bare_word_args_git_push_origin_main()
{
+ // Issue #27: documented over-extraction. `origin` and `main` are
+ // syntactically indistinguishable from subcommand verbs (lowercase
+ // identifiers, no path-shape). Consumers gating on `git push *`
+ // use pattern-prefix length (2) — see SPEC §6.1.1.
var result = Parse("git push origin main");
Assert.False(result.IsUnparseable);
var clause = Assert.Single(result.Clauses);
- Assert.Equal(new[] { "git", "push" }, clause.Verb.Tokens);
- Assert.Equal(new[] { "origin", "main" }, clause.Args.Select(a => a.Raw).ToArray());
+ Assert.Equal(new[] { "git", "push", "origin", "main" }, clause.Verb.Tokens);
+ Assert.Empty(clause.Args);
}
[Fact]
@@ -114,13 +118,18 @@ public void Two_token_verb_dotnet_test()
}
[Fact]
- public void Three_token_verb_docker_compose_up()
+ public void Greedy_verb_chain_walks_through_docker_compose_up_nginx()
{
+ // Issue #27: `docker compose up nginx` over-extracts because `nginx`
+ // is a verb-like lowercase identifier. The previous BashArity
+ // approach capped the chain at 3 tokens for `docker compose`; the
+ // greedy heuristic walks until a non-verb-like token. Consumers
+ // gating on `docker compose up *` use pattern-prefix length (3).
var result = Parse("docker compose up nginx");
Assert.False(result.IsUnparseable);
var clause = Assert.Single(result.Clauses);
- Assert.Equal(new[] { "docker", "compose", "up" }, clause.Verb.Tokens);
- Assert.Equal(new[] { "nginx" }, clause.Args.Select(a => a.Raw).ToArray());
+ Assert.Equal(new[] { "docker", "compose", "up", "nginx" }, clause.Verb.Tokens);
+ Assert.Empty(clause.Args);
}
[Fact]
@@ -134,13 +143,17 @@ public void Three_token_verb_bun_run_my_script()
}
[Fact]
- public void Two_token_verb_docker_run()
+ public void Greedy_verb_chain_absorbs_docker_run_image_name()
{
+ // Issue #27: `nginx` is a docker image name but syntactically a
+ // verb-like identifier. A registry-qualified image like
+ // `registry.example.com/ns/nginx:1.25` would stop the walk via
+ // path-shape rejection.
var result = Parse("docker run nginx");
Assert.False(result.IsUnparseable);
var clause = Assert.Single(result.Clauses);
- Assert.Equal(new[] { "docker", "run" }, clause.Verb.Tokens);
- Assert.Equal(new[] { "nginx" }, clause.Args.Select(a => a.Raw).ToArray());
+ Assert.Equal(new[] { "docker", "run", "nginx" }, clause.Verb.Tokens);
+ Assert.Empty(clause.Args);
}
[Fact]
@@ -153,12 +166,17 @@ public void Verb_chain_capped_by_available_tokens()
}
[Fact]
- public void Default_arity_when_verb_unknown()
+ public void Unknown_verb_with_bare_word_args_extracts_full_chain()
{
+ // Issue #27: under the greedy heuristic, unknown verbs (not in any
+ // table) extract the full chain of consecutive verb-like tokens.
+ // This is the strict-better-than-default-arity-1 behavior the
+ // issue motivates — `freshdesk ticket list` and similar private
+ // CLIs surface their full subcommand stack without curation.
var result = Parse("totally-unknown-verb foo bar");
var clause = Assert.Single(result.Clauses);
- Assert.Equal(new[] { "totally-unknown-verb" }, clause.Verb.Tokens);
- Assert.Equal(new[] { "foo", "bar" }, clause.Args.Select(a => a.Raw).ToArray());
+ Assert.Equal(new[] { "totally-unknown-verb", "foo", "bar" }, clause.Verb.Tokens);
+ Assert.Empty(clause.Args);
}
[Fact]
@@ -481,7 +499,9 @@ public void Sh_c_recurses_same_as_bash_c()
var result = Parse("sh -c \"echo hi\"");
Assert.False(result.IsUnparseable);
var clause = Assert.Single(result.Clauses);
- Assert.Equal(new[] { "echo" }, clause.Verb.Tokens);
+ // Issue #27: `hi` is verb-like and absorbed into the inner clause's
+ // verb chain (echo is not a FILE verb).
+ Assert.Equal(new[] { "echo", "hi" }, clause.Verb.Tokens);
Assert.True(clause.IsBashCWrapped);
}
@@ -526,7 +546,8 @@ public void Bash_c_nested_depth_2_parses()
var result = Parse("bash -c \"bash -c \\\"echo hi\\\"\"");
Assert.False(result.IsUnparseable);
var clause = Assert.Single(result.Clauses);
- Assert.Equal(new[] { "echo" }, clause.Verb.Tokens);
+ // Issue #27: `hi` absorbed into the verb chain.
+ Assert.Equal(new[] { "echo", "hi" }, clause.Verb.Tokens);
Assert.True(clause.IsBashCWrapped);
}
@@ -587,14 +608,14 @@ public void Leading_comment_does_not_pollute_verb_chain()
{
// The exact failure mode from issue #25: a leading explanatory
// comment was being parsed as the verb of the next clause,
- // surfacing as `# Extract` in downstream approval prompts.
- // BashArity collapses `git worktree` to a 2-token verb in v0.1
- // (the deeper `git worktree list` subcommand is not in the table,
- // so `list` lands as a positional arg — see SPEC §6.1).
+ // surfacing as `# Extract` in downstream approval prompts. Issue
+ // #27 follow-up: the greedy verb-chain heuristic now captures
+ // `worktree` and `list` together — the parser used to truncate
+ // the chain at the BashArity=2 default for git.
var result = Parse("# Extract worktree branches\ngit worktree list");
var clause = Assert.Single(result.Clauses);
- Assert.Equal(new[] { "git", "worktree" }, clause.Verb.Tokens);
- Assert.Equal("list", clause.Args[0].Raw);
+ Assert.Equal(new[] { "git", "worktree", "list" }, clause.Verb.Tokens);
+ Assert.Empty(clause.Args);
Assert.False(result.IsUnparseable);
}
@@ -1051,11 +1072,15 @@ public void Docker_volume_value_is_not_a_path_per_locked_interpretation_8()
{
var result = Parse("docker run -v /host:/container nginx");
var clause = Assert.Single(result.Clauses);
- // verb chain probe: -v /host:/container should be consumed; verb = ["docker", "run"]
- Assert.Equal(new[] { "docker", "run" }, clause.Verb.Tokens);
- Assert.Equal(3, clause.Args.Count);
+ // Issue #27 follow-up: the verb-chain walker consumes `-v
+ // /host:/container` as a flag-with-value pair, then `nginx`
+ // (verb-like) extends the chain. Verb = ["docker", "run", "nginx"];
+ // the volume-mount value still surfaces as an arg with IsPath=false
+ // per locked interpretation #8 (colon-joined target is not a path).
+ Assert.Equal(new[] { "docker", "run", "nginx" }, clause.Verb.Tokens);
+ Assert.Equal(2, clause.Args.Count);
Assert.Equal("-v", clause.Args[0].Raw);
- Assert.False(clause.Args[1].IsPath); // colon-joined volume mount, NOT a path
+ Assert.False(clause.Args[1].IsPath);
Assert.Equal("/host:/container", clause.Args[1].Raw);
}
}