diff --git a/.gitignore b/.gitignore index 989f27be..9d0fcbe9 100644 --- a/.gitignore +++ b/.gitignore @@ -21,7 +21,6 @@ coverage/ .agent-relay.stale.*/ .relay/ .ricky/ -.trajectories/ .workflow-artifacts/ .factory/ .mcp.json diff --git a/.trajectories/completed/2026-05/traj_2f8sfg3akql2.json b/.trajectories/completed/2026-05/traj_2f8sfg3akql2.json new file mode 100644 index 00000000..05c42988 --- /dev/null +++ b/.trajectories/completed/2026-05/traj_2f8sfg3akql2.json @@ -0,0 +1,65 @@ +{ + "id": "traj_2f8sfg3akql2", + "version": 1, + "task": { + "title": "Harden Ricky workflow never-fail coverage" + }, + "status": "completed", + "startedAt": "2026-05-08T14:46:04.652Z", + "completedAt": "2026-05-08T14:54:06.222Z", + "agents": [ + { + "name": "default", + "role": "lead", + "joinedAt": "2026-05-08T14:48:29.461Z" + } + ], + "chapters": [ + { + "id": "chap_e5f3y6nkmd4t", + "title": "Work", + "agentName": "default", + "startedAt": "2026-05-08T14:48:29.461Z", + "endedAt": "2026-05-08T14:54:06.222Z", + "events": [ + { + "ts": 1778251709462, + "type": "decision", + "content": "Harden generated workflows with repair-aware retry instead of fail-fast: Harden generated workflows with repair-aware retry instead of fail-fast", + "raw": { + "question": "Harden generated workflows with repair-aware retry instead of fail-fast", + "chosen": "Harden generated workflows with repair-aware retry instead of fail-fast", + "alternatives": [], + "reasoning": "Relay now supports deterministic gate repair agents; Ricky should emit workflows that opt into that behavior for ordinary, master, and child workflows so generated workflows do not terminate on repairable checks." + }, + "significance": "high" + }, + { + "ts": 1778252041693, + "type": "reflection", + "content": "Ricky generator now emits repair-aware retry for ordinary, master, and child workflows; pipeline tests cover code, doc, low-risk, and master shapes; local auto-fix and full suite are green after syncing package proof script allowlist.", + "raw": { + "confidence": 0.9 + }, + "significance": "high", + "tags": [ + "confidence:0.9" + ] + } + ] + } + ], + "retrospective": { + "summary": "Hardened Ricky workflow generation so generated workflows opt into repair-aware retry with repairAgent/repairRetries, added validation and generation tests across code, doc, low-risk, master, and child workflow shapes, kept local auto-fix coverage green, and synced package proof/docs for existing eval scripts so the full suite passes.", + "approach": "Standard approach", + "confidence": 0.92 + }, + "commits": [], + "filesChanged": [], + "projectId": "/Users/khaliqgant/Projects/AgentWorkforce/ricky-workflow-never-fail", + "tags": [], + "_trace": { + "startRef": "33549385e357061346be25e7be0c64d49abdfef9", + "endRef": "33549385e357061346be25e7be0c64d49abdfef9" + } +} \ No newline at end of file diff --git a/.trajectories/completed/2026-05/traj_2f8sfg3akql2.md b/.trajectories/completed/2026-05/traj_2f8sfg3akql2.md new file mode 100644 index 00000000..9ee619c6 --- /dev/null +++ b/.trajectories/completed/2026-05/traj_2f8sfg3akql2.md @@ -0,0 +1,32 @@ +# Trajectory: Harden Ricky workflow never-fail coverage + +> **Status:** ✅ Completed +> **Confidence:** 92% +> **Started:** May 8, 2026 at 04:46 PM +> **Completed:** May 8, 2026 at 04:54 PM + +--- + +## Summary + +Hardened Ricky workflow generation so generated workflows opt into repair-aware retry with repairAgent/repairRetries, added validation and generation tests across code, doc, low-risk, master, and child workflow shapes, kept local auto-fix coverage green, and synced package proof/docs for existing eval scripts so the full suite passes. + +**Approach:** Standard approach + +--- + +## Key Decisions + +### Harden generated workflows with repair-aware retry instead of fail-fast +- **Chose:** Harden generated workflows with repair-aware retry instead of fail-fast +- **Reasoning:** Relay now supports deterministic gate repair agents; Ricky should emit workflows that opt into that behavior for ordinary, master, and child workflows so generated workflows do not terminate on repairable checks. + +--- + +## Chapters + +### 1. Work +*Agent: default* + +- Harden generated workflows with repair-aware retry instead of fail-fast: Harden generated workflows with repair-aware retry instead of fail-fast +- Ricky generator now emits repair-aware retry for ordinary, master, and child workflows; pipeline tests cover code, doc, low-risk, and master shapes; local auto-fix and full suite are green after syncing package proof script allowlist. diff --git a/.trajectories/completed/2026-05/traj_b77ch3xz99r8.json b/.trajectories/completed/2026-05/traj_b77ch3xz99r8.json new file mode 100644 index 00000000..546a281e --- /dev/null +++ b/.trajectories/completed/2026-05/traj_b77ch3xz99r8.json @@ -0,0 +1,98 @@ +{ + "id": "traj_b77ch3xz99r8", + "version": 1, + "task": { + "title": "Avoid master child workflow false terminal failures" + }, + "status": "completed", + "startedAt": "2026-05-08T15:04:46.241Z", + "completedAt": "2026-05-08T15:12:41.150Z", + "agents": [ + { + "name": "default", + "role": "lead", + "joinedAt": "2026-05-08T15:12:30.504Z" + } + ], + "chapters": [ + { + "id": "chap_c2qwv9dhlskk", + "title": "Work", + "agentName": "default", + "startedAt": "2026-05-08T15:12:30.504Z", + "endedAt": "2026-05-08T15:12:41.150Z", + "events": [ + { + "ts": 1778253150505, + "type": "decision", + "content": "Keep nested Ricky child auto-fix enabled from master workflows: Keep nested Ricky child auto-fix enabled from master workflows", + "raw": { + "question": "Keep nested Ricky child auto-fix enabled from master workflows", + "chosen": "Keep nested Ricky child auto-fix enabled from master workflows", + "alternatives": [], + "reasoning": "The cloud failure showed master-generated child runs using --no-auto-fix, which prevented the child workflow from repairing its own deterministic blocker before the master failed." + }, + "significance": "high" + }, + { + "ts": 1778253150534, + "type": "decision", + "content": "Make generated child final validation non-terminal: Make generated child final validation non-terminal", + "raw": { + "question": "Make generated child final validation non-terminal", + "chosen": "Make generated child final validation non-terminal", + "alternatives": [], + "reasoning": "Parallel child workflows can observe temporary integrated-repo typecheck failures caused by sibling slices; the master final validation should own the hard integrated check after all child signoffs." + }, + "significance": "high" + }, + { + "ts": 1778253150554, + "type": "decision", + "content": "Retry after workflow repair provider exceptions: Retry after workflow repair provider exceptions", + "raw": { + "question": "Retry after workflow repair provider exceptions", + "chosen": "Retry after workflow repair provider exceptions", + "alternatives": [], + "reasoning": "Malformed or missing structured artifacts from the repair persona should consume a bounded retry and resume the workflow, not stop the auto-fix loop after the first attempt." + }, + "significance": "high" + }, + { + "ts": 1778253156553, + "type": "reflection", + "content": "Generated workflows now keep repair loops active at both master and child layers, and legacy artifacts get deterministic repair for the old no-auto-fix and hard child validation patterns.", + "raw": { + "focalPoints": [ + "nested-auto-fix", + "parallel-validation", + "repair-provider-resilience" + ], + "adjustments": "Added regression coverage for generation, deterministic legacy repair, and provider exception retry.", + "confidence": 0.9 + }, + "significance": "high", + "tags": [ + "focal:nested-auto-fix", + "focal:parallel-validation", + "focal:repair-provider-resilience", + "confidence:0.9" + ] + } + ] + } + ], + "retrospective": { + "summary": "Updated Ricky workflow generation and auto-fix resilience so generated master workflows keep nested child auto-fix enabled, generated child final validation no longer terminally blocks on parallel sibling repo state, legacy generated artifacts can be deterministically repaired, and repair-provider exceptions trigger bounded retry/resume instead of immediate failure.", + "approach": "Standard approach", + "confidence": 0.9 + }, + "commits": [], + "filesChanged": [], + "projectId": "/Users/khaliqgant/Projects/AgentWorkforce/ricky-workflow-never-fail", + "tags": [], + "_trace": { + "startRef": "800ddc84f3a68aa0c7edc6cdcfd513c612d8adf5", + "endRef": "800ddc84f3a68aa0c7edc6cdcfd513c612d8adf5" + } +} \ No newline at end of file diff --git a/.trajectories/completed/2026-05/traj_b77ch3xz99r8.md b/.trajectories/completed/2026-05/traj_b77ch3xz99r8.md new file mode 100644 index 00000000..cb157da0 --- /dev/null +++ b/.trajectories/completed/2026-05/traj_b77ch3xz99r8.md @@ -0,0 +1,42 @@ +# Trajectory: Avoid master child workflow false terminal failures + +> **Status:** ✅ Completed +> **Confidence:** 90% +> **Started:** May 8, 2026 at 05:04 PM +> **Completed:** May 8, 2026 at 05:12 PM + +--- + +## Summary + +Updated Ricky workflow generation and auto-fix resilience so generated master workflows keep nested child auto-fix enabled, generated child final validation no longer terminally blocks on parallel sibling repo state, legacy generated artifacts can be deterministically repaired, and repair-provider exceptions trigger bounded retry/resume instead of immediate failure. + +**Approach:** Standard approach + +--- + +## Key Decisions + +### Keep nested Ricky child auto-fix enabled from master workflows +- **Chose:** Keep nested Ricky child auto-fix enabled from master workflows +- **Reasoning:** The cloud failure showed master-generated child runs using --no-auto-fix, which prevented the child workflow from repairing its own deterministic blocker before the master failed. + +### Make generated child final validation non-terminal +- **Chose:** Make generated child final validation non-terminal +- **Reasoning:** Parallel child workflows can observe temporary integrated-repo typecheck failures caused by sibling slices; the master final validation should own the hard integrated check after all child signoffs. + +### Retry after workflow repair provider exceptions +- **Chose:** Retry after workflow repair provider exceptions +- **Reasoning:** Malformed or missing structured artifacts from the repair persona should consume a bounded retry and resume the workflow, not stop the auto-fix loop after the first attempt. + +--- + +## Chapters + +### 1. Work +*Agent: default* + +- Keep nested Ricky child auto-fix enabled from master workflows: Keep nested Ricky child auto-fix enabled from master workflows +- Make generated child final validation non-terminal: Make generated child final validation non-terminal +- Retry after workflow repair provider exceptions: Retry after workflow repair provider exceptions +- Generated workflows now keep repair loops active at both master and child layers, and legacy artifacts get deterministic repair for the old no-auto-fix and hard child validation patterns. diff --git a/.trajectories/index.json b/.trajectories/index.json new file mode 100644 index 00000000..1a383fee --- /dev/null +++ b/.trajectories/index.json @@ -0,0 +1,20 @@ +{ + "version": 1, + "lastUpdated": "2026-05-08T15:12:41.286Z", + "trajectories": { + "traj_2f8sfg3akql2": { + "title": "Harden Ricky workflow never-fail coverage", + "status": "completed", + "startedAt": "2026-05-08T14:46:04.652Z", + "completedAt": "2026-05-08T14:54:06.222Z", + "path": "/Users/khaliqgant/Projects/AgentWorkforce/ricky-workflow-never-fail/.trajectories/completed/2026-05/traj_2f8sfg3akql2.json" + }, + "traj_b77ch3xz99r8": { + "title": "Avoid master child workflow false terminal failures", + "status": "completed", + "startedAt": "2026-05-08T15:04:46.241Z", + "completedAt": "2026-05-08T15:12:41.150Z", + "path": "/Users/khaliqgant/Projects/AgentWorkforce/ricky-workflow-never-fail/.trajectories/completed/2026-05/traj_b77ch3xz99r8.json" + } + } +} \ No newline at end of file diff --git a/AGENTS.md b/AGENTS.md index 8545e100..c7ae292d 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -192,6 +192,7 @@ Every agent working in this repo must follow these rules when authoring, reviewi - **Review stage:** Every significant workflow must include review by an agent distinct from the writer when possible. Prefer `writer=codex` with `reviewer=claude`, `writer=claude` with `reviewer=codex`, and both reviewers for critical workflows. Review artifacts for significant workflows must be written under `.workflow-artifacts/`. - **80-to-100 validation:** Serious implementation workflows must use a soft-gate, fix, hard-gate loop. The fix loop must include a post-fix re-review on the fixed state before final signoff. Passing compile or typecheck alone is not enough. - **Commit boundaries:** Do not run `git commit` or `git push` from agent steps unless the workflow explicitly owns that boundary and documents the expected files. Each workflow must state the expected branch naming pattern and whether PR creation is in or out of scope. Default branch names should follow `ricky/-` unless a nearby spec declares a narrower pattern. +- **PR creation primitive:** When PR creation is in scope, workflows MUST use the agent-relay GitHub primitive (`@agent-relay/github-primitive`) rather than shelling out to `gh pr create`. Use `createGitHubStep` from `@agent-relay/github-primitive/workflow-step` with `action: 'createPR'` for declarative workflow steps, or `GitHubClient.create({ runtime: 'auto' })` with `client.createPR(...)` for imperative use. The primitive handles runtime selection (local `gh` CLI vs Nango cloud vs relay-cloud proxy) and produces structured outputs the orchestrator can chain on. Master/orchestration workflows that complete an end-to-end deliverable SHOULD include a `final-pr` step using this primitive; specs must explicitly opt out if PR creation is not desired. - **Reviewable wording:** Workflow requirements must be specific enough for grep checks, structural checks, dry-run output, review artifacts, or scoped diff review. Avoid broad prose that cannot be verified by deterministic gates or reviewer inspection. - **Env loading:** Load `.env.local` and `.env` before `.run(...)` without overwriting exported values. Fail fast with `MISSING_ENV_VAR: ` before expensive agent steps. - **Scoped change detection:** After implementation steps, verify the repo changed in the expected scope using `git diff --name-only` plus `git ls-files --others --exclude-standard`, scoped to declared file targets. Do not use repo-wide `git diff --quiet` when unrelated work may be present. diff --git a/README.md b/README.md index 9cf3ca1a..59d61224 100644 --- a/README.md +++ b/README.md @@ -166,6 +166,12 @@ npm scripts (canonical ordering matches `package.json`): - `npm test` — bundle the CLI, then run the full test suite and proof tests - `npm start` — launch the interactive CLI from `src/surfaces/cli/commands/cli-main.ts` - `npm run dev` — alias for `npm start` +- `npm run evals` — compile and run the Ricky eval suite +- `npm run evals:compile` — compile Ricky eval definitions +- `npm run evals:opencode` — run Ricky evals through the OpenCode executor path +- `npm run evals:list` — list available Ricky evals +- `npm run evals:summary` — summarize the latest Ricky eval results +- `npm run evals:compare` — compare Ricky eval result sets - `npm run batch` — run workflow batches via `scripts/run-ricky-batch.sh` - `npm run overnight` — run the overnight workflow queue via `scripts/run-ricky-overnight.sh` - default queue mode is now `flight-safe`, which only runs the workflows currently classified as unattended-safe diff --git a/evals/README.md b/evals/README.md new file mode 100644 index 00000000..39cbd3e9 --- /dev/null +++ b/evals/README.md @@ -0,0 +1,147 @@ +# Ricky Evals + +This directory holds human-authored product evals for Ricky. The shared loading, +filtering, deterministic checks, human-review marking, and run artifact writing +come from `@agent-assistant/telemetry/evals`; Ricky keeps the domain-specific +cases, rubrics, and product executors here. + +## Start Here + +Write new evals in `evals/suites//cases.md`. Each `## case-id` block is +compiled into generated `cases.jsonl` by: + +```sh +npm run evals:compile +``` + +Run all current evals: + +```sh +npm run evals +``` + +Useful filters: + +```sh +npm run evals -- --suite workflow-authoring +npm run evals -- --case workflow-authoring.deterministic-gates +npm run evals -- --tag local +npm run evals:list +``` + +Run history and review worksheets are written under `.ricky/evals/runs/`, which +is intentionally ignored by git. + +## Running Against OpenCode + +Ricky can also run the human-review cases against a local OpenCode one-shot +model. This path does not need `OPENROUTER_API_KEY`; it shells out to +`opencode run -m ` and captures the answer into the normal +human-review worksheet. + +```sh +npm run evals:opencode -- --suite workflow-authoring +``` + +By default this uses `opencode/minimax-m2.5-free`. Override the local/free model or +binary with environment variables: + +```sh +RICKY_EVAL_OPENCODE_MODEL=opencode/nemotron-3-super-free npm run evals:opencode -- --tag workflow-authoring +RICKY_EVAL_OPENCODE_BIN=/path/to/opencode npm run evals:opencode -- --case generation-quality.workflow-contract +``` + +For a case-specific provider run, set `Executor: opencode` in the case. To run +the existing `Executor: manual` cases through OpenCode without editing them, use +`npm run evals:opencode`. + +Agent Relay is still the better fit for heavier evals that need real worker +topology, tool-mediated execution, or multi-agent coordination. The direct +OpenCode executor is intentionally small so local quality sweeps stay cheap and +fast. + +## Writing Manual Cases + +Use `Executor: manual` when you want to capture a Ricky behavior expectation for +humans to judge. Put the user request in `### Message`, then write concrete +`### Must` and `### Must Not` bullets. These become the human-review rubric. + +To evaluate a real Ricky answer manually, paste it into `### Candidate Output` +or point to a file with `### Candidate Output Path`. If no output is supplied, +the run still creates a review worksheet so the expected behavior is visible. + +Minimal manual case: + +```text +## workflow-authoring.your-case-id +Executor: manual +Kind: capability +Tags: workflow-authoring +Human Review: true + +### Message +Ask Ricky to do the thing you care about. + +### Must +- State the behavior a good Ricky response must show. + +### Must Not +- State the regression or product failure this eval should catch. +``` + +## Deterministic CLI Cases + +Use `Executor: ricky-cli` for small command-surface checks. Put the command +arguments in `### Mock` as `argv: ...`; the runner invokes the source CLI through +local `tsx`. + +```text +## cli.example +Executor: ricky-cli +Kind: regression +Tags: cli + +### Message +--help + +### Mock +argv: --help + +### Deterministic Checks +ok: true +contentIncludes: +- ricky run +forbidPhrases: +- TypeError +``` + +Keep deterministic cases narrow and cheap. Use human-review cases for planning +quality, workflow authoring judgment, and any behavior where a senior engineer +needs to read the output. + +## Source Map + +The current suites sweep the repo's existing product and architecture docs: + +- `cli-behavior` covers `README.md`, `docs/product/ricky-cli-onboarding-ux-spec.md`, + `docs/product/ricky-cofounder-interactive-readiness-checklist.md`, and + `specs/cli-version-from-package-json.md`. +- `workflow-authoring` covers `AGENTS.md`, + `docs/workflows/WORKFLOW_STANDARDS.md`, + `workflows/shared/WORKFLOW_AUTHORING_RULES.md`, and + workflow authoring expectations in `SPEC.md`. +- `runtime-recovery` covers `SPEC.md`, + `docs/architecture/ricky-failure-taxonomy-and-unblockers.md`, + `docs/architecture/ricky-runtime-architecture.md`, + `specs/cli-auto-fix-and-resume.md`, and + `specs/in-process-workflow-runner.md`. +- `surfaces-ingress` covers `docs/architecture/ricky-surfaces-and-ingress.md`, + `docs/product/ricky-cli-onboarding-ux-spec.md`, + `specs/cloud-runtime-execute-artifact.md`, and + `specs/linear-integration.md`. +- `generation-quality` covers `SPEC.md`, + `specs/workflow-generation-quality.md`, and + `docs/product/ricky-skill-embedding-boundary.md`. +- `agent-assistant-boundary` covers the Agent Assistant adoption audit, + boundary, proof, live proof, and local execution reuse documents under + `docs/product/`. diff --git a/evals/fixtures/transcripts/.gitkeep b/evals/fixtures/transcripts/.gitkeep new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/evals/fixtures/transcripts/.gitkeep @@ -0,0 +1 @@ + diff --git a/evals/suites/agent-assistant-boundary/cases.jsonl b/evals/suites/agent-assistant-boundary/cases.jsonl new file mode 100644 index 00000000..27b9a50f --- /dev/null +++ b/evals/suites/agent-assistant-boundary/cases.jsonl @@ -0,0 +1,7 @@ +# Generated by scripts/evals/compile-ricky-evals.mjs from cases.md. +# Do not edit this file directly; edit cases.md in this suite instead. +{"id":"agent-assistant-boundary.real-reuse-not-rhetorical","suite":"agent-assistant-boundary","executor":"manual","kind":"regression","input":{"message":"Update Ricky docs and code to say it uses Agent Assistant more deeply."},"expected":{"maxToolCalls":0,"must":["Ground claims in real package imports and runtime paths.","Distinguish current implementation from target architecture.","Identify which Agent Assistant primitive is actually exercised."],"mustNot":["Rename local code to sound Agent Assistant aligned and count that as adoption.","Claim broad Agent Assistant native behavior from documentation-only alignment.","Blur target architecture with landed behavior."],"humanReviewRequired":true},"tags":["agent-assistant","boundary"]} +{"id":"agent-assistant-boundary.turn-context-preserves-ricky-envelope","suite":"agent-assistant-boundary","executor":"manual","kind":"regression","input":{"message":"Evaluate the current Ricky `@agent-assistant/turn-context` adoption."},"expected":{"maxToolCalls":0,"must":["Preserve request id, source metadata, structured spec, invocation root, mode, stage mode, spec path, metadata, and spec text.","Record compact provenance through generation decisions or coordinator metadata.","Keep the shared turn context internal to the adapter boundary."],"mustNot":["Move LocalResponse, blocker taxonomy, recovery wording, or execution semantics into the shared turn-context package.","Drop Ricky-specific workflow metadata during envelope assembly.","Treat turn context as a product decision engine."],"humanReviewRequired":true},"tags":["agent-assistant","turn-context"]} +{"id":"agent-assistant-boundary.product-core-stays-ricky-owned","suite":"agent-assistant-boundary","executor":"manual","kind":"capability","input":{"message":"Decide whether workflow generation, validation, debugging, staged CLI UX, and blocker/evidence wording should move into Agent Assistant."},"expected":{"maxToolCalls":0,"must":["Keep product-defining workflow generation, validation, debugging, local UX, and evidence wording Ricky-owned until proof says otherwise.","Reuse shared runtime primitives where they reduce duplication without weakening Ricky.","Make extraction follow typed, tested, live product proof."],"mustNot":["Generalize workflow-specific behavior prematurely.","Adopt moving shared seams merely for architectural purity.","Lose the precise local-first staged workflow UX."],"humanReviewRequired":true},"tags":["agent-assistant","product-core"]} +{"id":"agent-assistant-boundary.one-slice-at-a-time","suite":"agent-assistant-boundary","executor":"manual","kind":"capability","input":{"message":"Plan the next Agent Assistant adoption slice for Ricky."},"expected":{"maxToolCalls":0,"must":["Pick exactly one real shared seam to evaluate or adopt.","Define a live Ricky product path that will prove the adoption.","Include regression checks that product messaging, blocker output, and evidence remain truthful."],"mustNot":["Bundle sessions, memory, policy, proactive behavior, and execution extraction into one vague migration.","Skip the comparison/evaluation step for mature Ricky-local seams.","Treat adoption as successful without a live product-path proof."],"humanReviewRequired":true},"tags":["agent-assistant","adoption"]} +{"id":"agent-assistant-boundary.future-surfaces-use-shared-runtime","suite":"agent-assistant-boundary","executor":"manual","kind":"capability","input":{"message":"Design future Slack or web support for Ricky using Agent Assistant packages."},"expected":{"maxToolCalls":0,"must":["Prefer shared surfaces, webhook-runtime, sessions, and routing primitives for future non-CLI interaction where mature.","Keep local CLI behavior product-local unless shared adoption is proven harmless.","Explain which behavior is future/target architecture versus implemented today."],"mustNot":["Preemptively add memory, policy, or proactive packages without a real Ricky product requirement.","Let future surface abstractions distort the current CLI contract.","Duplicate a mature Agent Assistant capability locally without justification."],"humanReviewRequired":true},"tags":["agent-assistant","surfaces"]} diff --git a/evals/suites/agent-assistant-boundary/cases.md b/evals/suites/agent-assistant-boundary/cases.md new file mode 100644 index 00000000..cc79d6e4 --- /dev/null +++ b/evals/suites/agent-assistant-boundary/cases.md @@ -0,0 +1,114 @@ +# Agent Assistant Boundary Cases + +These cases come from the Agent Assistant audit, adoption boundary, local +execution contract evaluation, adoption proof, and live proof documents. + +## agent-assistant-boundary.real-reuse-not-rhetorical +Executor: manual +Kind: regression +Tags: agent-assistant, boundary +Human Review: true + +### Message +Update Ricky docs and code to say it uses Agent Assistant more deeply. + +### Deterministic Checks +maxToolCalls: 0 + +### Must +- Ground claims in real package imports and runtime paths. +- Distinguish current implementation from target architecture. +- Identify which Agent Assistant primitive is actually exercised. + +### Must Not +- Rename local code to sound Agent Assistant aligned and count that as adoption. +- Claim broad Agent Assistant native behavior from documentation-only alignment. +- Blur target architecture with landed behavior. + +## agent-assistant-boundary.turn-context-preserves-ricky-envelope +Executor: manual +Kind: regression +Tags: agent-assistant, turn-context +Human Review: true + +### Message +Evaluate the current Ricky `@agent-assistant/turn-context` adoption. + +### Deterministic Checks +maxToolCalls: 0 + +### Must +- Preserve request id, source metadata, structured spec, invocation root, mode, stage mode, spec path, metadata, and spec text. +- Record compact provenance through generation decisions or coordinator metadata. +- Keep the shared turn context internal to the adapter boundary. + +### Must Not +- Move LocalResponse, blocker taxonomy, recovery wording, or execution semantics into the shared turn-context package. +- Drop Ricky-specific workflow metadata during envelope assembly. +- Treat turn context as a product decision engine. + +## agent-assistant-boundary.product-core-stays-ricky-owned +Executor: manual +Kind: capability +Tags: agent-assistant, product-core +Human Review: true + +### Message +Decide whether workflow generation, validation, debugging, staged CLI UX, and blocker/evidence wording should move into Agent Assistant. + +### Deterministic Checks +maxToolCalls: 0 + +### Must +- Keep product-defining workflow generation, validation, debugging, local UX, and evidence wording Ricky-owned until proof says otherwise. +- Reuse shared runtime primitives where they reduce duplication without weakening Ricky. +- Make extraction follow typed, tested, live product proof. + +### Must Not +- Generalize workflow-specific behavior prematurely. +- Adopt moving shared seams merely for architectural purity. +- Lose the precise local-first staged workflow UX. + +## agent-assistant-boundary.one-slice-at-a-time +Executor: manual +Kind: capability +Tags: agent-assistant, adoption +Human Review: true + +### Message +Plan the next Agent Assistant adoption slice for Ricky. + +### Deterministic Checks +maxToolCalls: 0 + +### Must +- Pick exactly one real shared seam to evaluate or adopt. +- Define a live Ricky product path that will prove the adoption. +- Include regression checks that product messaging, blocker output, and evidence remain truthful. + +### Must Not +- Bundle sessions, memory, policy, proactive behavior, and execution extraction into one vague migration. +- Skip the comparison/evaluation step for mature Ricky-local seams. +- Treat adoption as successful without a live product-path proof. + +## agent-assistant-boundary.future-surfaces-use-shared-runtime +Executor: manual +Kind: capability +Tags: agent-assistant, surfaces +Human Review: true + +### Message +Design future Slack or web support for Ricky using Agent Assistant packages. + +### Deterministic Checks +maxToolCalls: 0 + +### Must +- Prefer shared surfaces, webhook-runtime, sessions, and routing primitives for future non-CLI interaction where mature. +- Keep local CLI behavior product-local unless shared adoption is proven harmless. +- Explain which behavior is future/target architecture versus implemented today. + +### Must Not +- Preemptively add memory, policy, or proactive packages without a real Ricky product requirement. +- Let future surface abstractions distort the current CLI contract. +- Duplicate a mature Agent Assistant capability locally without justification. diff --git a/evals/suites/agent-assistant-boundary/rubric.md b/evals/suites/agent-assistant-boundary/rubric.md new file mode 100644 index 00000000..60ef5290 --- /dev/null +++ b/evals/suites/agent-assistant-boundary/rubric.md @@ -0,0 +1,17 @@ +# Agent Assistant Boundary Rubric + +Use this suite for Ricky's relationship to Agent Assistant packages and shared +assistant runtime seams. + +## Human Review Questions + +1. Does the answer separate current implementation from target architecture? +2. Is shared reuse real, narrow, and proven? +3. Does Ricky keep workflow-specific product behavior local where appropriate? +4. Is extraction gated by typed tests and live product proof? +5. Does adoption reduce product burden rather than add indirection? + +## Suggested Pass Bar + +Pass only when the boundary is honest, specific, and grounded in actual Ricky +runtime behavior. diff --git a/evals/suites/cli-behavior/cases.jsonl b/evals/suites/cli-behavior/cases.jsonl new file mode 100644 index 00000000..e6bc95cd --- /dev/null +++ b/evals/suites/cli-behavior/cases.jsonl @@ -0,0 +1,8 @@ +# Generated by scripts/evals/compile-ricky-evals.mjs from cases.md. +# Do not edit this file directly; edit cases.md in this suite instead. +{"id":"cli.help-surfaces-local-cloud-and-run","suite":"cli-behavior","executor":"ricky-cli","kind":"regression","input":{"message":"--help"},"expected":{"ok":true,"contentIncludes":["ricky local --spec","ricky run ","ricky status"],"forbidPhrases":["TypeError","ReferenceError","stack trace"],"maxToolCalls":1,"must":["Show the user the local, Cloud, run, status, and connect surfaces without requiring interactive setup.","Keep the help output truthful to the implemented CLI commands."],"mustNot":["Print a stack trace or raw implementation failure for help.","Hide the local/BYOH run path behind Cloud-only language."],"humanReviewRequired":false},"tags":["cli","onboarding","local","cloud"],"mock":{"argv":"--help"}} +{"id":"cli.version-prints-package-version","suite":"cli-behavior","executor":"ricky-cli","kind":"regression","input":{"message":"version"},"expected":{"ok":true,"contentMatches":["^ricky 0\\.1\\.\\d+"],"forbidPhrases":["TypeError","ReferenceError"],"maxToolCalls":1,"must":["Print the package version as a short script-friendly value."],"mustNot":["Start the interactive onboarding flow for `version`."],"humanReviewRequired":false},"tags":["cli","packaging"],"mock":{"argv":"version"}} +{"id":"cli.generation-default-not-execution","suite":"cli-behavior","executor":"manual","kind":"regression","input":{"message":"A user runs `ricky --mode local --spec \"generate a workflow for package checks\"` without `--run`."},"expected":{"maxToolCalls":0,"must":["Say generation is the default and execution was not requested.","Print the generated artifact path, workflow id, spec digest, and next run command.","Avoid showing execution evidence for a generation-only request."],"mustNot":["Imply the workflow ran automatically.","Present a generation-only result as execution success.","Hide the opt-in commands for running the artifact."],"humanReviewRequired":true},"tags":["cli","onboarding","local"]} +{"id":"cli.first-run-copy-is-compact-and-truthful","suite":"cli-behavior","executor":"manual","kind":"capability","input":{"message":"Render Ricky's first-run CLI onboarding for a new user."},"expected":{"maxToolCalls":0,"must":["Show compact Ricky branding and clear Local / BYOH, Cloud, Both, and Just explore choices.","End every branch with a concrete next step.","Advertise only commands that are currently implemented."],"mustNot":["Sound like a launch page or documentation dump.","Claim Ricky runs workflows by default when generation is the default path.","Require web or Slack onboarding before CLI use."],"humanReviewRequired":true},"tags":["cli","onboarding"]} +{"id":"cli.recovery-guidance-no-stack-traces","suite":"cli-behavior","executor":"manual","kind":"regression","input":{"message":"A user gives Ricky an empty spec or a missing spec file."},"expected":{"maxToolCalls":0,"must":["Return a user-facing failure or guidance message with a real recovery command.","Distinguish generation failure from execution failure.","Show stack traces only when verbose diagnostic mode is requested."],"mustNot":["Crash with an uncaught exception in normal mode.","Suggest commands that do not exist.","Pretend a missing spec was accepted."],"humanReviewRequired":true},"tags":["cli","recovery"]} +{"id":"cli.status-does-not-invent-provider-state","suite":"cli-behavior","executor":"manual","kind":"regression","input":{"message":"Render `ricky status` when no provider checks have proven Google or GitHub are connected."},"expected":{"maxToolCalls":0,"must":["Report unknown or not-connected provider state honestly.","Update provider status only from explicit provider checks or Cloud status results.","Give concrete setup guidance for Cloud when relevant."],"mustNot":["Mark Google or GitHub connected because guidance text was shown.","Invent a provider connection URL or OAuth flow.","Show empty fields with no recovery guidance when config is missing."],"humanReviewRequired":true},"tags":["cli","status","cloud"]} diff --git a/evals/suites/cli-behavior/cases.md b/evals/suites/cli-behavior/cases.md new file mode 100644 index 00000000..491b5b3e --- /dev/null +++ b/evals/suites/cli-behavior/cases.md @@ -0,0 +1,151 @@ +# CLI Behavior Cases + +These cases protect small, deterministic CLI promises. They should stay cheap +enough to run in the default offline eval suite. + +## cli.help-surfaces-local-cloud-and-run +Executor: ricky-cli +Kind: regression +Tags: cli, onboarding, local, cloud +Human Review: false + +### Message +--help + +### Mock +argv: --help + +### Deterministic Checks +ok: true +contentIncludes: +- ricky local --spec +- ricky run +- ricky status +forbidPhrases: +- TypeError +- ReferenceError +- stack trace +maxToolCalls: 1 + +### Must +- Show the user the local, Cloud, run, status, and connect surfaces without requiring interactive setup. +- Keep the help output truthful to the implemented CLI commands. + +### Must Not +- Print a stack trace or raw implementation failure for help. +- Hide the local/BYOH run path behind Cloud-only language. + +## cli.version-prints-package-version +Executor: ricky-cli +Kind: regression +Tags: cli, packaging +Human Review: false + +### Message +version + +### Mock +argv: version + +### Deterministic Checks +ok: true +contentMatches: +- ^ricky 0\.1\.\d+ +forbidPhrases: +- TypeError +- ReferenceError +maxToolCalls: 1 + +### Must +- Print the package version as a short script-friendly value. + +### Must Not +- Start the interactive onboarding flow for `version`. + +## cli.generation-default-not-execution +Executor: manual +Kind: regression +Tags: cli, onboarding, local +Human Review: true + +### Message +A user runs `ricky --mode local --spec "generate a workflow for package checks"` without `--run`. + +### Deterministic Checks +maxToolCalls: 0 + +### Must +- Say generation is the default and execution was not requested. +- Print the generated artifact path, workflow id, spec digest, and next run command. +- Avoid showing execution evidence for a generation-only request. + +### Must Not +- Imply the workflow ran automatically. +- Present a generation-only result as execution success. +- Hide the opt-in commands for running the artifact. + +## cli.first-run-copy-is-compact-and-truthful +Executor: manual +Kind: capability +Tags: cli, onboarding +Human Review: true + +### Message +Render Ricky's first-run CLI onboarding for a new user. + +### Deterministic Checks +maxToolCalls: 0 + +### Must +- Show compact Ricky branding and clear Local / BYOH, Cloud, Both, and Just explore choices. +- End every branch with a concrete next step. +- Advertise only commands that are currently implemented. + +### Must Not +- Sound like a launch page or documentation dump. +- Claim Ricky runs workflows by default when generation is the default path. +- Require web or Slack onboarding before CLI use. + +## cli.recovery-guidance-no-stack-traces +Executor: manual +Kind: regression +Tags: cli, recovery +Human Review: true + +### Message +A user gives Ricky an empty spec or a missing spec file. + +### Deterministic Checks +maxToolCalls: 0 + +### Must +- Return a user-facing failure or guidance message with a real recovery command. +- Distinguish generation failure from execution failure. +- Show stack traces only when verbose diagnostic mode is requested. + +### Must Not +- Crash with an uncaught exception in normal mode. +- Suggest commands that do not exist. +- Pretend a missing spec was accepted. + +## cli.status-does-not-invent-provider-state +Executor: manual +Kind: regression +Tags: cli, status, cloud +Human Review: true + +### Message +Render `ricky status` when no provider checks have proven Google or GitHub are connected. + +### Deterministic Checks +maxToolCalls: 0 + +### Must +- Report unknown or not-connected provider state honestly. +- Update provider status only from explicit provider checks or Cloud status results. +- Give concrete setup guidance for Cloud when relevant. + +### Must Not +- Mark Google or GitHub connected because guidance text was shown. +- Invent a provider connection URL or OAuth flow. +- Show empty fields with no recovery guidance when config is missing. diff --git a/evals/suites/cli-behavior/rubric.md b/evals/suites/cli-behavior/rubric.md new file mode 100644 index 00000000..014ff934 --- /dev/null +++ b/evals/suites/cli-behavior/rubric.md @@ -0,0 +1,10 @@ +# CLI Behavior Rubric + +Use this suite for deterministic command-surface behavior. + +## Pass Bar + +The command should complete without stack traces, preserve scriptable output, and +name only implemented Ricky surfaces. CLI evals should avoid broad product +judgment; use workflow-authoring cases when the quality bar depends on reading a +plan or generated workflow. diff --git a/evals/suites/generation-quality/cases.jsonl b/evals/suites/generation-quality/cases.jsonl new file mode 100644 index 00000000..42ab726f --- /dev/null +++ b/evals/suites/generation-quality/cases.jsonl @@ -0,0 +1,9 @@ +# Generated by scripts/evals/compile-ricky-evals.mjs from cases.md. +# Do not edit this file directly; edit cases.md in this suite instead. +{"id":"generation-quality.skill-matcher-registry-backed","suite":"generation-quality","executor":"manual","kind":"capability","input":{"message":"Generate a workflow for a GitHub primitive change and explain which skills Ricky selected."},"expected":{"maxToolCalls":0,"must":["Select skills from the actual registry rather than a hardcoded static set.","Record ranked skill matches with confidence and trigger evidence.","Fall back gracefully when the registry is missing or no skill clears the threshold."],"mustNot":["Claim runtime agents embody skills when only generation-time guidance was applied.","Fail generation solely because optional skill files are missing.","Hide skill selection evidence from artifacts or JSON output."],"humanReviewRequired":true},"tags":["generation","skills"]} +{"id":"generation-quality.tool-selector-honors-spec-hints","suite":"generation-quality","executor":"manual","kind":"capability","input":{"message":"Generate a workflow where the spec says \"use Claude to review and Codex to implement\"."},"expected":{"maxToolCalls":0,"must":["Assign per-step runner/model decisions from explicit spec hints where possible.","Let skill metadata or project defaults fill gaps when the spec is silent.","Write `tool-selection.json` or equivalent audit metadata."],"mustNot":["Use one generic runtime default for every agent despite explicit hints.","Let skill defaults override explicit user/spec runner hints.","Omit the reason each tool or model was chosen."],"humanReviewRequired":true},"tags":["generation","tools"]} +{"id":"generation-quality.refine-is-opt-in-and-bounded","suite":"generation-quality","executor":"manual","kind":"capability","input":{"message":"Use `--refine` to sharpen a generated workflow's step task descriptions and acceptance gates."},"expected":{"maxToolCalls":0,"must":["Keep the deterministic unrefined path as the default.","Bound the refinement pass by timeout, token budget, and editable regions.","Re-run validation after refinement and fall back to the deterministic artifact on unsafe edits or provider failure."],"mustNot":["Change the workflow graph, agent assignments, or side-effect scope during refinement.","Fail the whole generation if optional refinement times out.","Hide warnings when the deterministic artifact is returned unchanged."],"humanReviewRequired":true},"tags":["generation","refine","llm"]} +{"id":"generation-quality.behavior-grounded-gates","suite":"generation-quality","executor":"manual","kind":"regression","input":{"message":"Generate a workflow for the `ricky --version` spec."},"expected":{"maxToolCalls":0,"must":["Build gates from the stated acceptance behavior, such as checking `ricky --version` output.","Avoid generic source-shape grep checks when the spec asks for CLI behavior.","Keep generated validation meaningful for the current repo shape."],"mustNot":["Treat `grep -Eq 'export|function|class|workflow(' dist/ricky.js` as proof of version behavior.","Claim the workflow is proven by source syntax alone.","Ignore the package-json version resolution order in the spec."],"humanReviewRequired":true},"tags":["generation","validation"]} +{"id":"generation-quality.pattern-selection-deliberate","suite":"generation-quality","executor":"manual","kind":"regression","input":{"message":"Generate a workflow for many independent artifacts with a validation/fix/rerun loop."},"expected":{"maxToolCalls":0,"must":["Choose `dag`, `supervisor`, or `pipeline` deliberately based on the work shape.","Explain the pattern choice in artifact metadata or a rationale.","Use `dag` for validation/fix/rerun loops when dependencies matter."],"mustNot":["Default blindly to `dag` for every workflow.","Collapse independent artifact work into one vague agent task.","Omit verification gates because the chosen pattern seems obvious."],"humanReviewRequired":true},"tags":["generation","pattern"]} +{"id":"generation-quality.skill-boundary-copy","suite":"generation-quality","executor":"manual","kind":"regression","input":{"message":"Describe how selected workflow-writing skills affected a generated Ricky workflow."},"expected":{"maxToolCalls":0,"must":["Describe skills as generation-time selection, loading, and rendering inputs.","Point to metadata such as `loaded-skills.txt` and `skill-application-boundary.json`.","Say the workflow instructions were informed by selected skills."],"mustNot":["Say runtime agents are skill-embedded, skill-powered at runtime, or embody skills unless runtime skill loading is implemented and tested.","Treat metadata existence as proof that agents received skill bodies at runtime.","Overstate current tests beyond the generation boundary."],"humanReviewRequired":true},"tags":["generation","skills","copy"]} +{"id":"generation-quality.no-pure-codegen-without-proof","suite":"generation-quality","executor":"manual","kind":"regression","input":{"message":"Generate a workflow from a vague product spec and return it to the user."},"expected":{"maxToolCalls":0,"must":["Produce a Relay-native TypeScript workflow with explicit verification, review, and signoff.","Validate with dry-run or targeted structural checks where possible.","Return artifacts, warnings, and follow-up commands honestly."],"mustNot":["Act like Ricky is a pure code-generation bot that emits workflows without verification.","Stop at \"code compiles\" as the proof bar.","Skip skill-aware workflow authoring guidance for serious workflows."],"humanReviewRequired":true},"tags":["generation","proof"]} diff --git a/evals/suites/generation-quality/cases.md b/evals/suites/generation-quality/cases.md new file mode 100644 index 00000000..0c9ff5b2 --- /dev/null +++ b/evals/suites/generation-quality/cases.md @@ -0,0 +1,158 @@ +# Generation Quality Cases + +These cases come from `SPEC.md`, `specs/workflow-generation-quality.md`, +`docs/product/ricky-skill-embedding-boundary.md`, and the CLI proof specs. + +## generation-quality.skill-matcher-registry-backed +Executor: manual +Kind: capability +Tags: generation, skills +Human Review: true + +### Message +Generate a workflow for a GitHub primitive change and explain which skills Ricky selected. + +### Deterministic Checks +maxToolCalls: 0 + +### Must +- Select skills from the actual registry rather than a hardcoded static set. +- Record ranked skill matches with confidence and trigger evidence. +- Fall back gracefully when the registry is missing or no skill clears the threshold. + +### Must Not +- Claim runtime agents embody skills when only generation-time guidance was applied. +- Fail generation solely because optional skill files are missing. +- Hide skill selection evidence from artifacts or JSON output. + +## generation-quality.tool-selector-honors-spec-hints +Executor: manual +Kind: capability +Tags: generation, tools +Human Review: true + +### Message +Generate a workflow where the spec says "use Claude to review and Codex to implement". + +### Deterministic Checks +maxToolCalls: 0 + +### Must +- Assign per-step runner/model decisions from explicit spec hints where possible. +- Let skill metadata or project defaults fill gaps when the spec is silent. +- Write `tool-selection.json` or equivalent audit metadata. + +### Must Not +- Use one generic runtime default for every agent despite explicit hints. +- Let skill defaults override explicit user/spec runner hints. +- Omit the reason each tool or model was chosen. + +## generation-quality.refine-is-opt-in-and-bounded +Executor: manual +Kind: capability +Tags: generation, refine, llm +Human Review: true + +### Message +Use `--refine` to sharpen a generated workflow's step task descriptions and acceptance gates. + +### Deterministic Checks +maxToolCalls: 0 + +### Must +- Keep the deterministic unrefined path as the default. +- Bound the refinement pass by timeout, token budget, and editable regions. +- Re-run validation after refinement and fall back to the deterministic artifact on unsafe edits or provider failure. + +### Must Not +- Change the workflow graph, agent assignments, or side-effect scope during refinement. +- Fail the whole generation if optional refinement times out. +- Hide warnings when the deterministic artifact is returned unchanged. + +## generation-quality.behavior-grounded-gates +Executor: manual +Kind: regression +Tags: generation, validation +Human Review: true + +### Message +Generate a workflow for the `ricky --version` spec. + +### Deterministic Checks +maxToolCalls: 0 + +### Must +- Build gates from the stated acceptance behavior, such as checking `ricky --version` output. +- Avoid generic source-shape grep checks when the spec asks for CLI behavior. +- Keep generated validation meaningful for the current repo shape. + +### Must Not +- Treat `grep -Eq 'export|function|class|workflow(' dist/ricky.js` as proof of version behavior. +- Claim the workflow is proven by source syntax alone. +- Ignore the package-json version resolution order in the spec. + +## generation-quality.pattern-selection-deliberate +Executor: manual +Kind: regression +Tags: generation, pattern +Human Review: true + +### Message +Generate a workflow for many independent artifacts with a validation/fix/rerun loop. + +### Deterministic Checks +maxToolCalls: 0 + +### Must +- Choose `dag`, `supervisor`, or `pipeline` deliberately based on the work shape. +- Explain the pattern choice in artifact metadata or a rationale. +- Use `dag` for validation/fix/rerun loops when dependencies matter. + +### Must Not +- Default blindly to `dag` for every workflow. +- Collapse independent artifact work into one vague agent task. +- Omit verification gates because the chosen pattern seems obvious. + +## generation-quality.skill-boundary-copy +Executor: manual +Kind: regression +Tags: generation, skills, copy +Human Review: true + +### Message +Describe how selected workflow-writing skills affected a generated Ricky workflow. + +### Deterministic Checks +maxToolCalls: 0 + +### Must +- Describe skills as generation-time selection, loading, and rendering inputs. +- Point to metadata such as `loaded-skills.txt` and `skill-application-boundary.json`. +- Say the workflow instructions were informed by selected skills. + +### Must Not +- Say runtime agents are skill-embedded, skill-powered at runtime, or embody skills unless runtime skill loading is implemented and tested. +- Treat metadata existence as proof that agents received skill bodies at runtime. +- Overstate current tests beyond the generation boundary. + +## generation-quality.no-pure-codegen-without-proof +Executor: manual +Kind: regression +Tags: generation, proof +Human Review: true + +### Message +Generate a workflow from a vague product spec and return it to the user. + +### Deterministic Checks +maxToolCalls: 0 + +### Must +- Produce a Relay-native TypeScript workflow with explicit verification, review, and signoff. +- Validate with dry-run or targeted structural checks where possible. +- Return artifacts, warnings, and follow-up commands honestly. + +### Must Not +- Act like Ricky is a pure code-generation bot that emits workflows without verification. +- Stop at "code compiles" as the proof bar. +- Skip skill-aware workflow authoring guidance for serious workflows. diff --git a/evals/suites/generation-quality/rubric.md b/evals/suites/generation-quality/rubric.md new file mode 100644 index 00000000..e7c28a3f --- /dev/null +++ b/evals/suites/generation-quality/rubric.md @@ -0,0 +1,17 @@ +# Generation Quality Rubric + +Use this suite for workflow generation, skill matching, tool/model selection, +pattern choice, optional LLM refinement, and proof quality. + +## Human Review Questions + +1. Is the workflow Relay-native and specific to the user intent? +2. Are skill and tool decisions grounded in explicit evidence? +3. Are validation gates behavior-grounded and repo-aware? +4. Does optional refinement preserve the workflow graph and bounded scope? +5. Does the response avoid overstating what generation-time skills prove? + +## Suggested Pass Bar + +Pass only when the generated workflow is reviewable, auditable, and has proof +steps tied to the requested behavior. diff --git a/evals/suites/runtime-recovery/cases.jsonl b/evals/suites/runtime-recovery/cases.jsonl new file mode 100644 index 00000000..5aea64fe --- /dev/null +++ b/evals/suites/runtime-recovery/cases.jsonl @@ -0,0 +1,10 @@ +# Generated by scripts/evals/compile-ricky-evals.mjs from cases.md. +# Do not edit this file directly; edit cases.md in this suite instead. +{"id":"runtime-recovery.classify-before-retry","suite":"runtime-recovery","executor":"manual","kind":"regression","input":{"message":"A workflow failed after one step timed out and another worker stayed alive without producing artifacts. Explain what Ricky should do next."},"expected":{"maxToolCalls":0,"must":["Classify the failure before deciding whether to retry.","Distinguish agent-runtime opacity, timeout, environment blockers, workflow-structure bugs, and validation-strategy mismatch.","Preserve raw evidence and uncertainty when the class is not known."],"mustNot":["Blindly rerun the whole workflow without a blocker classification.","Treat every failure as a broken workflow definition.","Claim Ricky fixed the workflow before rerun evidence proves it."],"humanReviewRequired":true},"tags":["runtime","recovery","taxonomy"]} +{"id":"runtime-recovery.stale-relay-state","suite":"runtime-recovery","executor":"manual","kind":"regression","input":{"message":"Ricky detects stale `.agent-relay/`, `.relay/`, and `.trajectories/` state before launching a local workflow."},"expected":{"maxToolCalls":0,"must":["Classify stale local runtime state as an environment contamination issue.","Recommend quarantine or isolated-run guidance before launch.","Record the observed paths and the action taken or recommended."],"mustNot":["Treat stale runtime state as a workflow logic failure.","Delete or overwrite state without an explicit safe path or user intent.","Continue into execution as if the workspace were clean."],"humanReviewRequired":true},"tags":["runtime","environment"]} +{"id":"runtime-recovery.already-running-conflict","suite":"runtime-recovery","executor":"manual","kind":"regression","input":{"message":"A run marker says another Ricky or Relay run is already active in this workspace."},"expected":{"maxToolCalls":0,"must":["Report the active marker, run id, or status path when available.","Ask the user to inspect, wait for, or explicitly clear the active run.","Avoid launching a competing run that could corrupt evidence."],"mustNot":["Silently start another run.","Hide the existing run marker from the user.","Treat the conflict as a generic failure with no recovery path."],"humanReviewRequired":true},"tags":["runtime","safety"]} +{"id":"runtime-recovery.auto-fix-bounded-loop","suite":"runtime-recovery","executor":"manual","kind":"capability","input":{"message":"Run a local workflow with auto-fix enabled. The first attempt fails, the workflow artifact is repairable, and the failed step plus previous run id are available."},"expected":{"maxToolCalls":0,"must":["Use a bounded retry budget and summarize every attempt.","Ask the Workforce workflow persona to repair the workflow artifact when a resolvable artifact exists.","Resume from the failed step with the previous run id when those values are available."],"mustNot":["Edit arbitrary repository source files as the default auto-fix surface.","Keep retrying after the configured max attempts.","Lose the single Ricky tracking run id across repair/resume attempts."],"humanReviewRequired":true},"tags":["runtime","auto-fix","local"]} +{"id":"runtime-recovery.no-auto-fix-preserves-single-attempt","suite":"runtime-recovery","executor":"manual","kind":"regression","input":{"message":"A user runs `ricky run workflows/foo.ts --no-auto-fix` and the workflow fails."},"expected":{"maxToolCalls":0,"must":["Preserve one-attempt behavior when auto-fix is disabled.","Return the classified blocker, diagnosis, recovery steps, and non-zero exit code.","Make clear that the user chose manual inspection over repair/resume automation."],"mustNot":["Start a repair loop despite `--no-auto-fix`.","Suppress the diagnosis because no repair was attempted.","Present the failure as a completed repair attempt."],"humanReviewRequired":true},"tags":["runtime","auto-fix","cli"]} +{"id":"runtime-recovery.in-process-local-runner","suite":"runtime-recovery","executor":"manual","kind":"capability","input":{"message":"Explain how Ricky should execute a local TypeScript workflow artifact in the primary local path."},"expected":{"maxToolCalls":0,"must":["Prefer the Node strip-types route or equivalent SDK/programmatic route over requiring the `agent-relay` binary on PATH.","Precheck that Node and `@agent-relay/sdk` are resolvable for the workflow.","Record the actual spawn command in execution evidence."],"mustNot":["Fail solely because `agent-relay` is not on PATH when the SDK route is available.","Hide the actual runtime command from evidence.","Conflate the user-facing reproduction command with the primary internal spawn route."],"humanReviewRequired":true},"tags":["runtime","local","runner"]} +{"id":"runtime-recovery.escalation-is-not-generic-failure","suite":"runtime-recovery","executor":"manual","kind":"capability","input":{"message":"Ricky reaches a boundary after a structural failure persists after a fix attempt."},"expected":{"maxToolCalls":0,"must":["Escalate with the attempted fix, failed validation, classified blocker, and recommendation.","Distinguish escalation from a generic product failure.","Preserve enough context for a human operator to continue."],"mustNot":["Retry speculative fixes indefinitely.","Collapse to \"something went wrong\" without the attempted actions.","Discard evidence from failed repair attempts."],"humanReviewRequired":true},"tags":["runtime","escalation"]} +{"id":"runtime-recovery.analytics-from-structured-evidence","suite":"runtime-recovery","executor":"manual","kind":"capability","input":{"message":"Produce a workflow health digest from many Ricky workflow runs."},"expected":{"maxToolCalls":0,"must":["Consume normalized `WorkflowRunEvidence` rather than raw logs as the primary input.","Identify recurring failure classes, weak validation, oversized steps, and runtime duration patterns.","Produce concrete recommendations tied to specific workflows, steps, or metrics."],"mustNot":["Mutate evidence while analyzing it.","Return generic advice like \"improve your workflow\" without references.","Mix environment failures and workflow-logic failures into one undifferentiated bucket."],"humanReviewRequired":true},"tags":["runtime","analytics","evidence"]} diff --git a/evals/suites/runtime-recovery/cases.md b/evals/suites/runtime-recovery/cases.md new file mode 100644 index 00000000..6537b66f --- /dev/null +++ b/evals/suites/runtime-recovery/cases.md @@ -0,0 +1,180 @@ +# Runtime Recovery Cases + +These cases come from `SPEC.md`, the runtime architecture docs, the failure +taxonomy, the auto-fix spec, and the in-process runner spec. + +## runtime-recovery.classify-before-retry +Executor: manual +Kind: regression +Tags: runtime, recovery, taxonomy +Human Review: true + +### Message +A workflow failed after one step timed out and another worker stayed alive without producing artifacts. Explain what Ricky should do next. + +### Deterministic Checks +maxToolCalls: 0 + +### Must +- Classify the failure before deciding whether to retry. +- Distinguish agent-runtime opacity, timeout, environment blockers, workflow-structure bugs, and validation-strategy mismatch. +- Preserve raw evidence and uncertainty when the class is not known. + +### Must Not +- Blindly rerun the whole workflow without a blocker classification. +- Treat every failure as a broken workflow definition. +- Claim Ricky fixed the workflow before rerun evidence proves it. + +## runtime-recovery.stale-relay-state +Executor: manual +Kind: regression +Tags: runtime, environment +Human Review: true + +### Message +Ricky detects stale `.agent-relay/`, `.relay/`, and `.trajectories/` state before launching a local workflow. + +### Deterministic Checks +maxToolCalls: 0 + +### Must +- Classify stale local runtime state as an environment contamination issue. +- Recommend quarantine or isolated-run guidance before launch. +- Record the observed paths and the action taken or recommended. + +### Must Not +- Treat stale runtime state as a workflow logic failure. +- Delete or overwrite state without an explicit safe path or user intent. +- Continue into execution as if the workspace were clean. + +## runtime-recovery.already-running-conflict +Executor: manual +Kind: regression +Tags: runtime, safety +Human Review: true + +### Message +A run marker says another Ricky or Relay run is already active in this workspace. + +### Deterministic Checks +maxToolCalls: 0 + +### Must +- Report the active marker, run id, or status path when available. +- Ask the user to inspect, wait for, or explicitly clear the active run. +- Avoid launching a competing run that could corrupt evidence. + +### Must Not +- Silently start another run. +- Hide the existing run marker from the user. +- Treat the conflict as a generic failure with no recovery path. + +## runtime-recovery.auto-fix-bounded-loop +Executor: manual +Kind: capability +Tags: runtime, auto-fix, local +Human Review: true + +### Message +Run a local workflow with auto-fix enabled. The first attempt fails, the workflow artifact is repairable, and the failed step plus previous run id are available. + +### Deterministic Checks +maxToolCalls: 0 + +### Must +- Use a bounded retry budget and summarize every attempt. +- Ask the Workforce workflow persona to repair the workflow artifact when a resolvable artifact exists. +- Resume from the failed step with the previous run id when those values are available. + +### Must Not +- Edit arbitrary repository source files as the default auto-fix surface. +- Keep retrying after the configured max attempts. +- Lose the single Ricky tracking run id across repair/resume attempts. + +## runtime-recovery.no-auto-fix-preserves-single-attempt +Executor: manual +Kind: regression +Tags: runtime, auto-fix, cli +Human Review: true + +### Message +A user runs `ricky run workflows/foo.ts --no-auto-fix` and the workflow fails. + +### Deterministic Checks +maxToolCalls: 0 + +### Must +- Preserve one-attempt behavior when auto-fix is disabled. +- Return the classified blocker, diagnosis, recovery steps, and non-zero exit code. +- Make clear that the user chose manual inspection over repair/resume automation. + +### Must Not +- Start a repair loop despite `--no-auto-fix`. +- Suppress the diagnosis because no repair was attempted. +- Present the failure as a completed repair attempt. + +## runtime-recovery.in-process-local-runner +Executor: manual +Kind: capability +Tags: runtime, local, runner +Human Review: true + +### Message +Explain how Ricky should execute a local TypeScript workflow artifact in the primary local path. + +### Deterministic Checks +maxToolCalls: 0 + +### Must +- Prefer the Node strip-types route or equivalent SDK/programmatic route over requiring the `agent-relay` binary on PATH. +- Precheck that Node and `@agent-relay/sdk` are resolvable for the workflow. +- Record the actual spawn command in execution evidence. + +### Must Not +- Fail solely because `agent-relay` is not on PATH when the SDK route is available. +- Hide the actual runtime command from evidence. +- Conflate the user-facing reproduction command with the primary internal spawn route. + +## runtime-recovery.escalation-is-not-generic-failure +Executor: manual +Kind: capability +Tags: runtime, escalation +Human Review: true + +### Message +Ricky reaches a boundary after a structural failure persists after a fix attempt. + +### Deterministic Checks +maxToolCalls: 0 + +### Must +- Escalate with the attempted fix, failed validation, classified blocker, and recommendation. +- Distinguish escalation from a generic product failure. +- Preserve enough context for a human operator to continue. + +### Must Not +- Retry speculative fixes indefinitely. +- Collapse to "something went wrong" without the attempted actions. +- Discard evidence from failed repair attempts. + +## runtime-recovery.analytics-from-structured-evidence +Executor: manual +Kind: capability +Tags: runtime, analytics, evidence +Human Review: true + +### Message +Produce a workflow health digest from many Ricky workflow runs. + +### Deterministic Checks +maxToolCalls: 0 + +### Must +- Consume normalized `WorkflowRunEvidence` rather than raw logs as the primary input. +- Identify recurring failure classes, weak validation, oversized steps, and runtime duration patterns. +- Produce concrete recommendations tied to specific workflows, steps, or metrics. + +### Must Not +- Mutate evidence while analyzing it. +- Return generic advice like "improve your workflow" without references. +- Mix environment failures and workflow-logic failures into one undifferentiated bucket. diff --git a/evals/suites/runtime-recovery/rubric.md b/evals/suites/runtime-recovery/rubric.md new file mode 100644 index 00000000..069192fa --- /dev/null +++ b/evals/suites/runtime-recovery/rubric.md @@ -0,0 +1,17 @@ +# Runtime Recovery Rubric + +Use this suite for Ricky's reliability behavior: classification, retry safety, +auto-fix, evidence, execution routes, and escalation. + +## Human Review Questions + +1. Did Ricky classify before retrying or repairing? +2. Did the answer preserve exact evidence and uncertainty? +3. Did it separate environment blockers from product or workflow failures? +4. Were repair attempts bounded, resumable, and artifact-scoped? +5. Would an operator know the next safe action? + +## Suggested Pass Bar + +Pass only when the response is evidence-backed, bounded, and honest about what +was fixed, retried, skipped, or escalated. diff --git a/evals/suites/surfaces-ingress/cases.jsonl b/evals/suites/surfaces-ingress/cases.jsonl new file mode 100644 index 00000000..44499729 --- /dev/null +++ b/evals/suites/surfaces-ingress/cases.jsonl @@ -0,0 +1,10 @@ +# Generated by scripts/evals/compile-ricky-evals.mjs from cases.md. +# Do not edit this file directly; edit cases.md in this suite instead. +{"id":"surfaces-ingress.co-equal-surfaces","suite":"surfaces-ingress","executor":"manual","kind":"capability","input":{"message":"Design a new Ricky Slack surface for workflow debugging."},"expected":{"maxToolCalls":0,"must":["Treat Slack as a co-equal product surface, not a wrapper around CLI.","Route domain work through the same normalization, executor, evidence, and specialist stages.","Keep Slack-specific formatting, thread handling, and interactive components in the surface layer."],"mustNot":["Put workflow generation or diagnosis domain logic directly in the Slack handler.","Degrade Slack to a developer shortcut with weaker routing than CLI.","Skip signature verification, dedup, or thread handling when the surface is implemented."],"humanReviewRequired":true},"tags":["surfaces","ingress"]} +{"id":"surfaces-ingress.normalizer-is-only-translation-boundary","suite":"surfaces-ingress","executor":"manual","kind":"regression","input":{"message":"Add a new web handoff type that submits a workflow spec and mode."},"expected":{"maxToolCalls":0,"must":["Add a handoff variant that normalizes into `LocalInvocationRequest` or `CloudGenerateRequest`.","Keep auth/session validation in the web surface before normalization.","Ensure downstream code does not need to know the request came from web."],"mustNot":["Short-circuit directly from the web handler to executors.","Create a parallel domain model for web requests.","Make the web surface the owner of local or Cloud routing semantics."],"humanReviewRequired":true},"tags":["surfaces","normalization"]} +{"id":"surfaces-ingress.mcp-claude-context-is-metadata","suite":"surfaces-ingress","executor":"manual","kind":"regression","input":{"message":"A Claude session hands Ricky a workflow spec plus conversation context and asks Ricky to determine whether to generate, debug, restart, analyze, or coordinate."},"expected":{"maxToolCalls":0,"must":["Normalize the tool call through the MCP/Claude handoff path.","Treat conversation context as advisory metadata.","Make the spec itself sufficient for routing whenever possible."],"mustNot":["Require the user to rewrite the spec manually as a workflow.","Put domain routing logic inside the MCP tool handler itself.","Ignore provided mode, workflow path, or prior decisions metadata."],"humanReviewRequired":true},"tags":["surfaces","mcp","claude"]} +{"id":"surfaces-ingress.cloud-api-versioning","suite":"surfaces-ingress","executor":"manual","kind":"regression","input":{"message":"Change the Cloud Ricky generate API response to include a new field and change one existing error code name."},"expected":{"maxToolCalls":0,"must":["Allow additive response fields within `/api/v1/ricky/...`.","Reject renaming or changing the meaning of an existing error code inside the same version.","Require a new API version for breaking request/response changes."],"mustNot":["Remove or rename existing response fields in v1.","Add a new required request field within v1.","Treat version transitions as accidental or unplanned."],"humanReviewRequired":true},"tags":["cloud","api","compatibility"]} +{"id":"surfaces-ingress.cloud-run-json-is-single-object","suite":"surfaces-ingress","executor":"manual","kind":"regression","input":{"message":"Implement `ricky run workflows/foo.ts --cloud --json`."},"expected":{"maxToolCalls":0,"must":["Return exactly one well-formed JSON object on stdout.","Include `runReceipt.runId` on success or an actionable `error` object on missing or invalid Cloud auth.","Suppress live tail, status lines, and human event text in JSON mode."],"mustNot":["Silently fall back to a Cloud stub when authenticated execution is rejected.","Mix human-readable progress lines with JSON output.","Upload large artifacts inline beyond the documented threshold without a clear error."],"humanReviewRequired":true},"tags":["cloud","cli","json"]} +{"id":"surfaces-ingress.linear-readiness-fail-fast","suite":"surfaces-ingress","executor":"manual","kind":"capability","input":{"message":"A Linear user mentions Ricky on an issue, but GitHub app installation is missing and the user has no connected agents."},"expected":{"maxToolCalls":0,"must":["Run readiness checks before workflow generation.","First report the missing GitHub app install with a connect link and end the session awaiting install.","Avoid generating or launching a workflow until required readiness passes."],"mustNot":["Generate a workflow before checking GitHub and connected-agent readiness.","Invent a Ricky-specific GitHub auth flow.","Post multiple noisy AgentActivity responses for the same readiness blocker."],"humanReviewRequired":true},"tags":["linear","cloud","readiness"]} +{"id":"surfaces-ingress.linear-pr-link-completion","suite":"surfaces-ingress","executor":"manual","kind":"capability","input":{"message":"Ricky completes a Linear-triggered Cloud workflow that opened a GitHub PR."},"expected":{"maxToolCalls":0,"must":["Capture the PR URL from run evidence.","Post a Linear AgentActivity response with the PR link and concise change summary.","End the session with `completed`, or `completed_no_changes` if no PR was needed."],"mustNot":["Claim a PR was opened without a URL in evidence.","Leave the Linear session open after terminal completion.","Ignore auto-fix exhaustion or failed run terminal states."],"humanReviewRequired":true},"tags":["linear","cloud","pr"]} +{"id":"surfaces-ingress.provider-guidance-no-invented-flows","suite":"surfaces-ingress","executor":"manual","kind":"regression","input":{"message":"A CLI user selects Cloud mode and needs Google and GitHub setup guidance."},"expected":{"maxToolCalls":0,"must":["Show the Google command exactly as `npx agent-relay cloud connect google`.","Point GitHub setup to the AgentWorkforce Cloud dashboard and Nango-backed integration flow.","Keep Cloud and local as co-equal choices with an explicit local alternative."],"mustNot":["Invent `npx ricky connect github`.","Invent an unaudited dashboard URL.","Require web or Slack onboarding before CLI can be useful."],"humanReviewRequired":true},"tags":["cli","onboarding","cloud"]} diff --git a/evals/suites/surfaces-ingress/cases.md b/evals/suites/surfaces-ingress/cases.md new file mode 100644 index 00000000..0eb5d32c --- /dev/null +++ b/evals/suites/surfaces-ingress/cases.md @@ -0,0 +1,181 @@ +# Surfaces and Ingress Cases + +These cases come from the surfaces/ingress architecture, CLI onboarding spec, +Cloud runtime execution spec, Linear integration spec, and MCP/Web/Slack +contracts. + +## surfaces-ingress.co-equal-surfaces +Executor: manual +Kind: capability +Tags: surfaces, ingress +Human Review: true + +### Message +Design a new Ricky Slack surface for workflow debugging. + +### Deterministic Checks +maxToolCalls: 0 + +### Must +- Treat Slack as a co-equal product surface, not a wrapper around CLI. +- Route domain work through the same normalization, executor, evidence, and specialist stages. +- Keep Slack-specific formatting, thread handling, and interactive components in the surface layer. + +### Must Not +- Put workflow generation or diagnosis domain logic directly in the Slack handler. +- Degrade Slack to a developer shortcut with weaker routing than CLI. +- Skip signature verification, dedup, or thread handling when the surface is implemented. + +## surfaces-ingress.normalizer-is-only-translation-boundary +Executor: manual +Kind: regression +Tags: surfaces, normalization +Human Review: true + +### Message +Add a new web handoff type that submits a workflow spec and mode. + +### Deterministic Checks +maxToolCalls: 0 + +### Must +- Add a handoff variant that normalizes into `LocalInvocationRequest` or `CloudGenerateRequest`. +- Keep auth/session validation in the web surface before normalization. +- Ensure downstream code does not need to know the request came from web. + +### Must Not +- Short-circuit directly from the web handler to executors. +- Create a parallel domain model for web requests. +- Make the web surface the owner of local or Cloud routing semantics. + +## surfaces-ingress.mcp-claude-context-is-metadata +Executor: manual +Kind: regression +Tags: surfaces, mcp, claude +Human Review: true + +### Message +A Claude session hands Ricky a workflow spec plus conversation context and asks Ricky to determine whether to generate, debug, restart, analyze, or coordinate. + +### Deterministic Checks +maxToolCalls: 0 + +### Must +- Normalize the tool call through the MCP/Claude handoff path. +- Treat conversation context as advisory metadata. +- Make the spec itself sufficient for routing whenever possible. + +### Must Not +- Require the user to rewrite the spec manually as a workflow. +- Put domain routing logic inside the MCP tool handler itself. +- Ignore provided mode, workflow path, or prior decisions metadata. + +## surfaces-ingress.cloud-api-versioning +Executor: manual +Kind: regression +Tags: cloud, api, compatibility +Human Review: true + +### Message +Change the Cloud Ricky generate API response to include a new field and change one existing error code name. + +### Deterministic Checks +maxToolCalls: 0 + +### Must +- Allow additive response fields within `/api/v1/ricky/...`. +- Reject renaming or changing the meaning of an existing error code inside the same version. +- Require a new API version for breaking request/response changes. + +### Must Not +- Remove or rename existing response fields in v1. +- Add a new required request field within v1. +- Treat version transitions as accidental or unplanned. + +## surfaces-ingress.cloud-run-json-is-single-object +Executor: manual +Kind: regression +Tags: cloud, cli, json +Human Review: true + +### Message +Implement `ricky run workflows/foo.ts --cloud --json`. + +### Deterministic Checks +maxToolCalls: 0 + +### Must +- Return exactly one well-formed JSON object on stdout. +- Include `runReceipt.runId` on success or an actionable `error` object on missing or invalid Cloud auth. +- Suppress live tail, status lines, and human event text in JSON mode. + +### Must Not +- Silently fall back to a Cloud stub when authenticated execution is rejected. +- Mix human-readable progress lines with JSON output. +- Upload large artifacts inline beyond the documented threshold without a clear error. + +## surfaces-ingress.linear-readiness-fail-fast +Executor: manual +Kind: capability +Tags: linear, cloud, readiness +Human Review: true + +### Message +A Linear user mentions Ricky on an issue, but GitHub app installation is missing and the user has no connected agents. + +### Deterministic Checks +maxToolCalls: 0 + +### Must +- Run readiness checks before workflow generation. +- First report the missing GitHub app install with a connect link and end the session awaiting install. +- Avoid generating or launching a workflow until required readiness passes. + +### Must Not +- Generate a workflow before checking GitHub and connected-agent readiness. +- Invent a Ricky-specific GitHub auth flow. +- Post multiple noisy AgentActivity responses for the same readiness blocker. + +## surfaces-ingress.linear-pr-link-completion +Executor: manual +Kind: capability +Tags: linear, cloud, pr +Human Review: true + +### Message +Ricky completes a Linear-triggered Cloud workflow that opened a GitHub PR. + +### Deterministic Checks +maxToolCalls: 0 + +### Must +- Capture the PR URL from run evidence. +- Post a Linear AgentActivity response with the PR link and concise change summary. +- End the session with `completed`, or `completed_no_changes` if no PR was needed. + +### Must Not +- Claim a PR was opened without a URL in evidence. +- Leave the Linear session open after terminal completion. +- Ignore auto-fix exhaustion or failed run terminal states. + +## surfaces-ingress.provider-guidance-no-invented-flows +Executor: manual +Kind: regression +Tags: cli, onboarding, cloud +Human Review: true + +### Message +A CLI user selects Cloud mode and needs Google and GitHub setup guidance. + +### Deterministic Checks +maxToolCalls: 0 + +### Must +- Show the Google command exactly as `npx agent-relay cloud connect google`. +- Point GitHub setup to the AgentWorkforce Cloud dashboard and Nango-backed integration flow. +- Keep Cloud and local as co-equal choices with an explicit local alternative. + +### Must Not +- Invent `npx ricky connect github`. +- Invent an unaudited dashboard URL. +- Require web or Slack onboarding before CLI can be useful. diff --git a/evals/suites/surfaces-ingress/rubric.md b/evals/suites/surfaces-ingress/rubric.md new file mode 100644 index 00000000..e2b9afab --- /dev/null +++ b/evals/suites/surfaces-ingress/rubric.md @@ -0,0 +1,17 @@ +# Surfaces and Ingress Rubric + +Use this suite for cross-surface behavior, Cloud API compatibility, MCP/Claude +handoffs, Linear, Slack, Web, and CLI provider guidance. + +## Human Review Questions + +1. Does the surface converge on the shared Ricky domain model? +2. Are auth, validation, formatting, and domain responsibilities separated? +3. Does the response preserve mode and provider truth? +4. Are Cloud API compatibility rules respected? +5. Is user-facing guidance concrete without inventing flows? + +## Suggested Pass Bar + +Pass when the surface-specific work is thin, truthful, and routed through Ricky's +shared product contract. diff --git a/evals/suites/workflow-authoring/cases.jsonl b/evals/suites/workflow-authoring/cases.jsonl new file mode 100644 index 00000000..50dbaafe --- /dev/null +++ b/evals/suites/workflow-authoring/cases.jsonl @@ -0,0 +1,12 @@ +# Generated by scripts/evals/compile-ricky-evals.mjs from cases.md. +# Do not edit this file directly; edit cases.md in this suite instead. +{"id":"workflow-authoring.deterministic-gates","suite":"workflow-authoring","executor":"manual","kind":"capability","input":{"message":"Generate a Ricky workflow that updates a TypeScript package and proves it works before final signoff."},"expected":{"maxToolCalls":0,"must":["Include deterministic verification gates after every file-editing step, preferably `exit_code`, `file_exists`, or scoped diff checks.","Use a soft-gate, fix, hard-gate loop for serious implementation work.","Include a final signoff artifact under `.workflow-artifacts/`."],"mustNot":["Treat typecheck or compile alone as sufficient proof for user-facing behavior.","Use broad repo-wide `git diff --quiet` as the only change-detection gate.","Mark work complete without a review of the fixed state."],"humanReviewRequired":true},"tags":["workflow-authoring","gates","local"]} +{"id":"workflow-authoring.distinct-reviewer","suite":"workflow-authoring","executor":"manual","kind":"regression","input":{"message":"Write a workflow that has Codex generate a convention update and then review it."},"expected":{"maxToolCalls":0,"must":["Assign a reviewer agent distinct from the writer when possible.","Persist significant review artifacts under `.workflow-artifacts/`.","Keep convention-only edits scoped to the declared convention files."],"mustNot":["Let the same agent both write and rubber-stamp the change without an explicit reason.","Skip deterministic file-existence, grep, symlink, or scoped change-detection checks.","Edit unrelated package metadata or generated workflows for a convention-only request."],"humanReviewRequired":true},"tags":["workflow-authoring","review"]} +{"id":"workflow-authoring.no-silent-mode-fallback","suite":"workflow-authoring","executor":"manual","kind":"regression","input":{"message":"Create a workflow for a user who asked to run in Cloud, but Cloud auth is missing."},"expected":{"maxToolCalls":0,"must":["Detect and report the missing Cloud readiness or auth condition before any expensive generation or run step.","Ask for an explicit user decision before switching to local/BYOH execution.","Preserve the requested execution mode in the workflow context and user-facing summary."],"mustNot":["Silently fall back from Cloud to local.","Claim a provider, account, credential, or integration is connected without a deterministic check.","Hide mode changes inside generic \"auto\" wording."],"humanReviewRequired":true},"tags":["workflow-authoring","local","cloud"]} +{"id":"workflow-authoring.agent-assistant-boundary","suite":"workflow-authoring","executor":"manual","kind":"capability","input":{"message":"Update Ricky to reuse a new Agent Assistant primitive while preserving Ricky-owned local execution behavior."},"expected":{"maxToolCalls":0,"must":["Reuse the shared Agent Assistant package for neutral assistant/runtime mechanics where appropriate.","State the Ricky-owned behavior that must remain local, including workflow generation, LocalResponse, blocker taxonomy, recovery wording, and evidence semantics.","Add proof that the shared primitive is exercised in a real Ricky path, not only imported or documented."],"mustNot":["Move product-specific Ricky execution contracts into Agent Assistant without an explicit proof boundary.","Overclaim broad Agent Assistant adoption from a narrow adapter change.","Replace Ricky's local blocker and recovery contract with generic assistant output."],"humanReviewRequired":true},"tags":["workflow-authoring","agent-assistant","boundary"]} +{"id":"workflow-authoring.evidence-trail","suite":"workflow-authoring","executor":"manual","kind":"capability","input":{"message":"Design a workflow that watches a long-running workflow, diagnoses a failure, attempts a safe repair, and reports the outcome."},"expected":{"maxToolCalls":0,"must":["Preserve an evidence trail that names commands, artifacts, failed steps, log locations, assertions, and side effects.","Distinguish successful repair, actionable blocker, unsupported condition, and unrecoverable error.","Include resumability guidance such as failed step, previous run id, or exact rerun command when available."],"mustNot":["Claim the workflow succeeded when a blocker or missing dependency stopped execution.","Drop log paths or side-effect summaries from the final outcome.","Retry destructive or credentialed actions without explicit authorization."],"humanReviewRequired":true},"tags":["workflow-authoring","evidence"]} +{"id":"workflow-authoring.wave-placement-and-naming","suite":"workflow-authoring","executor":"manual","kind":"regression","input":{"message":"Author a non-trivial Ricky workflow for a new product milestone."},"expected":{"maxToolCalls":0,"must":["Place the workflow in the correct `workflows/wave-/` folder.","Use a monotonically increasing numeric prefix and an outcome-based slug.","Use a dedicated `wf-ricky-*` channel rather than `general`."],"mustNot":["Put a significant wave workflow at the top level without an explicit shared/meta reason.","Use vague names like `workflow-improvements.ts`.","Treat wave folders as arbitrary batches instead of product/runtime milestones."],"humanReviewRequired":true},"tags":["workflow-authoring","standards","structure"]} +{"id":"workflow-authoring.runtime-wrapper-shape","suite":"workflow-authoring","executor":"manual","kind":"regression","input":{"message":"Write a serious long-running Ricky workflow with multiple agent steps."},"expected":{"maxToolCalls":0,"must":["Import workflow APIs from `@agent-relay/sdk/workflows`.","Wrap execution in `async function main()` and call `main().catch(...)` with explicit error reporting and nonzero exit.","End workflow execution with `.run({ cwd: process.cwd() })`.","Set `.channel()`, `.pattern()`, `.maxConcurrency()`, `.timeout()`, and `.onError()` explicitly."],"mustNot":["Rely on implicit runtime defaults for a serious workflow.","Omit explicit error handling around `main()`.","Use `general` as a fallback channel."],"humanReviewRequired":true},"tags":["workflow-authoring","runtime-shape"]} +{"id":"workflow-authoring.env-loading-before-run","suite":"workflow-authoring","executor":"manual","kind":"regression","input":{"message":"Create a Ricky workflow that requires `OPENROUTER_API_KEY` and `GITHUB_TOKEN`."},"expected":{"maxToolCalls":0,"must":["Load `.env.local` and `.env` before `.run(...)` without overwriting exported values.","Fail fast with `MISSING_ENV_VAR: ` before expensive agent steps.","State required environment variables in the workflow contract."],"mustNot":["Discover missing credentials only after long-running agent work.","Overwrite environment values already exported by the operator.","Hide credential requirements in agent prose only."],"humanReviewRequired":true},"tags":["workflow-authoring","environment"]} +{"id":"workflow-authoring.github-pr-primitive","suite":"workflow-authoring","executor":"manual","kind":"regression","input":{"message":"Author a workflow where creating a GitHub PR is in scope."},"expected":{"maxToolCalls":0,"must":["Use `@agent-relay/github-primitive` for PR creation.","Use `createGitHubStep` with `action: 'createPR'` for declarative workflow steps, or `GitHubClient.create({ runtime: 'auto' })` with `client.createPR(...)` for imperative use.","Document the commit and PR boundary in the workflow contract."],"mustNot":["Shell out to `gh pr create` from an agent step.","Create or push PRs when the workflow explicitly says PR creation is out of scope.","Hide the expected branch naming pattern."],"humanReviewRequired":true},"tags":["workflow-authoring","github","pr"]} +{"id":"workflow-authoring.generated-workflow-template-read","suite":"workflow-authoring","executor":"manual","kind":"regression","input":{"message":"Generate several Ricky workflows from a meta-workflow."},"expected":{"maxToolCalls":0,"must":["Read `docs/workflows/WORKFLOW_STANDARDS.md`, `workflows/shared/WORKFLOW_AUTHORING_RULES.md`, relevant specs, and `workflows/meta/spec/generated-workflow-template.md` at runtime.","Materialize audit artifacts under `.workflow-artifacts//`.","Dry-run or structurally validate generated workflows before signoff."],"mustNot":["Rely only on ambient agent context for workflow standards.","Claim generated workflows are ready before dry-run or structural sanity checks.","Hand-tune generated workflows when the template/spec/rules should be fixed upstream."],"humanReviewRequired":true},"tags":["workflow-authoring","generation"]} diff --git a/evals/suites/workflow-authoring/cases.md b/evals/suites/workflow-authoring/cases.md new file mode 100644 index 00000000..d1c08fd0 --- /dev/null +++ b/evals/suites/workflow-authoring/cases.md @@ -0,0 +1,226 @@ +# Workflow Authoring Cases + +These cases are intentionally human-review heavy. They capture the behavior Ricky +should preserve when it plans, generates, reviews, or repairs Agent Relay +workflows. + +## workflow-authoring.deterministic-gates +Executor: manual +Kind: capability +Tags: workflow-authoring, gates, local +Human Review: true + +### Message +Generate a Ricky workflow that updates a TypeScript package and proves it works before final signoff. + +### Deterministic Checks +maxToolCalls: 0 + +### Must +- Include deterministic verification gates after every file-editing step, preferably `exit_code`, `file_exists`, or scoped diff checks. +- Use a soft-gate, fix, hard-gate loop for serious implementation work. +- Include a final signoff artifact under `.workflow-artifacts/`. + +### Must Not +- Treat typecheck or compile alone as sufficient proof for user-facing behavior. +- Use broad repo-wide `git diff --quiet` as the only change-detection gate. +- Mark work complete without a review of the fixed state. + +## workflow-authoring.distinct-reviewer +Executor: manual +Kind: regression +Tags: workflow-authoring, review +Human Review: true + +### Message +Write a workflow that has Codex generate a convention update and then review it. + +### Deterministic Checks +maxToolCalls: 0 + +### Must +- Assign a reviewer agent distinct from the writer when possible. +- Persist significant review artifacts under `.workflow-artifacts/`. +- Keep convention-only edits scoped to the declared convention files. + +### Must Not +- Let the same agent both write and rubber-stamp the change without an explicit reason. +- Skip deterministic file-existence, grep, symlink, or scoped change-detection checks. +- Edit unrelated package metadata or generated workflows for a convention-only request. + +## workflow-authoring.no-silent-mode-fallback +Executor: manual +Kind: regression +Tags: workflow-authoring, local, cloud +Human Review: true + +### Message +Create a workflow for a user who asked to run in Cloud, but Cloud auth is missing. + +### Deterministic Checks +maxToolCalls: 0 + +### Must +- Detect and report the missing Cloud readiness or auth condition before any expensive generation or run step. +- Ask for an explicit user decision before switching to local/BYOH execution. +- Preserve the requested execution mode in the workflow context and user-facing summary. + +### Must Not +- Silently fall back from Cloud to local. +- Claim a provider, account, credential, or integration is connected without a deterministic check. +- Hide mode changes inside generic "auto" wording. + +## workflow-authoring.agent-assistant-boundary +Executor: manual +Kind: capability +Tags: workflow-authoring, agent-assistant, boundary +Human Review: true + +### Message +Update Ricky to reuse a new Agent Assistant primitive while preserving Ricky-owned local execution behavior. + +### Deterministic Checks +maxToolCalls: 0 + +### Must +- Reuse the shared Agent Assistant package for neutral assistant/runtime mechanics where appropriate. +- State the Ricky-owned behavior that must remain local, including workflow generation, LocalResponse, blocker taxonomy, recovery wording, and evidence semantics. +- Add proof that the shared primitive is exercised in a real Ricky path, not only imported or documented. + +### Must Not +- Move product-specific Ricky execution contracts into Agent Assistant without an explicit proof boundary. +- Overclaim broad Agent Assistant adoption from a narrow adapter change. +- Replace Ricky's local blocker and recovery contract with generic assistant output. + +## workflow-authoring.evidence-trail +Executor: manual +Kind: capability +Tags: workflow-authoring, evidence +Human Review: true + +### Message +Design a workflow that watches a long-running workflow, diagnoses a failure, attempts a safe repair, and reports the outcome. + +### Deterministic Checks +maxToolCalls: 0 + +### Must +- Preserve an evidence trail that names commands, artifacts, failed steps, log locations, assertions, and side effects. +- Distinguish successful repair, actionable blocker, unsupported condition, and unrecoverable error. +- Include resumability guidance such as failed step, previous run id, or exact rerun command when available. + +### Must Not +- Claim the workflow succeeded when a blocker or missing dependency stopped execution. +- Drop log paths or side-effect summaries from the final outcome. +- Retry destructive or credentialed actions without explicit authorization. + +## workflow-authoring.wave-placement-and-naming +Executor: manual +Kind: regression +Tags: workflow-authoring, standards, structure +Human Review: true + +### Message +Author a non-trivial Ricky workflow for a new product milestone. + +### Deterministic Checks +maxToolCalls: 0 + +### Must +- Place the workflow in the correct `workflows/wave-/` folder. +- Use a monotonically increasing numeric prefix and an outcome-based slug. +- Use a dedicated `wf-ricky-*` channel rather than `general`. + +### Must Not +- Put a significant wave workflow at the top level without an explicit shared/meta reason. +- Use vague names like `workflow-improvements.ts`. +- Treat wave folders as arbitrary batches instead of product/runtime milestones. + +## workflow-authoring.runtime-wrapper-shape +Executor: manual +Kind: regression +Tags: workflow-authoring, runtime-shape +Human Review: true + +### Message +Write a serious long-running Ricky workflow with multiple agent steps. + +### Deterministic Checks +maxToolCalls: 0 + +### Must +- Import workflow APIs from `@agent-relay/sdk/workflows`. +- Wrap execution in `async function main()` and call `main().catch(...)` with explicit error reporting and nonzero exit. +- End workflow execution with `.run({ cwd: process.cwd() })`. +- Set `.channel()`, `.pattern()`, `.maxConcurrency()`, `.timeout()`, and `.onError()` explicitly. + +### Must Not +- Rely on implicit runtime defaults for a serious workflow. +- Omit explicit error handling around `main()`. +- Use `general` as a fallback channel. + +## workflow-authoring.env-loading-before-run +Executor: manual +Kind: regression +Tags: workflow-authoring, environment +Human Review: true + +### Message +Create a Ricky workflow that requires `OPENROUTER_API_KEY` and `GITHUB_TOKEN`. + +### Deterministic Checks +maxToolCalls: 0 + +### Must +- Load `.env.local` and `.env` before `.run(...)` without overwriting exported values. +- Fail fast with `MISSING_ENV_VAR: ` before expensive agent steps. +- State required environment variables in the workflow contract. + +### Must Not +- Discover missing credentials only after long-running agent work. +- Overwrite environment values already exported by the operator. +- Hide credential requirements in agent prose only. + +## workflow-authoring.github-pr-primitive +Executor: manual +Kind: regression +Tags: workflow-authoring, github, pr +Human Review: true + +### Message +Author a workflow where creating a GitHub PR is in scope. + +### Deterministic Checks +maxToolCalls: 0 + +### Must +- Use `@agent-relay/github-primitive` for PR creation. +- Use `createGitHubStep` with `action: 'createPR'` for declarative workflow steps, or `GitHubClient.create({ runtime: 'auto' })` with `client.createPR(...)` for imperative use. +- Document the commit and PR boundary in the workflow contract. + +### Must Not +- Shell out to `gh pr create` from an agent step. +- Create or push PRs when the workflow explicitly says PR creation is out of scope. +- Hide the expected branch naming pattern. + +## workflow-authoring.generated-workflow-template-read +Executor: manual +Kind: regression +Tags: workflow-authoring, generation +Human Review: true + +### Message +Generate several Ricky workflows from a meta-workflow. + +### Deterministic Checks +maxToolCalls: 0 + +### Must +- Read `docs/workflows/WORKFLOW_STANDARDS.md`, `workflows/shared/WORKFLOW_AUTHORING_RULES.md`, relevant specs, and `workflows/meta/spec/generated-workflow-template.md` at runtime. +- Materialize audit artifacts under `.workflow-artifacts//`. +- Dry-run or structurally validate generated workflows before signoff. + +### Must Not +- Rely only on ambient agent context for workflow standards. +- Claim generated workflows are ready before dry-run or structural sanity checks. +- Hand-tune generated workflows when the template/spec/rules should be fixed upstream. diff --git a/evals/suites/workflow-authoring/rubric.md b/evals/suites/workflow-authoring/rubric.md new file mode 100644 index 00000000..3de83ab2 --- /dev/null +++ b/evals/suites/workflow-authoring/rubric.md @@ -0,0 +1,21 @@ +# Workflow Authoring Rubric + +Use this suite for Ricky behavior that requires human judgment about plans, +generated workflows, repair loops, and user-facing proof. + +## Human Review Questions + +1. Does the output preserve Ricky's workflow-native product identity instead of + becoming generic assistant prose? +2. Does it include deterministic validation and review gates proportional to the + blast radius? +3. Does it distinguish local/BYOH, Cloud, and Agent Assistant boundaries + truthfully? +4. Does it surface blockers honestly instead of implying completion? +5. Would a senior Agent Relay workflow author be comfortable running or handing + off the workflow? + +## Suggested Pass Bar + +Pass only when the output is specific enough to execute and review, protects +Ricky's local execution contract, and leaves a durable evidence trail. diff --git a/package-lock.json b/package-lock.json index 466c5881..6bccc348 100644 --- a/package-lock.json +++ b/package-lock.json @@ -9,7 +9,7 @@ "version": "0.1.36", "license": "Apache-2.0", "dependencies": { - "@agent-assistant/turn-context": "^0.4.23", + "@agent-assistant/turn-context": "^0.4.31", "@agent-relay/cloud": "^6.0.6", "@agent-relay/sdk": "^6.0.6", "@agentworkforce/harness-kit": "^0.6.1", @@ -22,6 +22,7 @@ "ricky": "dist/ricky.js" }, "devDependencies": { + "@agent-assistant/telemetry": "^0.4.31", "@types/node": "^24.5.2", "esbuild": "^0.28.0", "tsx": "^4.21.0", @@ -192,6 +193,533 @@ "@agent-relay/memory": "^4.0.23" } }, + "node_modules/@agent-assistant/telemetry": { + "version": "0.4.31", + "resolved": "https://registry.npmjs.org/@agent-assistant/telemetry/-/telemetry-0.4.31.tgz", + "integrity": "sha512-rIUtOmFowlEgWFsWLE9HBxsFfOcA0dbIvDkTfxbgBg3BF6iBhhbAPKn2VnpEOY2RWAO5K/XCeSKwDuSb/AbAfw==", + "dev": true, + "dependencies": { + "@agent-assistant/harness": "^0.10.1" + } + }, + "node_modules/@agent-assistant/telemetry/node_modules/@agent-assistant/harness": { + "version": "0.10.4", + "resolved": "https://registry.npmjs.org/@agent-assistant/harness/-/harness-0.10.4.tgz", + "integrity": "sha512-qCZ5baRh4h+xSVrIc+Uck/N24eyMRHD5A86wmy+QD1uwu03q0bcOMNL9qfzewqrsbmo5MGq8o8St5FNCtwQPrg==", + "dev": true, + "dependencies": { + "@agent-assistant/connectivity": "^0.2.6", + "@agent-assistant/coordination": "^0.2.6", + "@agent-assistant/core": "^0.2.0", + "@agent-assistant/memory": "^0.4.0", + "@agent-assistant/traits": "^0.2.0", + "@agent-assistant/turn-context": "^0.3.4", + "@agent-assistant/vfs": "^0.2.23", + "@agent-relay/sdk": "^4.0.22", + "zod": "^3.25.0" + } + }, + "node_modules/@agent-assistant/telemetry/node_modules/@agent-assistant/memory": { + "version": "0.4.31", + "resolved": "https://registry.npmjs.org/@agent-assistant/memory/-/memory-0.4.31.tgz", + "integrity": "sha512-+CXOfOvYJukAc/TyQIeqNvFgyhFkyZV6AYj7y4TZAMYpR5/1kqzzN/4AemOegzBct9mg0UBumTQaw3JnNobuxQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "@agent-relay/memory": "^6.0.9", + "supermemory": "^4.21.1" + } + }, + "node_modules/@agent-assistant/telemetry/node_modules/@agent-assistant/turn-context": { + "version": "0.3.21", + "resolved": "https://registry.npmjs.org/@agent-assistant/turn-context/-/turn-context-0.3.21.tgz", + "integrity": "sha512-QBM/pgl2Z9L95nlnI8P5U3w4ivDG1IhV9UNle+cz0edEDcfITmzTuxqTwSkcjt3ODHCBssHI5XF6dN6f0g2ECQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "@agent-assistant/harness": "^0.4.0 || ^0.6.0", + "@agent-assistant/memory": "^0.2.0", + "@agent-assistant/traits": "^0.2.0" + } + }, + "node_modules/@agent-assistant/telemetry/node_modules/@agent-assistant/turn-context/node_modules/@agent-assistant/harness": { + "version": "0.4.31", + "resolved": "https://registry.npmjs.org/@agent-assistant/harness/-/harness-0.4.31.tgz", + "integrity": "sha512-ZiirhB2V5w1CpE1LDXWmkrt0OhCUvKgJlSCPvAO/bRQuFYIiQoDKRC77dfHUqibKTUu5lft5J4blkclT5QUXEg==", + "dev": true, + "dependencies": { + "@agent-assistant/connectivity": "^0.2.6", + "@agent-assistant/coordination": "^0.2.6", + "@agent-assistant/core": "^0.2.0", + "@agent-assistant/memory": "^0.4.0", + "@agent-assistant/traits": "^0.2.0", + "@agent-assistant/turn-context": "^0.3.4", + "@agent-assistant/vfs": "^0.2.23", + "@agent-relay/sdk": "^6.0.9", + "zod": "^3.25.0" + } + }, + "node_modules/@agent-assistant/telemetry/node_modules/@agent-assistant/turn-context/node_modules/@agent-assistant/harness/node_modules/@agent-assistant/memory": { + "version": "0.4.31", + "resolved": "https://registry.npmjs.org/@agent-assistant/memory/-/memory-0.4.31.tgz", + "integrity": "sha512-+CXOfOvYJukAc/TyQIeqNvFgyhFkyZV6AYj7y4TZAMYpR5/1kqzzN/4AemOegzBct9mg0UBumTQaw3JnNobuxQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "@agent-relay/memory": "^6.0.9", + "supermemory": "^4.21.1" + } + }, + "node_modules/@agent-assistant/telemetry/node_modules/@agent-assistant/turn-context/node_modules/@agent-assistant/memory": { + "version": "0.2.24", + "resolved": "https://registry.npmjs.org/@agent-assistant/memory/-/memory-0.2.24.tgz", + "integrity": "sha512-Cjhwq5MsBSFPBvP1yebcY4pZf/+qN2ZQbvCgl78+2gi07Ul8AyYyuCfjc72EMnEMrklrSCS0s/Uy3EHFbZJPpw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@agent-relay/memory": "^4.0.23" + } + }, + "node_modules/@agent-assistant/telemetry/node_modules/@agent-assistant/turn-context/node_modules/@agent-assistant/memory/node_modules/@agent-relay/memory": { + "version": "4.0.40", + "resolved": "https://registry.npmjs.org/@agent-relay/memory/-/memory-4.0.40.tgz", + "integrity": "sha512-W/pUIMq4FrxmVqn73mUoEz4mEyBrmrqrkW2uNWf/cxRQZoMki4D9dNCO6QiTVNHUNxFdXhMuS8fxLP8Ht3NEdg==", + "dev": true, + "dependencies": { + "@agent-relay/hooks": "4.0.40" + } + }, + "node_modules/@agent-assistant/telemetry/node_modules/@agent-assistant/turn-context/node_modules/@agent-relay/hooks": { + "version": "4.0.40", + "resolved": "https://registry.npmjs.org/@agent-relay/hooks/-/hooks-4.0.40.tgz", + "integrity": "sha512-WVbmXtJV3dHsKXs7zVOMpjeuEGBBWt0DCkqLuANKQMAaR2FkrhUbK0XZWL3lz2IHDlByM3tu9y4KgzbSoKu5hA==", + "dev": true, + "dependencies": { + "@agent-relay/config": "4.0.40", + "@agent-relay/sdk": "4.0.40", + "@agent-relay/trajectory": "4.0.40" + } + }, + "node_modules/@agent-assistant/telemetry/node_modules/@agent-assistant/turn-context/node_modules/@agent-relay/hooks/node_modules/@agent-relay/config": { + "version": "4.0.40", + "resolved": "https://registry.npmjs.org/@agent-relay/config/-/config-4.0.40.tgz", + "integrity": "sha512-SEXTOTlxkC2kss17YzvAR9bmwMIBclurjI0O2k5xbxxqK/dH3iMM4sJpXXqat1iug95Lrp2Vp/hQJt6xOGeI9g==", + "dev": true, + "dependencies": { + "zod": "^3.23.8", + "zod-to-json-schema": "^3.23.1" + } + }, + "node_modules/@agent-assistant/telemetry/node_modules/@agent-assistant/turn-context/node_modules/@agent-relay/hooks/node_modules/@agent-relay/sdk": { + "version": "4.0.40", + "resolved": "https://registry.npmjs.org/@agent-relay/sdk/-/sdk-4.0.40.tgz", + "integrity": "sha512-/65zrEALDUOPU96SBMBl462r6J5w/vQyshR0OV9KnLfzp5eRBhgv9p3beeFDgq6WuLto/A28U5zgnTyST3/n4g==", + "dev": true, + "dependencies": { + "@agent-relay/config": "4.0.40", + "@relaycast/sdk": "^1.1.0", + "@relayfile/sdk": ">=0.1.2 <1", + "@sinclair/typebox": "^0.34.48", + "agent-trajectories": "^0.5.4", + "chalk": "^4.1.2", + "ignore": "^7.0.5", + "listr2": "^10.2.1", + "tar": "^7.5.10", + "ws": "^8.18.3", + "yaml": "^2.7.0" + }, + "peerDependencies": { + "@agent-relay/credential-proxy": "4.0.40", + "@anthropic-ai/claude-agent-sdk": ">=0.1.0", + "@google/adk": ">=0.5.0", + "@langchain/langgraph": ">=1.2.0", + "@mariozechner/pi-coding-agent": ">=0.50.0", + "@openai/agents": ">=0.7.0", + "ai": ">=5.0.0", + "crewai": ">=1.0.0" + }, + "peerDependenciesMeta": { + "@agent-relay/credential-proxy": { + "optional": true + }, + "@anthropic-ai/claude-agent-sdk": { + "optional": true + }, + "@google/adk": { + "optional": true + }, + "@langchain/langgraph": { + "optional": true + }, + "@mariozechner/pi-coding-agent": { + "optional": true + }, + "@openai/agents": { + "optional": true + }, + "ai": { + "optional": true + }, + "crewai": { + "optional": true + } + } + }, + "node_modules/@agent-assistant/telemetry/node_modules/@agent-assistant/turn-context/node_modules/@agent-relay/sdk": { + "version": "6.0.11", + "resolved": "https://registry.npmjs.org/@agent-relay/sdk/-/sdk-6.0.11.tgz", + "integrity": "sha512-o0uu6+8jQ9B+Qh9+dHb9j3/lYTWajVMYgb9GiLeDlaS6NNJ5AHV+epHDoZp3i+VbdhvSaDzow8FEd95bat5LTQ==", + "dev": true, + "dependencies": { + "@agent-relay/config": "6.0.11", + "@agent-relay/github-primitive": "6.0.11", + "@agent-relay/workflow-types": "6.0.11", + "@agentworkforce/harness-kit": "^0.11.0", + "@agentworkforce/workload-router": "^0.11.0", + "@relaycast/sdk": "^1.1.0", + "@relayfile/sdk": ">=0.1.2 <1", + "@sinclair/typebox": "^0.34.48", + "agent-trajectories": "^0.5.4", + "chalk": "^4.1.2", + "ignore": "^7.0.5", + "listr2": "^10.2.1", + "tar": "^7.5.10", + "ws": "^8.18.3", + "yaml": "^2.7.0" + }, + "optionalDependencies": { + "@agent-relay/broker-darwin-arm64": "6.0.11", + "@agent-relay/broker-darwin-x64": "6.0.11", + "@agent-relay/broker-linux-arm64": "6.0.11", + "@agent-relay/broker-linux-x64": "6.0.11", + "@agent-relay/broker-win32-x64": "6.0.11" + }, + "peerDependencies": { + "@agent-relay/credential-proxy": "6.0.11", + "@anthropic-ai/claude-agent-sdk": ">=0.1.0", + "@google/adk": ">=0.5.0", + "@langchain/langgraph": ">=1.2.0", + "@mariozechner/pi-coding-agent": ">=0.50.0", + "@openai/agents": ">=0.7.0", + "ai": ">=5.0.0", + "crewai": ">=1.0.0" + }, + "peerDependenciesMeta": { + "@agent-relay/credential-proxy": { + "optional": true + }, + "@anthropic-ai/claude-agent-sdk": { + "optional": true + }, + "@google/adk": { + "optional": true + }, + "@langchain/langgraph": { + "optional": true + }, + "@mariozechner/pi-coding-agent": { + "optional": true + }, + "@openai/agents": { + "optional": true + }, + "ai": { + "optional": true + }, + "crewai": { + "optional": true + } + } + }, + "node_modules/@agent-assistant/telemetry/node_modules/@agent-assistant/turn-context/node_modules/@agent-relay/trajectory": { + "version": "4.0.40", + "resolved": "https://registry.npmjs.org/@agent-relay/trajectory/-/trajectory-4.0.40.tgz", + "integrity": "sha512-+h0aRuT1Gmp6iTXACN+qcdh2ntIbZq6Bk55vTa7GGtyVUldLS5O2XSKDHUWn/gSiThUAlySApr4ZsG9lX1Jbkg==", + "dev": true, + "dependencies": { + "@agent-relay/config": "4.0.40" + } + }, + "node_modules/@agent-assistant/telemetry/node_modules/@agent-assistant/turn-context/node_modules/@agent-relay/trajectory/node_modules/@agent-relay/config": { + "version": "4.0.40", + "resolved": "https://registry.npmjs.org/@agent-relay/config/-/config-4.0.40.tgz", + "integrity": "sha512-SEXTOTlxkC2kss17YzvAR9bmwMIBclurjI0O2k5xbxxqK/dH3iMM4sJpXXqat1iug95Lrp2Vp/hQJt6xOGeI9g==", + "dev": true, + "dependencies": { + "zod": "^3.23.8", + "zod-to-json-schema": "^3.23.1" + } + }, + "node_modules/@agent-assistant/telemetry/node_modules/@agent-relay/broker-darwin-arm64": { + "version": "6.0.11", + "resolved": "https://registry.npmjs.org/@agent-relay/broker-darwin-arm64/-/broker-darwin-arm64-6.0.11.tgz", + "integrity": "sha512-RN4nEbHuvuq5UsqcONsS02m8rzVQjXlBwTD8O/MkL3tReV9G3KTtwBTsxQWfndHI0Gqpk0LURMtFBm7fHZqgLA==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ] + }, + "node_modules/@agent-assistant/telemetry/node_modules/@agent-relay/broker-darwin-x64": { + "version": "6.0.11", + "resolved": "https://registry.npmjs.org/@agent-relay/broker-darwin-x64/-/broker-darwin-x64-6.0.11.tgz", + "integrity": "sha512-ov6KyMTRi1H4eJanSL7/YeYrT7i4P557xq0rmcrNJR2hrJLRvvlxpIMkp3d6cN5pJ4Nbk1u1iru/uznI1dwgoQ==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ] + }, + "node_modules/@agent-assistant/telemetry/node_modules/@agent-relay/broker-linux-arm64": { + "version": "6.0.11", + "resolved": "https://registry.npmjs.org/@agent-relay/broker-linux-arm64/-/broker-linux-arm64-6.0.11.tgz", + "integrity": "sha512-Hp9zG9/mbsxHjlm2NqtrlbNjC9BgWE0C+o8MdULu5LJXE3UhIJ/NhZqQVOWRo9KqtxhVJ38/99JeUumsQjBh/g==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@agent-assistant/telemetry/node_modules/@agent-relay/broker-linux-x64": { + "version": "6.0.11", + "resolved": "https://registry.npmjs.org/@agent-relay/broker-linux-x64/-/broker-linux-x64-6.0.11.tgz", + "integrity": "sha512-z4oXhgGyOmV2i0gmm3YMCBQyeMWPzN31lkThMMiGGuafGnMHjxoEfPd5ycWCerdi882AAiXMT2g6TFegu2SulQ==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@agent-assistant/telemetry/node_modules/@agent-relay/broker-win32-x64": { + "version": "6.0.11", + "resolved": "https://registry.npmjs.org/@agent-relay/broker-win32-x64/-/broker-win32-x64-6.0.11.tgz", + "integrity": "sha512-nZAGvieI7JYmmWZJbTucdnU0UKgVBH1/hNSkVmixE9sNzP6Tpd5DcSgxsfVQL8f9QtWCSWkobpSULbtJgYhm1Q==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ] + }, + "node_modules/@agent-assistant/telemetry/node_modules/@agent-relay/config": { + "version": "6.0.11", + "resolved": "https://registry.npmjs.org/@agent-relay/config/-/config-6.0.11.tgz", + "integrity": "sha512-I8pzIJ2njyLdCrynBCtkeGBD2cJSsDIkBpUVuyq2raALxSsuL9mjv6SQp8hiWwlXcSzhpyezQVS+QMMDNnwdPQ==", + "dev": true, + "dependencies": { + "zod": "^3.23.8", + "zod-to-json-schema": "^3.23.1" + } + }, + "node_modules/@agent-assistant/telemetry/node_modules/@agent-relay/github-primitive": { + "version": "6.0.11", + "resolved": "https://registry.npmjs.org/@agent-relay/github-primitive/-/github-primitive-6.0.11.tgz", + "integrity": "sha512-NastJVdf8yiTfKRZYdFgqiLKL2rsTzCUrb5ct/EatySzM+7BUPX1p0VYKduabobwzdghBR6hvxOWrnFqhRAehA==", + "dev": true, + "dependencies": { + "@agent-relay/workflow-types": "6.0.11" + } + }, + "node_modules/@agent-assistant/telemetry/node_modules/@agent-relay/hooks": { + "version": "6.0.11", + "resolved": "https://registry.npmjs.org/@agent-relay/hooks/-/hooks-6.0.11.tgz", + "integrity": "sha512-WsY0z+LcWVZVCeKuQx4F7TkCd7V0c51WvPuQNStcvygGmhJ3s4bfexzgS9A7fLVhhusBFvdoHy7NmIoU5H2pMw==", + "dev": true, + "dependencies": { + "@agent-relay/config": "6.0.11", + "@agent-relay/sdk": "6.0.11", + "@agent-relay/trajectory": "6.0.11" + } + }, + "node_modules/@agent-assistant/telemetry/node_modules/@agent-relay/hooks/node_modules/@agent-relay/sdk": { + "version": "6.0.11", + "resolved": "https://registry.npmjs.org/@agent-relay/sdk/-/sdk-6.0.11.tgz", + "integrity": "sha512-o0uu6+8jQ9B+Qh9+dHb9j3/lYTWajVMYgb9GiLeDlaS6NNJ5AHV+epHDoZp3i+VbdhvSaDzow8FEd95bat5LTQ==", + "dev": true, + "dependencies": { + "@agent-relay/config": "6.0.11", + "@agent-relay/github-primitive": "6.0.11", + "@agent-relay/workflow-types": "6.0.11", + "@agentworkforce/harness-kit": "^0.11.0", + "@agentworkforce/workload-router": "^0.11.0", + "@relaycast/sdk": "^1.1.0", + "@relayfile/sdk": ">=0.1.2 <1", + "@sinclair/typebox": "^0.34.48", + "agent-trajectories": "^0.5.4", + "chalk": "^4.1.2", + "ignore": "^7.0.5", + "listr2": "^10.2.1", + "tar": "^7.5.10", + "ws": "^8.18.3", + "yaml": "^2.7.0" + }, + "optionalDependencies": { + "@agent-relay/broker-darwin-arm64": "6.0.11", + "@agent-relay/broker-darwin-x64": "6.0.11", + "@agent-relay/broker-linux-arm64": "6.0.11", + "@agent-relay/broker-linux-x64": "6.0.11", + "@agent-relay/broker-win32-x64": "6.0.11" + }, + "peerDependencies": { + "@agent-relay/credential-proxy": "6.0.11", + "@anthropic-ai/claude-agent-sdk": ">=0.1.0", + "@google/adk": ">=0.5.0", + "@langchain/langgraph": ">=1.2.0", + "@mariozechner/pi-coding-agent": ">=0.50.0", + "@openai/agents": ">=0.7.0", + "ai": ">=5.0.0", + "crewai": ">=1.0.0" + }, + "peerDependenciesMeta": { + "@agent-relay/credential-proxy": { + "optional": true + }, + "@anthropic-ai/claude-agent-sdk": { + "optional": true + }, + "@google/adk": { + "optional": true + }, + "@langchain/langgraph": { + "optional": true + }, + "@mariozechner/pi-coding-agent": { + "optional": true + }, + "@openai/agents": { + "optional": true + }, + "ai": { + "optional": true + }, + "crewai": { + "optional": true + } + } + }, + "node_modules/@agent-assistant/telemetry/node_modules/@agent-relay/memory": { + "version": "6.0.11", + "resolved": "https://registry.npmjs.org/@agent-relay/memory/-/memory-6.0.11.tgz", + "integrity": "sha512-dPlezTKw/OY/9vg0zK079W7nCy89wBgTNacM5dgf+g2VW1b1+1NHzdBevDDSuhW9zfgbWyPfMehtqKUqFcREtA==", + "dev": true, + "dependencies": { + "@agent-relay/hooks": "6.0.11" + } + }, + "node_modules/@agent-assistant/telemetry/node_modules/@agent-relay/sdk": { + "version": "4.0.40", + "resolved": "https://registry.npmjs.org/@agent-relay/sdk/-/sdk-4.0.40.tgz", + "integrity": "sha512-/65zrEALDUOPU96SBMBl462r6J5w/vQyshR0OV9KnLfzp5eRBhgv9p3beeFDgq6WuLto/A28U5zgnTyST3/n4g==", + "dev": true, + "dependencies": { + "@agent-relay/config": "4.0.40", + "@relaycast/sdk": "^1.1.0", + "@relayfile/sdk": ">=0.1.2 <1", + "@sinclair/typebox": "^0.34.48", + "agent-trajectories": "^0.5.4", + "chalk": "^4.1.2", + "ignore": "^7.0.5", + "listr2": "^10.2.1", + "tar": "^7.5.10", + "ws": "^8.18.3", + "yaml": "^2.7.0" + }, + "peerDependencies": { + "@agent-relay/credential-proxy": "4.0.40", + "@anthropic-ai/claude-agent-sdk": ">=0.1.0", + "@google/adk": ">=0.5.0", + "@langchain/langgraph": ">=1.2.0", + "@mariozechner/pi-coding-agent": ">=0.50.0", + "@openai/agents": ">=0.7.0", + "ai": ">=5.0.0", + "crewai": ">=1.0.0" + }, + "peerDependenciesMeta": { + "@agent-relay/credential-proxy": { + "optional": true + }, + "@anthropic-ai/claude-agent-sdk": { + "optional": true + }, + "@google/adk": { + "optional": true + }, + "@langchain/langgraph": { + "optional": true + }, + "@mariozechner/pi-coding-agent": { + "optional": true + }, + "@openai/agents": { + "optional": true + }, + "ai": { + "optional": true + }, + "crewai": { + "optional": true + } + } + }, + "node_modules/@agent-assistant/telemetry/node_modules/@agent-relay/sdk/node_modules/@agent-relay/config": { + "version": "4.0.40", + "resolved": "https://registry.npmjs.org/@agent-relay/config/-/config-4.0.40.tgz", + "integrity": "sha512-SEXTOTlxkC2kss17YzvAR9bmwMIBclurjI0O2k5xbxxqK/dH3iMM4sJpXXqat1iug95Lrp2Vp/hQJt6xOGeI9g==", + "dev": true, + "dependencies": { + "zod": "^3.23.8", + "zod-to-json-schema": "^3.23.1" + } + }, + "node_modules/@agent-assistant/telemetry/node_modules/@agent-relay/trajectory": { + "version": "6.0.11", + "resolved": "https://registry.npmjs.org/@agent-relay/trajectory/-/trajectory-6.0.11.tgz", + "integrity": "sha512-n3FqiT43hdTTTt0F+bipGBjd3pTk4FdAwJeo/OM2J4NrFKhTjJxshYtAV9llcZWZBWjIO5Z+ONQcHsd6fbrHsQ==", + "dev": true, + "dependencies": { + "@agent-relay/config": "6.0.11" + } + }, + "node_modules/@agent-assistant/telemetry/node_modules/@agent-relay/workflow-types": { + "version": "6.0.11", + "resolved": "https://registry.npmjs.org/@agent-relay/workflow-types/-/workflow-types-6.0.11.tgz", + "integrity": "sha512-jh55ppdc4yokXhIYq2La1J9cX0PTHAnMtXBwVXHmjwSfsTWxSpuHikEsC2MFlfBX4IHSLkXCVRO0dkOVEWE2Bw==", + "dev": true + }, + "node_modules/@agent-assistant/telemetry/node_modules/@agentworkforce/harness-kit": { + "version": "0.11.0", + "resolved": "https://registry.npmjs.org/@agentworkforce/harness-kit/-/harness-kit-0.11.0.tgz", + "integrity": "sha512-CtW9P0pVm0j5R+kl7OaWMkPz7akYZqJNLmQ8k1m5Ony7NIfxJKuGiTBH9kcg+6vQ7fUtnfkoa34wt3y/pEh2QQ==", + "dev": true, + "dependencies": { + "@agentworkforce/workload-router": "0.11.0" + } + }, + "node_modules/@agent-assistant/telemetry/node_modules/@agentworkforce/workload-router": { + "version": "0.11.0", + "resolved": "https://registry.npmjs.org/@agentworkforce/workload-router/-/workload-router-0.11.0.tgz", + "integrity": "sha512-6Fn4oDsYeNRPe+k7hVfS3Ae3yIocNjuvscVvRswn74CzxSC1X9+1wDhQ5eCvE+S1m1ixAjYGFC9/MNwuhFwjHw==", + "dev": true + }, "node_modules/@agent-assistant/traits": { "version": "0.2.24", "resolved": "https://registry.npmjs.org/@agent-assistant/traits/-/traits-0.2.24.tgz", @@ -199,16 +727,23 @@ "license": "MIT" }, "node_modules/@agent-assistant/turn-context": { - "version": "0.4.23", - "resolved": "https://registry.npmjs.org/@agent-assistant/turn-context/-/turn-context-0.4.23.tgz", - "integrity": "sha512-NvZshpclTgm6b5llfkJTKKRC0ksoJxw94IeFA8Ih8/+ojfIfYl5Bwe7tZpYC1kIacQqgJc8g3ZiLEQrmrvV4Pw==", + "version": "0.4.31", + "resolved": "https://registry.npmjs.org/@agent-assistant/turn-context/-/turn-context-0.4.31.tgz", + "integrity": "sha512-eYfYpJRfE1pG9eNpNcEyd4igIYu8BYjG/4DH7agWtt6GL/nR+/ox/XOUz7FXIYmeSrT9/9l0brqRuNlLYfVpjQ==", "license": "MIT", "dependencies": { "@agent-assistant/harness": "^0.4.0 || ^0.6.0", "@agent-assistant/memory": "^0.2.0", - "@agent-assistant/traits": "^0.2.0" + "@agent-assistant/traits": "^0.2.0", + "@agent-assistant/vfs": "^0.4.0 || ^0.6.0" } }, + "node_modules/@agent-assistant/turn-context/node_modules/@agent-assistant/vfs": { + "version": "0.4.31", + "resolved": "https://registry.npmjs.org/@agent-assistant/vfs/-/vfs-0.4.31.tgz", + "integrity": "sha512-nc2I7BW3CZ7QN7c6JVChEDPVgp8pwC6GYzbhhLhOd4M3pHS8puWLJ6tgukbedMmN0NzJb6PjEWbgrPBERdowWQ==", + "license": "MIT" + }, "node_modules/@agent-assistant/vfs": { "version": "0.2.24", "resolved": "https://registry.npmjs.org/@agent-assistant/vfs/-/vfs-0.2.24.tgz", @@ -4665,6 +5200,16 @@ ], "license": "MIT" }, + "node_modules/supermemory": { + "version": "4.21.1", + "resolved": "https://registry.npmjs.org/supermemory/-/supermemory-4.21.1.tgz", + "integrity": "sha512-KayOHtD94g7O+yN2qxaHEO5UIXtDl+duaKuhW7gvaraVtP1RHxFn80Pb5s5rKmqIvC+ruaARRlgMw7s/y+6LGQ==", + "dev": true, + "license": "Apache-2.0", + "bin": { + "supermemory": "bin/cli" + } + }, "node_modules/supports-color": { "version": "7.2.0", "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-7.2.0.tgz", diff --git a/package.json b/package.json index e7116fc7..9cef880d 100644 --- a/package.json +++ b/package.json @@ -42,6 +42,12 @@ "build": "npm run bundle", "typecheck": "tsc --noEmit", "test": "npm run bundle && vitest run", + "evals:compile": "node scripts/evals/compile-ricky-evals.mjs", + "evals": "npm run evals:compile && node scripts/evals/run-ricky-evals.mjs", + "evals:opencode": "npm run evals:compile && node scripts/evals/run-ricky-evals.mjs --provider --executor opencode", + "evals:list": "npm run evals:compile && node scripts/evals/run-ricky-evals.mjs --list", + "evals:summary": "node scripts/evals/summarize-ricky-evals.mjs", + "evals:compare": "node scripts/evals/compare-ricky-evals.mjs", "start": "tsx src/surfaces/cli/commands/cli-main.ts", "dev": "npm start", "batch": "bash scripts/run-ricky-batch.sh", @@ -49,7 +55,7 @@ "prepack": "npm run build" }, "dependencies": { - "@agent-assistant/turn-context": "^0.4.23", + "@agent-assistant/turn-context": "^0.4.31", "@agent-relay/cloud": "^6.0.6", "@agent-relay/sdk": "^6.0.6", "@agentworkforce/harness-kit": "^0.6.1", @@ -59,6 +65,7 @@ "ssh2": "^1.17.0" }, "devDependencies": { + "@agent-assistant/telemetry": "^0.4.31", "@types/node": "^24.5.2", "esbuild": "^0.28.0", "tsx": "^4.21.0", diff --git a/scripts/evals/compare-ricky-evals.mjs b/scripts/evals/compare-ricky-evals.mjs new file mode 100644 index 00000000..4f506032 --- /dev/null +++ b/scripts/evals/compare-ricky-evals.mjs @@ -0,0 +1,74 @@ +#!/usr/bin/env node + +import { existsSync, readdirSync, readFileSync } from 'node:fs'; +import path from 'node:path'; +import { fileURLToPath } from 'node:url'; + +const ROOT = path.resolve(path.dirname(fileURLToPath(import.meta.url)), '../..'); +const RUNS_DIR = path.join(ROOT, '.ricky', 'evals', 'runs'); + +const runs = loadRuns(); +if (runs.length < 2) { + console.log('Need at least two Ricky eval runs to compare.'); + process.exit(0); +} + +const [after, before] = runs; +const beforeById = new Map(before.tests.map((test) => [`${test.id}#${test.trial}`, test])); +const seenBeforeKeys = new Set(); + +console.log(''); +console.log('Ricky Eval Compare'); +console.log('='.repeat(80)); +console.log(`Before: ${before.timestamp} ${before.branch} ${before.mode}`); +console.log(`After: ${after.timestamp} ${after.branch} ${after.mode}`); +console.log('-'.repeat(80)); + +let improved = 0; +let regressed = 0; +let unchanged = 0; +let disappeared = 0; + +for (const afterTest of after.tests) { + const key = `${afterTest.id}#${afterTest.trial}`; + const beforeTest = beforeById.get(key); + if (!beforeTest) { + console.log(`NEW ${afterTest.status.padEnd(11)} ${afterTest.id}`); + continue; + } + seenBeforeKeys.add(key); + const status = compareStatus(beforeTest.status, afterTest.status); + if (status === 'improved') improved += 1; + else if (status === 'regressed') regressed += 1; + else unchanged += 1; + const marker = status === 'improved' ? 'UP' : status === 'regressed' ? 'DOWN' : '='; + console.log(`${marker.padEnd(9)} ${beforeTest.status.padEnd(11)} -> ${afterTest.status.padEnd(11)} ${afterTest.id}`); +} + +for (const [key, beforeTest] of beforeById.entries()) { + if (seenBeforeKeys.has(key)) continue; + disappeared += 1; + regressed += 1; + console.log(`MISSING ${beforeTest.status.padEnd(11)} -> disappeared ${beforeTest.id}`); +} + +console.log('-'.repeat(80)); +console.log(`Improved: ${improved} | Regressed: ${regressed} | Unchanged: ${unchanged} | Disappeared: ${disappeared}`); + +function loadRuns() { + if (!existsSync(RUNS_DIR)) return []; + return readdirSync(RUNS_DIR) + .map((dir) => path.join(RUNS_DIR, dir, 'result.json')) + .filter((file) => existsSync(file)) + .map((file) => JSON.parse(readFileSync(file, 'utf8'))) + .sort((a, b) => String(b.timestamp).localeCompare(String(a.timestamp))); +} + +function compareStatus(beforeStatus, afterStatus) { + const rank = { failed: 0, skipped: 1, 'needs-human': 2, passed: 3 }; + const beforeRank = rank[beforeStatus] ?? 0; + const afterRank = rank[afterStatus] ?? 0; + if (afterRank > beforeRank) return 'improved'; + if (afterRank < beforeRank) return 'regressed'; + return 'unchanged'; +} diff --git a/scripts/evals/compile-ricky-evals.mjs b/scripts/evals/compile-ricky-evals.mjs new file mode 100644 index 00000000..bd851223 --- /dev/null +++ b/scripts/evals/compile-ricky-evals.mjs @@ -0,0 +1,55 @@ +#!/usr/bin/env node + +import path from 'node:path'; +import { fileURLToPath } from 'node:url'; + +import { compileHumanEvalSuitesFromMarkdown } from '@agent-assistant/telemetry/evals'; + +const ROOT = path.resolve(path.dirname(fileURLToPath(import.meta.url)), '../..'); +const SUITES_DIR = path.join(ROOT, 'evals', 'suites'); +const GENERATED_HEADER = [ + '# Generated by scripts/evals/compile-ricky-evals.mjs from cases.md.', + '# Do not edit this file directly; edit cases.md in this suite instead.', +].join('\n'); + +const args = parseArgs(process.argv.slice(2)); +const result = compileHumanEvalSuitesFromMarkdown({ + suitesDir: SUITES_DIR, + suite: args.suite, + generatedHeader: GENERATED_HEADER, +}); + +for (const suite of result.suites) { + console.log(`${path.relative(ROOT, suite.outputPath)}: wrote ${suite.cases.length} cases`); +} +console.log(`Compiled ${result.total} Ricky eval cases.`); + +function parseArgs(argv) { + const parsed = {}; + for (let index = 0; index < argv.length; index += 1) { + const arg = argv[index]; + if (arg === '--suite') parsed.suite = readOptionValue(argv, ++index, '--suite'); + else if (arg === '--help' || arg === '-h') { + printHelp(); + process.exit(0); + } else { + throw new Error(`Unknown argument: ${arg}`); + } + } + return parsed; +} + +function readOptionValue(argv, index, option) { + const value = argv[index]; + if (value === undefined || value.startsWith('--')) { + throw new Error(`${option} requires a value`); + } + return value; +} + +function printHelp() { + console.log(`Usage: node scripts/evals/compile-ricky-evals.mjs [--suite NAME] + +Reads evals/suites/*/cases.md and writes generated cases.jsonl files. +`); +} diff --git a/scripts/evals/run-ricky-evals.mjs b/scripts/evals/run-ricky-evals.mjs new file mode 100644 index 00000000..de3cdd15 --- /dev/null +++ b/scripts/evals/run-ricky-evals.mjs @@ -0,0 +1,206 @@ +#!/usr/bin/env node + +import { spawnSync } from 'node:child_process'; +import { existsSync } from 'node:fs'; +import path from 'node:path'; +import { fileURLToPath } from 'node:url'; + +import { + createDefaultHumanEvalExecutors, + createSkippedEvalError, + defaultRedactActual, + runHumanEvalCli, +} from '@agent-assistant/telemetry/evals'; + +const ROOT = path.resolve(path.dirname(fileURLToPath(import.meta.url)), '../..'); +const DEFAULT_OPENCODE_MODEL = 'opencode/minimax-m2.5-free'; +const { argv: evalArgv, executorOverride } = parseRickyEvalArgs(process.argv.slice(2)); +const defaultExecutors = createDefaultHumanEvalExecutors(ROOT); + +const exitCode = await runHumanEvalCli({ + argv: evalArgv, + rootDir: ROOT, + productName: 'Ricky Evals', + runsDir: path.join(ROOT, '.ricky', 'evals', 'runs'), + executors: { + manual: executeManual, + opencode: executeOpenCode, + 'ricky-cli': executeRickyCli, + }, + defaultExecutor: 'manual', + redactActual(actual) { + const redacted = defaultRedactActual(actual); + if (typeof redacted.content === 'string' && redacted.content.length > 4000) { + redacted.content = `${redacted.content.slice(0, 4000)}\n...[truncated]`; + } + return redacted; + }, +}); + +process.exitCode = exitCode; + +function executeManual(testCase, context) { + if (context.providerMode && executorOverride === 'opencode') { + return executeOpenCode(testCase, context); + } + return defaultExecutors.manual(testCase, context); +} + +function executeOpenCode(testCase, context) { + if (!context.providerMode) { + throw createSkippedEvalError('opencode executor skipped; rerun with --provider or HUMAN_EVAL_PROVIDER=1'); + } + + const command = process.env.RICKY_EVAL_OPENCODE_BIN ?? 'opencode'; + const model = process.env.RICKY_EVAL_OPENCODE_MODEL ?? DEFAULT_OPENCODE_MODEL; + const timeoutMs = readPositiveInt(process.env.RICKY_EVAL_OPENCODE_TIMEOUT_MS, 120_000); + const prompt = buildOpenCodePrompt(testCase); + const args = ['run']; + if (model) args.push('-m', model); + args.push(prompt); + + const startedAt = Date.now(); + const result = spawnSync(command, args, { + cwd: context.rootDir, + encoding: 'utf8', + timeout: timeoutMs, + env: { + ...process.env, + CI: '1', + FORCE_COLOR: '0', + NO_COLOR: '1', + }, + }); + const durationMs = Date.now() - startedAt; + + if (result.error) { + if (result.error.code === 'ENOENT') { + throw createSkippedEvalError(`opencode executor skipped; '${command}' was not found in PATH`); + } + throw result.error; + } + + const stdout = result.stdout?.trimEnd() ?? ''; + const stderr = result.stderr?.trimEnd() ?? ''; + const content = stdout || stderr || ''; + + return { + ok: result.status === 0, + status: result.status === 0 ? 'completed' : `exit_${result.status ?? 'signal'}`, + stopReason: result.signal ?? undefined, + content, + model, + toolCalls: [], + notes: `Ran local opencode one-shot with model ${model}; exit=${result.status ?? result.signal ?? 'unknown'}; durationMs=${durationMs}.`, + }; +} + +function executeRickyCli(testCase, context) { + const argvText = stringValue(testCase.mock?.argv) ?? stringValue(testCase.input.argv) ?? stringValue(testCase.input.message); + if (!argvText) { + throw new Error('ricky-cli executor requires Mock argv or Message'); + } + + const tsxBin = path.join(context.rootDir, 'node_modules', '.bin', process.platform === 'win32' ? 'tsx.cmd' : 'tsx'); + if (!existsSync(tsxBin)) { + throw createSkippedEvalError('ricky-cli executor requires local node_modules/.bin/tsx; run npm install first'); + } + + const startedAt = Date.now(); + const argv = splitArgv(argvText); + const result = spawnSync(tsxBin, ['src/surfaces/cli/commands/cli-main.ts', ...argv], { + cwd: context.rootDir, + encoding: 'utf8', + timeout: 20_000, + env: { + ...process.env, + CI: '1', + FORCE_COLOR: '0', + NO_COLOR: '1', + }, + }); + const durationMs = Date.now() - startedAt; + + if (result.error) { + throw result.error; + } + + const stdout = result.stdout ?? ''; + const stderr = result.stderr ?? ''; + const content = [stdout.trimEnd(), stderr.trimEnd()].filter(Boolean).join('\n'); + + return { + ok: result.status === 0, + status: `exit_${result.status ?? 'signal'}`, + stopReason: result.signal ?? undefined, + content, + toolCalls: [ + { + name: 'ricky-cli', + argv, + exitCode: result.status, + signal: result.signal, + durationMs, + }, + ], + notes: `Ran: tsx src/surfaces/cli/commands/cli-main.ts ${argv.join(' ')}`, + }; +} + +function splitArgv(value) { + const tokens = []; + const pattern = /"([^"]*)"|'([^']*)'|[^\s]+/g; + let match; + while ((match = pattern.exec(value)) !== null) { + tokens.push(match[1] ?? match[2] ?? match[0]); + } + return tokens; +} + +function stringValue(value) { + return typeof value === 'string' && value.trim().length > 0 ? value.trim() : undefined; +} + +function buildOpenCodePrompt(testCase) { + const systemPrompt = stringValue(testCase.input.systemPrompt); + const threadHistory = Array.isArray(testCase.input.threadHistory) + ? testCase.input.threadHistory + : []; + const sections = [ + 'You are Ricky, the AgentWorkforce workflow reliability, coordination, and authoring assistant.', + [ + 'Follow Ricky repository conventions from AGENTS.md, workflow standards, shared authoring rules, and product specs.', + 'Prefer concrete workflow contracts, deterministic verification gates, review artifacts, 80-to-100 validation loops, honest blocker reporting, and scoped branch/PR boundaries when the request involves workflow authoring or repair.', + 'Answer the user request directly. Do not mention this eval harness or hidden rubric.', + ].join(' '), + ]; + + if (systemPrompt) { + sections.push(`Additional system context:\n${systemPrompt}`); + } + if (threadHistory.length > 0) { + sections.push(`Thread history:\n${JSON.stringify(threadHistory, null, 2)}`); + } + sections.push(`User request:\n${String(testCase.input.message ?? '').trim()}`); + return sections.join('\n\n'); +} + +function readPositiveInt(value, fallback) { + const parsed = Number(value); + return Number.isInteger(parsed) && parsed > 0 ? parsed : fallback; +} + +function parseRickyEvalArgs(argv) { + const passthrough = []; + let executorOverride; + for (let index = 0; index < argv.length; index += 1) { + const arg = argv[index]; + if (arg === '--executor') { + executorOverride = argv[index + 1]; + index += 1; + continue; + } + passthrough.push(arg); + } + return { argv: passthrough, executorOverride }; +} diff --git a/scripts/evals/summarize-ricky-evals.mjs b/scripts/evals/summarize-ricky-evals.mjs new file mode 100644 index 00000000..ddcf6818 --- /dev/null +++ b/scripts/evals/summarize-ricky-evals.mjs @@ -0,0 +1,40 @@ +#!/usr/bin/env node + +import { existsSync, readdirSync, readFileSync } from 'node:fs'; +import path from 'node:path'; +import { fileURLToPath } from 'node:url'; + +const ROOT = path.resolve(path.dirname(fileURLToPath(import.meta.url)), '../..'); +const RUNS_DIR = path.join(ROOT, '.ricky', 'evals', 'runs'); + +const runs = loadRuns(); +if (runs.length === 0) { + console.log('No Ricky eval runs yet. Run: npm run evals'); + process.exit(0); +} + +console.log(''); +console.log(`Ricky Eval History (${runs.length} runs)`); +console.log('='.repeat(96)); +console.log(`${'Date'.padEnd(20)}${'Branch'.padEnd(24)}${'Mode'.padEnd(10)}${'Pass'.padEnd(10)}${'Human'.padEnd(10)}${'Fail'.padEnd(8)}Duration`); +console.log('-'.repeat(96)); + +for (const run of runs.slice(0, 20)) { + const date = String(run.timestamp ?? '').replace('T', ' ').slice(0, 19).padEnd(20); + const branch = String(run.branch ?? 'unknown').slice(0, 22).padEnd(24); + const mode = String(run.mode ?? 'unknown').padEnd(10); + const pass = `${run.passed ?? 0}/${run.total_trials ?? 0}`.padEnd(10); + const human = String(run.needs_human ?? 0).padEnd(10); + const fail = String(run.failed ?? 0).padEnd(8); + const duration = `${Math.round((run.total_duration_ms ?? 0) / 1000)}s`; + console.log(`${date}${branch}${mode}${pass}${human}${fail}${duration}`); +} + +function loadRuns() { + if (!existsSync(RUNS_DIR)) return []; + return readdirSync(RUNS_DIR) + .map((dir) => path.join(RUNS_DIR, dir, 'result.json')) + .filter((file) => existsSync(file)) + .map((file) => JSON.parse(readFileSync(file, 'utf8'))) + .sort((a, b) => String(b.timestamp).localeCompare(String(a.timestamp))); +} diff --git a/src/local/auto-fix-loop.test.ts b/src/local/auto-fix-loop.test.ts index 595931ba..aa52dd78 100644 --- a/src/local/auto-fix-loop.test.ts +++ b/src/local/auto-fix-loop.test.ts @@ -404,6 +404,40 @@ describe('runWithAutoFix', () => { ); }); + it('deterministically repairs generated master child runs that disabled nested auto-fix', () => { + const repair = repairWorkflowDeterministically({ + artifactPath: 'workflows/generated/master.ts', + artifactContent: legacyMasterWorkflowContent(), + evidence: sdkRuntimeBlockerEvidence('run-update-config-2'), + }); + + expect(repair).toMatchObject({ + applied: true, + mode: 'deterministic', + summary: expect.stringContaining('allowed nested child workflows to use Ricky auto-fix'), + }); + expect(repair?.summary).toContain('replaced fail-fast error handling with repair-aware retry'); + expect(repair?.content).toContain("ricky run 'workflows/generated/child.ts' --foreground"); + expect(repair?.content).not.toContain('--no-auto-fix'); + expect(repair?.content).toContain(".onError('retry', { maxRetries: 2, retryDelayMs: 1000, repairAgent: \"master-lead\", repairRetries: 2 })"); + }); + + it('deterministically makes generated child final validation non-terminal', () => { + const repair = repairWorkflowDeterministically({ + artifactPath: 'workflows/generated/child.ts', + artifactContent: legacyChildWorkflowContent(), + evidence: sdkRuntimeBlockerEvidence('final-hard-validation'), + }); + + expect(repair).toMatchObject({ + applied: true, + mode: 'deterministic', + summary: expect.stringContaining('generated child final validation non-terminal'), + }); + expect(repair?.content).toContain('.step("final-hard-validation"'); + expect(repair?.content).toContain('failOnError: false'); + }); + it('skips sentinel-guard hardening when no later tail-grep check references the same path and marker', () => { const repair = repairWorkflowDeterministically({ artifactPath: 'workflows/generated/sentinel-no-check.ts', @@ -449,6 +483,41 @@ describe('runWithAutoFix', () => { expect(result.nextActions.join('\n')).toContain('Direct repair is available.'); }); + it('retries instead of stopping when the workflow repair provider throws', async () => { + const runSingleAttempt = vi + .fn() + .mockResolvedValueOnce(blockerResponse('INVALID_ARTIFACT', 'run-1', 'final-hard-validation')) + .mockResolvedValueOnce(successResponse('run-2')); + const workflowRepairer = vi.fn().mockRejectedValue(new Error('structured artifact missing')); + + const result = await runWithAutoFix(baseRequest, { + maxAttempts: 3, + runSingleAttempt, + classifyFailure: fakeClassification, + debugWorkflowRun: guidedDebugger, + workflowRepairer, + artifactWriter: vi.fn().mockResolvedValue(undefined), + }); + + expect(result.ok).toBe(true); + expect(workflowRepairer).toHaveBeenCalledTimes(1); + expect(runSingleAttempt).toHaveBeenCalledTimes(2); + expect(runSingleAttempt.mock.calls[1][0].retry).toMatchObject({ + attempt: 2, + maxAttempts: 3, + previousRunId: 'run-1', + retryOfRunId: 'run-1', + startFromStep: 'final-hard-validation', + }); + expect(result.auto_fix?.attempts[0]).toMatchObject({ + fix_error: 'structured artifact missing', + warning: expect.stringContaining('Workflow repair provider failed; retrying without an artifact rewrite'), + }); + expect(result.warnings).toEqual(expect.arrayContaining([ + expect.stringContaining('Workflow repair provider failed; retrying without an artifact rewrite'), + ])); + }); + it('uses the persona repair path even when the debugger recommends guided repair', async () => { const runSingleAttempt = vi .fn() @@ -1081,6 +1150,83 @@ function workflowContent(): string { return 'import { workflow } from "@agent-relay/sdk/workflows";\nworkflow("foo").run({ cwd: process.cwd() });\n'; } +function legacyMasterWorkflowContent(): string { + return [ + "import { workflow } from '@agent-relay/sdk/workflows';", + '', + '// RICKY_MASTER_EXECUTOR_WORKFLOW', + 'async function main() {', + ' await workflow("ricky-master")', + ' .onError(\'fail-fast\')', + ' .step("run-child", {', + ' type: "deterministic",', + ' command: "set -e\\nricky run \'workflows/generated/child.ts\' --foreground --no-auto-fix\\ntest -f \'.workflow-artifacts/child/signoff.md\'",', + ' captureOutput: true,', + ' failOnError: true,', + ' })', + ' .run({ cwd: process.cwd() });', + '}', + ].join('\n'); +} + +function legacyChildWorkflowContent(): string { + return [ + "import { workflow } from '@agent-relay/sdk/workflows';", + '', + 'async function main() {', + ' await workflow("ricky-child-update-config-2")', + ' .step("final-hard-validation", {', + ' type: "deterministic",', + ' dependsOn: ["fix-loop"],', + ' command: "set -e\\nnpm run typecheck\\ngit diff --name-only",', + ' captureOutput: true,', + ' failOnError: true,', + ' })', + ' .step("final-signoff", {', + ' type: "deterministic",', + ' dependsOn: ["final-hard-validation"],', + ' command: "echo RICKY_CHILD_WORKFLOW_COMPLETE",', + ' captureOutput: true,', + ' failOnError: true,', + ' })', + ' .run({ cwd: process.cwd() });', + '}', + ].join('\n'); +} + +function sdkRuntimeBlockerEvidence(failedStep: string): WorkflowRunEvidence { + return { + runId: 'run-1', + workflowId: 'wf-1', + workflowName: 'ricky-master', + status: 'failed', + startedAt: '2026-04-28T00:00:00.000Z', + completedAt: '2026-04-28T00:00:01.000Z', + steps: [ + { + stepId: failedStep, + stepName: failedStep, + status: 'failed', + startedAt: '2026-04-28T00:00:00.000Z', + completedAt: '2026-04-28T00:00:01.000Z', + error: `Step "${failedStep}" failed: Command failed with exit code 2`, + verifications: [], + deterministicGates: [], + logs: [{ stream: 'stdout', excerpt: `✗ ${failedStep} — FAILED: Command failed with exit code 2` }], + artifacts: [], + retries: [], + narrative: [], + history: [], + }, + ], + deterministicGates: [], + logs: [{ stream: 'stdout', excerpt: `INVALID_ARTIFACT at ${failedStep}` }], + artifacts: [], + narrative: [], + routing: [], + }; +} + function leadPlanMarkerWorkflowContent(): string { return [ "import { workflow } from '@agent-relay/sdk/workflows';", diff --git a/src/local/auto-fix-loop.ts b/src/local/auto-fix-loop.ts index 4b65d80f..f805867e 100644 --- a/src/local/auto-fix-loop.ts +++ b/src/local/auto-fix-loop.ts @@ -368,6 +368,29 @@ export async function runWithAutoFix( } catch (error) { attemptSummary.fix_error = error instanceof Error ? error.message : String(error); warnings.push(...warningsFromError(error)); + if (attempt < maxAttempts) { + const warning = `Workflow repair provider failed; retrying without an artifact rewrite: ${attemptSummary.fix_error}`; + attemptSummary.warning = warning; + warnings.push(warning); + if (!runId) { + warnings.push('Auto-fix retry could not resolve a previous run id; retrying without step-level resume.'); + } else if (!retryOfRunId) { + retryOfRunId = runId; + } + currentRequest = { + ...retryBaseRequest(currentRequest, response), + autoFix: undefined, + retry: { + attempt: attempt + 1, + maxAttempts, + ...(runId ? { previousRunId: runId, retryOfRunId: retryOfRunId ?? runId } : {}), + ...(failedStep ? { startFromStep: failedStep } : {}), + reason: `auto-fix retry after workflow repair provider failure for ${blockerCode ?? 'local failure'}`, + }, + }; + onProgress?.(`Workflow repair provider failed; retrying workflow${failedStep ? ` from ${failedStep}` : ''}...`); + continue; + } const escalated = withAutoFix(response, maxAttempts, attempts, attemptSummary.status, warnings, trackingRunId); escalated.nextActions = [ ...escalated.nextActions, @@ -563,6 +586,18 @@ export function repairWorkflowDeterministically( changes.push(...sentinelGuardRepair.changes); } + const masterChildRepair = repairMasterChildRunRepairLoop(content); + if (masterChildRepair.content !== content) { + content = masterChildRepair.content; + changes.push(...masterChildRepair.changes); + } + + const childValidationRepair = repairGeneratedChildFinalValidation(content); + if (childValidationRepair.content !== content) { + content = childValidationRepair.content; + changes.push(...childValidationRepair.changes); + } + if (content === input.artifactContent || changes.length === 0) return null; return { @@ -968,6 +1003,41 @@ function repairSentinelGuardedRehydration(content: string): { content: string; c return { content: next, changes: [...new Set(changes)] }; } +function repairMasterChildRunRepairLoop(content: string): { content: string; changes: string[] } { + const isMasterArtifact = content.includes('RICKY_MASTER_EXECUTOR_WORKFLOW') || content.includes('--foreground --no-auto-fix'); + if (!isMasterArtifact) return { content, changes: [] }; + + const changes: string[] = []; + let next = content.replace(/--foreground\s+--no-auto-fix/g, () => { + changes.push('allowed nested child workflows to use Ricky auto-fix instead of --no-auto-fix'); + return '--foreground'; + }); + + next = next.replace( + /^\s*\.onError\(\s*['"]fail-fast['"]\s*\)/m, + (match) => { + changes.push('replaced fail-fast error handling with repair-aware retry'); + const indent = match.match(/^\s*/)?.[0] ?? ''; + return `${indent}.onError('retry', { maxRetries: 2, retryDelayMs: 1000, repairAgent: "master-lead", repairRetries: 2 })`; + }, + ); + + return { content: next, changes: [...new Set(changes)] }; +} + +function repairGeneratedChildFinalValidation(content: string): { content: string; changes: string[] } { + if (!/workflow\(["']ricky-child-/.test(content)) return { content, changes: [] }; + const changes: string[] = []; + const next = content.replace( + /(\.step\(["']final-hard-validation["'][\s\S]*?captureOutput:\s*true,\n\s*)failOnError:\s*true,/, + (match, prefix: string) => { + changes.push('made generated child final validation non-terminal so master final validation owns integrated repo checks'); + return `${prefix}failOnError: false,`; + }, + ); + return { content: next, changes }; +} + function repairAgentStepTimeouts(content: string, evidence: WorkflowRunEvidence): { content: string; changes: string[] } { const timedOutStep = timedOutAgentStepFromEvidence(evidence); if (!timedOutStep) return { content, changes: [] }; diff --git a/src/product/generation/master-workflow-renderer.ts b/src/product/generation/master-workflow-renderer.ts index 7cfe585c..a8890485 100644 --- a/src/product/generation/master-workflow-renderer.ts +++ b/src/product/generation/master-workflow-renderer.ts @@ -1,4 +1,9 @@ import type { NormalizedWorkflowSpec } from '../spec-intake/types.js'; +import { + DEFAULT_REPAIR_RETRY_ATTEMPTS, + DEFAULT_RETRY_BACKOFF_MS, + DEFAULT_RETRY_MAX_ATTEMPTS, +} from '../../shared/constants.js'; import { planMasterExecution, type ChildWorkflowPlan, type MasterExecutionPlan } from '../orchestration/index.js'; import type { DeterministicGate, @@ -177,7 +182,7 @@ function renderMasterSource(input: { ` .channel(${literal(input.channel)})`, ' .maxConcurrency(4)', ' .timeout(7200000)', - ' .onError(\'fail-fast\')', + ` .onError(${repairAwareOnError('master-lead')})`, '', ' .agent("master-lead", { cli: "claude", interactive: false, role: "Plans child workflow boundaries, dependency waves, and final integration evidence.", retries: 1 })', ' .agent("master-reviewer", { cli: "codex", preset: "reviewer", role: "Reviews child signoff evidence and master executor readiness.", retries: 1 })', @@ -292,7 +297,7 @@ function renderChildRunStep(child: ChildWorkflowPlan): string[] { : ['verify-child-workflows']; const command = [ 'set -e', - `ricky run ${shellQuote(child.workflowFilePath)} --foreground --no-auto-fix`, + `ricky run ${shellQuote(child.workflowFilePath)} --foreground`, `test -f ${shellQuote(child.signoffArtifactPath)}`, `grep -F ${shellQuote(child.signoffMarker)} ${shellQuote(child.signoffArtifactPath)}`, 'echo RICKY_MASTER_CHILD_RUN_VERIFIED', @@ -328,7 +333,7 @@ function childWorkflowSource(child: ChildWorkflowPlan, spec: NormalizedWorkflowS ` .channel(${literal(`wf-ricky-child-${child.id}`)})`, ' .maxConcurrency(2)', ' .timeout(3600000)', - ' .onError(\'retry\', { maxRetries: 1, retryDelayMs: 1000 })', + ` .onError(${repairAwareOnError('validator-claude')})`, ' .agent("lead-claude", { cli: "claude", interactive: false, role: "Plans this bounded child workflow slice.", retries: 1 })', ' .agent("impl-codex", { cli: "codex", role: "Implements only this child workflow slice and its declared file scope.", retries: 2 })', ' .agent("reviewer-codex", { cli: "codex", preset: "reviewer", role: "Reviews code, tests, deterministic gates, and PR/result evidence.", retries: 1 })', @@ -403,7 +408,7 @@ function childWorkflowSource(child: ChildWorkflowPlan, spec: NormalizedWorkflowS 'echo RICKY_CHILD_FINAL_VALIDATION_READY', ].join('\n'))},`, ' captureOutput: true,', - ' failOnError: true,', + ' failOnError: false,', ' })', ' .step("final-signoff", {', ' type: "deterministic",', @@ -435,6 +440,10 @@ function childWorkflowSource(child: ChildWorkflowPlan, spec: NormalizedWorkflowS ].join('\n')}`; } +function repairAwareOnError(repairAgent: string): string { + return `'retry', { maxRetries: ${DEFAULT_RETRY_MAX_ATTEMPTS}, retryDelayMs: ${DEFAULT_RETRY_BACKOFF_MS}, repairAgent: ${literal(repairAgent)}, repairRetries: ${DEFAULT_REPAIR_RETRY_ATTEMPTS} }`; +} + function buildMasterTasks(plan: MasterExecutionPlan): WorkflowTask[] { return [ task('prepare-context', 'Prepare master context', 'deterministic', 'Write master plan and generation-time skill evidence.', []), diff --git a/src/product/generation/pipeline.test.ts b/src/product/generation/pipeline.test.ts index ddbb71b8..21742486 100644 --- a/src/product/generation/pipeline.test.ts +++ b/src/product/generation/pipeline.test.ts @@ -46,9 +46,15 @@ describe('workflow generation pipeline', () => { expect(rendered.artifactPath).toBe('workflows/generated/runtime-master.ts'); expect(rendered.content).toContain('RICKY_MASTER_EXECUTOR_WORKFLOW'); expect(rendered.content).toContain('Master plan: 5 child workflows'); - expect(rendered.content).toContain('ricky run \'workflows/generated/runtime-master-children/01-nested-runner.ts\' --foreground --no-auto-fix'); + expect(rendered.content).toContain('ricky run \'workflows/generated/runtime-master-children/01-nested-runner.ts\' --foreground'); + expect(rendered.content).not.toMatch(/^\s*command: "set -e\\nricky run .*--no-auto-fix/m); expect(rendered.content).toContain('MASTER_EXECUTOR_RESULT_READY'); expect(rendered.content).toContain('RICKY_CHILD_WORKFLOW_COMPLETE'); + expect(rendered.content).toContain(".onError('retry', { maxRetries: 2, retryDelayMs: 1000, repairAgent: \"master-lead\", repairRetries: 2 })"); + expect(rendered.content.replace(/\\+"/g, '"')).toContain(".onError('retry', { maxRetries: 2, retryDelayMs: 1000, repairAgent: \"validator-claude\", repairRetries: 2 })"); + expect(rendered.content.replace(/\\+"/g, '"')).toMatch( + /\.step\("final-hard-validation"[\s\S]*?failOnError: false,[\s\S]*?\.step\("final-signoff"/, + ); expect(rendered.content).toContain('.run({ cwd: process.cwd() })'); }); @@ -135,6 +141,8 @@ describe('workflow generation pipeline', () => { expect(artifact.content).toContain('.agent("impl-primary-codex"'); expect(artifact.content).toContain('.agent("impl-tests-codex"'); expect(artifact.content).toContain('.agent("validator-claude"'); + expect(artifact.content).toContain(".onError('retry', { maxRetries: 2, retryDelayMs: 1000, repairAgent: \"validator-claude\", repairRetries: 2 })"); + expect(artifact.content).not.toMatch(/^\s*\.onError\('fail-fast'\)/m); expect(artifact.content).toContain('80-to-100 fix loop'); expect(artifact.content).toContain('deterministic sanity gate using POSIX grep, git grep, or an equivalent assertion'); expect(artifact.content).toContain('If using rg, guard it with command -v rg'); @@ -378,6 +386,8 @@ describe('workflow generation pipeline', () => { expect(artifact.content).toContain('.agent("validator-codex", { cli: "codex", preset: "worker"'); expect(artifact.content).toContain('.agent("author-codex"'); expect(artifact.content).not.toContain('.agent("impl-primary-codex"'); + expect(artifact.content).toContain(".onError('retry', { maxRetries: 2, retryDelayMs: 1000, repairAgent: \"validator-codex\", repairRetries: 2 })"); + expect(artifact.content).not.toMatch(/^\s*\.onError\('fail-fast'\)/m); expect(artifact.content).toContain('Codex structural marker gate'); expect(artifact.content).toContain('must not be presented as independent review evidence'); expect(artifact.content).toContain('Substantive review evidence comes from the Claude review steps plus deterministic validation gates'); @@ -1294,6 +1304,36 @@ describe('workflow generation pipeline', () => { }); expect(result.patternDecision.specSignals).toContain('choosing-swarm-patterns skill loaded'); expect(result.patternDecision.reason).toMatch(/choosing-swarm-patterns/i); + expect(artifact(result).content).toContain(".onError('retry', { maxRetries: 2, retryDelayMs: 1000, repairAgent: \"validator-codex\", repairRetries: 2 })"); + expect(artifact(result).content).not.toMatch(/^\s*\.onError\('fail-fast'\)/m); + }); + + it('rejects generated workflows that can still fail fast without a repair agent', () => { + const implementationSpec = spec({ + description: 'Implement workflow retry policy validation.', + targetFiles: ['src/product/generation/pipeline.ts'], + }); + const result = generate({ + spec: implementationSpec, + artifactPath: 'workflows/generated/repair-policy.ts', + }); + const baseArtifact = artifact(result); + const weakArtifact = { + ...baseArtifact, + content: baseArtifact.content.replace( + ".onError('retry', { maxRetries: 2, retryDelayMs: 1000, repairAgent: \"validator-claude\", repairRetries: 2 })", + ".onError('retry', { maxRetries: 2, retryDelayMs: 1000 })", + ), + }; + + const validation = validateGeneratedArtifact(weakArtifact, result.patternDecision, result.skillContext, implementationSpec); + + expect(validation.valid).toBe(false); + expect(validation.issues).toEqual( + expect.arrayContaining([ + expect.objectContaining({ code: 'REPAIR_AWARE_RETRY_MISSING' }), + ]), + ); }); it('respects pattern override', () => { diff --git a/src/product/generation/pipeline.ts b/src/product/generation/pipeline.ts index ba671eac..4b8930c7 100644 --- a/src/product/generation/pipeline.ts +++ b/src/product/generation/pipeline.ts @@ -318,6 +318,13 @@ export function validateGeneratedArtifact( if (!hasExplicitWorkflowRunCwd(content)) { issues.push(blockingIssue('validation', 'RUN_CWD_MISSING', 'Rendered workflow does not run with explicit cwd.')); } + if (requiresRepairAwareRetry(content)) { + issues.push(blockingIssue( + 'validation', + 'REPAIR_AWARE_RETRY_MISSING', + 'Rendered workflow must use retry error handling with repairAgent and repairRetries so repairable deterministic gates do not fail the workflow outright.', + )); + } if (spec && requiresImplementationWorkflow(spec)) { if (!/IMPLEMENTATION_WORKFLOW_CONTRACT/.test(content)) { @@ -424,6 +431,18 @@ export function validateGeneratedArtifact( }; } +function requiresRepairAwareRetry(content: string): boolean { + if (/^\s*\.onError\(\s*['"]fail-fast['"]/m.test(content)) return true; + const workflowErrorHandling = content + .split('\n') + .filter((line) => /^\s*\.onError\(/.test(line)); + if (workflowErrorHandling.length === 0) return true; + + return workflowErrorHandling.some((line) => + !/\.onError\(\s*['"]retry['"]\s*,\s*\{.*\brepairAgent\s*:.*\brepairRetries\s*:/.test(line), + ); +} + function hasDeterministicSanityGate(artifact: RenderedArtifact): boolean { return artifact.gates.some((gate) => gate.failOnError && isSanityGateCommand(gate.command)); } diff --git a/src/product/generation/template-renderer.ts b/src/product/generation/template-renderer.ts index 9cde38e9..9413062d 100644 --- a/src/product/generation/template-renderer.ts +++ b/src/product/generation/template-renderer.ts @@ -6,6 +6,7 @@ import { DEFAULT_IMPLEMENT_TIMEOUT_MS, DEFAULT_LEAD_PLAN_TIMEOUT_MS, DEFAULT_MAX_CONCURRENCY, + DEFAULT_REPAIR_RETRY_ATTEMPTS, DEFAULT_RETRY_BACKOFF_MS, DEFAULT_RETRY_MAX_ATTEMPTS, DEFAULT_REVIEW_TIMEOUT_MS, @@ -117,7 +118,7 @@ function renderSource(input: { isCodeWorkflow: boolean; toolSelection: ToolSelectionContext; }): string { - const onError = input.pattern.riskLevel === 'low' ? "'fail-fast'" : `'retry', { maxRetries: ${DEFAULT_RETRY_MAX_ATTEMPTS}, retryDelayMs: ${DEFAULT_RETRY_BACKOFF_MS} }`; + const onError = renderRepairAwareOnError(repairAgentFor(input.isCodeWorkflow)); const contextSetup = buildGeneratedContextSetup(input.spec, input.artifactsDir, input.pattern, input.skills, input.skillApplicationEvidence, input.toolSelection); const lines: string[] = [ "import { workflow } from '@agent-relay/sdk/workflows';", @@ -203,6 +204,14 @@ function renderSource(input: { return `${lines.join('\n')}\n`; } +function renderRepairAwareOnError(repairAgent: string): string { + return `'retry', { maxRetries: ${DEFAULT_RETRY_MAX_ATTEMPTS}, retryDelayMs: ${DEFAULT_RETRY_BACKOFF_MS}, repairAgent: ${literal(repairAgent)}, repairRetries: ${DEFAULT_REPAIR_RETRY_ATTEMPTS} }`; +} + +function repairAgentFor(isCodeWorkflow: boolean): string { + return isCodeWorkflow ? 'validator-claude' : 'validator-codex'; +} + function buildTeam(pattern: SwarmPattern, isCodeWorkflow: boolean): TeamMemberSpec[] { if (!isCodeWorkflow) { return [ diff --git a/src/shared/constants.ts b/src/shared/constants.ts index 67622f07..df8a3d45 100644 --- a/src/shared/constants.ts +++ b/src/shared/constants.ts @@ -34,6 +34,8 @@ export const DEFAULT_FIX_LOOP_TIMEOUT_MS = 1_200_000; // 20 min — bounded fix export const DEFAULT_RETRY_MAX_ATTEMPTS = 2; +export const DEFAULT_REPAIR_RETRY_ATTEMPTS = 2; + export const DEFAULT_AUTO_FIX_ATTEMPTS = 7; export const DEFAULT_RETRY_BACKOFF_MS = 1_000; diff --git a/test/package-proof/package-layout-proof.test.ts b/test/package-proof/package-layout-proof.test.ts index 6c208ec3..70772e9f 100644 --- a/test/package-proof/package-layout-proof.test.ts +++ b/test/package-proof/package-layout-proof.test.ts @@ -72,7 +72,7 @@ describe('Ricky package layout and npm-default proof', () => { [ 'package-script-allowlist', [ - 'required scripts: batch, build, bundle, clean, dev, overnight, prepack, start, test, typecheck', + 'required scripts: batch, build, bundle, clean, dev, evals, evals:compare, evals:compile, evals:list, evals:opencode, evals:summary, overnight, prepack, start, test, typecheck', 'missing required scripts: (none)', 'unexpected extra scripts: (none)', ], diff --git a/test/package-proof/package-layout-proof.ts b/test/package-proof/package-layout-proof.ts index 36921ce3..f0be49d3 100644 --- a/test/package-proof/package-layout-proof.ts +++ b/test/package-proof/package-layout-proof.ts @@ -95,6 +95,12 @@ const REQUIRED_PACKAGE_SCRIPTS = [ 'test', 'start', 'dev', + 'evals', + 'evals:compile', + 'evals:opencode', + 'evals:list', + 'evals:summary', + 'evals:compare', 'batch', 'overnight', 'prepack',