From 5c01816709ce2020f45382a6278f3493bd61609e Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 5 Apr 2026 08:36:50 +0000 Subject: [PATCH 1/6] Initial plan From 3cccd00527f3d2dc5feb2a1c8d725c8d938428d0 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 5 Apr 2026 08:53:26 +0000 Subject: [PATCH 2/6] docs: document pre-step data-fetching pattern in create-agentic-workflow.md Agent-Logs-Url: https://github.com/github/gh-aw/sessions/5f9dc564-23a1-4366-b8fc-52c7e44673d1 Co-authored-by: pelikhan <4175913+pelikhan@users.noreply.github.com> --- .github/aw/create-agentic-workflow.md | 77 +++++++++++++++++++++++ pkg/agentdrain/data/default_weights.json | 79 +++--------------------- 2 files changed, 87 insertions(+), 69 deletions(-) diff --git a/.github/aw/create-agentic-workflow.md b/.github/aw/create-agentic-workflow.md index cb6d2384bfb..5a773df8829 100644 --- a/.github/aw/create-agentic-workflow.md +++ b/.github/aw/create-agentic-workflow.md @@ -705,6 +705,83 @@ When creating workflows that involve coding agents operating in large repositori - Documentation updates for multiple services - Dependency updates across microservices +### Pre-step Data Fetching + +**Always fetch heavy data before the AI session begins.** Passing large blobs of raw data (logs, artifacts, build output) directly to the AI agent wastes tokens, increases inference latency, and reduces reliability. Instead, use a deterministic `steps:` block to download, filter, and store the data at a well-known path before the agent reads it. + +**Why this matters:** +- Token budgets are finite — raw CI logs or deployment output can be thousands of lines; pre-fetching lets you truncate or filter to only the relevant parts. +- Deterministic shell steps are faster and more reliable than asking the agent to call tools repeatedly to download the same data. +- Pre-fetched data is reusable: the agent reads a local file instead of making repeated API calls. + +**Reusable template:** + +```yaml +--- +description: +on: + workflow_run: + workflows: ["CI"] + types: [completed] +permissions: + contents: read + actions: read +tools: + github: + toolsets: [default] +steps: + - name: Fetch data + run: | + # Download heavy data before the AI session begins + gh run view ${{ github.event.workflow_run.id }} --log > /tmp/gh-aw/agent/ci-logs.txt 2>&1 || true + # Trim to last 500 lines to stay within token budget + tail -500 /tmp/gh-aw/agent/ci-logs.txt > /tmp/gh-aw/agent/ci-logs-trimmed.txt +safe-outputs: + add-comment: + max: 1 +--- + +Analyze the CI failure logs at `/tmp/gh-aw/agent/ci-logs-trimmed.txt`. + +Identify the root cause, suggest a fix, and add a comment to the triggering PR. +``` + +**The agent data directory** (`/tmp/gh-aw/agent/`) is the canonical location for files produced by pre-steps and consumed by the agent. Write pre-fetched data there so the agent can find it with a predictable path. + +**Example use cases:** + +1. **Deployment failure logs** — Download Heroku/Vercel/Railway deployment logs via their CLI before the AI analyses the failure: + ```yaml + steps: + - name: Fetch deployment logs + run: | + heroku logs --tail --num 200 --app ${{ vars.HEROKU_APP }} \ + > /tmp/gh-aw/agent/deploy-logs.txt + ``` + +2. **Build artifacts / test results** — Run `npm ci && npm run build` (or equivalent) and save the output so the agent can inspect compilation errors without re-running the build: + ```yaml + steps: + - name: Build and capture output + run: | + npm ci 2>&1 | tail -200 > /tmp/gh-aw/agent/build-output.txt + npm run test -- --reporter=json > /tmp/gh-aw/agent/test-results.json 2>&1 || true + ``` + +3. **GitHub Actions workflow run artifacts** — Download a specific artifact from a failed run before asking the agent to diagnose it: + ```yaml + steps: + - name: Download test artifact + run: | + gh run download ${{ github.event.workflow_run.id }} \ + --name test-results --dir /tmp/gh-aw/agent/artifacts/ || true + ``` + +**Related patterns:** + +- For loading a **baseline before AI analysis** (e.g., architecture notes, known-issues registry), use `repo-memory` — see `.github/aw/memory.md` for the full comparison of `cache-memory`, `repo-memory`, and `repo-memory` with wiki. +- For **multi-step data processing pipelines** that pre-compute aggregations before the AI runs, see the [DataOps pattern](https://github.github.com/gh-aw/patterns/data-ops/) and [Deterministic & Agentic Patterns guide](https://github.github.com/gh-aw/guides/deterministic-agentic-patterns/). + ## Issue Form Mode: Step-by-Step Workflow Creation When processing a GitHub issue created via the workflow creation form, follow these steps: diff --git a/pkg/agentdrain/data/default_weights.json b/pkg/agentdrain/data/default_weights.json index 26e14f1120d..5ed9cfb5f89 100644 --- a/pkg/agentdrain/data/default_weights.json +++ b/pkg/agentdrain/data/default_weights.json @@ -39,28 +39,7 @@ "id": 5, "size": 1, "stage": "error", - "template": [ - "stage=error", - "reason=The", - "Serena", - "MCP", - "server", - "is", - "not", - "available", - "in", - "this", - "environment.", - "No", - "serena-*", - "tools", - "are", - "registered.", - "tool=Serena", - "MCP", - "server", - "type=missing_tool" - ] + "template": ["stage=error", "reason=The", "Serena", "MCP", "server", "is", "not", "available", "in", "this", "environment.", "No", "serena-*", "tools", "are", "registered.", "tool=Serena", "MCP", "server", "type=missing_tool"] }, { "id": 6, @@ -234,12 +213,7 @@ ], "config": { "Depth": 4, - "ExcludeFields": [ - "session_id", - "trace_id", - "span_id", - "timestamp" - ], + "ExcludeFields": ["session_id", "trace_id", "span_id", "timestamp"], "MaskRules": [ { "Name": "uuid", @@ -285,21 +259,12 @@ "id": 1, "size": 100, "stage": "finish", - "template": [ - "stage=finish", - "\u003c*\u003e", - "tokens=\u003cNUM\u003e" - ] + "template": ["stage=finish", "\u003c*\u003e", "tokens=\u003cNUM\u003e"] } ], "config": { "Depth": 4, - "ExcludeFields": [ - "session_id", - "trace_id", - "span_id", - "timestamp" - ], + "ExcludeFields": ["session_id", "trace_id", "span_id", "timestamp"], "MaskRules": [ { "Name": "uuid", @@ -345,21 +310,12 @@ "id": 1, "size": 72, "stage": "plan", - "template": [ - "stage=plan", - "errors=\u003cNUM\u003e", - "turns=\u003cNUM\u003e" - ] + "template": ["stage=plan", "errors=\u003cNUM\u003e", "turns=\u003cNUM\u003e"] } ], "config": { "Depth": 4, - "ExcludeFields": [ - "session_id", - "trace_id", - "span_id", - "timestamp" - ], + "ExcludeFields": ["session_id", "trace_id", "span_id", "timestamp"], "MaskRules": [ { "Name": "uuid", @@ -403,12 +359,7 @@ "clusters": null, "config": { "Depth": 4, - "ExcludeFields": [ - "session_id", - "trace_id", - "span_id", - "timestamp" - ], + "ExcludeFields": ["session_id", "trace_id", "span_id", "timestamp"], "MaskRules": [ { "Name": "uuid", @@ -452,12 +403,7 @@ "clusters": null, "config": { "Depth": 4, - "ExcludeFields": [ - "session_id", - "trace_id", - "span_id", - "timestamp" - ], + "ExcludeFields": ["session_id", "trace_id", "span_id", "timestamp"], "MaskRules": [ { "Name": "uuid", @@ -1607,12 +1553,7 @@ ], "config": { "Depth": 4, - "ExcludeFields": [ - "session_id", - "trace_id", - "span_id", - "timestamp" - ], + "ExcludeFields": ["session_id", "trace_id", "span_id", "timestamp"], "MaskRules": [ { "Name": "uuid", @@ -1652,4 +1593,4 @@ }, "next_id": 15 } -} \ No newline at end of file +} From 41f63b5f0184b7e06985e1a243f07ad61e7dc31f Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 5 Apr 2026 09:01:19 +0000 Subject: [PATCH 3/6] docs: add GH_TOKEN env and proper permissions to pre-step template snippets Agent-Logs-Url: https://github.com/github/gh-aw/sessions/e4abe7a1-8d34-48e8-b4ff-6bfe4ec72dac Co-authored-by: pelikhan <4175913+pelikhan@users.noreply.github.com> --- .github/aw/create-agentic-workflow.md | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/.github/aw/create-agentic-workflow.md b/.github/aw/create-agentic-workflow.md index 5a773df8829..2e009532e4f 100644 --- a/.github/aw/create-agentic-workflow.md +++ b/.github/aw/create-agentic-workflow.md @@ -731,9 +731,12 @@ tools: toolsets: [default] steps: - name: Fetch data + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + RUN_ID: ${{ github.event.workflow_run.id }} run: | # Download heavy data before the AI session begins - gh run view ${{ github.event.workflow_run.id }} --log > /tmp/gh-aw/agent/ci-logs.txt 2>&1 || true + gh run view "$RUN_ID" --log > /tmp/gh-aw/agent/ci-logs.txt 2>&1 || true # Trim to last 500 lines to stay within token budget tail -500 /tmp/gh-aw/agent/ci-logs.txt > /tmp/gh-aw/agent/ci-logs-trimmed.txt safe-outputs: @@ -754,6 +757,8 @@ Identify the root cause, suggest a fix, and add a comment to the triggering PR. ```yaml steps: - name: Fetch deployment logs + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | heroku logs --tail --num 200 --app ${{ vars.HEROKU_APP }} \ > /tmp/gh-aw/agent/deploy-logs.txt @@ -763,6 +768,8 @@ Identify the root cause, suggest a fix, and add a comment to the triggering PR. ```yaml steps: - name: Build and capture output + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | npm ci 2>&1 | tail -200 > /tmp/gh-aw/agent/build-output.txt npm run test -- --reporter=json > /tmp/gh-aw/agent/test-results.json 2>&1 || true @@ -772,8 +779,11 @@ Identify the root cause, suggest a fix, and add a comment to the triggering PR. ```yaml steps: - name: Download test artifact + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + RUN_ID: ${{ github.event.workflow_run.id }} run: | - gh run download ${{ github.event.workflow_run.id }} \ + gh run download "$RUN_ID" \ --name test-results --dir /tmp/gh-aw/agent/artifacts/ || true ``` From c1a30ba6e753a772a8687ca3e75aeecbf3e93104 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 5 Apr 2026 09:08:48 +0000 Subject: [PATCH 4/6] docs: optimize pre-step data-fetching section for agent consumption, add cache-memory tip Agent-Logs-Url: https://github.com/github/gh-aw/sessions/b3d19cbb-dbdf-4c96-bce7-12da512c46d8 Co-authored-by: pelikhan <4175913+pelikhan@users.noreply.github.com> --- .github/aw/create-agentic-workflow.md | 73 ++++++++------------------- 1 file changed, 20 insertions(+), 53 deletions(-) diff --git a/.github/aw/create-agentic-workflow.md b/.github/aw/create-agentic-workflow.md index 2e009532e4f..b9a14250103 100644 --- a/.github/aw/create-agentic-workflow.md +++ b/.github/aw/create-agentic-workflow.md @@ -707,90 +707,57 @@ When creating workflows that involve coding agents operating in large repositori ### Pre-step Data Fetching -**Always fetch heavy data before the AI session begins.** Passing large blobs of raw data (logs, artifacts, build output) directly to the AI agent wastes tokens, increases inference latency, and reduces reliability. Instead, use a deterministic `steps:` block to download, filter, and store the data at a well-known path before the agent reads it. +Use a deterministic `steps:` block to download, trim, and store heavy data before the agent runs. The agent reads local files instead of making repeated API calls, staying within its token budget. -**Why this matters:** -- Token budgets are finite — raw CI logs or deployment output can be thousands of lines; pre-fetching lets you truncate or filter to only the relevant parts. -- Deterministic shell steps are faster and more reliable than asking the agent to call tools repeatedly to download the same data. -- Pre-fetched data is reusable: the agent reads a local file instead of making repeated API calls. +**Rules:** +- Always set `env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}` on every step that calls `gh` — the token is not injected automatically. +- Write output to `/tmp/gh-aw/agent/` (canonical agent data directory). +- Trim large blobs before writing (`tail -N`). +- Add `permissions: actions: read` when reading workflow logs or artifacts. -**Reusable template:** +**Template (CI log analysis):** ```yaml --- -description: on: workflow_run: workflows: ["CI"] types: [completed] permissions: contents: read - actions: read + actions: read # required for gh run view / gh run download tools: github: toolsets: [default] + cache-memory: true # persist pre-fetched data across runs (dedup, trending) steps: - - name: Fetch data + - name: Fetch CI logs env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} RUN_ID: ${{ github.event.workflow_run.id }} run: | - # Download heavy data before the AI session begins + mkdir -p /tmp/gh-aw/agent gh run view "$RUN_ID" --log > /tmp/gh-aw/agent/ci-logs.txt 2>&1 || true - # Trim to last 500 lines to stay within token budget tail -500 /tmp/gh-aw/agent/ci-logs.txt > /tmp/gh-aw/agent/ci-logs-trimmed.txt safe-outputs: add-comment: max: 1 --- -Analyze the CI failure logs at `/tmp/gh-aw/agent/ci-logs-trimmed.txt`. +Analyze `/tmp/gh-aw/agent/ci-logs-trimmed.txt`. Identify the root cause and post a comment to the triggering PR. -Identify the root cause, suggest a fix, and add a comment to the triggering PR. +Check `/tmp/gh-aw/cache-memory/seen-runs.json` for previously seen run IDs; skip if already processed and append the current run ID when done. ``` -**The agent data directory** (`/tmp/gh-aw/agent/`) is the canonical location for files produced by pre-steps and consumed by the agent. Write pre-fetched data there so the agent can find it with a predictable path. +**Use cases:** -**Example use cases:** +| Scenario | Step snippet | +|---|---| +| Deployment logs (Heroku/Vercel/Railway) | `heroku logs --tail --num 200 --app ${{ vars.HEROKU_APP }} > /tmp/gh-aw/agent/deploy-logs.txt` | +| Build / test output | `npm ci 2>&1 \| tail -200 > /tmp/gh-aw/agent/build.txt && npm run test -- --reporter=json > /tmp/gh-aw/agent/test.json 2>&1 \|\| true` | +| Workflow run artifact | `gh run download "$RUN_ID" --name test-results --dir /tmp/gh-aw/agent/artifacts/ \|\| true` | -1. **Deployment failure logs** — Download Heroku/Vercel/Railway deployment logs via their CLI before the AI analyses the failure: - ```yaml - steps: - - name: Fetch deployment logs - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - heroku logs --tail --num 200 --app ${{ vars.HEROKU_APP }} \ - > /tmp/gh-aw/agent/deploy-logs.txt - ``` - -2. **Build artifacts / test results** — Run `npm ci && npm run build` (or equivalent) and save the output so the agent can inspect compilation errors without re-running the build: - ```yaml - steps: - - name: Build and capture output - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - npm ci 2>&1 | tail -200 > /tmp/gh-aw/agent/build-output.txt - npm run test -- --reporter=json > /tmp/gh-aw/agent/test-results.json 2>&1 || true - ``` - -3. **GitHub Actions workflow run artifacts** — Download a specific artifact from a failed run before asking the agent to diagnose it: - ```yaml - steps: - - name: Download test artifact - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - RUN_ID: ${{ github.event.workflow_run.id }} - run: | - gh run download "$RUN_ID" \ - --name test-results --dir /tmp/gh-aw/agent/artifacts/ || true - ``` - -**Related patterns:** - -- For loading a **baseline before AI analysis** (e.g., architecture notes, known-issues registry), use `repo-memory` — see `.github/aw/memory.md` for the full comparison of `cache-memory`, `repo-memory`, and `repo-memory` with wiki. -- For **multi-step data processing pipelines** that pre-compute aggregations before the AI runs, see the [DataOps pattern](https://github.github.com/gh-aw/patterns/data-ops/) and [Deterministic & Agentic Patterns guide](https://github.github.com/gh-aw/guides/deterministic-agentic-patterns/). +**`cache-memory` tip:** Add `cache-memory: true` under `tools:` to persist pre-fetched data across runs. This enables deduplication (skip already-diagnosed run IDs), trending (compare metrics over time), and avoids redundant downloads on retries. The agent reads and writes `/tmp/gh-aw/cache-memory/`. See `.github/aw/memory.md` for full configuration options. ## Issue Form Mode: Step-by-Step Workflow Creation From e01291447ebcd62f9b55a85b31baa6efbf32e329 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 5 Apr 2026 09:16:16 +0000 Subject: [PATCH 5/6] docs: suggest jq for efficient JSON filtering in pre-step data fetching Agent-Logs-Url: https://github.com/github/gh-aw/sessions/9e2d3ec1-d10f-4c63-b6d5-a82bf271a007 Co-authored-by: pelikhan <4175913+pelikhan@users.noreply.github.com> --- .github/aw/create-agentic-workflow.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/aw/create-agentic-workflow.md b/.github/aw/create-agentic-workflow.md index b9a14250103..7dec6475703 100644 --- a/.github/aw/create-agentic-workflow.md +++ b/.github/aw/create-agentic-workflow.md @@ -714,6 +714,7 @@ Use a deterministic `steps:` block to download, trim, and store heavy data befor - Write output to `/tmp/gh-aw/agent/` (canonical agent data directory). - Trim large blobs before writing (`tail -N`). - Add `permissions: actions: read` when reading workflow logs or artifacts. +- Use `jq` to filter JSON responses before writing them to disk — extract only the fields the agent needs and keep file sizes small. **Template (CI log analysis):** @@ -756,8 +757,9 @@ Check `/tmp/gh-aw/cache-memory/seen-runs.json` for previously seen run IDs; skip | Deployment logs (Heroku/Vercel/Railway) | `heroku logs --tail --num 200 --app ${{ vars.HEROKU_APP }} > /tmp/gh-aw/agent/deploy-logs.txt` | | Build / test output | `npm ci 2>&1 \| tail -200 > /tmp/gh-aw/agent/build.txt && npm run test -- --reporter=json > /tmp/gh-aw/agent/test.json 2>&1 \|\| true` | | Workflow run artifact | `gh run download "$RUN_ID" --name test-results --dir /tmp/gh-aw/agent/artifacts/ \|\| true` | +| Filter JSON API response | `gh api repos/{owner}/{repo}/issues --jq '[.[] \| {number,title,state,labels:[.labels[].name]}]' > /tmp/gh-aw/agent/issues.json` | -**`cache-memory` tip:** Add `cache-memory: true` under `tools:` to persist pre-fetched data across runs. This enables deduplication (skip already-diagnosed run IDs), trending (compare metrics over time), and avoids redundant downloads on retries. The agent reads and writes `/tmp/gh-aw/cache-memory/`. See `.github/aw/memory.md` for full configuration options. +**`cache-memory` tip:** Add `cache-memory: true` under `tools:` to persist pre-fetched data across runs. This enables deduplication (skip already-diagnosed run IDs), trending (compare metrics over time), and avoids redundant downloads on retries. The agent reads and writes `/tmp/gh-aw/cache-memory/`. Use `jq` to update the dedup file efficiently — for example `jq '. + ["'"$RUN_ID"'"]' /tmp/gh-aw/cache-memory/seen-runs.json > /tmp/seen-runs.tmp && mv /tmp/seen-runs.tmp /tmp/gh-aw/cache-memory/seen-runs.json`. See `.github/aw/memory.md` for full configuration options. ## Issue Form Mode: Step-by-Step Workflow Creation From 0b4ff6a7ba6d30f0f4b6fbd64baff357110630d0 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 5 Apr 2026 09:25:03 +0000 Subject: [PATCH 6/6] docs: mention tools.agentic-workflows for logs and audit in pre-step table Agent-Logs-Url: https://github.com/github/gh-aw/sessions/ad8bf1ae-1a34-4c0a-b6a2-f6a24784bdb2 Co-authored-by: pelikhan <4175913+pelikhan@users.noreply.github.com> --- .github/aw/create-agentic-workflow.md | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/aw/create-agentic-workflow.md b/.github/aw/create-agentic-workflow.md index 7dec6475703..fbe47995556 100644 --- a/.github/aw/create-agentic-workflow.md +++ b/.github/aw/create-agentic-workflow.md @@ -758,6 +758,7 @@ Check `/tmp/gh-aw/cache-memory/seen-runs.json` for previously seen run IDs; skip | Build / test output | `npm ci 2>&1 \| tail -200 > /tmp/gh-aw/agent/build.txt && npm run test -- --reporter=json > /tmp/gh-aw/agent/test.json 2>&1 \|\| true` | | Workflow run artifact | `gh run download "$RUN_ID" --name test-results --dir /tmp/gh-aw/agent/artifacts/ \|\| true` | | Filter JSON API response | `gh api repos/{owner}/{repo}/issues --jq '[.[] \| {number,title,state,labels:[.labels[].name]}]' > /tmp/gh-aw/agent/issues.json` | +| Agentic workflow run logs | No shell step needed — add `tools: agentic-workflows:` and the agent uses `logs` and `audit` commands directly | **`cache-memory` tip:** Add `cache-memory: true` under `tools:` to persist pre-fetched data across runs. This enables deduplication (skip already-diagnosed run IDs), trending (compare metrics over time), and avoids redundant downloads on retries. The agent reads and writes `/tmp/gh-aw/cache-memory/`. Use `jq` to update the dedup file efficiently — for example `jq '. + ["'"$RUN_ID"'"]' /tmp/gh-aw/cache-memory/seen-runs.json > /tmp/seen-runs.tmp && mv /tmp/seen-runs.tmp /tmp/gh-aw/cache-memory/seen-runs.json`. See `.github/aw/memory.md` for full configuration options.