diff --git a/errors/caching-artifacts/cache-corrupt-from-cancelled-workflow-always-save.yml b/errors/caching-artifacts/cache-corrupt-from-cancelled-workflow-always-save.yml new file mode 100644 index 0000000..f2c5fde --- /dev/null +++ b/errors/caching-artifacts/cache-corrupt-from-cancelled-workflow-always-save.yml @@ -0,0 +1,105 @@ +id: caching-artifacts-138 +title: 'Workflow Cancellation During cache/restore Leaves Corrupt Cache — always() Save Poisons Future Runs' +category: caching-artifacts +severity: error +tags: + - actions-cache + - cache + - corrupt-cache + - workflow-cancel + - windows + - always + - cache-restore + - cache-save +patterns: + - regex: 'Error: The operation was canceled\.' + flags: 'i' + - regex: 'cache\.tzst.*force-local.*\nError: The operation was canceled' + flags: 'im' +error_messages: + - "Error: The operation was canceled." + - "\"C:\\Program Files\\Git\\usr\\bin\\tar.exe\" -xf D:/a/_temp/.../cache.tzst ... --use-compress-program \"zstd -d\"\nError: The operation was canceled." +root_cause: | + When a workflow is cancelled mid-run while actions/cache/restore is actively + extracting the cache archive, the extraction is interrupted at an arbitrary point. + The target directory on disk is left in a partially-extracted (corrupt) state — + some files are present, some are missing, and some may be truncated. + + If the workflow uses actions/cache/save with `if: always()` (a popular pattern to + ensure the cache is populated even when a job fails), the save step runs after the + cancellation because `always()` evaluates to true regardless of job status including + cancellation. The save step re-archives the now-partial directory and uploads it + under the original cache key, atomically overwriting the last known-good cache. + + Subsequent workflow runs perform a cache hit on this key, restore the corrupted + archive, and experience seemingly unrelated build failures: missing source files, + truncated binaries, incomplete package store manifests, etc. The restore step reports + SUCCESS because the download/extract of the corrupt archive itself succeeds — the + corruption only manifests when the extracted files are actually used. + + This issue is most commonly observed on Windows runners where cache extraction via + tar/zstd is slower and workflow cancellations are more likely to land during active + extraction. Reported in actions/cache#1729 with a reproducible example from the + Agda project on Cabal/Haskell builds. +fix: | + Guard the cache/save step with a condition that prevents saving when the preceding + restore was interrupted or the job was cancelled. Use one of: + + 1. Prefer the composite `actions/cache@v5` action over split save/restore — the + composite action's save is a post action, which is NOT executed when a job is + cancelled (post actions only run on success/failure, not cancellation). + + 2. If using split save/restore, add an explicit `if:` condition on the save step + that checks both step outcome and job cancellation status. + + 3. If the cache key is already corrupted, delete it manually via GitHub UI + (Actions > Caches) or API before the next run. +fix_code: + - language: yaml + label: 'Preferred fix: Use composite actions/cache — post action skips on cancellation' + code: | + steps: + - name: Cache cabal store + uses: actions/cache@v5 # composite action — save post-step NOT run on cancel + with: + path: ~/.cabal + key: cabal-${{ runner.os }}-${{ hashFiles('cabal.project') }} + restore-keys: | + cabal-${{ runner.os }}- + + - name: Build + run: cabal build all + - language: yaml + label: 'Split save/restore: Guard cache/save against cancelled restore' + code: | + steps: + - name: Restore cache + id: cache-restore + uses: actions/cache/restore@v5 + with: + path: ~/.cabal + key: cabal-${{ runner.os }}-${{ hashFiles('cabal.project') }} + + - name: Build + run: cabal build all + + - name: Save cache + uses: actions/cache/save@v5 + # Only save when restore completed AND build succeeded + # Do NOT use if: always() — this runs even on cancellation and saves corrupt state + if: steps.cache-restore.outcome == 'success' && job.status == 'success' + with: + path: ~/.cabal + key: ${{ steps.cache-restore.outputs.cache-primary-key }} +prevention: + - 'Do not use `if: always()` on actions/cache/save steps — always() evaluates true even on job cancellation, allowing a corrupt partial-restore state to be saved.' + - 'Prefer the composite `actions/cache@v5` over separate save/restore when you do not need fine-grained control — its post action is cancelled-safe.' + - 'If a job is cancelled during restore and you suspect cache corruption, manually delete the affected cache key via GitHub UI (repo → Actions → Caches) before the next run.' + - 'Guard save with `job.status == success` or inspect `steps..outcome` to ensure a complete restore before re-saving.' +docs: + - url: 'https://github.com/actions/cache/issues/1729' + label: 'actions/cache #1729 — Workflow cancellation during restore writes corrupt cache' + - url: 'https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/caching-dependencies-to-speed-up-workflows' + label: 'GitHub Docs — Caching dependencies to speed up workflows' + - url: 'https://docs.github.com/en/actions/reference/context-reference#job-context' + label: 'GitHub Docs — job context (job.status)' diff --git a/errors/caching-artifacts/cache-ipv6-server-enotfound-parsed-as-hostname.yml b/errors/caching-artifacts/cache-ipv6-server-enotfound-parsed-as-hostname.yml new file mode 100644 index 0000000..208eb2f --- /dev/null +++ b/errors/caching-artifacts/cache-ipv6-server-enotfound-parsed-as-hostname.yml @@ -0,0 +1,84 @@ +id: caching-artifacts-139 +title: 'actions/cache Fails With ENOTFOUND on IPv6 Self-Hosted Cache Server — IPv6 Address Literal Parsed as Hostname' +category: caching-artifacts +severity: warning +tags: + - actions-cache + - ipv6 + - self-hosted-runner + - ENOTFOUND + - ACTIONS_CACHE_URL + - getaddrinfo + - cache-server +patterns: + - regex: 'getCacheEntry failed: getaddrinfo ENOTFOUND \[[\da-f:]+\]' + flags: 'i' + - regex: 'reserveCache failed: getaddrinfo ENOTFOUND \[[\da-f:]+\]' + flags: 'i' + - regex: 'Failed to (restore|save): (getCacheEntry|reserveCache) failed: getaddrinfo ENOTFOUND \[' + flags: 'i' +error_messages: + - "::warning::Failed to restore: getCacheEntry failed: getaddrinfo ENOTFOUND [2001:bc8:1d90:1fc1:dc00:ff:fe2b:3f97]" + - "::warning::Failed to save: reserveCache failed: getaddrinfo ENOTFOUND [2001:bc8:1d90:1fc1:dc00:ff:fe2b:3f97]" +root_cause: | + When a self-hosted runner is configured with an IPv6 cache server URL via the + ACTIONS_CACHE_URL environment variable (e.g. http://[2001:bc8:...]:8080/), the + Node.js HTTP client inside the @actions/cache toolkit package incorrectly strips + the surrounding square brackets from the IPv6 address literal when constructing + the hostname for DNS resolution. + + The resulting bare IPv6 address string (e.g. 2001:bc8:1d90:1fc1:dc00:ff:fe2b:3f97, + without brackets) is passed to getaddrinfo as if it were a DNS hostname. Since it + is not a valid hostname and not an IPv4 address, getaddrinfo returns ENOTFOUND, + causing every cache restore and save to fail. + + The failures surface as warnings (not errors) so the workflow continues to run — + but the cache is never used and never saved, silently eliminating any performance + benefit and causing repeated cold build times on self-hosted setups with IPv6 cache + infrastructure. + + Root fix merged in actions/toolkit PR #2298 (HTTP client IPv6 bracket handling). + The fix may not be deployed to all currently pinned versions of actions/cache. + Reported in actions/cache#1718. +fix: | + Configure the self-hosted cache server to listen on an IPv4 address or register a + DNS hostname that resolves to the cache server, and use that in ACTIONS_CACHE_URL + instead of a bare IPv6 literal. + + If the fix from actions/toolkit PR #2298 has been shipped in a new actions/cache + release, upgrading to that release will also resolve the issue. +fix_code: + - language: yaml + label: 'Use IPv4 or DNS hostname for ACTIONS_CACHE_URL instead of IPv6 literal' + code: | + # Set in runner environment, Docker Compose, or workflow env: block. + # WRONG — IPv6 literal causes getaddrinfo ENOTFOUND: + # ACTIONS_CACHE_URL: http://[2001:bc8:1d90:1fc1:dc00:ff:fe2b:3f97]:8080/ + + # CORRECT — Use IPv4 address: + env: + ACTIONS_CACHE_URL: http://192.168.1.10:8080/ + + # OR use a DNS hostname that resolves to the cache server: + # ACTIONS_CACHE_URL: http://cache.internal.example.com:8080/ + - language: yaml + label: 'Test ACTIONS_CACHE_URL connectivity from the runner before deploying' + code: | + # Add a diagnostic step to verify cache server is reachable: + steps: + - name: Test cache server connectivity + run: | + echo "ACTIONS_CACHE_URL: $ACTIONS_CACHE_URL" + curl -s -o /dev/null -w "%{http_code}" "$ACTIONS_CACHE_URL" || echo "Cache server unreachable" +prevention: + - 'Configure self-hosted cache servers (e.g. actions-cache-server, Minio, Gitea cache) to listen on an IPv4 interface or a DNS hostname rather than a bare IPv6 literal.' + - 'Test ACTIONS_CACHE_URL reachability with curl from the runner before relying on cache steps in CI workflows.' + - 'Treat ::warning:: lines from actions/cache in workflow logs as actionable — cache failures silently degrade performance without failing the job.' + - 'Watch actions/cache and actions/toolkit releases for the IPv6 address bracket fix (actions/toolkit PR #2298) to ship in a stable release.' +docs: + - url: 'https://github.com/actions/cache/issues/1718' + label: 'actions/cache #1718 — Caching fails on IPv6 cache server' + - url: 'https://github.com/actions/toolkit/pull/2298' + label: 'actions/toolkit PR #2298 — Fix IPv6 address parsing in HTTP client' + - url: 'https://docs.github.com/en/actions/hosting-your-own-runners/managing-self-hosted-runners/about-self-hosted-runners#communication-requirements' + label: 'GitHub Docs — Self-hosted runner communication requirements' diff --git a/errors/runner-environment/setup-python-version-not-found-ubuntu-26-container-self-hosted.yml b/errors/runner-environment/setup-python-version-not-found-ubuntu-26-container-self-hosted.yml new file mode 100644 index 0000000..23cee14 --- /dev/null +++ b/errors/runner-environment/setup-python-version-not-found-ubuntu-26-container-self-hosted.yml @@ -0,0 +1,86 @@ +id: runner-environment-479 +title: 'setup-python Fails to Find Python Version in Ubuntu 26.04 Container on Self-Hosted Runner' +category: runner-environment +severity: error +tags: + - setup-python + - ubuntu-26 + - container + - self-hosted-runner + - versions-manifest + - python-version-not-found +patterns: + - regex: 'The version .+ with architecture .x64. was not found for this operating system' + flags: 'i' + - regex: 'setup-python.*version.*not found.*container.*ubuntu' + flags: 'i' +error_messages: + - "Error: The version '3.13' with architecture 'x64' was not found for this operating system." +root_cause: | + When actions/setup-python runs inside a job container (e.g. `container: ubuntu:latest` + or `container: ubuntu:26.04`) on a SELF-HOSTED runner, it queries the versions manifest + to find pre-built Python binaries. On Ubuntu 26.04 (Noble+) containers, the action + cannot resolve a matching manifest entry and returns "version not found." + + The root cause is that the versions-manifest.json for actions/setup-python maps Python + distributions to specific Linux identifiers. Ubuntu 26.04 (or `ubuntu:latest` when it + resolves to 26.04) presents a different OS identifier than Ubuntu 24.04 or 22.04. The + manifest does not yet include a matching pre-compiled binary distribution for the new + Ubuntu 26.04 glibc/OS combination, so no download URL is found and the action errors out. + + This only affects SELF-HOSTED runners. GitHub-hosted `ubuntu-latest` runners work because + they bypass the manifest lookup for containers differently — the host runner provides + toolcache entries that containers can access, while self-hosted runners lack those entries. + + The runner's host OS (e.g. Amazon Linux) is irrelevant — it is the container OS that + matters for the Python binary lookup. + + Reported in setup-python#1309 (June 2026, 8 reactions). Downgrading to + `container: ubuntu:24.04` is the confirmed workaround. +fix: | + 1. Pin the container image to Ubuntu 24.04 (or 22.04) instead of `ubuntu:latest` or + `ubuntu:26.04` until the setup-python manifest includes Ubuntu 26.04 distributions. + 2. Use `python-version-file:` pointing to a .python-version or pyproject.toml to allow + setup-python to install from the system package manager as a fallback. + 3. Pre-install Python in the container image itself and set `update-environment: false` + to skip the manifest lookup entirely. +fix_code: + - language: yaml + label: 'Pin container to ubuntu:24.04 until setup-python manifest supports 26.04' + code: | + jobs: + build: + runs-on: [self-hosted] + container: ubuntu:24.04 # Pin to 24.04 — ubuntu:latest may resolve to 26.04 + steps: + - uses: actions/setup-python@v6 + with: + python-version: '3.13' + - language: yaml + label: 'Pre-install Python in a custom container image to bypass manifest lookup' + code: | + # Dockerfile + FROM ubuntu:26.04 + RUN apt-get update && apt-get install -y python3.13 python3.13-venv python3-pip + + # Workflow — set update-environment: false to use pre-installed Python + jobs: + build: + runs-on: [self-hosted] + container: myorg/ubuntu-26-python:latest # custom image with Python pre-installed + steps: + - uses: actions/setup-python@v6 + with: + python-version: '3.13' + update-environment: false # don't download — use system Python +prevention: + - 'Do not use `container: ubuntu:latest` on self-hosted runners when relying on setup-python — `ubuntu:latest` may resolve to a new major version (e.g. 26.04) for which pre-built Python binaries are not yet available in the versions manifest.' + - 'Pin container image tags to a specific version (ubuntu:24.04) rather than floating tags in production self-hosted workflows.' + - 'Watch setup-python releases and versions-manifest.json for Ubuntu 26.04 support to land before migrating self-hosted container workflows.' +docs: + - url: 'https://github.com/actions/setup-python/issues/1309' + label: 'setup-python #1309 — Failing to fetch version from manifest when using Ubuntu 26.04 container on self-hosted runner' + - url: 'https://github.com/actions/setup-python/blob/main/docs/advanced-usage.md' + label: 'setup-python Advanced Usage — containers and self-hosted runners' + - url: 'https://docs.github.com/en/actions/writing-workflows/choosing-where-your-workflow-runs/running-jobs-in-a-container' + label: 'GitHub Docs — Running jobs in a container' diff --git a/errors/silent-failures/reusable-workflow-environment-secrets-silently-empty-without-secrets-inherit.yml b/errors/silent-failures/reusable-workflow-environment-secrets-silently-empty-without-secrets-inherit.yml new file mode 100644 index 0000000..b121277 --- /dev/null +++ b/errors/silent-failures/reusable-workflow-environment-secrets-silently-empty-without-secrets-inherit.yml @@ -0,0 +1,106 @@ +id: sf-215 +title: 'Environment-Scoped Secrets Silently Resolve to Empty String in Reusable Workflow Without secrets: inherit' +category: silent-failures +severity: silent-failure +tags: + - reusable-workflow + - secrets + - environment + - secrets-inherit + - workflow-call + - silent-failure +patterns: + - regex: 'MY_SECRET length: 0' + flags: 'i' + - regex: 'EMPTY.*secret.*resolved\|secret.*resolv.*empty.*reusable' + flags: 'i' +error_messages: + - "MY_SECRET length: 0" + - "EMPTY (secret resolved to empty string in reusable workflow)" +root_cause: | + The GitHub Actions documentation states that environment-scoped secrets can be + accessed inside a reusable (called) workflow simply by declaring the environment + on the called job: "If a called workflow needs to access environment secrets, + the environment must be defined in the called workflow." + + In practice this documentation is incomplete: environment-scoped secrets resolve + to an empty string ("") in the called workflow UNLESS the CALLER explicitly adds + `secrets: inherit` to the `uses:` step. Without `secrets: inherit`, only secrets + that are explicitly forwarded through a `secrets:` mapping block are passed to the + called workflow. + + Even though the called job declares `environment: my-env`, the environment's + secrets are NOT automatically injected from the caller's scope at runtime. The + called workflow's secrets context contains only what the caller explicitly passes. + No error is thrown — the step succeeds but the secret variable is silently empty, + causing dependent logic (auth, API calls, signing operations) to fail downstream + with unrelated-looking errors. + + Reported in detail with a minimal repro in actions/runner#4453. The GitHub docs + page for reusable workflows was cited but does not reflect the actual runtime behavior. +fix: | + Add `secrets: inherit` to the caller workflow's `uses:` step. This passes all + secrets that the caller can access (including environment-scoped secrets) down to + the called workflow. + + Alternatively, use explicit `secrets:` mapping to forward specific secrets from + the caller's scope to named inputs in the called workflow's `on.workflow_call.secrets` + block. +fix_code: + - language: yaml + label: 'Caller: Add secrets: inherit to forward environment-scoped secrets' + code: | + # .github/workflows/caller.yml + on: workflow_dispatch + jobs: + call-it: + uses: ./.github/workflows/reusable.yml + with: + target_environment: my-env + secrets: inherit # ← required for env-scoped secrets to resolve in called workflow + - language: yaml + label: 'Caller: Alternatively, explicitly forward individual secrets' + code: | + # .github/workflows/caller.yml + on: workflow_dispatch + jobs: + call-it: + uses: ./.github/workflows/reusable.yml + with: + target_environment: my-env + secrets: + MY_SECRET: ${{ secrets.MY_SECRET }} # explicit forwarding from caller scope + - language: yaml + label: 'Called workflow: Declare expected secrets in on.workflow_call.secrets' + code: | + # .github/workflows/reusable.yml + on: + workflow_call: + inputs: + target_environment: + required: true + type: string + secrets: + MY_SECRET: + required: true # declared — caller must forward via secrets: mapping + jobs: + worker: + runs-on: ubuntu-latest + environment: ${{ inputs.target_environment }} + steps: + - name: Use secret + env: + MY_SECRET: ${{ secrets.MY_SECRET }} + run: echo "Secret is populated — length ${#MY_SECRET}" +prevention: + - 'Always add `secrets: inherit` to caller workflows that invoke reusable workflows needing environment-scoped secrets.' + - 'If using explicit `secrets:` mapping, include all environment-scoped secrets the called workflow references, not just repository-level secrets.' + - 'Add a guard step in the called workflow to detect silently empty secrets early: `if [ -z "$MY_SECRET" ]; then echo "::error::MY_SECRET is empty — check caller secrets: inherit"; exit 1; fi`' + - 'Test reusable workflows from a fresh caller that does NOT already inherit secrets to verify the `secrets: inherit` path works end-to-end.' +docs: + - url: 'https://github.com/actions/runner/issues/4453' + label: 'actions/runner #4453 — Environment-scoped secrets unreachable from reusable workflow without secrets: inherit' + - url: 'https://docs.github.com/en/actions/sharing-automations/reusing-workflows#using-inputs-and-secrets-in-a-reusable-workflow' + label: 'GitHub Docs — Using inputs and secrets in a reusable workflow' + - url: 'https://docs.github.com/en/actions/reference/workflows-and-actions/workflow-syntax#jobsjob_idsecretsinherit' + label: 'GitHub Docs — jobs..secrets: inherit'