From 0e035b06e0286536e3ff3ca514afa6ce626d029c Mon Sep 17 00:00:00 2001 From: Markus Neusinger <2921697+MarkusNeusinger@users.noreply.github.com> Date: Sat, 25 Apr 2026 23:46:03 +0200 Subject: [PATCH 1/2] fix(workflows): make impl pipeline resilient to transient Claude failures MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three resilience gaps were leaving PRs stuck after a single Claude Code Action hiccup: - impl-generate: only allowed 2 attempts before declaring `failed`. Bumped cap to 3, aligning with the existing "max retries exhausted (3 attempts)" label description and the repair phase's 3-attempt budget. - impl-repair: had no failure handler — if the repair workflow itself crashed, `ai-rejected` was already removed and re-review never fired, leaving the PR silently stuck. Added handler that restores the label and auto-retries once via a marker comment, then falls back to manual. - impl-review: both failure paths (Claude crash + score=0 from missing quality_score.txt) immediately surfaced `ai-review-failed`, requiring manual rerun. Both now auto-retry once via repository_dispatch with a shared marker comment before giving up. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/impl-generate.yml | 22 +++++---- .github/workflows/impl-repair.yml | 53 ++++++++++++++++++++ .github/workflows/impl-review.yml | 76 ++++++++++++++++++++++------- 3 files changed, 124 insertions(+), 27 deletions(-) diff --git a/.github/workflows/impl-generate.yml b/.github/workflows/impl-generate.yml index 2041f48f99..ff64407a02 100644 --- a/.github/workflows/impl-generate.yml +++ b/.github/workflows/impl-generate.yml @@ -762,9 +762,13 @@ jobs: echo "::notice::Previous failures for ${LIBRARY}/${SPEC_ID}: $FAILURE_COUNT" - # After 1 previous failure (= this is attempt 2) → mark as failed - if [ "$FAILURE_COUNT" -ge 1 ]; then - echo "::warning::Marking $LIBRARY as failed after 2 generation attempts" + # FAILURE_COUNT counts marker comments BEFORE this run. + # 0 → this is attempt 1 fail, 1 → attempt 2 fail, 2 → attempt 3 fail. + ATTEMPT=$((FAILURE_COUNT + 1)) + + # After 2 previous failures (= this is attempt 3) → mark as failed + if [ "$FAILURE_COUNT" -ge 2 ]; then + echo "::warning::Marking $LIBRARY as failed after 3 generation attempts" # Create failed label if needed gh label create "impl:${LIBRARY}:failed" --color "d73a4a" \ @@ -777,9 +781,9 @@ jobs: # Post final failure comment with marker gh issue comment "$ISSUE" --body "${MARKER} - ## :x: ${LIBRARY} Failed (Attempt 2/2) + ## :x: ${LIBRARY} Failed (Attempt 3/3) - The **${LIBRARY}** implementation for \`${SPEC_ID}\` failed after 2 attempts. + The **${LIBRARY}** implementation for \`${SPEC_ID}\` failed after 3 attempts. **Reason:** Claude Code failed to create the implementation file. @@ -792,11 +796,11 @@ jobs: :robot: *[impl-generate](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }})*" else - # First failure → post comment with marker and auto-retry + # Attempt 1 or 2 failed → post comment with marker and auto-retry gh issue comment "$ISSUE" --body "${MARKER} - ## :warning: ${LIBRARY} Generation Failed (Attempt 1/2) + ## :warning: ${LIBRARY} Generation Failed (Attempt ${ATTEMPT}/3) - First attempt failed. Automatically retrying... + Attempt ${ATTEMPT} failed. Automatically retrying... --- :robot: *[impl-generate](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }})*" @@ -810,5 +814,5 @@ jobs: -f library="${LIBRARY}" \ -f issue_number="${ISSUE}" - echo "::notice::Triggered automatic retry for ${LIBRARY}/${SPEC_ID}" + echo "::notice::Triggered automatic retry for ${LIBRARY}/${SPEC_ID} (attempt $((ATTEMPT + 1)))" fi diff --git a/.github/workflows/impl-repair.yml b/.github/workflows/impl-repair.yml index 280a8e821a..d99e14a4e8 100644 --- a/.github/workflows/impl-repair.yml +++ b/.github/workflows/impl-repair.yml @@ -239,3 +239,56 @@ jobs: -f event_type=review-pr \ -f "client_payload[pr_number]=$PR_NUM" echo "::notice::Triggered impl-review.yml via repository_dispatch for PR #$PR_NUM" + + # ======================================================================== + # Failure handling: when the repair workflow itself crashes (e.g. Claude + # Code Action transient failure), restore the ai-rejected label and + # auto-retry once. After one auto-retry, fall back to manual. + # ======================================================================== + - name: Handle repair failure + if: failure() + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + PR_NUM: ${{ inputs.pr_number }} + SPEC_ID: ${{ inputs.specification_id }} + LIBRARY: ${{ inputs.library }} + ATTEMPT: ${{ inputs.attempt }} + run: | + # Restore ai-rejected label that was removed at start of repair so the + # PR state stays consistent (otherwise it looks "stuck approved"). + gh pr edit "$PR_NUM" --add-label "ai-rejected" 2>/dev/null || true + + MARKER="" + RETRY_COUNT=$(gh api "repos/${{ github.repository }}/issues/${PR_NUM}/comments" \ + --jq "[.[] | select(.body | contains(\"$MARKER\"))] | length" 2>/dev/null || echo "0") + + if [ "$RETRY_COUNT" -ge 1 ]; then + echo "::error::Repair attempt ${ATTEMPT} crashed twice — giving up" + gh pr comment "$PR_NUM" --body "${MARKER} + ## :x: Repair Workflow Crashed (Attempt ${ATTEMPT}/3, retry exhausted) + + The repair workflow itself failed twice for this attempt — likely a persistent Claude Code Action issue. + + **Manual restart:** + \`\`\` + gh workflow run impl-repair.yml -f pr_number=${PR_NUM} -f specification_id=${SPEC_ID} -f library=${LIBRARY} -f attempt=${ATTEMPT} + \`\`\` + + --- + :robot: *[impl-repair](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }})*" + else + echo "::warning::Repair attempt ${ATTEMPT} crashed — auto-retrying once" + gh pr comment "$PR_NUM" --body "${MARKER} + ## :wrench: Repair Workflow Crashed (Attempt ${ATTEMPT}/3) — Auto-Retrying + + The repair workflow failed (probably a transient Claude Code Action issue). Automatically re-triggering this attempt... + + --- + :robot: *[impl-repair](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }})*" + + gh workflow run impl-repair.yml \ + -f pr_number="$PR_NUM" \ + -f specification_id="$SPEC_ID" \ + -f library="$LIBRARY" \ + -f attempt="$ATTEMPT" + fi diff --git a/.github/workflows/impl-review.yml b/.github/workflows/impl-review.yml index d365781bb7..efe6718bb2 100644 --- a/.github/workflows/impl-review.yml +++ b/.github/workflows/impl-review.yml @@ -172,34 +172,50 @@ jobs: env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} PR_NUM: ${{ steps.pr.outputs.pr_number }} + SPEC_ID: ${{ steps.pr.outputs.specification_id }} + LIBRARY: ${{ steps.pr.outputs.library }} run: | - echo "::error::AI Review did not produce valid output files" - echo "::error::Expected quality_score.txt but got score=0" - echo "::error::This indicates Claude Code Action ran but didn't complete the review task" + echo "::error::AI Review did not produce valid output files (score=0)" - # Add ai-review-failed label so it's visible - gh pr edit "$PR_NUM" --add-label "ai-review-failed" 2>/dev/null || true + MARKER="" + RETRY_COUNT=$(gh api "repos/${{ github.repository }}/issues/${PR_NUM}/comments" \ + --jq "[.[] | select(.body | contains(\"$MARKER\"))] | length" 2>/dev/null || echo "0") - # Post error comment on PR - gh pr comment "$PR_NUM" --body "## :x: AI Review Failed + if [ "$RETRY_COUNT" -ge 1 ]; then + # Already auto-retried once → final fail, require manual rerun + gh pr edit "$PR_NUM" --add-label "ai-review-failed" 2>/dev/null || true + gh pr comment "$PR_NUM" --body "${MARKER} + ## :x: AI Review Failed (auto-retry exhausted) - The AI review action completed but did not produce valid output files. + The AI review action completed but did not produce valid output files. Auto-retry already tried once. **What happened:** - The Claude Code Action ran - No \`quality_score.txt\` file was created - - No review data was extracted - **Action required:** - Re-run the impl-review workflow manually: + **Manual rerun:** \`\`\` gh workflow run impl-review.yml -f pr_number=$PR_NUM \`\`\` + --- + :robot: *[impl-review](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }})*" + exit 1 + fi + + # First failure — post marker and auto-retry via repository_dispatch + gh pr comment "$PR_NUM" --body "${MARKER} + ## :wrench: AI Review Produced No Score — Auto-Retrying + + The Claude Code Action ran but didn't write \`quality_score.txt\`. Auto-retrying review once... + --- :robot: *[impl-review](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }})*" - exit 1 + gh api repos/${{ github.repository }}/dispatches \ + -f event_type=review-pr \ + -f "client_payload[pr_number]=$PR_NUM" + echo "::notice::Auto-re-triggered impl-review.yml for PR #$PR_NUM" - name: Add quality score label if: steps.review.conclusion == 'success' && steps.score.outputs.score != '0' @@ -448,19 +464,43 @@ jobs: env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} PR_NUM: ${{ steps.pr.outputs.pr_number }} + SPEC_ID: ${{ steps.pr.outputs.specification_id }} + LIBRARY: ${{ steps.pr.outputs.library }} run: | - gh pr edit "$PR_NUM" --add-label "ai-review-failed" - gh pr comment "$PR_NUM" --body "## :warning: AI Review Failed + MARKER="" + RETRY_COUNT=$(gh api "repos/${{ github.repository }}/issues/${PR_NUM}/comments" \ + --jq "[.[] | select(.body | contains(\"$MARKER\"))] | length" 2>/dev/null || echo "0") - The AI review action failed or timed out. + if [ "$RETRY_COUNT" -ge 1 ]; then + gh pr edit "$PR_NUM" --add-label "ai-review-failed" 2>/dev/null || true + gh pr comment "$PR_NUM" --body "${MARKER} + ## :x: AI Review Failed (auto-retry exhausted) - **Options:** - 1. Re-run the workflow manually - 2. Request manual human review + The AI review action failed or timed out twice in a row. + + **Manual rerun:** + \`\`\` + gh workflow run impl-review.yml -f pr_number=$PR_NUM + \`\`\` + + --- + :robot: *[impl-review](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }})*" + exit 0 + fi + + gh pr comment "$PR_NUM" --body "${MARKER} + ## :wrench: AI Review Crashed — Auto-Retrying + + The Claude Code Action failed or timed out. Auto-retrying review once... --- :robot: *[impl-review](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }})*" + gh api repos/${{ github.repository }}/dispatches \ + -f event_type=review-pr \ + -f "client_payload[pr_number]=$PR_NUM" + echo "::notice::Auto-re-triggered impl-review.yml for PR #$PR_NUM" + - name: Add verdict label and take action if: steps.review.conclusion == 'success' && steps.score.outputs.score != '0' env: From 12cd72ca0bf5685647640c306472054ba6e766b5 Mon Sep 17 00:00:00 2001 From: Markus Neusinger <2921697+MarkusNeusinger@users.noreply.github.com> Date: Sat, 25 Apr 2026 23:57:35 +0200 Subject: [PATCH 2/2] fix(workflows): paginate marker counts and exit non-zero after auto-retry Address Copilot review on #5410: - Marker counts (impl-generate / impl-repair / impl-review) used the default `gh api .../comments` page size of 30. On long-lived issues/PRs the marker could fall off the first page, causing repeated "first" auto-retries. Switched to `--paginate --per_page=100` so the count is reliable. - impl-review's auto-retry paths now `exit 1` after dispatching the retry, so the run status accurately reflects that no verdict was produced and any dispatch failure stays visible. The auto-retry runs in a separate workflow run, so this doesn't break the recovery chain. - Also paginate the failure-marker count in impl-generate (same bug class, not flagged by Copilot but same fix). Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/impl-generate.yml | 8 +++++--- .github/workflows/impl-repair.yml | 5 +++-- .github/workflows/impl-review.yml | 16 ++++++++++++---- 3 files changed, 20 insertions(+), 9 deletions(-) diff --git a/.github/workflows/impl-generate.yml b/.github/workflows/impl-generate.yml index ff64407a02..a83f03b036 100644 --- a/.github/workflows/impl-generate.yml +++ b/.github/workflows/impl-generate.yml @@ -755,10 +755,12 @@ jobs: run: | echo "::notice::Handling generation failure for $LIBRARY/$SPEC_ID" - # Count previous failures via hidden marker comments (more reliable than workflow runs) + # Count previous failures via hidden marker comments (more reliable than workflow runs). + # Paginate so the marker is found even on issues with >30 comments + # (which is common because all 9 library impls land on the same issue). MARKER="" - FAILURE_COUNT=$(gh api "repos/${{ github.repository }}/issues/${ISSUE}/comments" \ - --jq "[.[] | select(.body | contains(\"$MARKER\"))] | length" 2>/dev/null || echo "0") + FAILURE_COUNT=$(gh api --paginate "repos/${{ github.repository }}/issues/${ISSUE}/comments?per_page=100" \ + --jq "[.[] | select(.body != null and (.body | contains(\"$MARKER\")))] | length" 2>/dev/null || echo "0") echo "::notice::Previous failures for ${LIBRARY}/${SPEC_ID}: $FAILURE_COUNT" diff --git a/.github/workflows/impl-repair.yml b/.github/workflows/impl-repair.yml index d99e14a4e8..0076c12634 100644 --- a/.github/workflows/impl-repair.yml +++ b/.github/workflows/impl-repair.yml @@ -259,8 +259,9 @@ jobs: gh pr edit "$PR_NUM" --add-label "ai-rejected" 2>/dev/null || true MARKER="" - RETRY_COUNT=$(gh api "repos/${{ github.repository }}/issues/${PR_NUM}/comments" \ - --jq "[.[] | select(.body | contains(\"$MARKER\"))] | length" 2>/dev/null || echo "0") + # Paginate so the marker is found even on PRs with >30 comments. + RETRY_COUNT=$(gh api --paginate "repos/${{ github.repository }}/issues/${PR_NUM}/comments?per_page=100" \ + --jq "[.[] | select(.body != null and (.body | contains(\"$MARKER\")))] | length" 2>/dev/null || echo "0") if [ "$RETRY_COUNT" -ge 1 ]; then echo "::error::Repair attempt ${ATTEMPT} crashed twice — giving up" diff --git a/.github/workflows/impl-review.yml b/.github/workflows/impl-review.yml index efe6718bb2..0665774ff6 100644 --- a/.github/workflows/impl-review.yml +++ b/.github/workflows/impl-review.yml @@ -178,8 +178,9 @@ jobs: echo "::error::AI Review did not produce valid output files (score=0)" MARKER="" - RETRY_COUNT=$(gh api "repos/${{ github.repository }}/issues/${PR_NUM}/comments" \ - --jq "[.[] | select(.body | contains(\"$MARKER\"))] | length" 2>/dev/null || echo "0") + # Paginate so the marker is found even on PRs with >30 comments. + RETRY_COUNT=$(gh api --paginate "repos/${{ github.repository }}/issues/${PR_NUM}/comments?per_page=100" \ + --jq "[.[] | select(.body != null and (.body | contains(\"$MARKER\")))] | length" 2>/dev/null || echo "0") if [ "$RETRY_COUNT" -ge 1 ]; then # Already auto-retried once → final fail, require manual rerun @@ -216,6 +217,9 @@ jobs: -f event_type=review-pr \ -f "client_payload[pr_number]=$PR_NUM" echo "::notice::Auto-re-triggered impl-review.yml for PR #$PR_NUM" + # Mark this run as failed so the run status reflects that no verdict + # was produced. The auto-retry runs in a separate workflow run. + exit 1 - name: Add quality score label if: steps.review.conclusion == 'success' && steps.score.outputs.score != '0' @@ -468,8 +472,9 @@ jobs: LIBRARY: ${{ steps.pr.outputs.library }} run: | MARKER="" - RETRY_COUNT=$(gh api "repos/${{ github.repository }}/issues/${PR_NUM}/comments" \ - --jq "[.[] | select(.body | contains(\"$MARKER\"))] | length" 2>/dev/null || echo "0") + # Paginate so the marker is found even on PRs with >30 comments. + RETRY_COUNT=$(gh api --paginate "repos/${{ github.repository }}/issues/${PR_NUM}/comments?per_page=100" \ + --jq "[.[] | select(.body != null and (.body | contains(\"$MARKER\")))] | length" 2>/dev/null || echo "0") if [ "$RETRY_COUNT" -ge 1 ]; then gh pr edit "$PR_NUM" --add-label "ai-review-failed" 2>/dev/null || true @@ -500,6 +505,9 @@ jobs: -f event_type=review-pr \ -f "client_payload[pr_number]=$PR_NUM" echo "::notice::Auto-re-triggered impl-review.yml for PR #$PR_NUM" + # Mark this run as failed so the run status reflects that no verdict + # was produced. The auto-retry runs in a separate workflow run. + exit 1 - name: Add verdict label and take action if: steps.review.conclusion == 'success' && steps.score.outputs.score != '0'