From 55c3c0a70dc6749718d995acd2e7e051bb41bc3d Mon Sep 17 00:00:00 2001 From: Chao Wang <26245345+ChaoWao@users.noreply.github.com> Date: Thu, 28 May 2026 10:47:57 +0800 Subject: [PATCH] CI: use pinned PTO-ISA commit on first attempt; drop retry-with-pinned MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The retry-with-pinned-PTO-ISA pattern was originally added so CI would self-heal when PTO-ISA's HEAD broke an AICPU launch path (canonical example: spmd_paged_attention's TestPagedAttentionUnrollTpushPop hits ACL_ERROR_RT_AICPU_EXCEPTION (507018) on HEAD PTO-ISA, then the pinned-retry passes). In practice the retry has become permanent masking — PTO-ISA HEAD breaks the same tests on every PR's first attempt, and every PR waits the extra cycle to merge. Use the pinned commit on the first attempt; rip out the retry block. What this trades: - Lose: visibility into "PTO-ISA HEAD regressed something." That signal was already noisy (most reds were the same chronic spmd/HCCL hang), and a dedicated periodic PTO-ISA HEAD smoke would be more honest than papering it into every PR's CI. - Gain: every PR gets a deterministic first attempt; ~5 min faster median, much faster when CI happens to roll a busy NPU slot. Applied to all four ST jobs: st-sim-a2a3, st-sim-a5, st-onboard-a2a3, st-onboard-a5. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/ci.yml | 38 +++++++------------------------------- 1 file changed, 7 insertions(+), 31 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4643f630a..7f2c4519c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -212,16 +212,8 @@ jobs: - name: Run pytest scene tests (a2a3sim) run: | - set +e - pytest examples tests/st --platform a2a3sim --device 0-15 -v --pto-session-timeout 600 --clone-protocol https --require-pto-isa - rc=$? - if [ $rc -ne 0 ]; then - echo "pytest failed with rc=$rc; retrying with pinned PTO-ISA commit" - pytest examples tests/st --platform a2a3sim --device 0-15 -v \ - --pto-session-timeout 600 --pto-isa-commit ${{ env.PTO_ISA_COMMIT }} --clone-protocol https --require-pto-isa - rc=$? - fi - exit $rc + pytest examples tests/st --platform a2a3sim --device 0-15 -v \ + --pto-session-timeout 600 --pto-isa-commit ${{ env.PTO_ISA_COMMIT }} --clone-protocol https --require-pto-isa # DFX per-feature smokes — the default pytest above passes no --enable-* # flag, so each capture pipeline gets its own invocation here. Kept as @@ -304,16 +296,8 @@ jobs: - name: Run pytest scene tests (a5sim) run: | - set +e - pytest examples tests/st --platform a5sim --device 0-15 -v --pto-session-timeout 600 --clone-protocol https --require-pto-isa - rc=$? - if [ $rc -ne 0 ]; then - echo "pytest failed with rc=$rc; retrying with pinned PTO-ISA commit" - pytest examples tests/st --platform a5sim --device 0-15 -v \ - --pto-session-timeout 600 --pto-isa-commit ${{ env.PTO_ISA_COMMIT }} --clone-protocol https --require-pto-isa - rc=$? - fi - exit $rc + pytest examples tests/st --platform a5sim --device 0-15 -v \ + --pto-session-timeout 600 --pto-isa-commit ${{ env.PTO_ISA_COMMIT }} --clone-protocol https --require-pto-isa # ---------- Profiling sub-flags smoke (compile + run) ---------- # PTO2_PROFILING / PTO2_ORCH_PROFILING / PTO2_SCHED_PROFILING / @@ -517,17 +501,9 @@ jobs: - name: Run pytest scene tests (a2a3) run: | - set +e source .venv/bin/activate - python -m pytest examples tests/st --platform a2a3 --device ${DEVICE_RANGE} -v --pto-session-timeout 600 --clone-protocol ssh --require-pto-isa - rc=$? - if [ $rc -ne 0 ]; then - echo "pytest failed with rc=$rc; retrying with pinned PTO-ISA commit" - python -m pytest examples tests/st --platform a2a3 --device ${DEVICE_RANGE} -v \ - --pto-session-timeout 600 --pto-isa-commit ${{ env.PTO_ISA_COMMIT }} --clone-protocol ssh --require-pto-isa - rc=$? - fi - exit $rc + python -m pytest examples tests/st --platform a2a3 --device ${DEVICE_RANGE} -v \ + --pto-session-timeout 600 --pto-isa-commit ${{ env.PTO_ISA_COMMIT }} --clone-protocol ssh --require-pto-isa # DFX per-feature smokes — hardware mirror of the st-sim-a2a3 set. The # race window in dep_gen only fires on real timing, so this row is @@ -665,4 +641,4 @@ jobs: source .venv/bin/activate DEVICE_LIST=$(python -c "s,e='${DEVICE_RANGE}'.split('-'); print(','.join(str(i) for i in range(int(s),int(e)+1)))") PYTEST="python -m pytest examples tests/st --platform a5 --device ${DEVICE_RANGE} -v --clone-protocol ssh --require-pto-isa" - task-submit --timeout 1800 --max-time 1800 --device "$DEVICE_LIST" --run "set +e; $PYTEST --pto-session-timeout 1200; rc=\$?; if [ \$rc -ne 0 ]; then echo \"pytest failed with rc=\$rc; retrying with pinned PTO-ISA commit\"; $PYTEST --pto-session-timeout 1200 --pto-isa-commit ${{ env.PTO_ISA_COMMIT }} --clone-protocol ssh; rc=\$?; fi; exit \$rc" + task-submit --timeout 1800 --max-time 1800 --device "$DEVICE_LIST" --run "$PYTEST --pto-session-timeout 1200 --pto-isa-commit ${{ env.PTO_ISA_COMMIT }}"