From 80e90f1e5df0785e9e7363b29af754d152a187b7 Mon Sep 17 00:00:00 2001 From: Dmitry Ilyin <6576495+widgetii@users.noreply.github.com> Date: Sat, 23 May 2026 20:43:39 +0300 Subject: [PATCH 1/2] contrib/openipc-bisect: stability fixes uncovered by first end-to-end run MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Four bugs surfaced when running the first real convergence loop on openipc-hi3520dv200.dlab.torturelabs.com (4-build window) the morning after PR #2117 landed. None of them would have been caught by the jq-against-static-manifest dry-runs done at PR time; they only emerge under real flash+reboot cycles. Fixes: 1. **`status` had a jq syntax error.** `(log/log(2)) | floor + 1` — jq has no `log` function (that's Python's math module). Status crashed at the JSON-construction step. Fix: compute ceil(log2(window_size)) in awk before invoking jq and pass via --argjson. 2. **`pick_next` returned "" when 1 unverified candidate remained.** Threshold was `<= 1` instead of `== 0`. A real bisect with the wrong verdict cadence would terminate early and miss the last build that needed testing. Threshold corrected to `== 0`; with 1 unverified the index math `length / 2 | floor` correctly returns 0, selecting the lone unverified build. 3. **SSH lacked ServerAliveInterval / ServerAliveCountMax.** When sysupgrade reboots the camera, dropbear is killed without a graceful TCP close. The host's `ssh root@$host "sysupgrade ..."` in remote_flash() then sat on a zombie TCP connection until kernel keepalive (~2 hours) — `iterate()` never reached `wait_for_camera`. Added `-o ServerAliveInterval=15 -o ServerAliveCountMax=3` to the default SSH_OPTS so the host detects the dead session in ~45s and the iteration progresses normally. 4. **`start ` rejected `root@host`.** The contract was bare hostname (the script always SSHes as root), but the form everyone reaches for in OpenIPC docs — including the wiki article shipped alongside the original PR — is `root@host`. Now strips a leading `user@` prefix in cmd_start before everything downstream. End-to-end test that found these (2026-05-23 on openipc-hi3520dv200.dlab.torturelabs.com, 4-build window): * start picked nightly-20260522-7d32f00 (median) → camera reboot → UART noise interrupted u-boot autoboot → camera stuck at u-boot prompt → host process killed manually → user recovered camera via UART. State file on host stayed intact across the brick. After recovery, `openipc-bisect resume` correctly re-attached and prompted for verdict — exactly the brick-survivability promise. * `good` verdict narrowed window to a single element and printed "Bisect complete. First bad build: nightly-20260523-7a2c1b3". After these fixes the next end-to-end run (5+ builds in manifest) should be hands-off. Co-Authored-By: Claude Opus 4.7 (1M context) --- contrib/openipc-bisect | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/contrib/openipc-bisect b/contrib/openipc-bisect index 9e67d9674..3d12e405c 100755 --- a/contrib/openipc-bisect +++ b/contrib/openipc-bisect @@ -27,7 +27,7 @@ PROG=$(basename "$0") MANIFEST_URL=${OPENIPC_MANIFEST_URL:-https://openipc.github.io/firmware/manifest.json} STATE_DIR=${OPENIPC_BISECT_STATE:-${XDG_STATE_HOME:-$HOME/.local/state}/openipc/bisect} WAIT_BUDGET=${OPENIPC_BISECT_WAIT:-300} # seconds to wait for a camera to come back -SSH_OPTS=${OPENIPC_SSH_OPTS:--o ConnectTimeout=5 -o StrictHostKeyChecking=accept-new} +SSH_OPTS=${OPENIPC_SSH_OPTS:--o ConnectTimeout=5 -o StrictHostKeyChecking=accept-new -o ServerAliveInterval=15 -o ServerAliveCountMax=3} die() { printf '%s: %s\n' "$PROG" "$*" >&2; exit 1; } info() { printf '%s\n' "$*" >&2; } @@ -146,7 +146,7 @@ pick_next() { .window as $w | (.verdicts // {}) as $v | ($w | map(select($v[.] == null))) as $unverified | - if ($unverified | length) <= 1 then "" else + if ($unverified | length) == 0 then "" else $unverified[($unverified | length / 2 | floor)] end ' @@ -261,6 +261,10 @@ iterate() { cmd_start() { host=$1; shift + # Tolerate user@host (the form most OpenIPC docs use, e.g. root@cam). + # The script always SSHes as root because that's the only user on these + # cameras; the user@ prefix is a doc convention, not a real choice. + host=${host#*@} good_ref=""; bad_ref=""; platform="" while [ $# -gt 0 ]; do case "$1" in @@ -349,12 +353,15 @@ cmd_verdict() { cmd_status() { host=$(resolve_host) state=$(load_state "$host") - printf '%s\n' "$state" | jq ' + wn=$(printf '%s' "$state" | jq -r '.window | length') + # jq has no log() — compute ceil(log2(N)) in awk. + rounds=$(awk -v n="$wn" 'BEGIN { if (n>1) { r=0; x=n; while (x>1) { r++; x=(x+1)/2 } print r } else print 0 }') + printf '%s\n' "$state" | jq --argjson rl "$rounds" ' { host, platform, good, bad, current, window_size: (.window | length), - verdicts, - est_rounds_left: ((.window | length) | (if . > 1 then (log/log(2)) | floor + 1 else 0 end)) + est_rounds_left: $rl, + verdicts }' } From 230902a88af04b651334067b95dddfca5fb6d13b Mon Sep 17 00:00:00 2001 From: Dmitry Ilyin <6576495+widgetii@users.noreply.github.com> Date: Sat, 23 May 2026 21:12:56 +0300 Subject: [PATCH 2/2] ci/manifest: retry transient gh API failures in enrich_manifest MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit GitHub Actions had a flaky API/token plane today (2026-05-23): two manifest workflow runs against the same upstream build (run 26331664183, commit 7a2c1b3) failed with HTTP 401 "Bad credentials" on `gh release view`, while a third run between them succeeded — same script, same permissions block. Pure flake. Add a 4-attempt retry budget (delays 0, 5, 15, 40 seconds) around the `gh()` wrapper. Total wait ≤60 s on the worst case, which is small compared to the disruption of having to manually re-dispatch manifest.yml whenever GH wobbles. Discrimination: - "release not found" / 404 → fail FAST (one attempt, ~0.7s). These are permanent and re-trying just wastes CI time. - Everything else (401, 5xx, network) → retry with backoff. Each attempt logs to stderr so the action log shows the retry trail. The script's caller (`manifest.yml`) is unchanged. The happy path still resolves the live 4-build manifest in <1s with no retries fired. Note: this lands in PR #2129 alongside the openipc-bisect fixes because the user reported both as part of one "transient CI flake" debugging session. The two changes are unrelated in code but share the same root cause class: real bugs only visible after the design runs against actual production load. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/scripts/enrich_manifest.py | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/.github/scripts/enrich_manifest.py b/.github/scripts/enrich_manifest.py index 2508f8f9b..a4a556a50 100644 --- a/.github/scripts/enrich_manifest.py +++ b/.github/scripts/enrich_manifest.py @@ -22,6 +22,7 @@ import re import subprocess import sys +import time from pathlib import Path REPO = os.environ.get("GITHUB_REPOSITORY", "OpenIPC/firmware") @@ -29,13 +30,38 @@ TAG_RE = re.compile(r"^nightly-(\d{8})-([0-9a-f]{7})$") ASSET_RE = re.compile(r"^openipc\.([^.]+)-(nor|nand)-(lite|ultimate|neo)\.tgz$") +# Retry budget for transient GitHub API failures (HTTP 401 Bad credentials, +# 5xx, rate-limit) observed on workflow_run-triggered runs 2026-05-23. +GH_RETRY_DELAYS = (0, 5, 15, 40) # 4 attempts; last delay before final try + def gh(*args: str) -> str: # Always pass --repo so we don't depend on a .git in cwd # (the workflow runs the script from a path without .git). - return subprocess.check_output( - ["gh", *args, "--repo", REPO], text=True + # Retry on transient failures (the GitHub API/token plane has flaky days); + # surface to stderr so failures are visible in the action log. + cmd = ["gh", *args, "--repo", REPO] + last_exc = None + for delay in GH_RETRY_DELAYS: + if delay > 0: + time.sleep(delay) + try: + return subprocess.check_output(cmd, text=True, stderr=subprocess.PIPE) + except subprocess.CalledProcessError as e: + last_exc = e + err = e.stderr or "" + # Permanent failures — don't waste the retry budget. + if "release not found" in err.lower() or "not found (HTTP 404)" in err: + break + sys.stderr.write( + f"gh {' '.join(args[:3])}: attempt failed " + f"(rc={e.returncode}): {err.strip()[:240]}\n" + ) + sys.stderr.write( + f"gh {' '.join(args[:3])}: giving up; " + f"final stderr:\n{last_exc.stderr}\n" ) + raise last_exc def list_dated_releases() -> list[dict]: