From 2a25c39ad82ba7d9cbe4db122faccf244f09866c Mon Sep 17 00:00:00 2001 From: Jarek Potiuk Date: Sun, 31 May 2026 20:01:23 +0200 Subject: [PATCH] test(preflight-audit): replay-mode eval fixture exercises every classifier rule MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PR #418 shipped the preflight-audit CLI with a replay mode but no fixture exercising the full classifier end-to-end. This adds: - `tests/fixtures/synthetic_workspace_sweep.json` — 12-issue GraphQL response, one issue per rule path (Rule 1 dispatch, Rule 1-yields-then-Rule-7, Rule 2 dispatch-urgent, Rules 3-7 skip-noop, GitHub-App bot login, personal-bot needing override, fall-through dispatch, recently-closed dispatch). Each issue node carries a `_purpose` annotation documenting which rule it should land on. - `tests/test_eval_replay.py` — drives `classify_response` against the fixture with a pinned `now` (2026-06-01T12:00:00Z) and asserts: 1. The full per-decision bucket distribution (positional identifiers per bucket). 2. The same distribution under `extra_bot_logins` — one issue migrates from dispatch to skip-noop with the override. 3. Per-issue assertions with reason-substring matches, keeping the fixture's `_purpose` annotations in lock-step with the classifier behaviour. 4. A skip-rate floor (≥30%) matching the real-world target after #416's rule tuning. A rule change that alters the distribution fails one of the asserts; the diff in the failing assertion tells the reviewer how the rule affects coverage before they ever look at real adopter data. The eval is deterministic (no live `gh` calls, fixed `now`) so CI runs it in milliseconds. This closes the tune-then-verify loop one more rung up — PR #416 used a one-off `/tmp/` script, PR #418 promoted it to a CLI, and this PR locks the rule behaviour into the test suite. --- .../fixtures/synthetic_workspace_sweep.json | 268 ++++++++++++++++++ .../preflight-audit/tests/test_eval_replay.py | 151 ++++++++++ 2 files changed, 419 insertions(+) create mode 100644 tools/preflight-audit/tests/fixtures/synthetic_workspace_sweep.json create mode 100644 tools/preflight-audit/tests/test_eval_replay.py diff --git a/tools/preflight-audit/tests/fixtures/synthetic_workspace_sweep.json b/tools/preflight-audit/tests/fixtures/synthetic_workspace_sweep.json new file mode 100644 index 00000000..9b703a5a --- /dev/null +++ b/tools/preflight-audit/tests/fixtures/synthetic_workspace_sweep.json @@ -0,0 +1,268 @@ +{ + "_comment": "Synthetic GraphQL response that exercises every classifier rule. Each issue is purpose-built to land in a specific bucket so the eval test can assert the full distribution. `now` for the test is 2026-06-01T12:00:00Z; every timestamp below is relative to that. Renaming this file or shifting `now` breaks the test on purpose — both pieces are documented as in lock-step in `test_eval_replay.py`.", + "data": { + "repository": { + "i100": { + "_purpose": "Rule 1 dispatch — recent human activity within 7d.", + "number": 100, + "state": "OPEN", + "closedAt": null, + "updatedAt": "2026-05-30T08:00:00Z", + "labels": { + "nodes": [ + {"name": "cve allocated"} + ] + }, + "comments": { + "nodes": [ + { + "author": {"login": "reporter-jane"}, + "createdAt": "2026-05-30T08:00:00Z", + "body": "I'd like to discuss this further.\n" + } + ] + } + }, + "i101": { + "_purpose": "Rule 1 yields (skill-drove-update) → Rule 7 fires (skip-noop, fix released awaiting advisory).", + "number": 101, + "state": "OPEN", + "closedAt": null, + "updatedAt": "2026-05-30T08:00:00Z", + "labels": { + "nodes": [ + {"name": "cve allocated"}, + {"name": "fix released"} + ] + }, + "comments": { + "nodes": [ + { + "author": {"login": "potiuk"}, + "createdAt": "2026-05-30T08:00:00Z", + "body": "\nRollup entry body.\n" + } + ] + } + }, + "i102": { + "_purpose": "Rule 2 dispatch-urgent — non-skill comment <24h ago, updatedAt is older (>7d) so Rule 1 doesn't catch first.", + "number": 102, + "state": "OPEN", + "closedAt": null, + "updatedAt": "2026-05-10T08:00:00Z", + "labels": { + "nodes": [ + {"name": "needs triage"} + ] + }, + "comments": { + "nodes": [ + { + "author": {"login": "reporter"}, + "createdAt": "2026-06-01T06:00:00Z", + "body": "Bumping — has anyone looked?\n" + } + ] + } + }, + "i103": { + "_purpose": "Rule 3 skip-noop — closed >30d ago with announced label.", + "number": 103, + "state": "CLOSED", + "closedAt": "2026-04-15T10:00:00Z", + "updatedAt": "2026-04-15T10:00:00Z", + "labels": { + "nodes": [ + {"name": "announced"}, + {"name": "cve allocated"} + ] + }, + "comments": { + "nodes": [ + { + "author": {"login": "potiuk"}, + "createdAt": "2026-04-15T10:00:00Z", + "body": "\nClosed.\n" + } + ] + } + }, + "i104": { + "_purpose": "Rule 4 skip-noop — closed >90d ago with no announced label (stale invalid/dup).", + "number": 104, + "state": "CLOSED", + "closedAt": "2025-12-01T10:00:00Z", + "updatedAt": "2025-12-01T10:00:00Z", + "labels": { + "nodes": [ + {"name": "invalid"} + ] + }, + "comments": { + "nodes": [ + { + "author": {"login": "reporter"}, + "createdAt": "2025-11-30T10:00:00Z", + "body": "Acknowledging.\n" + } + ] + } + }, + "i105": { + "_purpose": "Rule 5 skip-noop — open with full lifecycle labels + skill-last.", + "number": 105, + "state": "OPEN", + "closedAt": null, + "updatedAt": "2026-05-25T08:00:00Z", + "labels": { + "nodes": [ + {"name": "cve allocated"}, + {"name": "pr merged"}, + {"name": "announced"} + ] + }, + "comments": { + "nodes": [ + { + "author": {"login": "potiuk"}, + "createdAt": "2026-05-25T08:00:00Z", + "body": "\nLast sync entry.\n" + } + ] + } + }, + "i106": { + "_purpose": "Rule 6 skip-noop — open with cve+pr+skill-last, awaiting release.", + "number": 106, + "state": "OPEN", + "closedAt": null, + "updatedAt": "2026-05-20T08:00:00Z", + "labels": { + "nodes": [ + {"name": "cve allocated"}, + {"name": "pr merged"} + ] + }, + "comments": { + "nodes": [ + { + "author": {"login": "potiuk"}, + "createdAt": "2026-05-20T08:00:00Z", + "body": "\nAwaiting release.\n" + } + ] + } + }, + "i107": { + "_purpose": "Rule 7 skip-noop — open with cve+fix-released+skill-last, awaiting advisory.", + "number": 107, + "state": "OPEN", + "closedAt": null, + "updatedAt": "2026-05-18T08:00:00Z", + "labels": { + "nodes": [ + {"name": "cve allocated"}, + {"name": "fix released"} + ] + }, + "comments": { + "nodes": [ + { + "author": {"login": "potiuk"}, + "createdAt": "2026-05-18T08:00:00Z", + "body": "\nAwaiting advisory.\n" + } + ] + } + }, + "i108": { + "_purpose": "GitHub-App bot login → skill-or-bot detection via `[bot]` suffix; lands in Rule 7 skip-noop.", + "number": 108, + "state": "OPEN", + "closedAt": null, + "updatedAt": "2026-05-15T08:00:00Z", + "labels": { + "nodes": [ + {"name": "cve allocated"}, + {"name": "fix released"} + ] + }, + "comments": { + "nodes": [ + { + "author": {"login": "github-actions[bot]"}, + "createdAt": "2026-05-15T08:00:00Z", + "body": "CI run completed.\n" + } + ] + } + }, + "i109": { + "_purpose": "Personal-account bot — by default this dispatches (not detected), with --bot-logins it skips via Rule 7.", + "number": 109, + "state": "OPEN", + "closedAt": null, + "updatedAt": "2026-05-10T08:00:00Z", + "labels": { + "nodes": [ + {"name": "cve allocated"}, + {"name": "fix released"} + ] + }, + "comments": { + "nodes": [ + { + "author": {"login": "company-private-bot"}, + "createdAt": "2026-05-10T08:00:00Z", + "body": "No skill marker here.\n" + } + ] + } + }, + "i110": { + "_purpose": "Fall-through dispatch — no rule matches (open, lifecycle incomplete, skill-last but only single label).", + "number": 110, + "state": "OPEN", + "closedAt": null, + "updatedAt": "2026-04-01T08:00:00Z", + "labels": { + "nodes": [ + {"name": "needs triage"} + ] + }, + "comments": { + "nodes": [ + { + "author": {"login": "potiuk"}, + "createdAt": "2026-04-01T08:00:00Z", + "body": "\nOrphan rollup.\n" + } + ] + } + }, + "i111": { + "_purpose": "Closed recently (<30d), announced label — dispatches under Rule 1 (recent updatedAt without skill-drove-update; closed 5d ago).", + "number": 111, + "state": "CLOSED", + "closedAt": "2026-05-27T10:00:00Z", + "updatedAt": "2026-05-27T10:00:00Z", + "labels": { + "nodes": [ + {"name": "announced"}, + {"name": "cve allocated"} + ] + }, + "comments": { + "nodes": [ + { + "author": {"login": "rm-shahar"}, + "createdAt": "2026-05-27T10:00:00Z", + "body": "Released as part of v3.3.0.\n" + } + ] + } + } + } + } +} diff --git a/tools/preflight-audit/tests/test_eval_replay.py b/tools/preflight-audit/tests/test_eval_replay.py new file mode 100644 index 00000000..412490e0 --- /dev/null +++ b/tools/preflight-audit/tests/test_eval_replay.py @@ -0,0 +1,151 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Replay-mode eval — drives `classify_response` end-to-end against a +canned GraphQL response fixture, asserts the full per-issue +classification AND the bucket distribution. + +The fixture +(`fixtures/synthetic_workspace_sweep.json`) is purpose-built to +exercise every rule in `bulk-mode.md` § Pre-flight no-op +classifier — one issue per rule path, plus a fall-through and a +bot-login-detection case. Each issue's `_purpose` field documents +which rule it should land on. + +This is the eval-fixture pattern the README points at: a rule +change that alters the distribution will fail one of the asserts +below; the diff in the failing assertion tells the reviewer how +the rule affects coverage before they look at any real adopter +data. The eval is **deterministic** — every timestamp is relative +to a pinned `now` value below. +""" + +from __future__ import annotations + +import json +from datetime import UTC, datetime +from pathlib import Path + +import pytest + +from preflight_audit.classifier import Decision, classify_response + +# Pinned `now` — every fixture timestamp is computed relative to this +# moment. Shifting it requires rebuilding the fixture in lock-step. +NOW = datetime(2026, 6, 1, 12, 0, 0, tzinfo=UTC) + +FIXTURE = Path(__file__).parent / "fixtures" / "synthetic_workspace_sweep.json" + + +@pytest.fixture(scope="module") +def response() -> dict: + return json.loads(FIXTURE.read_text(encoding="utf-8")) + + +def test_synthetic_sweep_breakdown_no_extra_bots(response: dict) -> None: + """Without `--bot-logins` overrides, the synthetic sweep should + skip 6 of 12 trackers and dispatch the rest (one urgent).""" + classifications = classify_response(response, now=NOW) + by_decision: dict[Decision, list[int]] = {} + for c in classifications: + by_decision.setdefault(c.decision, []).append(c.issue.number) + for d in by_decision.values(): + d.sort() + + assert by_decision.get(Decision.SKIP_NOOP, []) == [101, 103, 104, 105, 106, 107, 108] + assert by_decision.get(Decision.DISPATCH_URGENT, []) == [102] + assert by_decision.get(Decision.DISPATCH, []) == [100, 109, 110, 111] + + +def test_synthetic_sweep_breakdown_with_extra_bot(response: dict) -> None: + """Adding `company-private-bot` to the override moves #109 from + `dispatch` to `skip-noop` (Rule 7 fires once the personal-bot + is recognised as bot-equivalent).""" + classifications = classify_response( + response, now=NOW, extra_bot_logins=frozenset({"company-private-bot"}) + ) + decisions = {c.issue.number: c.decision for c in classifications} + assert decisions[109] == Decision.SKIP_NOOP + # Other classifications stay put. + assert decisions[100] == Decision.DISPATCH + assert decisions[102] == Decision.DISPATCH_URGENT + assert decisions[103] == Decision.SKIP_NOOP + + +def test_synthetic_sweep_each_issue_lands_in_documented_bucket(response: dict) -> None: + """Per-issue assertions matching the `_purpose` annotation each + fixture entry carries. Keeps the fixture documentation in + lock-step with the rule behaviour.""" + classifications = classify_response(response, now=NOW) + by_number = {c.issue.number: c for c in classifications} + + # Rule 1 dispatch (recent human activity). + assert by_number[100].decision == Decision.DISPATCH + assert "recent human activity" in by_number[100].reason + + # Rule 1 yields → Rule 7 fires. + assert by_number[101].decision == Decision.SKIP_NOOP + assert "awaiting advisory" in by_number[101].reason + + # Rule 2 dispatch-urgent. + assert by_number[102].decision == Decision.DISPATCH_URGENT + assert "reporter" in by_number[102].reason + + # Rule 3 — post-announce. + assert by_number[103].decision == Decision.SKIP_NOOP + assert "post-announce" in by_number[103].reason + + # Rule 4 — stale closed. + assert by_number[104].decision == Decision.SKIP_NOOP + assert "stale closed" in by_number[104].reason + + # Rule 5 — all phases done. + assert by_number[105].decision == Decision.SKIP_NOOP + assert "all phases done" in by_number[105].reason + + # Rule 6 — awaiting release. + assert by_number[106].decision == Decision.SKIP_NOOP + assert "awaiting release" in by_number[106].reason + + # Rule 7 — awaiting advisory. + assert by_number[107].decision == Decision.SKIP_NOOP + assert "awaiting advisory" in by_number[107].reason + + # Bot login detection (Rule 7). + assert by_number[108].decision == Decision.SKIP_NOOP + assert by_number[108].last_is_skill_or_bot is True + + # Personal-account bot — without override, dispatches. + assert by_number[109].decision == Decision.DISPATCH + assert by_number[109].last_is_skill_or_bot is False + + # Fall-through dispatch. + assert by_number[110].decision == Decision.DISPATCH + + # Recently-closed non-skill comment — dispatches (Rule 1 catches recent updatedAt). + assert by_number[111].decision == Decision.DISPATCH + + +def test_skip_rate_meets_target(response: dict) -> None: + """Assert the fixture sees a skip-rate ≥30%, matching the + real-world target after the v2 rule tuning. If a rule edit + pushes the rate below this, either the rule needs reviewing + or the fixture needs an extra positive case for the + relaxation.""" + classifications = classify_response(response, now=NOW) + skips = sum(1 for c in classifications if c.decision == Decision.SKIP_NOOP) + rate = skips / len(classifications) + assert rate >= 0.30, f"skip-rate {rate:.0%} below 30% target"