From b571a3adc82f824e7222aa1060a172fd0d36ded1 Mon Sep 17 00:00:00 2001 From: Justin McLean Date: Wed, 10 Jun 2026 13:30:15 +1000 Subject: [PATCH] feat(validator): add SOFT eval-coverage check (check #8) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Every skill under skills/ must ship a matching behavioural eval suite under tools/skill-evals/evals//. The new validate_eval_coverage function surfaces missing suites as SOFT advisory violations so that in-flight eval PRs do not fail the gate while their branches are pending review. Against the live repo the check correctly flags the two skills that currently have in-flight eval branches (pr-management-quick-merge and setup-status) and is silent on all others. 8 new test cases cover the happy path, the missing-eval path, missing-both-dirs paths, the soft-category membership, and the non-directory skip. Addresses the Known Gap in specs/meta-and-quality-tooling.md: "Eval coverage is incomplete — skills added before the per-skill-eval convention have no suite." The check prevents future regressions. Generated-by: Claude (Opus 4.7) --- .../src/skill_and_tool_validator/__init__.py | 48 ++++++++++- .../tests/test_validator.py | 83 +++++++++++++++++++ 2 files changed, 130 insertions(+), 1 deletion(-) diff --git a/tools/skill-and-tool-validator/src/skill_and_tool_validator/__init__.py b/tools/skill-and-tool-validator/src/skill_and_tool_validator/__init__.py index 1d12cbf0..4b617d71 100644 --- a/tools/skill-and-tool-validator/src/skill_and_tool_validator/__init__.py +++ b/tools/skill-and-tool-validator/src/skill_and_tool_validator/__init__.py @@ -17,7 +17,7 @@ """Validate framework skill definitions. -This module validates eight aspects of every skill under +This module validates nine aspects of every skill under skills/: 1. YAML frontmatter — every SKILL.md must have a valid frontmatter @@ -49,6 +49,11 @@ Apache Software Foundation license preamble. Skill ``.md`` files declare their license via the required ``license:`` frontmatter key (checked by aspect 1), so they need no separate header. +9. Eval-coverage (SOFT) — every skill directory under ``skills/`` + must have a matching behavioural eval suite under + ``tools/skill-evals/evals//``. Missing suites are + advisories so in-flight eval PRs do not block the gate while + their branches are pending review. SOFT categories surface as advisory warnings (stderr) without failing the run unless ``--strict`` is passed. @@ -74,6 +79,7 @@ SKILLS_DIR = Path("skills") TOOLS_DIR = Path("tools") DOCS_DIR = Path("docs") +SKILL_EVALS_DIR = Path("tools/skill-evals/evals") PROJECTS_TEMPLATE_DIR = Path("projects/_template") # Categories for the tool-validator block. Both HARD by default — every @@ -89,6 +95,8 @@ # with live skill frontmatter + tool README declarations. DOCS_LABELS_AND_CAPABILITIES = Path("docs/labels-and-capabilities.md") CAPABILITY_SYNC_CATEGORY = "capability-sync" +# Eval-coverage check: every skill must have a matching eval suite. +EVAL_COVERAGE_CATEGORY = "eval-coverage" _SKILL_TABLE_HEADER = "## Capability to skill map" _TOOL_TABLE_HEADER = "## Capability to tool map" # Tokens like `capability:setup`. Optional backticks around the token. @@ -262,6 +270,7 @@ def _read_mode_table() -> dict[str, str]: GH_LIST_CATEGORY, PRIVACY_CATEGORY, LOWERCASE_F_FIELD_CATEGORY, + EVAL_COVERAGE_CATEGORY, } ) HARD_CATEGORIES: frozenset[str] = frozenset( @@ -1728,6 +1737,40 @@ def collect_doc_files(root: Path | None = None) -> set[Path]: return files +# --------------------------------------------------------------------------- +# Eval-coverage check (check #9, SOFT) +# --------------------------------------------------------------------------- + + +def validate_eval_coverage(root: Path | None = None) -> Iterable[Violation]: + """Warn when a skill directory has no matching eval suite. + + Every skill under skills/ must have a behavioural eval suite under + tools/skill-evals/evals//. Missing suites surface as SOFT + advisories so in-flight eval PRs do not fail the gate while their + branches are pending review. + """ + repo_root = root or find_repo_root() + skills_base = repo_root / SKILLS_DIR + evals_base = repo_root / SKILL_EVALS_DIR + if not skills_base.exists(): + return + eval_slugs: set[str] = set() + if evals_base.exists(): + eval_slugs = {p.name for p in evals_base.iterdir() if p.is_dir()} + for skill_dir in sorted(skills_base.iterdir()): + if not skill_dir.is_dir(): + continue + slug = skill_dir.name + if slug not in eval_slugs: + yield Violation( + skill_dir / "SKILL.md", + None, + f"eval-coverage: no eval suite at tools/skill-evals/evals/{slug}/ — add one before shipping", + category=EVAL_COVERAGE_CATEGORY, + ) + + def run_validation(root: Path | None = None) -> list[Violation]: """Run the full validation suite and return all violations.""" repo_root = root or find_repo_root() @@ -1774,6 +1817,9 @@ def run_validation(root: Path | None = None) -> list[Violation]: # Capability-sync check: the doc tables and the source must agree. violations.extend(validate_capability_sync(repo_root)) + # Eval-coverage check: every skill must have a matching eval suite. + violations.extend(validate_eval_coverage(repo_root)) + return violations diff --git a/tools/skill-and-tool-validator/tests/test_validator.py b/tools/skill-and-tool-validator/tests/test_validator.py index 2e5430b3..3fda038f 100644 --- a/tools/skill-and-tool-validator/tests/test_validator.py +++ b/tools/skill-and-tool-validator/tests/test_validator.py @@ -30,6 +30,7 @@ _PRIVACY_EXTERNAL_CONTENT_MODES, ALL_CATEGORIES, ALLOWED_MODES, + EVAL_COVERAGE_CATEGORY, FORBIDDEN_PATTERNS, GH_LIST_CATEGORY, HARD_CATEGORIES, @@ -61,6 +62,7 @@ run_validation, slugify, validate_capability_sync, + validate_eval_coverage, validate_frontmatter, validate_gh_list_limit, validate_injection_guard, @@ -2399,3 +2401,84 @@ def test_italic_parens_annotation_is_stripped(self, tmp_path: Path) -> None: # The parenthetical capability:reconciliation must NOT be flagged as a doc-side declared capability; # the row's authoritative capability is just intake, which matches the live skill. assert violations == [], [v.message for v in violations] + + +# --------------------------------------------------------------------------- +# Eval-coverage check +# --------------------------------------------------------------------------- + + +class TestValidateEvalCoverage: + """Tests for validate_eval_coverage (check #9 — SOFT).""" + + def _make_skill(self, root: Path, slug: str) -> None: + skill_dir = root / "skills" / slug + skill_dir.mkdir(parents=True, exist_ok=True) + (skill_dir / "SKILL.md").write_text( + f"---\nname: magpie-{slug}\ndescription: test\ncapability: capability:triage\nlicense: Apache-2.0\n---\n" + ) + + def _make_eval(self, root: Path, slug: str) -> None: + eval_dir = root / "tools" / "skill-evals" / "evals" / slug + eval_dir.mkdir(parents=True, exist_ok=True) + (eval_dir / "README.md").write_text(f"# {slug} evals\n") + + def test_skill_with_matching_eval_passes(self, tmp_path: Path) -> None: + self._make_skill(tmp_path, "issue-triage") + self._make_eval(tmp_path, "issue-triage") + violations = list(validate_eval_coverage(tmp_path)) + assert violations == [] + + def test_skill_without_eval_yields_soft_violation(self, tmp_path: Path) -> None: + self._make_skill(tmp_path, "new-skill") + # No matching eval directory. + violations = list(validate_eval_coverage(tmp_path)) + assert len(violations) == 1 + v = violations[0] + assert v.category == EVAL_COVERAGE_CATEGORY + assert "new-skill" in v.message + assert "tools/skill-evals/evals/new-skill/" in v.message + + def test_multiple_skills_some_missing_evals(self, tmp_path: Path) -> None: + self._make_skill(tmp_path, "alpha") + self._make_skill(tmp_path, "beta") + self._make_skill(tmp_path, "gamma") + self._make_eval(tmp_path, "alpha") + # beta and gamma have no evals. + violations = list(validate_eval_coverage(tmp_path)) + assert len(violations) == 2 + slugs = {v.path.parent.name for v in violations} + assert slugs == {"beta", "gamma"} + assert all(v.category == EVAL_COVERAGE_CATEGORY for v in violations) + + def test_no_skills_dir_returns_no_violations(self, tmp_path: Path) -> None: + # skills/ does not exist at all. + violations = list(validate_eval_coverage(tmp_path)) + assert violations == [] + + def test_no_evals_dir_all_skills_flagged(self, tmp_path: Path) -> None: + self._make_skill(tmp_path, "alpha") + self._make_skill(tmp_path, "beta") + # tools/skill-evals/evals/ does not exist. + violations = list(validate_eval_coverage(tmp_path)) + assert len(violations) == 2 + assert all(v.category == EVAL_COVERAGE_CATEGORY for v in violations) + + def test_eval_coverage_is_soft_category(self) -> None: + assert EVAL_COVERAGE_CATEGORY in SOFT_CATEGORIES + assert EVAL_COVERAGE_CATEGORY not in ALL_CATEGORIES - SOFT_CATEGORIES + + def test_violation_path_points_to_skill_md(self, tmp_path: Path) -> None: + self._make_skill(tmp_path, "orphan") + violations = list(validate_eval_coverage(tmp_path)) + assert len(violations) == 1 + assert violations[0].path.name == "SKILL.md" + assert violations[0].path.parent.name == "orphan" + + def test_non_directory_entries_in_skills_are_skipped(self, tmp_path: Path) -> None: + skills_dir = tmp_path / "skills" + skills_dir.mkdir(parents=True) + # A plain file (not a directory) must not be treated as a skill. + (skills_dir / "README.md").write_text("# skills\n") + violations = list(validate_eval_coverage(tmp_path)) + assert violations == []