diff --git a/.github/workflows/link-check.yml b/.github/workflows/link-check.yml new file mode 100644 index 00000000..639ff568 --- /dev/null +++ b/.github/workflows/link-check.yml @@ -0,0 +1,82 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +--- +name: link-check + +# Lychee runs in informational mode for now: it surfaces broken +# internal links and broken external URLs, but does not block PR +# merges. The existing tree has a known set of pre-existing broken +# references (to files like `config/active-project.md` that this +# repository plans to add later) which would otherwise fail every +# PR. Once the baseline reaches zero, flip `continue-on-error` off. + +on: # yamllint disable-line rule:truthy + pull_request: + push: + branches: [main] + schedule: + # Daily run catches link rot in external URLs even when no PR + # touches them. + - cron: "0 8 * * *" + +permissions: {} + +jobs: + lychee: + runs-on: ubuntu-latest + permissions: + contents: read + issues: write + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false + + # Restore the lychee result cache so external URL checks reuse + # results across runs (config sets `max_cache_age = "7d"`). + - name: Restore lychee cache + uses: actions/cache@d4323d4df104b026a6aa633fdb11d772146be0bf # v4.2.2 + with: + path: .lycheecache + key: cache-lychee-${{ github.sha }} + restore-keys: cache-lychee- + + - name: Run lychee + id: lychee + # Pinned SHA must come from the ASF infrastructure-actions + # allowlist (https://github.com/apache/infrastructure-actions/blob/main/approved_patterns.yml), + # which the `asf-allowlist-check` workflow enforces on every + # PR. The previous v2.6.1 pin was not on the allowlist; v2.8.0 + # (2026-02-17) is. When bumping, pick the next allowlisted SHA + # — do not pick the latest upstream release blindly. + uses: lycheeverse/lychee-action@8646ba30535128ac92d33dfc9133794bfdd9b411 # v2.8.0 + with: + args: --config .lychee.toml --no-progress . + fail: false + token: ${{ secrets.GITHUB_TOKEN }} + continue-on-error: true + + - name: Summarise + if: always() + run: | + if [ -f lychee/out.md ]; then + echo "## Lychee link-check report" >> "$GITHUB_STEP_SUMMARY" + cat lychee/out.md >> "$GITHUB_STEP_SUMMARY" + else + echo "Lychee did not produce a report file." >> "$GITHUB_STEP_SUMMARY" + fi diff --git a/.gitignore b/.gitignore index d826fb1f..4ebb1283 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,11 @@ logs .build .claude/worktrees/ +# Lychee link-checker result cache. The workflow restores this from +# Actions cache between runs; checking it in would commit machine- +# specific resolution data. +.lycheecache + # This framework repository carries no project-specific configuration. # Adopting projects keep their per-user files (e.g. user.md) in their # own tracker repository under `/` and ensure those diff --git a/.lychee.toml b/.lychee.toml new file mode 100644 index 00000000..a93b3a4e --- /dev/null +++ b/.lychee.toml @@ -0,0 +1,96 @@ +# Lychee link checker config for apache/airflow-steward. +# +# Validates every link in markdown / rst / .md.j2 files: +# * cross-file file existence — `[text](other.md)` +# * cross-file fragments — `[text](other.md#anchor)` +# * external URLs — HTTP 2xx +# +# Run locally: +# lychee --config .lychee.toml . +# +# Run in CI: see `.github/workflows/doc-validation.yml`. + +# Check anchor fragments, not just file paths. +# `anchor-only` enables `#section` checks (which is what GitHub-style +# slugs produce); `text-only` would also enable `#:~:text=` fragments +# but those are not used in this repo. +include_fragments = "anchor-only" + +# Concurrency cap — kept moderate to avoid being rate-limited by GitHub. +max_concurrency = 14 + +# Per-request timeout. ASF infra and GitHub raw-content endpoints +# occasionally take 10+ seconds during peak. +timeout = 30 + +# Retry transient errors a few times before failing the run. +retry_wait_time = 2 +max_retries = 3 + +# Cache successful results for 7 days. Mirrors the framework-wide +# 7-day `[tool.uv] exclude-newer` cooldown convention. CI restores +# this cache between runs to keep the link check fast. +cache = true +max_cache_age = "7d" + +# Treat 4xx and 5xx as failures (default), but accept the redirects. +accept = [200, 206, 301, 302, 304, 308, 401, 403] + +# 401/403 accepted because some authenticated endpoints (private ASF +# trackers, GitHub API rate-limited responses) return them deterministically +# on unauthenticated CI but are still valid URLs. + +# Skip-list — endpoints that are private to ASF infra, rate-limited +# more aggressively than CI tolerates, or known to require headers +# the checker cannot supply. +exclude = [ + # ASF mailing-list archives — rate-limit on burst, not stable in CI. + '^https://lists\.apache\.org/.*', + + # ASF Vulnogram CVE tool — auth-gated, returns 200 only when logged in. + '^https://cveprocess\.apache\.org/.*', + + # cve.org record pages 404 until the record is published; many of the + # links in skill examples reference future / hypothetical CVE IDs. + '^https://cve\.org/CVERecord\?id=CVE-.*', + '^https://www\.cve\.org/CVERecord\?id=CVE-.*', + '^https://cveawg\.mitre\.org/api/cve/CVE-.*', + + # cve.mitre.org legacy (often slow + unreliable). + '^https://cve\.mitre\.org/.*', + + # Ponymail thread IDs in canned-response examples — they reference + # synthetic or future thread URLs that do not exist yet. + '^https://lists\.apache\.org/thread/.*', + + # Placeholder paths used by the framework convention. ``, + # ``, `` etc. are literal placeholder tokens that + # are substituted at runtime by the adopting project; URL-encoded + # they appear as `%3C...%3E`. Lychee tries to resolve them as real + # file paths and fails. See AGENTS.md#placeholder-convention-used-in-skill-files. + '%3C[A-Za-z0-9_-]+%3E', + + # ALL_CAPS substitution tokens (e.g., FRAMEWORK_README_URL, + # SOURCE_TAB_URL, JSON_ANCHOR_URL) used in the release-manager + # comment templates. Each is replaced at runtime by the rendering + # skill with the actual URL. + '/[A-Z][A-Z0-9_]+_URL(?:#|$)', + '/[A-Z][A-Z0-9_]+_URL/', + '/[A-Z][A-Z0-9_]+_URL$', +] + +# File patterns to skip entirely — the agent-isolation pinned-versions +# manifest has bare URLs that are intentionally pinned to specific tags +# the checker should not chase. +exclude_path = [ + "tools/agent-isolation/pinned-versions.toml", + "uv.lock", + "tools/*/uv.lock", + ".git", +] + +# Treat unresolvable hostnames the same as failures, no silent passes. +no_progress = true + +# Report-style output — one line per failing link makes CI logs grep-friendly. +format = "compact" diff --git a/.markdownlint.json b/.markdownlint.json new file mode 100644 index 00000000..a336c7d1 --- /dev/null +++ b/.markdownlint.json @@ -0,0 +1,34 @@ +{ + "$comment": "Markdownlint config for apache/airflow-steward. The rule set is deliberately minimal — only enables checks that catch real bugs (broken anchors, malformed code spans, malformed link references). Style choices that the existing docs already settled (compact tables, hyphen list markers, ordered-list numbering, fenced-language tagging) are left alone so the consolidation does not balloon the diff.", + + "default": true, + + "MD004": false, + "MD007": false, + "MD013": false, + "MD018": false, + "MD022": false, + "MD024": { "siblings_only": true }, + "MD028": false, + "MD029": false, + "MD031": false, + "MD032": false, + "MD033": false, + "MD034": false, + "MD036": false, + "MD040": false, + "MD041": false, + "MD046": false, + "MD050": false, + "MD056": false, + "MD059": false, + "MD060": false, + + "MD038": false, + "MD051": true, + "MD053": true, + + "$comment-MD038": "MD038 (no space inside code spans) is disabled because the steward docs intentionally use literal markdown-syntax samples like `# ` (H1), `### ` (H3), `- ` (list marker) inside backticks to illustrate markdown rendering. The rule has no per-context allowlist and would force escape-everywhere across many files. Re-enable in a follow-up that escapes those samples consistently.", + + "$comment-rationale": "MD051 (link-fragments) catches broken cross-references — exactly the bug class that surfaced 5 real broken anchors on the existing tree. MD053 catches dangling link-reference definitions. Everything else is style and is intentionally off so this PR stays diff-minimal." +} diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index ebe72ea7..c7737f42 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -18,7 +18,13 @@ default_stages: [commit, push] default_language_version: python: python3 - node: 18.6.0 + # Node 22 is the current active LTS line. markdownlint-cli2 ≥ v0.18 + # (and its string-width dependency from v6 onward) uses the regex + # `/v` flag, which requires Node ≥ 20 to parse. The previous Node + # 18.6.0 pin matched the framework's pre-PR-#18 tooling baseline + # but is incompatible with the markdownlint-cli2 v0.22.1 rev + # pinned below. + node: 22.11.0 minimum_prek_version: '0.3.5' repos: - repo: meta @@ -53,6 +59,38 @@ repos: name: Detect if mixed line ending is used (\r vs. \r\n) - id: trailing-whitespace name: Remove trailing whitespace at end of line + # markdownlint — catches structurally-bad markdown. Config in + # `.markdownlint.json` enables only the rules that catch real bugs + # (broken anchors via MD051, dangling link references via MD053); + # style choices the existing docs already settled stay off. + - repo: https://github.com/DavidAnson/markdownlint-cli2 + rev: v0.22.1 + hooks: + - id: markdownlint-cli2 + name: markdownlint + files: \.md$ + # typos — fast spell-checker. Allowlist is `.typos.toml`. + # Override the default args (which include `--write-changes`) so + # the hook never silently rewrites files; CI surfaces typos as + # errors instead. + - repo: https://github.com/crate-ci/typos + rev: v1.45.2 + hooks: + - id: typos + name: typos + args: [--force-exclude] + # Local placeholder linter — catches hardcoded references like + # `apache/airflow` or `Apache Airflow` that should be the + # placeholder tokens `` / `` per + # `AGENTS.md#placeholder-convention-used-in-skill-files`. + - repo: local + hooks: + - id: check-placeholders + name: check-placeholders + language: system + entry: tools/dev/check-placeholders.sh + files: ^(\.claude/skills/.*|tools/.*)\.(md|sh|py|yaml|yml|toml)$ + pass_filenames: false # Project-local checks for the `generate-cve-json` Python project at # `tools/vulnogram/generate-cve-json/`. Each hook sets the working # directory via `uv run --directory` so ruff / mypy / pytest pick up diff --git a/.typos.toml b/.typos.toml new file mode 100644 index 00000000..bc6c25a1 --- /dev/null +++ b/.typos.toml @@ -0,0 +1,51 @@ +# Typos spell-checker config for apache/airflow-steward. +# +# Run locally: +# typos +# +# Auto-fix what's safe: +# typos --write-changes +# +# Run in CI: see `.github/workflows/doc-validation.yml`. + +[default] + +[default.extend-words] +# Domain terms typos' default dictionary flags as misspellings. +# `CNA` = CVE Numbering Authority — used hundreds of times in the +# security documentation; not a typo of "CAN". +CNA = "CNA" +cna = "cna" +Cna = "Cna" +# `Nd` is the placeholder for "N days" in date-filter syntax +# (`newer_than:`, `lte=Nd`, etc); not a typo of "And". +Nd = "Nd" +# `mis-` prefix in `mis-allocation` is a legitimate compound, not a +# typo of "miss" / "mist". +mis = "mis" +# `pre-empted` is a real word; typos flags `empted` as a typo of +# `emptied` only because of how it splits hyphenated words. +empted = "empted" + +[default.extend-identifiers] +# Identifiers that look like typos but are real symbol names. +ASF = "ASF" +PMC = "PMC" +CVE = "CVE" +CWE = "CWE" +Vulnogram = "Vulnogram" +ponymail = "ponymail" +ponymail_mcp = "ponymail_mcp" +gh = "gh" +mcp = "mcp" +MCP = "MCP" + +[files] +# Skip auto-generated lockfiles + the cargo-style pinned-versions +# manifest (URLs / hashes that are not natural-language text). +extend-exclude = [ + "uv.lock", + "tools/*/uv.lock", + "tools/agent-isolation/pinned-versions.toml", + "*.svg", +] diff --git a/AGENTS.md b/AGENTS.md index 020b55b8..34838a02 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -772,7 +772,7 @@ them is a **request or a fact**, not a briefing: validity on X and allocated the CVE on Y…"). - Security-model paraphrasing — link to the chapter, do not re-explain (per - [Point reporters to the Security Model, don't re-explain it](#point-reporters-to-the-security-model-dont-re-explain-it)). + [Point reporters to the project's Security Model, don't re-explain it](#point-reporters-to-the-projects-security-model-dont-re-explain-it)). - Inflated closings ("We greatly appreciate your continued patience…"). A plain *"Thanks,"* / *"Regards,"* is enough. - Any open question that was already asked on the thread and is diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index f01a6c40..d0931afc 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -233,8 +233,30 @@ open a PR — CI runs the same config. The hook set: `config/user.md`; - `bootstrap-user-config` — creates `config/user.md` from the template on first run; +- `markdownlint-cli2` — flags structurally bad markdown. Config in + `.markdownlint.json` enables only the rules that catch real bugs + (broken anchors via MD051, dangling link references via MD053); + style choices are intentionally left alone; +- `typos` — fast spell-checker. Allowlist of project-specific terms + (`CNA`, `Vulnogram`, `ponymail`, `mis-`, `Nd`, etc.) lives in + `.typos.toml`; +- `check-placeholders` — local script at `tools/dev/check-placeholders.sh` + that refuses to commit hardcoded references like `apache/airflow` + or `Apache Airflow` inside `.claude/skills/` or `tools/`. The + framework convention is the `` / `` / `` + placeholder set; see + [`AGENTS.md` — Placeholder convention](AGENTS.md#placeholder-convention-used-in-skill-files); - `ruff check` / `ruff format --check` / `mypy` / `pytest` against the - `tools/vulnogram/generate-cve-json/` Python package. + `tools/vulnogram/generate-cve-json/` and `tools/gmail/oauth-draft/` + Python packages. + +A separate GitHub workflow `link-check.yml` runs +[lychee](https://lychee.cli.rs/) on every PR and on a daily schedule +to catch broken internal links and dead external URLs. The check is +**informational only** today (`continue-on-error: true`) because the +existing tree carries a known set of placeholder / not-yet-created +file references; once the baseline reaches zero the workflow will +flip to a hard gate. For the Python package directly: diff --git a/projects/_template/scope-labels.md b/projects/_template/scope-labels.md index 48010064..a681cd75 100644 --- a/projects/_template/scope-labels.md +++ b/projects/_template/scope-labels.md @@ -3,7 +3,7 @@ **Table of Contents** *generated with [DocToc](https://github.com/thlorenz/doctoc)* - [TODO: `` — scope labels](#todo-project-name--scope-labels) - - [Scope → CVE product / package-name table](#scope-%E2%86%92-cve-product--package-name-table) + - [Scope to CVE product / package-name table](#scope-to-cve-product--package-name-table) - [Default `packageName` and vendor](#default-packagename-and-vendor) - [Closing dispositions (not scope labels)](#closing-dispositions-not-scope-labels) @@ -30,7 +30,7 @@ affects more than one scope, the `sync-security-issue` skill surfaces this as a blocker and the triager splits the report into per-scope trackers. -## Scope → CVE product / package-name table +## Scope to CVE product / package-name table | Tracker scope label | CVE product | CVE container `packageName` | Collection URL | |---|---|---|---| diff --git a/tools/dev/check-placeholders.sh b/tools/dev/check-placeholders.sh new file mode 100755 index 00000000..2cea5a0f --- /dev/null +++ b/tools/dev/check-placeholders.sh @@ -0,0 +1,194 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# check-placeholders.sh +# +# Verifies that framework-level skill / tool docs refer to +# adopting-project specifics through placeholders only: +# +# — adopting project's display name +# — adopting project's private tracker repo slug +# — adopting project's public source repo slug +# +# Hardcoded references to "apache/airflow", "airflow-s/airflow-s", +# "Apache Airflow", or related concrete project names slip in +# whenever someone copy-pastes from the legacy Airflow content. +# This linter catches them before merge. +# +# Intentional callouts that **explain** the placeholder convention +# (example blocks, "for Airflow, see..." pointers, title-prefix +# rendering examples) are allowlisted via inline markers +# (`example:`, `e.g.`, `for Airflow`, lines inside `` +# HTML comment blocks). Regular prose that names Airflow without +# such a marker is the surface this linter is built to catch. +# +# Run from repo root: +# tools/dev/check-placeholders.sh +# +# Pre-commit invocation: see `.pre-commit-config.yaml`. + +set -euo pipefail + +# Patterns that should never appear outside the allowlist below. +# Each pattern must be a fixed string (grep -F). +FORBIDDEN_PATTERNS=( + "apache/airflow" + "airflow-s/airflow-s" + "Apache Airflow" + "apache.org/airflow" +) + +# Files / directories where Airflow references are intentional: +# the framework's own onboarding / contributor docs use Airflow as +# the canonical example adopter; the bootstrap scaffold under +# `projects/_template/` may reference Airflow in pointers; the root +# pyproject.toml documents the legacy `apache/airflow-steward` slug +# as part of its rename note; this linter file itself contains the +# patterns to look for. +ALLOWLIST_PATHS=( + "README.md" + "AGENTS.md" + "CONTRIBUTING.md" + "secure-agent-setup.md" + "how-to-fix-a-security-issue.md" + "new-members-onboarding.md" + "pyproject.toml" + "projects/_template/" + "tools/dev/check-placeholders.sh" + ".github/" + ".asf.yaml" + "NOTICE" + "LICENSE" +) + +# Inline markers that signal an intentional explanatory mention +# of Airflow on the same line. Lines matching any of these are +# treated as allowlisted. +INLINE_ALLOW_MARKERS=( + "example:" + "e.g." + "e\.g\." + "for Airflow" + "the Airflow" + "legacy" + "renamed" + "future-renamed" + "originally" + "vendor>: " + # `apache/airflow-steward` is the framework's own legacy slug — a + # match for the `apache/airflow` substring will fire on it, but + # those mentions are intentional self-references, not hardcoded + # adopter references. + "apache/airflow-steward" +) + +# Where to look. Only `.md` files under skills + tool adapter docs +# are scoped; Python sources under `tools/*/src/` and `tools/*/tests/` +# may legitimately mention Airflow in fixtures and docstrings. +SCAN_PATHS=( + ".claude/skills" + "tools" +) + +is_path_allowlisted() { + local file="$1" + for allow in "${ALLOWLIST_PATHS[@]}"; do + if [[ "$file" == "$allow"* ]]; then + return 0 + fi + done + return 1 +} + +line_has_inline_allow_marker() { + local line="$1" + for marker in "${INLINE_ALLOW_MARKERS[@]}"; do + if [[ "$line" == *"$marker"* ]]; then + return 0 + fi + done + return 1 +} + +main() { + local repo_root + repo_root="$(git rev-parse --show-toplevel 2>/dev/null || pwd)" + cd "$repo_root" + + local exit_code=0 + local matches_total=0 + + echo "check-placeholders: scanning ${SCAN_PATHS[*]} for hardcoded project references..." + + for pattern in "${FORBIDDEN_PATTERNS[@]}"; do + local matches + matches=$(grep -rFn \ + --include='*.md' \ + "$pattern" \ + "${SCAN_PATHS[@]}" 2>/dev/null || true) + + if [[ -z "$matches" ]]; then + continue + fi + + while IFS= read -r match_line; do + local file="${match_line%%:*}" + local rest="${match_line#*:}" + local _line_no="${rest%%:*}" + local content="${rest#*:}" + + if is_path_allowlisted "$file"; then + continue + fi + if line_has_inline_allow_marker "$content"; then + continue + fi + + if [[ $matches_total -eq 0 ]]; then + { + echo "" + echo "FORBIDDEN: hardcoded project references found." + echo "" + echo "Skill / tool docs must use the placeholders ," + echo ", and instead of the concrete strings" + echo "below. See AGENTS.md#placeholder-convention-used-in-skill-files." + echo "" + echo "Lines that explain the placeholder convention with an" + echo "intentional example are allowlisted by including one of:" + echo " example:, e.g., for Airflow, the Airflow, legacy," + echo " renamed, future-renamed, originally, vendor>: " + echo "" + } >&2 + fi + echo " $match_line" >&2 + matches_total=$((matches_total + 1)) + exit_code=1 + done <<< "$matches" + done + + if [[ $exit_code -eq 0 ]]; then + echo "check-placeholders: OK (no hardcoded references in skills / tool docs)." + else + echo "" >&2 + echo "check-placeholders: $matches_total violation(s) — fix before commit." >&2 + fi + + return $exit_code +} + +main "$@" diff --git a/tools/github/issue-template.md b/tools/github/issue-template.md index a794f0dd..b4b0dce5 100644 --- a/tools/github/issue-template.md +++ b/tools/github/issue-template.md @@ -7,7 +7,7 @@ - [Field roles the skills use](#field-roles-the-skills-use) - [Body-field surgery](#body-field-surgery) - [Empty-field convention](#empty-field-convention) - - [Issue-template → CVE 5.x mapping](#issue-template-%E2%86%92-cve-5x-mapping) + - [Issue-template to CVE 5.x mapping](#issue-template-to-cve-5x-mapping) @@ -98,7 +98,7 @@ into every unfilled body field. The skills honour this convention: - The CVE JSON generator treats `_No response_` as absence and simply omits the corresponding CVE-record element. -## Issue-template → CVE 5.x mapping +## Issue-template to CVE 5.x mapping The `generate-cve-json` tool maps body-field roles to CVE 5.x record elements as follows (generic — applies to any project using this diff --git a/tools/ponymail/operations.md b/tools/ponymail/operations.md index 41002a3d..4f95cab0 100644 --- a/tools/ponymail/operations.md +++ b/tools/ponymail/operations.md @@ -175,8 +175,8 @@ specific messages it cares about. Returned records are **summaries** (mid, subject, from, date, tid) — not full bodies. Fetch the body via -[`get_email`](#get-email) or the whole thread via -[`get_thread`](#get-thread) when needed. +[`get_email`](#get-an-email) or the whole thread via +[`get_thread`](#get-a-thread) when needed. ### Get a thread