diff --git a/tools/vulnogram/generate-cve-json/src/generate_cve_json/cve_json.py b/tools/vulnogram/generate-cve-json/src/generate_cve_json/cve_json.py index 53872884..7bd658c1 100644 --- a/tools/vulnogram/generate-cve-json/src/generate_cve_json/cve_json.py +++ b/tools/vulnogram/generate-cve-json/src/generate_cve_json/cve_json.py @@ -809,6 +809,10 @@ def classify_reference(url: str) -> list[str]: ``["patch"]``. * ``lists.apache.org/...`` and ``security.apache.org/...`` → ``["vendor-advisory"]``. + * ``cve.org/CVERecord?id=...`` and ``nvd.nist.gov/vuln/detail/...`` → + ``["related"]`` (links to other CVE records on the public CVE + databases — used for incomplete-fix / sibling-CVE cross-references + per ASF Security's request). * Anything else → no tags (empty list). """ if re.search(r"github\.com/[^/]+/[^/]+/(pull|commit)/", url): @@ -822,9 +826,59 @@ def classify_reference(url: str) -> list[str]: return [] if host in ("lists.apache.org", "security.apache.org"): return ["vendor-advisory"] + if host in ("cve.org", "www.cve.org", "nvd.nist.gov"): + return ["related"] return [] +# Match a complete CVE-YYYY-NNNNN identifier with word boundaries so +# substrings inside larger tokens (e.g. ``CVE-2026-12345-extra``) do +# not match. +_CVE_ID_RE = re.compile(r"\bCVE-\d{4}-\d{4,7}\b") + + +def extract_related_cve_ids(text: str, current_cve_id: str | None = None) -> list[str]: + """Extract distinct CVE identifiers cited in ``text``, in order of + first appearance. + + ``current_cve_id`` is excluded from the result so the generator + never emits a self-reference. The check is case-insensitive. + + Typical inputs: + + * The tracker's *Short public summary for publish* body field — + where Gate #3 (incomplete-fix cross-CVE clause) places a prior + CVE identifier the current CVE is a follow-up to. + * The tracker's *Security mailing list thread* field — when the + report references a prior CVE in its body for context. + + Output is a list (not a set) to preserve first-appearance order + so the emitted references list is deterministic across runs. + """ + seen: set[str] = set() + ordered: list[str] = [] + current_upper = (current_cve_id or "").upper() + for match in _CVE_ID_RE.finditer(text): + cve_id = match.group(0).upper() + if cve_id == current_upper: + continue + if cve_id in seen: + continue + seen.add(cve_id) + ordered.append(cve_id) + return ordered + + +def related_cve_url(cve_id: str) -> str: + """Return the public ``cve.org`` record URL for a CVE identifier. + + Format matches ASF Security's preference per Arnout Engelen's + 2026-05-29 review comment on CVE-2026-49298: ``https://cve.org/ + CVERecord?id=``. + """ + return f"https://www.cve.org/CVERecord?id={cve_id}" + + def build_references( mailing_list_field: str, pr_field: str, @@ -1111,7 +1165,18 @@ def build_cna_container( remediation_developers: list[str], advisory_urls: list[str] | None = None, product_overrides: dict[str, str] | None = None, + current_cve_id: str | None = None, ) -> dict: + # Sibling-CVE cross-references — extract every distinct CVE-YYYY-NNNNN + # mentioned in the description (the short public summary) and emit a + # ``cve.org/CVERecord?id=`` reference for each, tagged ``related`` + # by :func:`classify_reference`. This satisfies ASF Security's request + # (Arnout Engelen, 2026-05-29 review on CVE-2026-49298) that incomplete- + # fix follow-ups carry a structured ``references[]`` link back to the + # prior CVE. The current record's own CVE ID is excluded so the + # generator never emits a self-reference. + related_cve_urls = [related_cve_url(cid) for cid in extract_related_cve_ids(description, current_cve_id)] + extra_urls = list(advisory_urls or []) + related_cve_urls cna: dict = { "affected": build_affected( affected_versions_value, @@ -1127,7 +1192,7 @@ def build_cna_container( "metrics": build_metrics(severity_value), "problemTypes": build_problem_types(cwe_value), "providerMetadata": {"orgId": org_id}, - "references": build_references(mailing_list_value, pr_value, extra_urls=advisory_urls), + "references": build_references(mailing_list_value, pr_value, extra_urls=extra_urls), "source": {"discovery": discovery}, "title": title, "x_generator": {"engine": GENERATOR_TAG}, @@ -2094,6 +2159,7 @@ def main(argv: list[str] | None = None) -> int: remediation_developers=combined_remediation_developers, advisory_urls=combined_advisory_urls, product_overrides=product_overrides, + current_cve_id=cve_id, ) except ValueError as exc: # parse_affected_versions (and any future fail-loud parser diff --git a/tools/vulnogram/generate-cve-json/tests/test_generate_cve_json.py b/tools/vulnogram/generate-cve-json/tests/test_generate_cve_json.py index 6a07d36e..b9d4b6d9 100644 --- a/tools/vulnogram/generate-cve-json/tests/test_generate_cve_json.py +++ b/tools/vulnogram/generate-cve-json/tests/test_generate_cve_json.py @@ -801,6 +801,93 @@ def test_subdomain_is_not_treated_as_apache(self): def test_malformed_url_returns_no_tags(self): assert classify_reference("not a url") == [] + def test_cve_org_record_tagged_as_related(self): + # ASF Security's preferred URL form for sibling/incomplete-fix CVE + # cross-references (Arnout Engelen, 2026-05-29 review on + # CVE-2026-49298). + assert classify_reference("https://www.cve.org/CVERecord?id=CVE-2026-27173") == ["related"] + assert classify_reference("https://cve.org/CVERecord?id=CVE-2025-68438") == ["related"] + + def test_nvd_record_tagged_as_related(self): + # NVD is the same CVE database under a different URL form; treat + # as related too. + assert classify_reference("https://nvd.nist.gov/vuln/detail/CVE-2026-27173") == ["related"] + + +class TestExtractRelatedCveIds: + def test_extracts_single_prior_cve(self): + from generate_cve_json.cve_json import extract_related_cve_ids + + summary = ( + "This is a variant of CWE-200 previously addressed in CVE-2025-68438; " + "that fix did not cover the nested sensitive-keyword allowlist." + ) + assert extract_related_cve_ids(summary) == ["CVE-2025-68438"] + + def test_extracts_multiple_distinct_in_order(self): + from generate_cve_json.cve_json import extract_related_cve_ids + + summary = "Fix-bypass of CVE-2026-33858. Also related to CVE-2025-50213 and CVE-2025-27018." + assert extract_related_cve_ids(summary) == [ + "CVE-2026-33858", + "CVE-2025-50213", + "CVE-2025-27018", + ] + + def test_excludes_current_cve_id(self): + from generate_cve_json.cve_json import extract_related_cve_ids + + summary = "CVE-2026-42359 fixes a PATCH-path bypass of CVE-2026-33858." + assert extract_related_cve_ids(summary, current_cve_id="CVE-2026-42359") == [ + "CVE-2026-33858", + ] + + def test_current_cve_id_match_is_case_insensitive(self): + from generate_cve_json.cve_json import extract_related_cve_ids + + summary = "cve-2026-42359 fixes a PATCH-path bypass of CVE-2026-33858." + assert extract_related_cve_ids(summary, current_cve_id="CVE-2026-42359") == [ + "CVE-2026-33858", + ] + + def test_deduplicates_repeated_mentions(self): + from generate_cve_json.cve_json import extract_related_cve_ids + + summary = "CVE-2025-68438 was incomplete; this CVE follows CVE-2025-68438." + assert extract_related_cve_ids(summary) == ["CVE-2025-68438"] + + def test_substring_in_larger_token_does_not_match(self): + from generate_cve_json.cve_json import extract_related_cve_ids + + # Word-boundary regex must not match identifiers embedded in + # larger tokens (defensive against accidental hits). + assert extract_related_cve_ids("seeCVE-2026-33858trailing") == [] + assert extract_related_cve_ids("CVE-2026-33858x") == [] + + def test_short_form_required_at_least_four_digits(self): + from generate_cve_json.cve_json import extract_related_cve_ids + + # CVE-YYYY-NNNN minimum (matches MITRE's 4-7 digit constraint). + assert extract_related_cve_ids("CVE-2026-123") == [] + assert extract_related_cve_ids("CVE-2026-1234") == ["CVE-2026-1234"] + + def test_empty_string_returns_empty_list(self): + from generate_cve_json.cve_json import extract_related_cve_ids + + assert extract_related_cve_ids("") == [] + + def test_no_cve_id_in_text_returns_empty_list(self): + from generate_cve_json.cve_json import extract_related_cve_ids + + assert extract_related_cve_ids("no CVE here, just narrative.") == [] + + +class TestRelatedCveUrl: + def test_url_format_matches_cve_org(self): + from generate_cve_json.cve_json import related_cve_url + + assert related_cve_url("CVE-2026-27173") == "https://www.cve.org/CVERecord?id=CVE-2026-27173" + class TestBuildReferences: def test_mailing_list_field_urls_are_not_auto_included(self):