Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -809,6 +809,10 @@ def classify_reference(url: str) -> list[str]:
``["patch"]``.
* ``lists.apache.org/...`` and ``security.apache.org/...`` →
``["vendor-advisory"]``.
* ``cve.org/CVERecord?id=...`` and ``nvd.nist.gov/vuln/detail/...`` →
``["related"]`` (links to other CVE records on the public CVE
databases — used for incomplete-fix / sibling-CVE cross-references
per ASF Security's request).
* Anything else → no tags (empty list).
"""
if re.search(r"github\.com/[^/]+/[^/]+/(pull|commit)/", url):
Expand All @@ -822,9 +826,59 @@ def classify_reference(url: str) -> list[str]:
return []
if host in ("lists.apache.org", "security.apache.org"):
return ["vendor-advisory"]
if host in ("cve.org", "www.cve.org", "nvd.nist.gov"):
return ["related"]
return []


# Match a complete CVE-YYYY-NNNNN identifier with word boundaries so
# substrings inside larger tokens (e.g. ``CVE-2026-12345-extra``) do
# not match.
_CVE_ID_RE = re.compile(r"\bCVE-\d{4}-\d{4,7}\b")


def extract_related_cve_ids(text: str, current_cve_id: str | None = None) -> list[str]:
"""Extract distinct CVE identifiers cited in ``text``, in order of
first appearance.

``current_cve_id`` is excluded from the result so the generator
never emits a self-reference. The check is case-insensitive.

Typical inputs:

* The tracker's *Short public summary for publish* body field —
where Gate #3 (incomplete-fix cross-CVE clause) places a prior
CVE identifier the current CVE is a follow-up to.
* The tracker's *Security mailing list thread* field — when the
report references a prior CVE in its body for context.

Output is a list (not a set) to preserve first-appearance order
so the emitted references list is deterministic across runs.
"""
seen: set[str] = set()
ordered: list[str] = []
current_upper = (current_cve_id or "").upper()
for match in _CVE_ID_RE.finditer(text):
cve_id = match.group(0).upper()
if cve_id == current_upper:
continue
if cve_id in seen:
continue
seen.add(cve_id)
ordered.append(cve_id)
return ordered


def related_cve_url(cve_id: str) -> str:
"""Return the public ``cve.org`` record URL for a CVE identifier.

Format matches ASF Security's preference per Arnout Engelen's
2026-05-29 review comment on CVE-2026-49298: ``https://cve.org/
CVERecord?id=<CVE-ID>``.
"""
return f"https://www.cve.org/CVERecord?id={cve_id}"


def build_references(
mailing_list_field: str,
pr_field: str,
Expand Down Expand Up @@ -1111,7 +1165,18 @@ def build_cna_container(
remediation_developers: list[str],
advisory_urls: list[str] | None = None,
product_overrides: dict[str, str] | None = None,
current_cve_id: str | None = None,
) -> dict:
# Sibling-CVE cross-references — extract every distinct CVE-YYYY-NNNNN
# mentioned in the description (the short public summary) and emit a
# ``cve.org/CVERecord?id=<id>`` reference for each, tagged ``related``
# by :func:`classify_reference`. This satisfies ASF Security's request
# (Arnout Engelen, 2026-05-29 review on CVE-2026-49298) that incomplete-
# fix follow-ups carry a structured ``references[]`` link back to the
# prior CVE. The current record's own CVE ID is excluded so the
# generator never emits a self-reference.
related_cve_urls = [related_cve_url(cid) for cid in extract_related_cve_ids(description, current_cve_id)]
extra_urls = list(advisory_urls or []) + related_cve_urls
cna: dict = {
"affected": build_affected(
affected_versions_value,
Expand All @@ -1127,7 +1192,7 @@ def build_cna_container(
"metrics": build_metrics(severity_value),
"problemTypes": build_problem_types(cwe_value),
"providerMetadata": {"orgId": org_id},
"references": build_references(mailing_list_value, pr_value, extra_urls=advisory_urls),
"references": build_references(mailing_list_value, pr_value, extra_urls=extra_urls),
"source": {"discovery": discovery},
"title": title,
"x_generator": {"engine": GENERATOR_TAG},
Expand Down Expand Up @@ -2094,6 +2159,7 @@ def main(argv: list[str] | None = None) -> int:
remediation_developers=combined_remediation_developers,
advisory_urls=combined_advisory_urls,
product_overrides=product_overrides,
current_cve_id=cve_id,
)
except ValueError as exc:
# parse_affected_versions (and any future fail-loud parser
Expand Down
87 changes: 87 additions & 0 deletions tools/vulnogram/generate-cve-json/tests/test_generate_cve_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -801,6 +801,93 @@ def test_subdomain_is_not_treated_as_apache(self):
def test_malformed_url_returns_no_tags(self):
assert classify_reference("not a url") == []

def test_cve_org_record_tagged_as_related(self):
# ASF Security's preferred URL form for sibling/incomplete-fix CVE
# cross-references (Arnout Engelen, 2026-05-29 review on
# CVE-2026-49298).
assert classify_reference("https://www.cve.org/CVERecord?id=CVE-2026-27173") == ["related"]
assert classify_reference("https://cve.org/CVERecord?id=CVE-2025-68438") == ["related"]

def test_nvd_record_tagged_as_related(self):
# NVD is the same CVE database under a different URL form; treat
# as related too.
assert classify_reference("https://nvd.nist.gov/vuln/detail/CVE-2026-27173") == ["related"]


class TestExtractRelatedCveIds:
def test_extracts_single_prior_cve(self):
from generate_cve_json.cve_json import extract_related_cve_ids

summary = (
"This is a variant of CWE-200 previously addressed in CVE-2025-68438; "
"that fix did not cover the nested sensitive-keyword allowlist."
)
assert extract_related_cve_ids(summary) == ["CVE-2025-68438"]

def test_extracts_multiple_distinct_in_order(self):
from generate_cve_json.cve_json import extract_related_cve_ids

summary = "Fix-bypass of CVE-2026-33858. Also related to CVE-2025-50213 and CVE-2025-27018."
assert extract_related_cve_ids(summary) == [
"CVE-2026-33858",
"CVE-2025-50213",
"CVE-2025-27018",
]

def test_excludes_current_cve_id(self):
from generate_cve_json.cve_json import extract_related_cve_ids

summary = "CVE-2026-42359 fixes a PATCH-path bypass of CVE-2026-33858."
assert extract_related_cve_ids(summary, current_cve_id="CVE-2026-42359") == [
"CVE-2026-33858",
]

def test_current_cve_id_match_is_case_insensitive(self):
from generate_cve_json.cve_json import extract_related_cve_ids

summary = "cve-2026-42359 fixes a PATCH-path bypass of CVE-2026-33858."
assert extract_related_cve_ids(summary, current_cve_id="CVE-2026-42359") == [
"CVE-2026-33858",
]

def test_deduplicates_repeated_mentions(self):
from generate_cve_json.cve_json import extract_related_cve_ids

summary = "CVE-2025-68438 was incomplete; this CVE follows CVE-2025-68438."
assert extract_related_cve_ids(summary) == ["CVE-2025-68438"]

def test_substring_in_larger_token_does_not_match(self):
from generate_cve_json.cve_json import extract_related_cve_ids

# Word-boundary regex must not match identifiers embedded in
# larger tokens (defensive against accidental hits).
assert extract_related_cve_ids("seeCVE-2026-33858trailing") == []
assert extract_related_cve_ids("CVE-2026-33858x") == []

def test_short_form_required_at_least_four_digits(self):
from generate_cve_json.cve_json import extract_related_cve_ids

# CVE-YYYY-NNNN minimum (matches MITRE's 4-7 digit constraint).
assert extract_related_cve_ids("CVE-2026-123") == []
assert extract_related_cve_ids("CVE-2026-1234") == ["CVE-2026-1234"]

def test_empty_string_returns_empty_list(self):
from generate_cve_json.cve_json import extract_related_cve_ids

assert extract_related_cve_ids("") == []

def test_no_cve_id_in_text_returns_empty_list(self):
from generate_cve_json.cve_json import extract_related_cve_ids

assert extract_related_cve_ids("no CVE here, just narrative.") == []


class TestRelatedCveUrl:
def test_url_format_matches_cve_org(self):
from generate_cve_json.cve_json import related_cve_url

assert related_cve_url("CVE-2026-27173") == "https://www.cve.org/CVERecord?id=CVE-2026-27173"


class TestBuildReferences:
def test_mailing_list_field_urls_are_not_auto_included(self):
Expand Down
Loading