Skip to content

Commit 37de320

Browse files
ROB: Deal with invalid annotations in extract_links (#3659)
Closes #3656.
1 parent 05e6d3c commit 37de320

3 files changed

Lines changed: 83 additions & 4 deletions

File tree

pypdf/generic/_link.py

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,8 @@
3030

3131
from typing import TYPE_CHECKING, Optional, Union, cast
3232

33-
from . import ArrayObject, DictionaryObject, IndirectObject, PdfObject, TextStringObject
33+
from .._utils import logger_warning
34+
from . import ArrayObject, DictionaryObject, IndirectObject, PdfObject, TextStringObject, is_null_or_none
3435

3536
if TYPE_CHECKING:
3637
from .._page import PageObject
@@ -79,8 +80,23 @@ def extract_links(new_page: "PageObject", old_page: "PageObject") -> list[tuple[
7980
"""Extracts links from two pages on the assumption that the two pages are
8081
the same. Produces one list of (new link, old link) tuples.
8182
"""
82-
new_links = [_build_link(link, new_page) for link in new_page.get("/Annots", [])]
83-
old_links = [_build_link(link, old_page) for link in old_page.get("/Annots", [])]
83+
new_annotations = new_page.get("/Annots", ArrayObject()).get_object()
84+
old_annotations = old_page.get("/Annots", ArrayObject()).get_object()
85+
if is_null_or_none(new_annotations):
86+
new_annotations = ArrayObject()
87+
if is_null_or_none(old_annotations):
88+
old_annotations = ArrayObject()
89+
if not isinstance(new_annotations, ArrayObject) or not isinstance(old_annotations, ArrayObject):
90+
logger_warning(
91+
f"Expected annotation arrays: {old_annotations} {new_annotations}. Ignoring annotations.",
92+
__name__
93+
)
94+
return []
95+
if len(new_annotations) != len(old_annotations):
96+
logger_warning(f"Annotation sizes differ: {old_annotations} vs. {new_annotations}", __name__)
97+
98+
new_links = [_build_link(link, new_page) for link in new_annotations]
99+
old_links = [_build_link(link, old_page) for link in old_annotations]
84100

85101
return [
86102
(new_link, old_link) for (new_link, old_link)
@@ -110,7 +126,7 @@ def _build_link(indirect_object: IndirectObject, page: "PageObject") -> Optional
110126
return None # Nothing to do here
111127

112128

113-
def _create_link(reference: PdfObject, source_pdf: "PdfReader")-> Optional[ReferenceLink]:
129+
def _create_link(reference: PdfObject, source_pdf: "PdfReader") -> Optional[ReferenceLink]:
114130
if isinstance(reference, TextStringObject):
115131
return NamedReferenceLink(reference, source_pdf)
116132
if isinstance(reference, ArrayObject):

tests/generic/test_link.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
"""Test the pypdf.generic._link module."""
2+
from io import BytesIO
3+
4+
import pytest
5+
6+
from pypdf import PageObject, PdfReader, PdfWriter
7+
from pypdf.generic import ArrayObject, NameObject, NullObject, extract_links
8+
from tests import get_data_from_url
9+
10+
11+
@pytest.mark.enable_socket
12+
def test_extract_links__null_object_in_old_page():
13+
url = "https://github.com/user-attachments/files/25507697/sample.pdf"
14+
name = "issue3656.pdf"
15+
reader = PdfReader(BytesIO(get_data_from_url(url=url, name=name)))
16+
17+
writer = PdfWriter()
18+
writer.append(reader)
19+
20+
21+
def test_extract_links(caplog):
22+
page1 = PageObject()
23+
page2 = PageObject()
24+
25+
# No annotations.
26+
assert extract_links(page1, page2) == []
27+
assert caplog.messages == []
28+
29+
# Only old annotations.
30+
page1[NameObject("/Annots")] = NullObject()
31+
assert extract_links(page1, page2) == []
32+
assert caplog.messages == []
33+
caplog.clear()
34+
35+
page1[NameObject("/Annots")] = ArrayObject([NullObject()])
36+
assert extract_links(page1, page2) == []
37+
assert caplog.messages == ["Annotation sizes differ: [] vs. [NullObject]"]
38+
caplog.clear()
39+
40+
# Both old and new annotations.
41+
page2[NameObject("/Annots")] = ArrayObject([NullObject()])
42+
assert extract_links(page1, page2) == []
43+
assert caplog.messages == [] # Same size.
44+
caplog.clear()
45+
46+
page2[NameObject("/Annots")] = NullObject()
47+
assert extract_links(page1, page2) == []
48+
assert caplog.messages == ["Annotation sizes differ: [] vs. [NullObject]"]
49+
caplog.clear()
50+
51+
# Only new annotations.
52+
del page1[NameObject("/Annots")]
53+
page2[NameObject("/Annots")] = ArrayObject([NullObject()])
54+
assert extract_links(page1, page2) == []
55+
assert caplog.messages == ["Annotation sizes differ: [NullObject] vs. []"]

tests/test_writer.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2828,6 +2828,14 @@ def test_insert_filtered_annotations__annotations_are_no_list(caplog):
28282828
writer.append(reader)
28292829
font_file2 = reader.get_object(36).indirect_reference
28302830
assert caplog.messages == [
2831+
(
2832+
f"Expected annotation arrays: {{'/FontFile2': {font_file2!r}, "
2833+
"'/Descent': -269, '/CapHeight': 714, '/FontWeight': "
2834+
"300, '/FontName': '/JQJGLF+OpenSans-Light', '/ItalicAngle': 0, '/StemV': "
2835+
"48, '/Type': '/FontDescriptor', '/FontBBox': [-521, -269, 1140, 1048], "
2836+
"'/FontFamily': 'Open Sans Light', '/Flags': 32, '/XHeight': 531, "
2837+
"'/Ascent': 1048, '/FontStretch': '/Normal'} []. Ignoring annotations."
2838+
),
28312839
(
28322840
f"Expected list of annotations, got {{'/FontFile2': {font_file2!r}, "
28332841
"'/Descent': -269, '/CapHeight': 714, '/FontWeight': 300, '/FontName': '/JQJGLF+OpenSans-Light', "

0 commit comments

Comments
 (0)