Skip to content

Commit 648c627

Browse files
SEC: Improve the performance of the ASCIIHexDecode filter (#3666)
This now avoids manual parsing completely and relies on built-in functionality. Tests have shown that this drastically improves performance.
1 parent 1aef6fb commit 648c627

2 files changed

Lines changed: 27 additions & 28 deletions

File tree

pypdf/filters.py

Lines changed: 21 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
__author__ = "Mathieu Fenniak"
3636
__author_email__ = "biziqe@mathieu.fenniak.net"
3737

38+
import binascii
3839
import math
3940
import os
4041
import shutil
@@ -343,34 +344,26 @@ def decode(
343344
"""
344345
if isinstance(data, str):
345346
data = data.encode()
346-
retval = b""
347-
hex_pair = b""
348-
index = 0
349-
while True:
350-
if index >= len(data):
351-
logger_warning(
352-
"missing EOD in ASCIIHexDecode, check if output is OK", __name__
353-
)
354-
break # Reached end of string without an EOD
355-
char = data[index : index + 1]
356-
if char == b">":
357-
break
358-
if char.isspace():
359-
index += 1
360-
continue
361-
hex_pair += char
362-
if len(hex_pair) == 2:
363-
retval += bytes((int(hex_pair, base=16),))
364-
hex_pair = b""
365-
index += 1
366-
# If the filter encounters the EOD marker after reading
367-
# an odd number of hexadecimal digits,
368-
# it shall behave as if a 0 (zero) followed the last digit.
369-
# For every even number of hexadecimal digits, hex_pair is reset to b"".
370-
if hex_pair != b"":
371-
hex_pair += b"0"
372-
retval += bytes((int(hex_pair, base=16),))
373-
return retval
347+
348+
# Stop at EOD
349+
eod = data.find(b">")
350+
if eod == -1:
351+
logger_warning(
352+
"missing EOD in ASCIIHexDecode, check if output is OK",
353+
__name__,
354+
)
355+
hex_data = data
356+
else:
357+
hex_data = data[:eod]
358+
359+
# Remove whitespace
360+
hex_data = b"".join(hex_data.split())
361+
362+
# Pad if odd length
363+
if len(hex_data) % 2 == 1:
364+
hex_data += b"0"
365+
366+
return binascii.unhexlify(hex_data)
374367

375368

376369
class RunLengthDecode:

tests/test_filters.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1043,3 +1043,9 @@ def test_runlengthdecode__decode_limit():
10431043
# Use a very low limit for this exact comparison, otherwise *pytest* takes ages to render a failure diff.
10441044
with mock.patch("pypdf.filters.RUN_LENGTH_MAX_OUTPUT_LENGTH", uncompressed_size):
10451045
assert RunLengthDecode.decode(encoded) == b"A" * uncompressed_size
1046+
1047+
1048+
@pytest.mark.timeout(10)
1049+
def test_asciihexdecode__speed():
1050+
encoded = (b"41" * 1_200_000) + b">"
1051+
ASCIIHexDecode.decode(encoded)

0 commit comments

Comments
 (0)