diff --git a/src/qs_codec/utils/decode_utils.py b/src/qs_codec/utils/decode_utils.py index 477d591..837766b 100644 --- a/src/qs_codec/utils/decode_utils.py +++ b/src/qs_codec/utils/decode_utils.py @@ -5,40 +5,38 @@ from urllib.parse import unquote from ..enums.charset import Charset -from .str_utils import code_unit_at class DecodeUtils: """A collection of decode utility methods used by the library.""" + # Compile a pattern that matches either a %uXXXX sequence or a %XX sequence. + UNESCAPE_PATTERN: t.Pattern[str] = re.compile( + r"%u(?P[0-9A-Fa-f]{4})|%(?P[0-9A-Fa-f]{2})", + re.IGNORECASE, + ) + @classmethod def unescape(cls, string: str) -> str: - """A Python representation the deprecated JavaScript unescape function. - - https://developer.mozilla.org/en-US/docs/web/javascript/reference/global_objects/unescape """ - buffer: t.List[str] = [] - - i: int = 0 - while i < len(string): - c: int = code_unit_at(string, i) + A Python representation of the deprecated JavaScript unescape function. - if c == 0x25: - if string[i + 1] == "u": - buffer.append( - chr(int(string[i + 2 : i + 6], 16)), - ) - i += 6 - continue + This method replaces both "%XX" and "%uXXXX" escape sequences with + their corresponding characters. - buffer.append(chr(int(string[i + 1 : i + 3], 16))) - i += 3 - continue + Example: + unescape("%u0041%20%42") -> "A B" + """ - buffer.append(string[i]) - i += 1 + def replacer(match: t.Match[str]) -> str: + if (unicode_val := match.group("unicode")) is not None: + return chr(int(unicode_val, 16)) + elif (hex_val := match.group("hex")) is not None: + return chr(int(hex_val, 16)) + # match.group(0) is always non-None, so cast it to str for mypy. + return t.cast(str, match.group(0)) - return "".join(buffer) + return cls.UNESCAPE_PATTERN.sub(replacer, string) @classmethod def decode( @@ -46,18 +44,24 @@ def decode( string: t.Optional[str], charset: t.Optional[Charset] = Charset.UTF8, ) -> t.Optional[str]: - """Decode a URL-encoded string.""" + """Decode a URL-encoded string. + + For non-UTF8 charsets (specifically Charset.LATIN1), it replaces plus + signs with spaces and applies a custom unescape for percent-encoded hex + sequences. Otherwise, it defers to urllib.parse.unquote. + """ if string is None: return None + # Replace '+' with ' ' before processing. string_without_plus: str = string.replace("+", " ") if charset == Charset.LATIN1: + # Only process hex escape sequences for Latin1. return re.sub( - r"%[0-9a-f]{2}", + r"%[0-9A-Fa-f]{2}", lambda match: cls.unescape(match.group(0)), string_without_plus, - flags=re.IGNORECASE, ) return unquote(string_without_plus) diff --git a/src/qs_codec/utils/encode_utils.py b/src/qs_codec/utils/encode_utils.py index 372506c..c3b3158 100644 --- a/src/qs_codec/utils/encode_utils.py +++ b/src/qs_codec/utils/encode_utils.py @@ -42,22 +42,33 @@ def escape( https://developer.mozilla.org/en-US/docs/web/javascript/reference/global_objects/escape """ - # Build a set of "safe" code points. + # Convert any non-BMP character into its surrogate pair representation. + string = cls._to_surrogates(string) + safe_points: t.Set[int] = cls.RFC1738_SAFE_POINTS if format == Format.RFC1738 else cls.SAFE_POINTS buffer: t.List[str] = [] - i: int - char: str - for i, char in enumerate(string): - # Use code_unit_at if it does more than ord() + i: int = 0 + while i < len(string): c: int = code_unit_at(string, i) + # If we detect a high surrogate and there is a following low surrogate, encode both. + if 0xD800 <= c <= 0xDBFF and i + 1 < len(string): + next_c: int = code_unit_at(string, i + 1) + if 0xDC00 <= next_c <= 0xDFFF: + buffer.append(f"%u{c:04X}") + buffer.append(f"%u{next_c:04X}") + i += 2 + continue + if c in safe_points: - buffer.append(char) + buffer.append(string[i]) elif c < 256: buffer.append(f"%{c:02X}") else: buffer.append(f"%u{c:04X}") + i += 1 + return "".join(buffer) @classmethod diff --git a/tests/unit/utils_test.py b/tests/unit/utils_test.py index 52c7f79..f4e85da 100644 --- a/tests/unit/utils_test.py +++ b/tests/unit/utils_test.py @@ -1,3 +1,4 @@ +import re import typing as t import pytest @@ -393,45 +394,119 @@ def test_decode_latin1(self, encoded: str, decoded: str) -> None: assert DecodeUtils.decode(encoded, charset=Charset.LATIN1) == decoded @pytest.mark.parametrize( - "unescaped, escaped", + "unescaped, escaped, format", [ - ("abc123", "abc123"), - ("äöü", "%E4%F6%FC"), - ("ć", "%u0107"), - ("@*_+-./", "@*_+-./"), - ("(", "%28"), - (")", "%29"), - (" ", "%20"), - ("~", "%7E"), - ( - "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789@*_+-./", - "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789@*_+-./", - ), + # Basic alphanumerics (remain unchanged) + ("abc123", "abc123", None), + # Accented characters (Latin-1 range uses %XX) + ("äöü", "%E4%F6%FC", None), + # Non-ASCII that falls outside Latin-1 uses %uXXXX + ("ć", "%u0107", None), + # Characters that are defined as safe + ("@*_+-./", "@*_+-./", None), + # Parentheses: in RFC3986 they are encoded + ("(", "%28", None), + (")", "%29", None), + # Space character + (" ", "%20", None), + # Tilde is safe + ("~", "%7E", None), + # Punctuation that is not safe: exclamation and comma + ("!", "%21", None), + (",", "%2C", None), + # Mixed safe and unsafe characters + ("hello world!", "hello%20world%21", None), + # Multiple spaces are each encoded + ("a b c", "a%20b%20c", None), + # A string with various punctuation + ("Hello, World!", "Hello%2C%20World%21", None), + # Null character should be encoded + ("\x00", "%00", None), + # Emoji (e.g. 😀 U+1F600) + ("😀", "%uD83D%uDE00", None), + # Test RFC1738 format: Parentheses are safe (left unchanged) + ("(", "(", Format.RFC1738), + (")", ")", Format.RFC1738), + # Mixed test with RFC1738: other unsafe characters are still encoded + ("(hello)!", "(hello)%21", Format.RFC1738), ], ) - def test_escape(self, unescaped: str, escaped: str) -> None: - assert EncodeUtils.escape(unescaped) == escaped + def test_escape(self, unescaped: str, escaped: str, format: t.Optional[Format]) -> None: + assert EncodeUtils.escape(unescaped, format=format) == escaped @pytest.mark.parametrize( "escaped, unescaped", [ + # No escapes. ("abc123", "abc123"), + # Hex escapes with uppercase hex digits. ("%E4%F6%FC", "äöü"), + # Hex escapes with lowercase hex digits. + ("%e4%f6%fc", "äöü"), + # Unicode escape. ("%u0107", "ć"), + # Unicode escape with lowercase digits. + ("%u0061", "a"), + # Characters that do not need escaping. ("@*_+-./", "@*_+-./"), + # Hex escapes for punctuation. ("%28", "("), ("%29", ")"), ("%20", " "), ("%7E", "~"), + # A long string with only safe characters. ( "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789@*_+-./", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789@*_+-./", ), + # A mix of Unicode and hex escapes. + ("%u0041%20%42", "A B"), + # A mix of literal text and hex escapes. + ("hello%20world", "hello world"), + # A literal percent sign that is not followed by a valid escape remains unchanged. + ("100% sure", "100% sure"), + # Mixed Unicode and hex escapes. + ("%u0041%65", "Ae"), # %u0041 -> "A", %65 -> "e" + # Escaped percent signs that do not form a valid escape remain unchanged. + ("50%% off", "50%% off"), + # Consecutive escapes producing multiple spaces. + ("%20%u0020", " "), + # An invalid escape sequence should remain unchanged. + ("abc%g", "abc%g"), ], ) def test_unescape(self, escaped: str, unescaped: str) -> None: assert DecodeUtils.unescape(escaped) == unescaped + def test_unescape_fallback(self, monkeypatch: pytest.MonkeyPatch) -> None: + """ + Test that the unescape replacer falls back correctly when neither named group is set. + + We override the UNESCAPE_PATTERN to include a fallback alternative that matches a lone '%' + (i.e. a '%' not followed by 'u' or two hex digits). When unescape is called on a string + containing such a '%', the fallback branch in the replacer should return the matched '%' unchanged. + """ + + # Build a new pattern that, in addition to the normal valid escapes, matches a lone '%' + # using a fallback alternative. + new_pattern: t.Pattern[str] = re.compile( + r"%u(?P[0-9A-Fa-f]{4})|%(?P[0-9A-Fa-f]{2})|%(?!u|[0-9A-Fa-f]{2})" + ) + monkeypatch.setattr(DecodeUtils, "UNESCAPE_PATTERN", new_pattern) + + # The input string contains a lone '%' (followed by a space, so it doesn't form a valid escape). + input_string: str = "100% sure" + # We expect the '%' to be left as-is (via the fallback branch). + expected_output: str = "100% sure" + + result: str = DecodeUtils.unescape(input_string) + assert result == expected_output + + # Optionally, you can also check with a string where the fallback alternative is the only match. + input_string2: str = "abc% def" + expected_output2: str = "abc% def" + assert DecodeUtils.unescape(input_string2) == expected_output2 + def test_merges_dict_with_list(self) -> None: assert Utils.merge({"0": "a"}, [Undefined(), "b"]) == {"0": "a", "1": "b"}