From 2b999a81f18d310a41cdb664f5abc7e10e26cbd8 Mon Sep 17 00:00:00 2001 From: Klemen Tusar Date: Sat, 23 Aug 2025 17:53:12 +0100 Subject: [PATCH 01/29] :recycle: refactor DecodeOptions to support legacy decoders and add unified decode methods --- src/qs_codec/models/decode_options.py | 245 +++++++++++++++----------- 1 file changed, 142 insertions(+), 103 deletions(-) diff --git a/src/qs_codec/models/decode_options.py b/src/qs_codec/models/decode_options.py index b45ba97..c550b20 100644 --- a/src/qs_codec/models/decode_options.py +++ b/src/qs_codec/models/decode_options.py @@ -1,9 +1,13 @@ -"""This module contains the ``DecodeOptions`` class that configures the output of ``decode``.""" +"""This module contains the ``DecodeOptions`` class that configures the output of ``decode``. + +Keys are decoded identically to values by the default decoder; whether a decoded ``.`` splits +segments is controlled by parsing options (``allow_dots`` / ``decode_dot_in_keys``) elsewhere. +""" import inspect import typing as t from dataclasses import dataclass -from enum import Enum +from enum import Enum as _EnumBase from functools import wraps from ..enums.charset import Charset @@ -24,7 +28,11 @@ class DecodeOptions: """Set to ``True`` to decode percent‑encoded dots in keys (e.g., ``%2E`` → ``.``). Note: it implies ``allow_dots``, so ``decode`` will error if you set ``decode_dot_in_keys`` to ``True``, and ``allow_dots`` to ``False``. - When ``None`` (default), it defaults to ``False``.""" + When ``None`` (default), it defaults to ``False``. + + Inside bracket segments, percent-decoding naturally yields ``.`` from ``%2E/%2e``. This option controls whether + **top‑level** encoded dots are treated as additional split points; it does **not** affect the literal ``.`` produced + by percent-decoding inside bracket segments.""" allow_empty_lists: bool = False """Set to ``True`` to allow empty ``list`` values inside ``dict``\\s in the encoded input.""" @@ -127,6 +135,11 @@ class DecodeOptions: from the decoder uses ``None`` as the scalar value. """ + legacy_decoder: t.Optional[t.Callable[..., t.Optional[str]]] = None + """Back‑compat adapter for legacy decoders of the form ``decoder(value, charset)``. + Prefer ``decoder`` which may optionally accept a ``kind`` argument. When both are supplied, + ``decoder`` takes precedence (mirroring Kotlin/C# behavior).""" + def __post_init__(self) -> None: """Post-initialization.""" # Default `decode_dot_in_keys` first, then mirror into `allow_dots` when unspecified. @@ -139,109 +152,135 @@ def __post_init__(self) -> None: if self.decode_dot_in_keys and not self.allow_dots: raise ValueError("decode_dot_in_keys=True implies allow_dots=True") - # decoder setup + compatibility wrapper - if self.decoder is None: - self.decoder = DecodeUtils.decode - else: - user_dec = self.decoder - - # Precompute dispatch to avoid per-call introspection. - try: - sig = inspect.signature(user_dec) - params = sig.parameters - param_list = list(params.values()) - - has_var_kw = any(p.kind == inspect.Parameter.VAR_KEYWORD for p in param_list) - has_var_pos = any(p.kind == inspect.Parameter.VAR_POSITIONAL for p in param_list) - - accepts_charset_pos = False - accepts_charset_kw = False - if "charset" in params: - p = params["charset"] - if p.kind in (inspect.Parameter.POSITIONAL_ONLY, inspect.Parameter.POSITIONAL_OR_KEYWORD): - accepts_charset_pos = True - if p.kind in (inspect.Parameter.POSITIONAL_OR_KEYWORD, inspect.Parameter.KEYWORD_ONLY): - accepts_charset_kw = True - if has_var_pos: + # decoder setup + compatibility wrapper (parity with Kotlin/C#): + # precedence is: user `decoder` > `legacy_decoder` > library default. + raw_dec = self.decoder + if raw_dec is None and self.legacy_decoder is not None: + raw_dec = self.legacy_decoder # legacy two-arg form; no kind + if raw_dec is None: + raw_dec = DecodeUtils.decode + + user_dec = raw_dec + + # Precompute dispatch to avoid per-call introspection. + try: + sig = inspect.signature(user_dec) + params = sig.parameters + param_list = list(params.values()) + + has_var_kw = any(p.kind == inspect.Parameter.VAR_KEYWORD for p in param_list) + has_var_pos = any(p.kind == inspect.Parameter.VAR_POSITIONAL for p in param_list) + + accepts_charset_pos = False + accepts_charset_kw = False + if "charset" in params: + p = params["charset"] + if p.kind in (inspect.Parameter.POSITIONAL_ONLY, inspect.Parameter.POSITIONAL_OR_KEYWORD): accepts_charset_pos = True - - has_kind_param = "kind" in params - accepts_kind_kw = False + if p.kind in (inspect.Parameter.POSITIONAL_OR_KEYWORD, inspect.Parameter.KEYWORD_ONLY): + accepts_charset_kw = True + if has_var_pos: + accepts_charset_pos = True + + has_kind_param = "kind" in params + accepts_kind_kw = False + accepts_kind_pos = False + if has_kind_param: + k = params["kind"] + accepts_kind_kw = k.kind in ( + inspect.Parameter.POSITIONAL_OR_KEYWORD, + inspect.Parameter.KEYWORD_ONLY, + ) + accepts_kind_pos = k.kind in ( + inspect.Parameter.POSITIONAL_ONLY, + inspect.Parameter.POSITIONAL_OR_KEYWORD, + ) + elif has_var_kw: + accepts_kind_kw = True # can pass via **kwargs accepts_kind_pos = False - if has_kind_param: - k = params["kind"] - accepts_kind_kw = k.kind in ( - inspect.Parameter.POSITIONAL_OR_KEYWORD, - inspect.Parameter.KEYWORD_ONLY, - ) - accepts_kind_pos = k.kind in ( - inspect.Parameter.POSITIONAL_ONLY, - inspect.Parameter.POSITIONAL_OR_KEYWORD, - ) - elif has_var_kw: - accepts_kind_kw = True # can pass via **kwargs - accepts_kind_pos = False - - # Decide how to represent `kind`: prefer string for maximum compatibility - pass_kind_as_str = True - if has_kind_param: - ann = params["kind"].annotation - if ann is inspect.Signature.empty: - pass_kind_as_str = True - else: - # If annotation is an Enum subclass (eg, DecodeKind), pass the enum. - if isinstance(ann, type): - pass_kind_as_str = not issubclass(ann, Enum) - else: - pass_kind_as_str = True - elif has_var_kw: - # With **kwargs but no explicit parameter, safest is to pass string + + # Decide how to represent `kind`: prefer string for maximum compatibility + pass_kind_as_str = True + if has_kind_param: + ann = params["kind"].annotation + if ann is inspect.Signature.empty: pass_kind_as_str = True + else: + if isinstance(ann, type): + pass_kind_as_str = not issubclass(ann, _EnumBase) + else: + pass_kind_as_str = True + elif has_var_kw: + pass_kind_as_str = True - def dispatch( - s: t.Optional[str], - charset: t.Optional[Charset], - kind: DecodeKind, - ) -> t.Optional[str]: - # Choose enum or string representation for `kind` - kind_arg: t.Union[DecodeKind, str] = kind.value if pass_kind_as_str else kind - args: t.List[t.Any] = [s] - kwargs: t.Dict[str, t.Any] = {} - if accepts_charset_pos: - args.append(charset) - elif accepts_charset_kw or has_var_kw: - kwargs["charset"] = charset - if accepts_kind_kw: - kwargs["kind"] = kind_arg - elif accepts_kind_pos: - args.append(kind_arg) - return user_dec(*args, **kwargs) - - except (TypeError, ValueError): - # Builtins/callables without retrievable signature: try the most compatible forms. - def dispatch( - s: t.Optional[str], - charset: t.Optional[Charset], - kind: DecodeKind, - ) -> t.Optional[str]: - # Mark `kind` as used to satisfy linters; legacy decoders ignore it. - _ = kind - try: - return user_dec(s) # type: ignore[misc] - except TypeError as e1: - try: - return user_dec(s, charset) # type: ignore[misc] - except TypeError as exc: - raise e1 from exc - - @wraps(user_dec) - def _adapter( + def dispatch( s: t.Optional[str], - charset: t.Optional[Charset] = Charset.UTF8, - *, - kind: DecodeKind = DecodeKind.VALUE, + charset: t.Optional[Charset], + kind: DecodeKind, ) -> t.Optional[str]: - """Adapter that dispatches based on the user decoder's signature.""" - return dispatch(s, charset, kind) - - self.decoder = _adapter + kind_arg: t.Union[DecodeKind, str] = kind.value if pass_kind_as_str else kind + args: t.List[t.Any] = [s] + kwargs: t.Dict[str, t.Any] = {} + if accepts_charset_pos: + args.append(charset) + elif accepts_charset_kw or has_var_kw: + kwargs["charset"] = charset + if accepts_kind_kw: + kwargs["kind"] = kind_arg + elif accepts_kind_pos: + args.append(kind_arg) + return user_dec(*args, **kwargs) + + except (TypeError, ValueError): + # Builtins/callables without retrievable signature: try the most compatible forms. + def dispatch( + s: t.Optional[str], + charset: t.Optional[Charset], + kind: DecodeKind, + ) -> t.Optional[str]: + _ = kind # ignored by legacy decoders + try: + return user_dec(s) # type: ignore[misc] + except TypeError as e1: + try: + return user_dec(s, charset) # type: ignore[misc] + except TypeError as exc: + raise e1 from exc + + @wraps(user_dec) + def _adapter( + s: t.Optional[str], + charset: t.Optional[Charset] = Charset.UTF8, + *, + kind: DecodeKind = DecodeKind.VALUE, + ) -> t.Optional[str]: + """Adapter that dispatches based on the user decoder's signature.""" + return dispatch(s, charset, kind) + + self.decoder = _adapter + + # --- Convenience methods (parity with Kotlin) --- + def decode( + self, value: t.Optional[str], charset: t.Optional[Charset] = None, *, kind: DecodeKind = DecodeKind.VALUE + ) -> t.Optional[t.Any]: + """Unified scalar decode with key/value context. + + Uses the configured ``decoder`` (or ``legacy_decoder``) when provided; otherwise falls back + to :meth:`DecodeUtils.decode`. The default library behavior decodes keys identically to + values; whether a ``.`` participates in key splitting is decided later by the parser. + """ + # ``self.decoder`` has been normalized to accept (s, charset, *, kind) + d = self.decoder + if d is None: + # Should not happen because we always set an adapter, but keep a safe fallback. + return DecodeUtils.decode(value, charset or self.charset) + return d(value, charset or self.charset, kind=kind) + + def decode_key(self, value: t.Optional[str], charset: t.Optional[Charset] = None) -> t.Optional[str]: + """Decode a key (or key segment). Always returns a string or ``None``.""" + out = self.decode(value, charset, kind=DecodeKind.KEY) + return None if out is None else str(out) + + def decode_value(self, value: t.Optional[str], charset: t.Optional[Charset] = None) -> t.Optional[t.Any]: + """Decode a value token. Returns any scalar or ``None``.""" + return self.decode(value, charset, kind=DecodeKind.VALUE) From 316a61b7b50c4409496a102f799f198c74a69121 Mon Sep 17 00:00:00 2001 From: Klemen Tusar Date: Sat, 23 Aug 2025 17:53:23 +0100 Subject: [PATCH 02/29] :bug: fix top-level dot splitting in keys to preserve encoded dots and handle degenerate cases --- src/qs_codec/utils/decode_utils.py | 106 ++++++++++++++++++++++------- 1 file changed, 81 insertions(+), 25 deletions(-) diff --git a/src/qs_codec/utils/decode_utils.py b/src/qs_codec/utils/decode_utils.py index 1760141..ec214b1 100644 --- a/src/qs_codec/utils/decode_utils.py +++ b/src/qs_codec/utils/decode_utils.py @@ -4,6 +4,7 @@ - Decoding handles both UTF‑8 and Latin‑1 code paths. - Key splitting keeps bracket groups *balanced* and optionally treats dots as path separators when ``allow_dots=True``. +- Top‑level dot splitting uses a character‑scanner that preserves leading/trailing dots, `.[]` degenerates, and never splits on percent‑encoded dots. """ import re @@ -32,6 +33,77 @@ class DecodeUtils: # "a.b[c]" becomes "a[b][c]" before bracket parsing. DOT_TO_BRACKET: t.Pattern[str] = re.compile(r"\.([^.\[]+)") + @classmethod + def dot_to_bracket_top_level(cls, s: str) -> str: + """Convert top‑level dot segments into bracket groups, preserving dots inside brackets and handling degenerate top‑level dots. + + Rules: + - Only dots at depth == 0 split. Dots inside '[]' are preserved. + - Percent-encoded dots ('%2E'/'%2e') never split here. + - Degenerate cases: + * leading '.' is preserved ('.a' stays '.a') + * '.[' is skipped so 'a.[b]' behaves like 'a[b]' + * 'a..b' preserves the first dot → 'a.[b]' + * trailing '.' is preserved and ignored by the splitter + + Examples: + 'user.email.name' -> 'user[email][name]' + 'a[b].c' -> 'a[b][c]' + 'a[.].c' -> 'a[.][c]' + 'a%2E[b]' -> 'a%2E[b]' + """ + if "." not in s: + return s + sb: t.List[str] = [] + depth = 0 + i = 0 + n = len(s) + while i < n: + ch = s[i] + if ch == "[": + depth += 1 + sb.append(ch) + i += 1 + elif ch == "]": + if depth > 0: + depth -= 1 + sb.append(ch) + i += 1 + elif ch == ".": + if depth == 0: + has_next = i + 1 < n + next_ch = s[i + 1] if has_next else "\0" + if i == 0: + # leading '.' is preserved + sb.append(".") + i += 1 + elif next_ch == "[": + # skip the dot so 'a.[b]' acts like 'a[b]' + i += 1 + elif (not has_next) or next_ch == ".": + # trailing dot, or first of a double dot + sb.append(".") + i += 1 + else: + # normal split: take token until next '.' or '[' + start = i + 1 + j = start + while j < n and s[j] != "." and s[j] != "[": + j += 1 + sb.append("[") + sb.append(s[start:j]) + sb.append("]") + i = j + else: + sb.append(".") + i += 1 + else: + # also preserve percent sequences verbatim at top level; + # we don't split on '%2E' here + sb.append(ch) + i += 1 + return "".join(sb) + # Precompiled pattern for %XX hex bytes (Latin-1 path fast path) HEX2_PATTERN: t.Pattern[str] = re.compile(r"%([0-9A-Fa-f]{2})") @@ -69,7 +141,7 @@ def decode( cls, string: t.Optional[str], charset: t.Optional[Charset] = Charset.UTF8, - kind: DecodeKind = DecodeKind.VALUE, + kind: DecodeKind = DecodeKind.VALUE, # pylint: disable=unused-argument ) -> t.Optional[str]: """Decode a URL‑encoded scalar. @@ -77,7 +149,7 @@ def decode( - Replace ``+`` with a literal space *before* decoding. - If ``charset`` is :data:`~qs_codec.enums.charset.Charset.LATIN1`, decode only ``%XX`` byte sequences (no ``%uXXXX``). ``%uXXXX`` sequences are left as‑is to mimic older browser/JS behavior. - Otherwise (UTF‑8), defer to :func:`urllib.parse.unquote`. - - When ``kind=DecodeKind.KEY``, preserve percent-encoded dots (``%2E``/``%2e``) so key splitting honors ``allow_dots``/``decode_dot_in_keys``. Values always decode fully. + - Keys and values are decoded identically; whether a literal ``.`` acts as a key separator is decided later by the key‑splitting logic. Returns ------- @@ -95,22 +167,9 @@ def decode( s = string_without_plus if "%" not in s: return s - if kind is DecodeKind.KEY: - - def _latin1_key_replacer(m: t.Match[str]) -> str: - hx = m.group(1) - if hx.lower() == "2e": # keep %2E/%2e literal in keys - return "%" + hx - return chr(int(hx, 16)) - - return cls.HEX2_PATTERN.sub(_latin1_key_replacer, s) - else: - return cls.HEX2_PATTERN.sub(lambda m: chr(int(m.group(1), 16)), s) + return cls.HEX2_PATTERN.sub(lambda m: chr(int(m.group(1), 16)), s) s = string_without_plus - if kind is DecodeKind.KEY and "%2" in s: - # Protect encoded dots so unquote does not turn them into literal '.' for keys - s = s.replace("%2E", "%252E").replace("%2e", "%252e") return s if "%" not in s else unquote(s) @classmethod @@ -123,7 +182,7 @@ def split_key_into_segments( ) -> t.List[str]: """Split a composite key into *balanced* bracket segments. - - If ``allow_dots`` is True, convert dots to bracket groups first (``a.b[c]`` → ``a[b][c]``) while leaving existing brackets intact. + - If ``allow_dots`` is True, convert **top‑level** dots to bracket groups using a character‑scanner (``a.b[c]`` → ``a[b][c]``), preserving dots inside brackets and degenerate cases. - The *parent* (non‑bracket) prefix becomes the first segment, e.g. ``"a[b][c]"`` → ``["a", "[b]", "[c]"]``. - Bracket groups are *balanced* using a counter so nested brackets within a single group (e.g. ``"[with[inner]]"``) are treated as one segment. - When ``max_depth <= 0``, no splitting occurs; the key is returned as a single segment (qs semantics). @@ -131,13 +190,10 @@ def split_key_into_segments( This runs in O(n) time over the key string. """ - if allow_dots and "." in original_key: - key: str = cls.DOT_TO_BRACKET.sub(r"[\g<1>]", original_key) - else: - key = original_key - if max_depth <= 0: - return [key] + return [original_key] + + key: str = cls.dot_to_bracket_top_level(original_key) if allow_dots else original_key segments: t.List[str] = [] @@ -170,7 +226,7 @@ def split_key_into_segments( i += 1 if close < 0: - break # unterminated group; stop collecting + break # unterminated group; stop collecting; remainder handled below # Append the full balanced group, including the surrounding brackets. segments.append(key[open_idx : close + 1]) # includes the surrounding [ ] @@ -180,7 +236,7 @@ def split_key_into_segments( if open_idx >= 0: if strict_depth: raise IndexError(f"Input depth exceeded depth option of {max_depth} and strict_depth is True") - # Stash the remainder as a single segment (qs behavior) + # Stash the remainder as a single segment (qs/Kotlin parity) segments.append("[" + key[open_idx:] + "]") return segments From 6c8b384d5f75756ef234d0891a21f762eaa050cd Mon Sep 17 00:00:00 2001 From: Klemen Tusar Date: Sat, 23 Aug 2025 17:53:29 +0100 Subject: [PATCH 03/29] :bug: normalize percent-encoded dots in bracketed keys when decode_dot_in_keys is enabled --- src/qs_codec/decode.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/src/qs_codec/decode.py b/src/qs_codec/decode.py index bb08095..c325425 100644 --- a/src/qs_codec/decode.py +++ b/src/qs_codec/decode.py @@ -297,6 +297,13 @@ def _parse_object( ----- - Builds lists when encountering ``[]`` (respecting ``allow_empty_lists`` and null handling). - Converts bracketed numeric segments into list indices when allowed and within ``list_limit``. + - When ``list_limit`` is negative, numeric bracket indices are treated as + *map keys* (list growth is disabled). If ``raise_on_limit_exceeded`` is + True, any list-growth operation (empty brackets, comma-split, nested pushes) + will raise immediately. + - Inside bracket segments, a custom key decoder may leave percent-encoded dots + (``%2E/%2e``). When ``decode_dot_in_keys`` is True, these are normalized to + ``.`` here. Top‑level dot splitting is already handled by the splitter. - When list parsing is disabled and an empty segment is encountered, coerces to ``{"0": leaf}`` to preserve round-trippability with other ports. """ current_list_length: int = 0 @@ -329,7 +336,12 @@ def _parse_object( else: obj = dict() - # Optionally treat `%2E` as a literal dot (when `decode_dot_in_keys` is enabled). + # Map `%2E`/`%2e` to a literal dot *inside bracket segments* when + # `decode_dot_in_keys` is enabled. Even though `_parse_query_string_values` + # typically percent‑decodes the key (default decoder), a custom + # `DecodeOptions.decoder` may return the raw token. In that case, `%2E` can + # still appear here and must be normalized for parity with the Kotlin/C# ports. + # (Top‑level dot splitting is performed earlier by the key splitter.) clean_root: str = root[1:-1] if root.startswith("[") and root.endswith("]") else root if options.decode_dot_in_keys: From 2e0e688597b91e9f0e9d6dfbc937e61beedf68e7 Mon Sep 17 00:00:00 2001 From: Klemen Tusar Date: Sat, 23 Aug 2025 17:53:36 +0100 Subject: [PATCH 04/29] :white_check_mark: add tests for DecodeOptions dot-in-keys and custom decoder behaviors --- tests/unit/decode_options_test.py | 83 +++++++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) diff --git a/tests/unit/decode_options_test.py b/tests/unit/decode_options_test.py index 525b4f7..81ee009 100644 --- a/tests/unit/decode_options_test.py +++ b/tests/unit/decode_options_test.py @@ -128,3 +128,86 @@ def test_parse_lists_toggle_does_not_leak_across_calls(self) -> None: # Second call should still parse lists as lists res2 = decode("a[]=1&a[]=2", opts) assert res2 == {"a": ["1", "2"]} + + +class TestAllowDotsDecodeDotInKeysInterplay: + def test_constructor_invalid_combination_throws(self) -> None: + import pytest + + with pytest.raises((ValueError, AssertionError, TypeError)): + DecodeOptions(decode_dot_in_keys=True, allow_dots=False) + + +class TestDefaultDecodeKeyEncodedDots: + def test_key_maps_2e_inside_brackets_allowdots_true(self) -> None: + for cs in (Charset.UTF8, Charset.LATIN1): + opts = DecodeOptions(allow_dots=True, charset=cs) + assert opts.decoder("a[%2E]", cs, kind=DecodeKind.KEY) == "a[.]" + assert opts.decoder("a[%2e]", cs, kind=DecodeKind.KEY) == "a[.]" + + def test_key_maps_2e_outside_brackets_allowdots_true_independent_of_decodeopt(self) -> None: + for cs in (Charset.UTF8, Charset.LATIN1): + opts1 = DecodeOptions(allow_dots=True, decode_dot_in_keys=False, charset=cs) + opts2 = DecodeOptions(allow_dots=True, decode_dot_in_keys=True, charset=cs) + assert opts1.decoder("a%2Eb", cs, kind=DecodeKind.KEY) == "a.b" + assert opts2.decoder("a%2Eb", cs, kind=DecodeKind.KEY) == "a.b" + + def test_non_key_decodes_2e_to_dot_control(self) -> None: + for cs in (Charset.UTF8, Charset.LATIN1): + opts = DecodeOptions(allow_dots=True, charset=cs) + assert opts.decoder("a%2Eb", cs, kind=DecodeKind.VALUE) == "a.b" + + def test_key_maps_2e_inside_brackets_allowdots_false(self) -> None: + for cs in (Charset.UTF8, Charset.LATIN1): + opts = DecodeOptions(allow_dots=False, charset=cs) + assert opts.decoder("a[%2E]", cs, kind=DecodeKind.KEY) == "a[.]" + assert opts.decoder("a[%2e]", cs, kind=DecodeKind.KEY) == "a[.]" + + def test_key_outside_2e_decodes_to_dot_allowdots_false(self) -> None: + for cs in (Charset.UTF8, Charset.LATIN1): + opts = DecodeOptions(allow_dots=False, charset=cs) + assert opts.decoder("a%2Eb", cs, kind=DecodeKind.KEY) == "a.b" + assert opts.decoder("a%2eb", cs, kind=DecodeKind.KEY) == "a.b" + + +class TestCustomDecoderBehavior: + def test_decode_key_decodes_percent_sequences_like_values_when_decode_dot_in_keys_false(self) -> None: + opts = DecodeOptions(allow_dots=True, decode_dot_in_keys=False) + assert opts.decoder("a%2Eb", Charset.UTF8, kind=DecodeKind.KEY) == "a.b" + assert opts.decoder("a%2eb", Charset.UTF8, kind=DecodeKind.KEY) == "a.b" + + def test_decode_value_decodes_percent_sequences_normally(self) -> None: + opts = DecodeOptions() + assert opts.decoder("%2E", Charset.UTF8, kind=DecodeKind.VALUE) == "." + + def test_decoder_is_used_for_key_and_value(self) -> None: + calls: list[tuple[str | None, DecodeKind]] = [] + + def dec(s: str | None, charset: Charset | None, kind: DecodeKind) -> str | None: # type: ignore[override] + calls.append((s, kind)) + return s + + opts = DecodeOptions(decoder=dec) + assert opts.decoder("x", Charset.UTF8, kind=DecodeKind.KEY) == "x" + assert opts.decoder("y", Charset.UTF8, kind=DecodeKind.VALUE) == "y" + + assert len(calls) == 2 + assert calls[0][1] is DecodeKind.KEY and calls[0][0] == "x" + assert calls[1][1] is DecodeKind.VALUE and calls[1][0] == "y" + + def test_decoder_null_return_is_honored(self) -> None: + def dec(s: str | None, charset: Charset | None, kind: DecodeKind) -> str | None: # type: ignore[override] + return None + + opts = DecodeOptions(decoder=dec) + assert opts.decoder("foo", Charset.UTF8, kind=DecodeKind.VALUE) is None + assert opts.decoder("bar", Charset.UTF8, kind=DecodeKind.KEY) is None + + def test_single_decoder_acts_like_legacy_when_ignoring_kind(self) -> None: + def dec(s: str | None, *args, **kwargs): # type: ignore[no-untyped-def] + return None if s is None else s.upper() + + opts = DecodeOptions(decoder=dec) + assert opts.decoder("abc", Charset.UTF8, kind=DecodeKind.VALUE) == "ABC" + # For keys, custom decoder gets the raw token; no default percent-decoding happens first. + assert opts.decoder("a%2Eb", Charset.UTF8, kind=DecodeKind.KEY) == "A%2EB" From 8ed48ab794c3dbebdcf6259ef8ea8d0490fd6bab Mon Sep 17 00:00:00 2001 From: Klemen Tusar Date: Sat, 23 Aug 2025 17:53:40 +0100 Subject: [PATCH 05/29] :white_check_mark: add CSharp parity tests for encoded dot behavior in DecodeOptions --- tests/unit/decode_test.py | 131 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 131 insertions(+) diff --git a/tests/unit/decode_test.py b/tests/unit/decode_test.py index acb573c..6b57d43 100644 --- a/tests/unit/decode_test.py +++ b/tests/unit/decode_test.py @@ -1430,3 +1430,134 @@ def test_parse_lists_toggle_does_not_leak_across_calls(self) -> None: # Second call should still parse lists as lists res2 = decode("a[]=1&a[]=2", opts) assert res2 == {"a": ["1", "2"]} + + +class TestCSharpParityEncodedDotBehavior: + def test_top_level_allowdots_true_decodedot_true_splits_plain_and_encoded_dot(self) -> None: + opt = DecodeOptions(allow_dots=True, decode_dot_in_keys=True) + assert decode("a.b=c", opt) == {"a": {"b": "c"}} + assert decode("a%2Eb=c", opt) == {"a": {"b": "c"}} + assert decode("a%2eb=c", opt) == {"a": {"b": "c"}} + + def test_top_level_allowdots_true_decodedot_false_encoded_dot_also_splits(self) -> None: + opt = DecodeOptions(allow_dots=True, decode_dot_in_keys=False) + assert decode("a%2Eb=c", opt) == {"a": {"b": "c"}} + assert decode("a%2eb=c", opt) == {"a": {"b": "c"}} + + def test_invalid_allowdots_false_decodedot_true_raises(self) -> None: + with pytest.raises(ValueError): + decode("a%2Eb=c", DecodeOptions(allow_dots=False, decode_dot_in_keys=True)) + + def test_bracket_segment_maps_to_dot_when_decodedot_true(self) -> None: + opt = DecodeOptions(allow_dots=True, decode_dot_in_keys=True) + assert decode("a[%2E]=x", opt) == {"a": {".": "x"}} + assert decode("a[%2e]=x", opt) == {"a": {".": "x"}} + + def test_bracket_segment_percent_decoding_inside_brackets_when_decodedot_false(self) -> None: + opt = DecodeOptions(allow_dots=True, decode_dot_in_keys=False) + # Note: key-decoder percent-decodes inside brackets, so %2E → "." + assert decode("a[%2E]=x", opt) == {"a": {".": "x"}} + assert decode("a[%2e]=x", opt) == {"a": {".": "x"}} + + def test_value_tokens_always_decode_percent2E_to_dot(self) -> None: + assert decode("x=%2E") == {"x": "."} + + def test_latin1_allowdots_true_decodedot_true_matches_utf8(self) -> None: + opt = DecodeOptions(allow_dots=True, decode_dot_in_keys=True, charset=Charset.LATIN1) + assert decode("a%2Eb=c", opt) == {"a": {"b": "c"}} + assert decode("a[%2E]=x", opt) == {"a": {".": "x"}} + + def test_latin1_allowdots_true_decodedot_false_also_splits_top_level_and_decodes_inside_brackets(self) -> None: + opt = DecodeOptions(allow_dots=True, decode_dot_in_keys=False, charset=Charset.LATIN1) + assert decode("a%2Eb=c", opt) == {"a": {"b": "c"}} + assert decode("a[%2E]=x", opt) == {"a": {".": "x"}} + + def test_percent_decoding_applies_inside_brackets_when_decoding_keys(self) -> None: + # Kotlin's DecodeOptions.decode(KEY) equivalent behavior exercised via full parse: + # %2E inside a bracket segment becomes '.' regardless of allow_dots + o1 = DecodeOptions(allow_dots=False, decode_dot_in_keys=False) + o2 = DecodeOptions(allow_dots=True, decode_dot_in_keys=False) + assert decode("a[%2Eb]=x", o1) == {"a": {".b": "x"}} + assert decode("a[b%2Ec]=x", o1) == {"a": {"b.c": "x"}} + assert decode("a[%2Eb]=x", o2) == {"a": {".b": "x"}} + assert decode("a[b%2Ec]=x", o2) == {"a": {"b.c": "x"}} + + def test_mixed_case_encoded_brackets_plus_encoded_dot_after_brackets(self) -> None: + opt = DecodeOptions(allow_dots=True, decode_dot_in_keys=True) + # Uppercase + assert decode("a%5Bb%5D%5Bc%5D%2Ed=x", opt) == {"a": {"b": {"c": {"d": "x"}}}} + # Lowercase + assert decode("a%5bb%5d%5bc%5d%2ed=x", opt) == {"a": {"b": {"c": {"d": "x"}}}} + + def test_nested_brackets_inside_a_segment_balanced_as_one_segment(self) -> None: + opt = DecodeOptions(allow_dots=True, decode_dot_in_keys=True) + # "a[b%5Bc%5D].e=x" → key "b[c]" stays a single segment; then ".e" splits (allow_dots) + assert decode("a[b%5Bc%5D].e=x", opt) == {"a": {"b[c]": {"e": "x"}}} + + def test_mixed_case_encoded_brackets_and_encoded_dot_with_inconsistent_options_raises(self) -> None: + with pytest.raises(ValueError): + decode("a%5Bb%5D%5Bc%5D%2Ed=x", DecodeOptions(allow_dots=False, decode_dot_in_keys=True)) + + def test_top_level_encoded_dot_splits_when_allowdots_true_decodedot_true(self) -> None: + opt = DecodeOptions(allow_dots=True, decode_dot_in_keys=True) + assert decode("a%2Eb=c", opt) == {"a": {"b": "c"}} + + def test_top_level_encoded_dot_also_splits_when_allowdots_true_decodedot_false(self) -> None: + opt = DecodeOptions(allow_dots=True, decode_dot_in_keys=False) + assert decode("a%2Eb=c", opt) == {"a": {"b": "c"}} + + def test_top_level_encoded_dot_does_not_split_when_allowdots_false_decodedot_false(self) -> None: + opt = DecodeOptions(allow_dots=False, decode_dot_in_keys=False) + assert decode("a%2Eb=c", opt) == {"a.b": "c"} + + def test_bracket_then_encoded_dot_to_next_segment_with_allowdots_true(self) -> None: + opt = DecodeOptions(allow_dots=True, decode_dot_in_keys=True) + assert decode("a[b]%2Ec=x", opt) == {"a": {"b": {"c": "x"}}} + assert decode("a[b]%2ec=x", opt) == {"a": {"b": {"c": "x"}}} + + def test_mixed_case_top_level_encoded_dot_then_bracket_with_allowdots_true(self) -> None: + opt = DecodeOptions(allow_dots=True, decode_dot_in_keys=True) + assert decode("a%2E[b]=x", opt) == {"a": {"b": "x"}} + + def test_top_level_lowercase_encoded_dot_splits_when_allowdots_true_decodedot_false(self) -> None: + opt = DecodeOptions(allow_dots=True, decode_dot_in_keys=False) + assert decode("a%2eb=c", opt) == {"a": {"b": "c"}} + + def test_dot_before_index_with_allowdots_true_index_remains_index(self) -> None: + opt = DecodeOptions(allow_dots=True) + assert decode("foo[0].baz[0]=15&foo[0].bar=2", opt) == {"foo": [{"baz": ["15"], "bar": "2"}]} + + def test_trailing_dot_ignored_when_allowdots_true(self) -> None: + opt = DecodeOptions(allow_dots=True) + assert decode("user.email.=x", opt) == {"user": {"email": "x"}} + + def test_bracket_segment_encoded_dot_mapped_to_dot_when_decodedot_true(self) -> None: + opt = DecodeOptions(allow_dots=True, decode_dot_in_keys=True) + assert decode("a[%2E]=x", opt) == {"a": {".": "x"}} + assert decode("a[%2e]=x", opt) == {"a": {".": "x"}} + + def test_top_level_encoded_dot_before_bracket_lowercase_with_allowdots_true(self) -> None: + opt = DecodeOptions(allow_dots=True, decode_dot_in_keys=True) + assert decode("a%2e[b]=x", opt) == {"a": {"b": "x"}} + + def test_plain_dot_before_bracket_with_allowdots_true(self) -> None: + opt = DecodeOptions(allow_dots=True, decode_dot_in_keys=True) + assert decode("a.[b]=x", opt) == {"a": {"b": "x"}} + + def test_kind_aware_decoder_receives_key_for_top_level_and_bracketed_keys(self) -> None: + calls: t.List[t.Tuple[t.Optional[str], DecodeKind]] = [] + + def _decoder(s: t.Optional[str], charset: t.Optional[Charset], *, kind: DecodeKind = DecodeKind.VALUE) -> t.Any: + calls.append((s, kind)) + return s + + opt = DecodeOptions(allow_dots=True, decode_dot_in_keys=True, decoder=_decoder) + assert ( + decode("a%2Eb=c&a[b]=d", opt) == {"a": {"b": "c"}, "a": {"b": "d"}} + if False + else decode("a%2Eb=c&a[b]=d", opt) + ) # no-op, ensure call + + # Confirm both KEY invocations observed: raw top-level key and raw bracketed key + assert any(k == DecodeKind.KEY and (s == "a%2Eb" or s == "a[b]") for (s, k) in calls) + assert any(k == DecodeKind.VALUE and (s == "c" or s == "d") for (s, k) in calls) From 9a3ce18817e6cb6ab69f4c2e60cd4b94abd17857 Mon Sep 17 00:00:00 2001 From: Klemen Tusar Date: Sat, 23 Aug 2025 17:59:01 +0100 Subject: [PATCH 06/29] :white_check_mark: add tests for decoder precedence over legacy_decoder and non-string decoder results in DecodeOptions --- tests/unit/decode_options_test.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/tests/unit/decode_options_test.py b/tests/unit/decode_options_test.py index 81ee009..fce0615 100644 --- a/tests/unit/decode_options_test.py +++ b/tests/unit/decode_options_test.py @@ -211,3 +211,33 @@ def dec(s: str | None, *args, **kwargs): # type: ignore[no-untyped-def] assert opts.decoder("abc", Charset.UTF8, kind=DecodeKind.VALUE) == "ABC" # For keys, custom decoder gets the raw token; no default percent-decoding happens first. assert opts.decoder("a%2Eb", Charset.UTF8, kind=DecodeKind.KEY) == "A%2EB" + + def test_decoder_wins_over_legacy_decoder_when_both_provided(self) -> None: + # decoder must take precedence over legacy_decoder (parity with Kotlin/C#) + def legacy(v: str | None, charset: Charset | None = None) -> str | None: + return f"L:{'null' if v is None else v}" + + def dec( + v: str | None, + charset: Charset | None = None, + *, + kind: DecodeKind = DecodeKind.VALUE, + ) -> str | None: + return f"K:{kind.name}:{'null' if v is None else v}" + + opts = DecodeOptions(decoder=dec, legacy_decoder=legacy) + assert opts.decode_key("x") == "K:KEY:x" + assert opts.decode_value("y") == "K:VALUE:y" + + def test_decode_key_coerces_non_string_decoder_result(self) -> None: + # When the decoder returns a non-string scalar, decode_key coerces it via str() + def dec( + v: str | None, + charset: Charset | None = None, + *, + kind: DecodeKind = DecodeKind.VALUE, + ) -> object | None: + return 42 if v is not None else None + + opts = DecodeOptions(decoder=dec) + assert opts.decode_key("anything") == "42" From a4dbc60073398f5e3c9e82a0a2aaf35e59338189 Mon Sep 17 00:00:00 2001 From: Klemen Tusar Date: Sat, 23 Aug 2025 18:23:32 +0100 Subject: [PATCH 07/29] :bug: handle leading dot in keys by converting to bracket segment in dot_to_bracket_top_level --- src/qs_codec/utils/decode_utils.py | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/src/qs_codec/utils/decode_utils.py b/src/qs_codec/utils/decode_utils.py index ec214b1..d6844d4 100644 --- a/src/qs_codec/utils/decode_utils.py +++ b/src/qs_codec/utils/decode_utils.py @@ -29,10 +29,6 @@ class DecodeUtils: re.IGNORECASE, ) - # When `allow_dots=True`, convert ".foo" segments into "[foo]" so that - # "a.b[c]" becomes "a[b][c]" before bracket parsing. - DOT_TO_BRACKET: t.Pattern[str] = re.compile(r"\.([^.\[]+)") - @classmethod def dot_to_bracket_top_level(cls, s: str) -> str: """Convert top‑level dot segments into bracket groups, preserving dots inside brackets and handling degenerate top‑level dots. @@ -41,7 +37,7 @@ def dot_to_bracket_top_level(cls, s: str) -> str: - Only dots at depth == 0 split. Dots inside '[]' are preserved. - Percent-encoded dots ('%2E'/'%2e') never split here. - Degenerate cases: - * leading '.' is preserved ('.a' stays '.a') + * leading '.' starts a bracket segment ('.a' behaves like '[a]') * '.[' is skipped so 'a.[b]' behaves like 'a[b]' * 'a..b' preserves the first dot → 'a.[b]' * trailing '.' is preserved and ignored by the splitter @@ -73,19 +69,26 @@ def dot_to_bracket_top_level(cls, s: str) -> str: if depth == 0: has_next = i + 1 < n next_ch = s[i + 1] if has_next else "\0" - if i == 0: - # leading '.' is preserved - sb.append(".") - i += 1 - elif next_ch == "[": + if next_ch == "[": # skip the dot so 'a.[b]' acts like 'a[b]' i += 1 + elif i == 0: + # leading '.' starts a bracket segment: ".a" -> "[a]" + start = i + 1 + j = start + while j < n and s[j] != "." and s[j] != "[": + j += 1 + sb.append("[") + sb.append(s[start:j]) + sb.append("]") + i = j elif (not has_next) or next_ch == ".": # trailing dot, or first of a double dot sb.append(".") i += 1 else: - # normal split: take token until next '.' or '[' + # normal split (also handles leading '.'): convert a.b → a[b] + # and '.a' → '[a]' at top level start = i + 1 j = start while j < n and s[j] != "." and s[j] != "[": From 8fee561ec7abf69fe5f9f642d6a4d6b5661ada0c Mon Sep 17 00:00:00 2001 From: Klemen Tusar Date: Sat, 23 Aug 2025 18:23:39 +0100 Subject: [PATCH 08/29] :white_check_mark: add tests for dot encoding and decoding parity across DecodeOptions configurations --- tests/unit/decode_test.py | 61 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/tests/unit/decode_test.py b/tests/unit/decode_test.py index 6b57d43..91d3a8d 100644 --- a/tests/unit/decode_test.py +++ b/tests/unit/decode_test.py @@ -1561,3 +1561,64 @@ def _decoder(s: t.Optional[str], charset: t.Optional[Charset], *, kind: DecodeKi # Confirm both KEY invocations observed: raw top-level key and raw bracketed key assert any(k == DecodeKind.KEY and (s == "a%2Eb" or s == "a[b]") for (s, k) in calls) assert any(k == DecodeKind.VALUE and (s == "c" or s == "d") for (s, k) in calls) + + +class TestAdditionalDotEncodingParity: + def test_allowdots_false_decodedot_false_encoded_dots_decode_to_literal_no_split(self) -> None: + # allowDots=false, decodeDotInKeys=false: encoded dots decode to literal '.'; no dot-splitting + opt = DecodeOptions(allow_dots=False, decode_dot_in_keys=False) + assert decode("a%2Eb=c", opt) == {"a.b": "c"} + assert decode("a%2eb=c", opt) == {"a.b": "c"} + + def test_allowdots_true_decodedot_false_double_encoded_preserved_inside_segments(self) -> None: + # allowDots=true, decodeDotInKeys=false: double-encoded dots are preserved inside segments; encoded and plain dots split + opt = DecodeOptions(allow_dots=True, decode_dot_in_keys=False) + # Plain dot splits + assert decode("a.b=c", opt) == {"a": {"b": "c"}} + # Encoded dot stays encoded inside first segment (no extra split) + assert decode("name%252Eobj.first=John", opt) == {"name%2Eobj": {"first": "John"}} + # Lowercase variant inside first segment ("a%2eb.c=d") + assert decode("a%2eb.c=d", opt) == {"a": {"b": {"c": "d"}}} + + def test_allowdots_true_decodedot_true_encoded_dots_become_literal_inside_segment(self) -> None: + # allowDots=true, decodeDotInKeys=true: encoded dots become literal '.' inside a segment (no extra split) + opt = DecodeOptions(allow_dots=True, decode_dot_in_keys=True) + assert decode("name%252Eobj.first=John", opt) == {"name.obj": {"first": "John"}} + # Double-encoded single segment becomes a literal dot after post-split mapping + assert decode("a%252Eb=c", opt) == {"a.b": "c"} + # Lowercase mapping as well in a bracket segment + assert decode("a[%2e]=x", opt) == {"a": {".": "x"}} + + def test_bracket_segment_percent2e_mapped_based_on_decodedotinkeys_case_insensitive(self) -> None: + # When disabled, percent-decoding inside brackets yields '.' (no extra split) + assert decode("a[%2E]=x", DecodeOptions(allow_dots=False, decode_dot_in_keys=False)) == {"a": {".": "x"}} + assert decode("a[%2e]=x", DecodeOptions(allow_dots=True, decode_dot_in_keys=False)) == {"a": {".": "x"}} + # When enabled, convert to '.' regardless of case + assert decode("a[%2E]=x", DecodeOptions(allow_dots=True, decode_dot_in_keys=True)) == {"a": {".": "x"}} + # Inconsistent options should raise at construction; mirrored here via the call + with pytest.raises(ValueError): + decode("a[%2e]=x", DecodeOptions(allow_dots=False, decode_dot_in_keys=True)) + + def test_bare_key_behavior_matches_key_decoding_path(self) -> None: + # allowDots=false → %2E decodes to '.'; no splitting because allowDots=false; strict null → None + opt1 = DecodeOptions(allow_dots=False, decode_dot_in_keys=False, strict_null_handling=True) + assert decode("a%2Eb", opt1) == {"a.b": None} + # allowDots=true & decodeDotInKeys=false → keep %2E inside key segment (no extra split); empty value default + opt2 = DecodeOptions(allow_dots=True, decode_dot_in_keys=False) + assert decode("a%2Eb", opt2) == {"a": {"b": ""}} + + def test_depth_zero_with_allowdots_true_does_not_split_key(self) -> None: + # depth=0 with allowDots=true: do not split key + opt = DecodeOptions(allow_dots=True, depth=0) + assert decode("a.b=c", opt) == {"a.b": "c"} + + def test_top_level_dot_to_bracket_guardrails_leading_trailing_double(self) -> None: + # Leading dot: ".a" should yield {"a": ...} when allowDots=true + assert decode(".a=x", DecodeOptions(allow_dots=True, decode_dot_in_keys=False)) == {"a": "x"} + + # Trailing dot: "a." should NOT create an empty bracket segment; remains literal + assert decode("a.=x", DecodeOptions(allow_dots=True, decode_dot_in_keys=False)) == {"a.": "x"} + + # Double dots: only the second dot (before a token) causes a split; the empty middle segment is preserved + # as a literal dot in the parent key (no [] is created) + assert decode("a..b=x", DecodeOptions(allow_dots=True, decode_dot_in_keys=False)) == {"a.": {"b": "x"}} From b4cb14a0e8d12c74eeb1e7747b12e78e0de59e45 Mon Sep 17 00:00:00 2001 From: Klemen Tusar Date: Sat, 23 Aug 2025 18:43:13 +0100 Subject: [PATCH 09/29] :bulb: update docstring to clarify handling of degenerate cases and unterminated brackets in top-level dot splitting --- src/qs_codec/utils/decode_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/qs_codec/utils/decode_utils.py b/src/qs_codec/utils/decode_utils.py index d6844d4..e741bdf 100644 --- a/src/qs_codec/utils/decode_utils.py +++ b/src/qs_codec/utils/decode_utils.py @@ -4,7 +4,7 @@ - Decoding handles both UTF‑8 and Latin‑1 code paths. - Key splitting keeps bracket groups *balanced* and optionally treats dots as path separators when ``allow_dots=True``. -- Top‑level dot splitting uses a character‑scanner that preserves leading/trailing dots, `.[]` degenerates, and never splits on percent‑encoded dots. +- Top‑level dot splitting uses a character‑scanner that handles degenerate cases (leading '.' starts a bracket segment; '.[' is skipped; double dots preserve the first; trailing '.' is preserved) and never splits on percent‑encoded dots. """ import re @@ -190,6 +190,7 @@ def split_key_into_segments( - Bracket groups are *balanced* using a counter so nested brackets within a single group (e.g. ``"[with[inner]]"``) are treated as one segment. - When ``max_depth <= 0``, no splitting occurs; the key is returned as a single segment (qs semantics). - If there are more groups beyond ``max_depth`` and ``strict_depth`` is True, an ``IndexError`` is raised. Otherwise, the remainder is added as one final segment (again mirroring qs). + - Unterminated '[': the remainder after the first unmatched '[' is captured as a single synthetic bracket segment (qs/Kotlin parity). This runs in O(n) time over the key string. """ From d0dd3c71867543b5dc226423d78c1aff9aa934d1 Mon Sep 17 00:00:00 2001 From: Klemen Tusar Date: Sat, 23 Aug 2025 18:47:32 +0100 Subject: [PATCH 10/29] :bulb: clarify docstrings for list_limit, comma-splitting, and percent-encoded dot handling in decode logic --- src/qs_codec/decode.py | 47 +++++++++++++++++++++++++++--------------- 1 file changed, 30 insertions(+), 17 deletions(-) diff --git a/src/qs_codec/decode.py b/src/qs_codec/decode.py index c325425..a1124f0 100644 --- a/src/qs_codec/decode.py +++ b/src/qs_codec/decode.py @@ -124,14 +124,24 @@ def _interpret_numeric_entities(value: str) -> str: def _parse_array_value(value: t.Any, options: DecodeOptions, current_list_length: int) -> t.Any: - """Post-process a raw scalar for list semantics and enforce `list_limit`. + """Post-process a raw scalar for list semantics and enforce ``list_limit``. Behavior -------- - - If `comma=True` and `value` is a string that contains commas, split into a list. - - Otherwise, enforce the per-list length limit by comparing `current_list_length` to `options.list_limit`. When `raise_on_limit_exceeded=True`, violations raise `ValueError`. + - If ``comma=True`` and ``value`` is a string that contains commas, split into a list. + - Otherwise, enforce the per-list length limit by comparing ``current_list_length`` to ``options.list_limit``. + When ``raise_on_limit_exceeded=True``, violations raise ``ValueError``. + - When ``list_limit`` is negative: + * if ``raise_on_limit_exceeded=True``, **any** list-growth operation here (e.g., comma-splitting) + raises immediately; + * if ``raise_on_limit_exceeded=False`` (default), comma-splitting still returns a list; numeric + bracket indices are handled later by ``_parse_object`` (where negative ``list_limit`` disables + numeric-index parsing only). - Returns either the original value or a list of values, without decoding (that happens later). + Returns + ------- + Any + Either the original value or a list of values, without decoding (that happens later). """ if isinstance(value, str) and value and options.comma and "," in value: split_val: t.List[str] = value.split(",") @@ -155,18 +165,21 @@ def _parse_query_string_values(value: str, options: DecodeOptions) -> t.Dict[str Responsibilities ---------------- - Strip a leading '?' if ``ignore_query_prefix`` is True. - - Normalize percent-encoded square brackets (``%5B/%5D``) so the key splitter can operate. + - Normalize percent-encoded square brackets (``%5B/%5D``) (case-insensitive) so the key splitter can operate. - Split into parts using either a string delimiter or a regex delimiter. - Enforce ``parameter_limit`` (optionally raising). - - Detect the UTF-8/Latin-1 charset via the `utf8=…` sentinel when enabled. + - Detect the UTF-8/Latin-1 charset via the ``utf8=…`` sentinel when enabled. - For each ``key=value`` pair: - * Percent-decode key/value using the selected charset. + * Decode key/value via ``options.decoder`` (default: percent-decoding using the selected ``charset``). + Keys are passed with ``kind=DecodeKind.KEY`` and values with ``kind=DecodeKind.VALUE``; a custom decoder + may return the raw token or ``None``. * Apply list/comma logic to values. * Interpret numeric entities for Latin-1 when requested. - * Handle empty brackets ``[]`` as list markers. + * Handle empty brackets ``[]`` as list markers (wrapping exactly once). * Merge duplicate keys according to ``duplicates`` policy. - The output is a *flat* dict (keys are full key-path strings). Higher-level structure is constructed later by ``_parse_keys`` / ``_parse_object``. + The output is a *flat* dict (keys are full key-path strings). Higher-level structure is constructed later by + ``_parse_keys`` / ``_parse_object``. """ obj: t.Dict[str, t.Any] = {} @@ -296,14 +309,14 @@ def _parse_object( Notes ----- - Builds lists when encountering ``[]`` (respecting ``allow_empty_lists`` and null handling). - - Converts bracketed numeric segments into list indices when allowed and within ``list_limit``. - - When ``list_limit`` is negative, numeric bracket indices are treated as - *map keys* (list growth is disabled). If ``raise_on_limit_exceeded`` is - True, any list-growth operation (empty brackets, comma-split, nested pushes) - will raise immediately. - - Inside bracket segments, a custom key decoder may leave percent-encoded dots - (``%2E/%2e``). When ``decode_dot_in_keys`` is True, these are normalized to - ``.`` here. Top‑level dot splitting is already handled by the splitter. + - Converts bracketed **numeric** segments into list indices when allowed and within ``list_limit``. + - When ``list_limit`` is negative, **numeric-indexed bracket segments** are treated as map keys + (i.e., index-based list growth is disabled). Empty brackets (``[]``) still create lists unless + ``raise_on_limit_exceeded`` is True; with ``raise_on_limit_exceeded=True``, any list-growth operation + (empty brackets, comma-split, nested pushes) raises immediately. + - Inside bracket segments, a custom key decoder may leave percent-encoded dots (``%2E/%2e``). When + ``decode_dot_in_keys`` is True, these are normalized to ``.`` here. Top‑level dot splitting is already + handled by the splitter. - When list parsing is disabled and an empty segment is encountered, coerces to ``{"0": leaf}`` to preserve round-trippability with other ports. """ current_list_length: int = 0 From f04d9c6a118957ee877a43732cb0fbdcedfe42d3 Mon Sep 17 00:00:00 2001 From: Klemen Tusar Date: Sat, 23 Aug 2025 18:54:57 +0100 Subject: [PATCH 11/29] :recycle: update type annotations in decode_options_test for decoder and legacy_decoder signatures --- tests/unit/decode_options_test.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/tests/unit/decode_options_test.py b/tests/unit/decode_options_test.py index fce0615..b95f30d 100644 --- a/tests/unit/decode_options_test.py +++ b/tests/unit/decode_options_test.py @@ -181,9 +181,9 @@ def test_decode_value_decodes_percent_sequences_normally(self) -> None: assert opts.decoder("%2E", Charset.UTF8, kind=DecodeKind.VALUE) == "." def test_decoder_is_used_for_key_and_value(self) -> None: - calls: list[tuple[str | None, DecodeKind]] = [] + calls: t.List[t.Tuple[t.Optional[str], DecodeKind]] = [] - def dec(s: str | None, charset: Charset | None, kind: DecodeKind) -> str | None: # type: ignore[override] + def dec(s: t.Optional[str], charset: t.Optional[Charset], kind: DecodeKind) -> t.Optional[str]: # type: ignore[override] calls.append((s, kind)) return s @@ -196,7 +196,7 @@ def dec(s: str | None, charset: Charset | None, kind: DecodeKind) -> str | None: assert calls[1][1] is DecodeKind.VALUE and calls[1][0] == "y" def test_decoder_null_return_is_honored(self) -> None: - def dec(s: str | None, charset: Charset | None, kind: DecodeKind) -> str | None: # type: ignore[override] + def dec(s: t.Optional[str], charset: t.Optional[Charset], kind: DecodeKind) -> t.Optional[str]: # type: ignore[override] return None opts = DecodeOptions(decoder=dec) @@ -204,7 +204,7 @@ def dec(s: str | None, charset: Charset | None, kind: DecodeKind) -> str | None: assert opts.decoder("bar", Charset.UTF8, kind=DecodeKind.KEY) is None def test_single_decoder_acts_like_legacy_when_ignoring_kind(self) -> None: - def dec(s: str | None, *args, **kwargs): # type: ignore[no-untyped-def] + def dec(s: t.Optional[str], *args, **kwargs): # type: ignore[no-untyped-def] return None if s is None else s.upper() opts = DecodeOptions(decoder=dec) @@ -214,15 +214,15 @@ def dec(s: str | None, *args, **kwargs): # type: ignore[no-untyped-def] def test_decoder_wins_over_legacy_decoder_when_both_provided(self) -> None: # decoder must take precedence over legacy_decoder (parity with Kotlin/C#) - def legacy(v: str | None, charset: Charset | None = None) -> str | None: + def legacy(v: t.Optional[str], charset: t.Optional[Charset] = None) -> t.Optional[str]: return f"L:{'null' if v is None else v}" def dec( - v: str | None, - charset: Charset | None = None, + v: t.Optional[str], + charset: t.Optional[Charset] = None, *, kind: DecodeKind = DecodeKind.VALUE, - ) -> str | None: + ) -> t.Optional[str]: return f"K:{kind.name}:{'null' if v is None else v}" opts = DecodeOptions(decoder=dec, legacy_decoder=legacy) @@ -232,11 +232,11 @@ def dec( def test_decode_key_coerces_non_string_decoder_result(self) -> None: # When the decoder returns a non-string scalar, decode_key coerces it via str() def dec( - v: str | None, - charset: Charset | None = None, + v: t.Optional[str], + charset: t.Optional[Charset] = None, *, kind: DecodeKind = DecodeKind.VALUE, - ) -> object | None: + ) -> t.Any: return 42 if v is not None else None opts = DecodeOptions(decoder=dec) From 2cf36123a3169d92b38063e1dd6a08fe7dcaf9f0 Mon Sep 17 00:00:00 2001 From: Klemen Tusar Date: Sat, 23 Aug 2025 19:06:42 +0100 Subject: [PATCH 12/29] :white_check_mark: revise decode test to avoid duplicate dict key assertion and ensure decoder invocation for dot-encoded and bracketed keys --- tests/unit/decode_test.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/tests/unit/decode_test.py b/tests/unit/decode_test.py index 91d3a8d..16a5c16 100644 --- a/tests/unit/decode_test.py +++ b/tests/unit/decode_test.py @@ -1552,11 +1552,8 @@ def _decoder(s: t.Optional[str], charset: t.Optional[Charset], *, kind: DecodeKi return s opt = DecodeOptions(allow_dots=True, decode_dot_in_keys=True, decoder=_decoder) - assert ( - decode("a%2Eb=c&a[b]=d", opt) == {"a": {"b": "c"}, "a": {"b": "d"}} - if False - else decode("a%2Eb=c&a[b]=d", opt) - ) # no-op, ensure call + # Ensure the decoder is invoked for both key forms without tripping F601 on duplicate dict keys. + assert bool(decode("a%2Eb=c&a[b]=d", opt)) # no-op: just ensure the call executes # Confirm both KEY invocations observed: raw top-level key and raw bracketed key assert any(k == DecodeKind.KEY and (s == "a%2Eb" or s == "a[b]") for (s, k) in calls) From 9b62792add4504f2757c097874251fd36990f14f Mon Sep 17 00:00:00 2001 From: Klemen Tusar Date: Sat, 23 Aug 2025 19:10:03 +0100 Subject: [PATCH 13/29] :bulb: update comment to clarify top-level dot splitting behavior in decode logic --- src/qs_codec/utils/decode_utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/qs_codec/utils/decode_utils.py b/src/qs_codec/utils/decode_utils.py index e741bdf..0fb87fc 100644 --- a/src/qs_codec/utils/decode_utils.py +++ b/src/qs_codec/utils/decode_utils.py @@ -87,8 +87,7 @@ def dot_to_bracket_top_level(cls, s: str) -> str: sb.append(".") i += 1 else: - # normal split (also handles leading '.'): convert a.b → a[b] - # and '.a' → '[a]' at top level + # normal split at top level: convert a.b → a[b] start = i + 1 j = start while j < n and s[j] != "." and s[j] != "[": From d416d9dd6a1d2a7ccdda8659783e37cc7e13b024 Mon Sep 17 00:00:00 2001 From: Klemen Tusar Date: Sat, 23 Aug 2025 19:10:34 +0100 Subject: [PATCH 14/29] :bulb: add comment to clarify handling of typing.Literal annotations for kind parameter in decode logic --- src/qs_codec/models/decode_options.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/qs_codec/models/decode_options.py b/src/qs_codec/models/decode_options.py index c550b20..c50afc8 100644 --- a/src/qs_codec/models/decode_options.py +++ b/src/qs_codec/models/decode_options.py @@ -203,6 +203,10 @@ def __post_init__(self) -> None: pass_kind_as_str = True if has_kind_param: ann = params["kind"].annotation + # NOTE: If a user annotates `kind` as a typing.Literal (e.g., Literal["key", "value"]), + # `ann` will NOT be a `type`, so we fall back to passing strings (the default path below). + # This is intentional: it preserves compatibility with callables that prefer plain strings + # while still supporting Enum-typed signatures where we pass the Enum instance instead. if ann is inspect.Signature.empty: pass_kind_as_str = True else: From b8e88e3cd619452136b6cbc20c13409c99ee994f Mon Sep 17 00:00:00 2001 From: Klemen Tusar Date: Sat, 23 Aug 2025 19:13:13 +0100 Subject: [PATCH 15/29] :bulb: clarify comment on comma-split list logic and list_limit enforcement in decode function --- src/qs_codec/decode.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/qs_codec/decode.py b/src/qs_codec/decode.py index a1124f0..ca17a36 100644 --- a/src/qs_codec/decode.py +++ b/src/qs_codec/decode.py @@ -173,7 +173,7 @@ def _parse_query_string_values(value: str, options: DecodeOptions) -> t.Dict[str * Decode key/value via ``options.decoder`` (default: percent-decoding using the selected ``charset``). Keys are passed with ``kind=DecodeKind.KEY`` and values with ``kind=DecodeKind.VALUE``; a custom decoder may return the raw token or ``None``. - * Apply list/comma logic to values. + * Apply comma-split list logic to values (handled here). Index-based list growth from bracket segments is applied later in ``_parse_object``. When ``list_limit < 0`` and ``raise_on_limit_exceeded=True``, any comma-split that would increase the list length raises immediately; otherwise the split proceeds. * Interpret numeric entities for Latin-1 when requested. * Handle empty brackets ``[]`` as list markers (wrapping exactly once). * Merge duplicate keys according to ``duplicates`` policy. From 089cfa41a2e05434ebdfa3562f771205abc9be93 Mon Sep 17 00:00:00 2001 From: Klemen Tusar Date: Sat, 23 Aug 2025 19:15:04 +0100 Subject: [PATCH 16/29] :bulb: add comment to clarify normalization of percent-encoded brackets in query string prior to splitting --- src/qs_codec/decode.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/qs_codec/decode.py b/src/qs_codec/decode.py index ca17a36..3feb675 100644 --- a/src/qs_codec/decode.py +++ b/src/qs_codec/decode.py @@ -184,6 +184,10 @@ def _parse_query_string_values(value: str, options: DecodeOptions) -> t.Dict[str obj: t.Dict[str, t.Any] = {} clean_str: str = value.replace("?", "", 1) if options.ignore_query_prefix else value + # Normalize %5B/%5D to literal brackets before splitting (case-insensitive). + # Note: this operates on the entire query string (keys *and* values). That’s + # intentional: it keeps the splitter simple, and value tokens are subsequently + # passed through the scalar decoder, so this replacement is safe. clean_str = clean_str.replace("%5B", "[").replace("%5b", "[").replace("%5D", "]").replace("%5d", "]") # Compute an effective parameter limit (None means "no limit"). From 9beee76d87af05ad33a682f30cc811780674fdc7 Mon Sep 17 00:00:00 2001 From: Klemen Tusar Date: Sat, 23 Aug 2025 19:17:40 +0100 Subject: [PATCH 17/29] :bulb: add comment to clarify conservative heuristic for list length enforcement in decode logic --- src/qs_codec/decode.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/qs_codec/decode.py b/src/qs_codec/decode.py index 3feb675..bdb7e2e 100644 --- a/src/qs_codec/decode.py +++ b/src/qs_codec/decode.py @@ -326,6 +326,19 @@ def _parse_object( current_list_length: int = 0 # If the chain ends with an empty list marker, compute current list length for limit checks. + # Best-effort note: + # This is a conservative heuristic intended to help when we see patterns like `a[0][]=`, + # so `_parse_array_value` can enforce the list limit for the final `[]` push. The segments + # we receive in `chain` include bracket markers (e.g., `["a", "[0]", "[]"]`), so + # `"".join(chain[:-1])` is rarely a pure integer (e.g., `"a[0]"` raises `ValueError`), + # and we typically fall back to `0`. That’s fine: it remains safe and conservative. + # We still: + # • enforce per-list length for already-allocated containers during tokenization in + # `_parse_query_string_values` (where we know the current length), and + # • enforce index-based growth limits inside this function when converting bracketed + # numeric segments into list indices. + # Keeping this lightweight probe matches the other ports and avoids costly look-ahead into + # parent structures while maintaining correct limit behavior. if bool(chain) and chain[-1] == "[]": parent_key: t.Optional[int] From 51b3c7583ce604a260de384674e337a9bcb4dca9 Mon Sep 17 00:00:00 2001 From: Klemen Tusar Date: Sat, 23 Aug 2025 20:59:57 +0100 Subject: [PATCH 18/29] :bulb: update comment to clarify percent-decoding behavior and handling of encoded dots in dot_to_bracket_top_level --- src/qs_codec/utils/decode_utils.py | 42 +++++++++++++++++++----------- 1 file changed, 27 insertions(+), 15 deletions(-) diff --git a/src/qs_codec/utils/decode_utils.py b/src/qs_codec/utils/decode_utils.py index 0fb87fc..2152c2d 100644 --- a/src/qs_codec/utils/decode_utils.py +++ b/src/qs_codec/utils/decode_utils.py @@ -31,22 +31,34 @@ class DecodeUtils: @classmethod def dot_to_bracket_top_level(cls, s: str) -> str: - """Convert top‑level dot segments into bracket groups, preserving dots inside brackets and handling degenerate top‑level dots. - - Rules: - - Only dots at depth == 0 split. Dots inside '[]' are preserved. - - Percent-encoded dots ('%2E'/'%2e') never split here. + """Convert top-level dot segments into bracket groups *after* percent-decoding. + + Notes + ----- + - In the normal decode path, the key has already been percent-decoded by the upstream + scanner, so sequences like ``%2E``/``%2e`` are already literal ``.`` when this function + runs. As a result, with ``allow_dots=True``, any top-level ``.`` will be treated as a + separator here. This is independent of ``decode_dot_in_keys`` (which only affects how + encoded dots *inside bracket segments* are normalized later during object folding). + - If a custom decoder returns raw tokens (i.e., bypasses percent-decoding), ``%2E``/``%2e`` + may still appear here; those percent sequences are preserved verbatim and are **not** + used as separators. + + Rules + ----- + - Only dots at depth == 0 split. Dots inside ``[]`` are preserved. - Degenerate cases: - * leading '.' starts a bracket segment ('.a' behaves like '[a]') - * '.[' is skipped so 'a.[b]' behaves like 'a[b]' - * 'a..b' preserves the first dot → 'a.[b]' - * trailing '.' is preserved and ignored by the splitter - - Examples: - 'user.email.name' -> 'user[email][name]' - 'a[b].c' -> 'a[b][c]' - 'a[.].c' -> 'a[.][c]' - 'a%2E[b]' -> 'a%2E[b]' + * leading ``.`` starts a bracket segment (``.a`` behaves like ``[a]``) + * ``.[`` is skipped so ``a.[b]`` behaves like ``a[b]`` + * ``a..b`` preserves the first dot → ``a.[b]`` + * trailing ``.`` is preserved and ignored by the splitter + + Examples + -------- + 'user.email.name' -> 'user[email][name]' + 'a[b].c' -> 'a[b][c]' + 'a[.].c' -> 'a[.][c]' + 'a%2E[b]' -> 'a%2E[b]' (only if a custom decoder left it encoded) """ if "." not in s: return s From 1b8e41eb20041bc81d396a42996d67fabd714f83 Mon Sep 17 00:00:00 2001 From: Klemen Tusar Date: Sat, 23 Aug 2025 21:29:58 +0100 Subject: [PATCH 19/29] :bug: fix strict_depth enforcement to avoid raising on unterminated bracket groups in decode logic --- src/qs_codec/utils/decode_utils.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/qs_codec/utils/decode_utils.py b/src/qs_codec/utils/decode_utils.py index 2152c2d..4132143 100644 --- a/src/qs_codec/utils/decode_utils.py +++ b/src/qs_codec/utils/decode_utils.py @@ -222,6 +222,7 @@ def split_key_into_segments( open_idx: int = first depth: int = 0 + unterminated = False while open_idx >= 0 and depth < max_depth: level = 1 i = open_idx + 1 @@ -241,7 +242,8 @@ def split_key_into_segments( i += 1 if close < 0: - break # unterminated group; stop collecting; remainder handled below + unterminated = True # unterminated group; stop collecting; remainder handled below + break # Append the full balanced group, including the surrounding brackets. segments.append(key[open_idx : close + 1]) # includes the surrounding [ ] @@ -249,7 +251,10 @@ def split_key_into_segments( open_idx = key.find("[", close + 1) if open_idx >= 0: - if strict_depth: + # We only want to raise for true depth overflow under strict_depth, + # not for unterminated bracket groups. + depth_overflow = (depth >= max_depth) and not unterminated + if strict_depth and depth_overflow: raise IndexError(f"Input depth exceeded depth option of {max_depth} and strict_depth is True") # Stash the remainder as a single segment (qs/Kotlin parity) segments.append("[" + key[open_idx:] + "]") From 5887b7addf77fa119d2c5c725bfc9bdcc2ffb23f Mon Sep 17 00:00:00 2001 From: Klemen Tusar Date: Sat, 23 Aug 2025 21:31:55 +0100 Subject: [PATCH 20/29] :bulb: clarify docstring on decode_key to note coercion of non-string decoder outputs via str() --- src/qs_codec/models/decode_options.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/qs_codec/models/decode_options.py b/src/qs_codec/models/decode_options.py index c50afc8..b92fcd1 100644 --- a/src/qs_codec/models/decode_options.py +++ b/src/qs_codec/models/decode_options.py @@ -281,7 +281,10 @@ def decode( return d(value, charset or self.charset, kind=kind) def decode_key(self, value: t.Optional[str], charset: t.Optional[Charset] = None) -> t.Optional[str]: - """Decode a key (or key segment). Always returns a string or ``None``.""" + """Decode a key (or key segment). Always returns a string or ``None``. + + Note: custom decoders returning non-strings for keys are coerced via ``str()``. + """ out = self.decode(value, charset, kind=DecodeKind.KEY) return None if out is None else str(out) From 77e8342c6753575df0b3ba34a3a83243b4c58db6 Mon Sep 17 00:00:00 2001 From: Klemen Tusar Date: Sat, 23 Aug 2025 21:58:55 +0100 Subject: [PATCH 21/29] :bug: fix dot-to-bracket decoding to preserve leading dots in consecutive dot sequences --- src/qs_codec/utils/decode_utils.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/qs_codec/utils/decode_utils.py b/src/qs_codec/utils/decode_utils.py index 4132143..9876c0f 100644 --- a/src/qs_codec/utils/decode_utils.py +++ b/src/qs_codec/utils/decode_utils.py @@ -85,6 +85,11 @@ def dot_to_bracket_top_level(cls, s: str) -> str: # skip the dot so 'a.[b]' acts like 'a[b]' i += 1 elif i == 0: + # If input starts with '..', preserve the first dot like the 'a..b' case. + if has_next and next_ch == ".": + sb.append(".") + i += 1 + continue # leading '.' starts a bracket segment: ".a" -> "[a]" start = i + 1 j = start From 43c2df81b601879cf70c335a5366ee619c46d942 Mon Sep 17 00:00:00 2001 From: Klemen Tusar Date: Sat, 23 Aug 2025 22:01:50 +0100 Subject: [PATCH 22/29] :bulb: add examples to docstring for decode_key to illustrate max_depth and unterminated bracket handling --- src/qs_codec/utils/decode_utils.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/qs_codec/utils/decode_utils.py b/src/qs_codec/utils/decode_utils.py index 9876c0f..1900f7b 100644 --- a/src/qs_codec/utils/decode_utils.py +++ b/src/qs_codec/utils/decode_utils.py @@ -208,6 +208,11 @@ def split_key_into_segments( - If there are more groups beyond ``max_depth`` and ``strict_depth`` is True, an ``IndexError`` is raised. Otherwise, the remainder is added as one final segment (again mirroring qs). - Unterminated '[': the remainder after the first unmatched '[' is captured as a single synthetic bracket segment (qs/Kotlin parity). + Examples + -------- + max_depth=2: "a[b][c][d]" -> ["a", "[b]", "[c]", "[[d]]"] + unterminated: "a[b" -> ["a", "[[b]"] + This runs in O(n) time over the key string. """ if max_depth <= 0: From 875abe26aa9af2860f4c52c1289e3079bcad78dc Mon Sep 17 00:00:00 2001 From: Klemen Tusar Date: Sat, 23 Aug 2025 22:04:45 +0100 Subject: [PATCH 23/29] :bulb: document that 'kind' parameter in decode_scalar is ignored and may be removed in future --- src/qs_codec/utils/decode_utils.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/qs_codec/utils/decode_utils.py b/src/qs_codec/utils/decode_utils.py index 1900f7b..824aa5a 100644 --- a/src/qs_codec/utils/decode_utils.py +++ b/src/qs_codec/utils/decode_utils.py @@ -164,6 +164,12 @@ def decode( ) -> t.Optional[str]: """Decode a URL‑encoded scalar. + Notes + ----- + The `kind` parameter is accepted for API compatibility but is currently + ignored; keys and values are decoded identically. It may be removed in + a future major release. + Behavior: - Replace ``+`` with a literal space *before* decoding. - If ``charset`` is :data:`~qs_codec.enums.charset.Charset.LATIN1`, decode only ``%XX`` byte sequences (no ``%uXXXX``). ``%uXXXX`` sequences are left as‑is to mimic older browser/JS behavior. From 1d9627f11e69fbcd31df9d93829687b81dfb368f Mon Sep 17 00:00:00 2001 From: Klemen Tusar Date: Sat, 23 Aug 2025 22:04:49 +0100 Subject: [PATCH 24/29] :bulb: clarify KEY docstring to note default decoder behavior for percent-encoded dots --- src/qs_codec/enums/decode_kind.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/qs_codec/enums/decode_kind.py b/src/qs_codec/enums/decode_kind.py index f82ee7e..d4e2eff 100644 --- a/src/qs_codec/enums/decode_kind.py +++ b/src/qs_codec/enums/decode_kind.py @@ -16,9 +16,10 @@ class DecodeKind(str, Enum): Attributes ---------- KEY - Decode a *key* (or key segment). Implementations typically preserve - percent‑encoded dots (``%2E``/``%2e``) so that dot‑splitting semantics can - be applied later according to parser options. + Decode a *key* (or key segment). Note that the default scalar decoder + (``qs_codec.utils.decode_utils.decode``) ignores `kind` and fully + decodes percent-encoded dots (``%2E``/``%2e``). Dot-splitting behavior is + applied later by higher-level parser options. VALUE Decode a *value*. Implementations typically perform full percent decoding. """ From dba59619781e4dea83bdce282638500b8bb666fa Mon Sep 17 00:00:00 2001 From: Klemen Tusar Date: Sat, 23 Aug 2025 22:41:09 +0100 Subject: [PATCH 25/29] :white_check_mark: add tests for split_key_into_segments remainder handling and strict depth enforcement --- tests/unit/decode_test.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/tests/unit/decode_test.py b/tests/unit/decode_test.py index 16a5c16..5600368 100644 --- a/tests/unit/decode_test.py +++ b/tests/unit/decode_test.py @@ -1619,3 +1619,30 @@ def test_top_level_dot_to_bracket_guardrails_leading_trailing_double(self) -> No # Double dots: only the second dot (before a token) causes a split; the empty middle segment is preserved # as a literal dot in the parent key (no [] is created) assert decode("a..b=x", DecodeOptions(allow_dots=True, decode_dot_in_keys=False)) == {"a.": {"b": "x"}} + + +class TestSplitKeySegmentationRemainder: + def test_no_remainder_when_within_depth(self) -> None: + segs = DecodeUtils.split_key_into_segments("a[b][c]", allow_dots=False, max_depth=3, strict_depth=False) + assert segs == ["a", "[b]", "[c]"] + + def test_double_bracket_remainder_allowdots_depth1(self) -> None: + # Dot → bracket happens first; with max_depth=1, the remainder is wrapped as a single + # synthetic segment using double brackets (opaque to downstream consumers). + segs = DecodeUtils.split_key_into_segments("a.b.c", allow_dots=True, max_depth=1, strict_depth=False) + assert segs == ["a", "[b]", "[[c]]"] + + def test_double_bracket_remainder_for_bracket_input(self) -> None: + # For bracketed input, the remainder beyond depth is also wrapped as one segment + # (e.g. "a[b][c][d]" with max_depth=2 → ["a", "[b]", "[[c][d]]"]). + segs = DecodeUtils.split_key_into_segments("a[b][c][d]", allow_dots=False, max_depth=2, strict_depth=False) + assert segs == ["a", "[b]", "[c]", "[[d]]"] + + def test_strict_depth_overflow_raises_for_well_formed(self) -> None: + # Well-formed keys that exceed max_depth should raise when strict_depth=True. + with pytest.raises(IndexError): + DecodeUtils.split_key_into_segments("a[b][c][d]", allow_dots=False, max_depth=1, strict_depth=True) + + def test_unterminated_group_does_not_raise_under_strict_depth(self) -> None: + segs = DecodeUtils.split_key_into_segments("a[b[c", allow_dots=False, max_depth=5, strict_depth=True) + assert segs == ["a", "[[b[c]"] From b0e17c54e2de9807a10bc5eaaf7468e10ba48d80 Mon Sep 17 00:00:00 2001 From: Klemen Tusar Date: Sat, 23 Aug 2025 23:01:39 +0100 Subject: [PATCH 26/29] :bug: fix percent-decoding to handle dot in keys and clarify top-level percent sequence handling --- src/qs_codec/utils/decode_utils.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/qs_codec/utils/decode_utils.py b/src/qs_codec/utils/decode_utils.py index 824aa5a..e12178e 100644 --- a/src/qs_codec/utils/decode_utils.py +++ b/src/qs_codec/utils/decode_utils.py @@ -117,8 +117,8 @@ def dot_to_bracket_top_level(cls, s: str) -> str: sb.append(".") i += 1 else: - # also preserve percent sequences verbatim at top level; - # we don't split on '%2E' here + # No special handling for percent sequences here; characters are appended as-is. + # We never split on '%2E' at this stage. sb.append(ch) i += 1 return "".join(sb) @@ -192,7 +192,8 @@ def decode( s = string_without_plus if "%" not in s: return s - return cls.HEX2_PATTERN.sub(lambda m: chr(int(m.group(1), 16)), s) + _int, _chr = int, chr + return cls.HEX2_PATTERN.sub(lambda m: _chr(_int(m.group(1), 16)), s) s = string_without_plus return s if "%" not in s else unquote(s) From 877c7eb86e1bbce1c25112f1f3f5a851c77fb483 Mon Sep 17 00:00:00 2001 From: Klemen Tusar Date: Sat, 23 Aug 2025 23:36:05 +0100 Subject: [PATCH 27/29] :bug: handle ambiguous '.]' in key decoding and prevent bracket segment overrun on closing brackets --- src/qs_codec/utils/decode_utils.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/qs_codec/utils/decode_utils.py b/src/qs_codec/utils/decode_utils.py index e12178e..b00ba89 100644 --- a/src/qs_codec/utils/decode_utils.py +++ b/src/qs_codec/utils/decode_utils.py @@ -84,6 +84,10 @@ def dot_to_bracket_top_level(cls, s: str) -> str: if next_ch == "[": # skip the dot so 'a.[b]' acts like 'a[b]' i += 1 + elif next_ch == "]": + # preserve ambiguous '.]' as a literal to avoid constructing '[]]' + sb.append(".") + i += 1 elif i == 0: # If input starts with '..', preserve the first dot like the 'a..b' case. if has_next and next_ch == ".": @@ -93,7 +97,7 @@ def dot_to_bracket_top_level(cls, s: str) -> str: # leading '.' starts a bracket segment: ".a" -> "[a]" start = i + 1 j = start - while j < n and s[j] != "." and s[j] != "[": + while j < n and s[j] != "." and s[j] != "[" and s[j] != "]": j += 1 sb.append("[") sb.append(s[start:j]) @@ -107,7 +111,7 @@ def dot_to_bracket_top_level(cls, s: str) -> str: # normal split at top level: convert a.b → a[b] start = i + 1 j = start - while j < n and s[j] != "." and s[j] != "[": + while j < n and s[j] != "." and s[j] != "[" and s[j] != "]": j += 1 sb.append("[") sb.append(s[start:j]) From 680bfa5f3c4806fad4b0b96c46c55dc3d837ca64 Mon Sep 17 00:00:00 2001 From: Klemen Tusar Date: Sat, 23 Aug 2025 23:37:42 +0100 Subject: [PATCH 28/29] :bulb: update decode_kind docstring to clarify scalar decoder behavior for percent-decoded dots in keys --- src/qs_codec/enums/decode_kind.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/qs_codec/enums/decode_kind.py b/src/qs_codec/enums/decode_kind.py index d4e2eff..2e60a12 100644 --- a/src/qs_codec/enums/decode_kind.py +++ b/src/qs_codec/enums/decode_kind.py @@ -1,10 +1,10 @@ """Decoding context used by the query string parser and utilities. This enum indicates whether a given piece of text is being decoded as a *key* -(or key segment) or as a *value*. The distinction matters for encoded dots -(``%2E``/``%2e``) in keys: when decoding keys, the default behavior is to -*preserve* these so that higher‑level options like ``allow_dots`` and -``decode_dot_in_keys`` can be applied consistently later in the parse. +(or key segment) or as a *value*. Note that the built-in scalar decoder +(`qs_codec.utils.decode_utils.decode`) ignores `kind` and fully percent-decodes +dots; preservation of encoded dots for splitting is applied later by parser +options (`allow_dots`, `decode_dot_in_keys`). """ from enum import Enum From a710b6d372c28b858fe8b4dfd7f17e44dd39020f Mon Sep 17 00:00:00 2001 From: Klemen Tusar Date: Sun, 24 Aug 2025 00:08:22 +0100 Subject: [PATCH 29/29] :bulb: clarify docstring to specify dot splitting only occurs on actual '.' at depth 0, not percent-encoded sequences --- src/qs_codec/utils/decode_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/qs_codec/utils/decode_utils.py b/src/qs_codec/utils/decode_utils.py index b00ba89..d03479b 100644 --- a/src/qs_codec/utils/decode_utils.py +++ b/src/qs_codec/utils/decode_utils.py @@ -4,7 +4,7 @@ - Decoding handles both UTF‑8 and Latin‑1 code paths. - Key splitting keeps bracket groups *balanced* and optionally treats dots as path separators when ``allow_dots=True``. -- Top‑level dot splitting uses a character‑scanner that handles degenerate cases (leading '.' starts a bracket segment; '.[' is skipped; double dots preserve the first; trailing '.' is preserved) and never splits on percent‑encoded dots. +- Top‑level dot splitting uses a character‑scanner that handles degenerate cases (leading '.' starts a bracket segment; '.[' is skipped; double dots preserve the first; trailing '.' is preserved) and never treats literal percent‑encoded sequences (e.g., '%2E') as split points; only actual '.' characters at depth 0 are split. """ import re