From 6eb7f5c2dd9b48fdbe399dd80c80a991bc0a73aa Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Tue, 16 May 2023 14:35:17 -0400 Subject: [PATCH 01/14] Use a separate function. --- synapse/events/utils.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/synapse/events/utils.py b/synapse/events/utils.py index e6d040176be2..6f88ca3e95d1 100644 --- a/synapse/events/utils.py +++ b/synapse/events/utils.py @@ -253,6 +253,15 @@ def _copy_field(src: JsonDict, dst: JsonDict, field: List[str]) -> None: sub_out_dict[key_to_move] = sub_dict[key_to_move] +def _split_field(field: str) -> List[str]: + # Convert the field and remove escaping: + # + # 1. "content.body.thing\.with\.dots" + # 2. ["content", "body", "thing\.with\.dots"] + # 3. ["content", "body", "thing.with.dots"] + return [part.replace(r"\.", r".") for part in SPLIT_FIELD_REGEX.split(field)] + + def only_fields(dictionary: JsonDict, fields: List[str]) -> JsonDict: """Return a new dict with only the fields in 'dictionary' which are present in 'fields'. @@ -275,13 +284,7 @@ def only_fields(dictionary: JsonDict, fields: List[str]) -> JsonDict: # for each field, convert it: # ["content.body.thing\.with\.dots"] => [["content", "body", "thing\.with\.dots"]] - split_fields = [SPLIT_FIELD_REGEX.split(f) for f in fields] - - # for each element of the output array of arrays: - # remove escaping so we can use the right key names. - split_fields[:] = [ - [f.replace(r"\.", r".") for f in field_array] for field_array in split_fields - ] + split_fields = [_split_field(f) for f in fields] output: JsonDict = {} for field_array in split_fields: From ca1595f7dc50bcfc3811357418906de5e2d030f4 Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Tue, 16 May 2023 15:21:12 -0400 Subject: [PATCH 02/14] Port the matrix-js-sdk code. --- synapse/events/utils.py | 55 +++++++++++++++++++++++++++++++------- tests/events/test_utils.py | 36 ++++++++++++++++++++++++- 2 files changed, 81 insertions(+), 10 deletions(-) diff --git a/synapse/events/utils.py b/synapse/events/utils.py index 6f88ca3e95d1..f174f08b79e1 100644 --- a/synapse/events/utils.py +++ b/synapse/events/utils.py @@ -25,6 +25,7 @@ MutableMapping, Optional, Union, + Match, ) import attr @@ -46,13 +47,6 @@ from synapse.handlers.relations import BundledAggregations -# Split strings on "." but not "\." This uses a negative lookbehind assertion for '\' -# (? List[str]: # 1. "content.body.thing\.with\.dots" # 2. ["content", "body", "thing\.with\.dots"] # 3. ["content", "body", "thing.with.dots"] - return [part.replace(r"\.", r".") for part in SPLIT_FIELD_REGEX.split(field)] + + result = [] + + # The current field and whether the previous character was the escape + # character (a backslash). + part = "" + escaped = False + + # Iterate over each character, and decide whether to append to the current + # part (following the escape rules) or to start a new part (based on the + # field separator). + for c in field: + # If the previous character was the escape character (a backslash) + # then decide what to append to the current part. + if escaped: + if c in ("\\", "."): + # An escaped backslash or dot just gets added. + part += c + else: + # A character that shouldn't be escaped gets the backslash prepended. + part += "\\" + c + # This always resets being escaped. + escaped = False + + # Otherwise, the previous character was not the escape character. + else: + if c == ".": + # The field separator creates a new part. + result.append(part) + part = "" + elif c == "\\": + # A backslash adds no characters, but starts an escape sequence. + escaped = True + else: + # Otherwise, just add the current character. + part += c + + # Ensure the final part is included. If there's an open escape sequence + # it should be included. + if escaped: + part += "\\" + result.append(part) + + return result def only_fields(dictionary: JsonDict, fields: List[str]) -> JsonDict: @@ -269,7 +306,7 @@ def only_fields(dictionary: JsonDict, fields: List[str]) -> JsonDict: If there are no event fields specified then all fields are included. The entries may include '.' characters to indicate sub-fields. So ['content.body'] will include the 'body' field of the 'content' object. - A literal '.' character in a field name may be escaped using a '\'. + A literal '.' or '\' character in a field name may be escaped using a '\'. Args: dictionary: The dictionary to read from. diff --git a/tests/events/test_utils.py b/tests/events/test_utils.py index e40eac2eb064..14d670e1170d 100644 --- a/tests/events/test_utils.py +++ b/tests/events/test_utils.py @@ -16,6 +16,7 @@ from typing import Any, List, Mapping, Optional import attr +from parameterized import parameterized from synapse.api.constants import EventContentFields from synapse.api.room_versions import RoomVersions @@ -26,7 +27,7 @@ copy_and_fixup_power_levels_contents, maybe_upsert_event_field, prune_event, - serialize_event, + serialize_event, _split_field, ) from synapse.types import JsonDict from synapse.util.frozenutils import freeze @@ -794,3 +795,36 @@ def test_invalid_types_raise_type_error(self) -> None: def test_invalid_nesting_raises_type_error(self) -> None: with self.assertRaises(TypeError): copy_and_fixup_power_levels_contents({"a": {"b": {"c": 1}}}) # type: ignore[dict-item] + + +class SplitFieldTestCase(stdlib_unittest.TestCase): + @parameterized.expand([ + # A field with no dots. + ["m", ["m"]], + # Simple dotted fields. + ["m.foo", ["m", "foo"]], + ["m.foo.bar", ["m", "foo", "bar"]], + # Backslash is used as an escape character. + ["m\\.foo", ["m.foo"]], + ["m\\\\.foo", ["m\\", "foo"]], + ["m\\\\\\.foo", ["m\\.foo"]], + ["m\\\\\\\\.foo", ["m\\\\", "foo"]], + ["m\\foo", ["m\\foo"]], + ["m\\\\foo", ["m\\foo"]], + ["m\\\\\\foo", ["m\\\\foo"]], + ["m\\\\\\\\foo", ["m\\\\foo"]], + # Ensure that escapes at the end don't cause issues. + ["m.foo\\", ["m", "foo\\"]], + ["m.foo\\\\", ["m", "foo\\"]], + ["m.foo\\.", ["m", "foo."]], + ["m.foo\\\\.", ["m", "foo\\", ""]], + ["m.foo\\\\\\.", ["m", "foo\\."]], + # Empty parts (corresponding to properties which are an empty string) are allowed. + [".m", ["", "m"]], + ["..m", ["", "", "m"]], + ["m.", ["m", ""]], + ["m..", ["m", "", ""]], + ["m..foo", ["m", "", "foo"]], + ]) + def test_split_field(self, input: str, expected: str) -> None: + self.assertEqual(_split_field(input), expected) From 60718125c733637fb560bdca0fb6df9fe5f7fe52 Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Tue, 16 May 2023 15:23:44 -0400 Subject: [PATCH 03/14] Additional test and better comments. --- synapse/events/utils.py | 19 ++++++++++++++----- tests/events/test_utils.py | 2 ++ 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/synapse/events/utils.py b/synapse/events/utils.py index f174f08b79e1..4130f31c8b80 100644 --- a/synapse/events/utils.py +++ b/synapse/events/utils.py @@ -248,12 +248,21 @@ def _copy_field(src: JsonDict, dst: JsonDict, field: List[str]) -> None: def _split_field(field: str) -> List[str]: - # Convert the field and remove escaping: - # - # 1. "content.body.thing\.with\.dots" - # 2. ["content", "body", "thing\.with\.dots"] - # 3. ["content", "body", "thing.with.dots"] + """ + Split strings on "." but not "\." and escape backslash escaped characters. + + Convert the field and remove escaping: + 1. "content.body.thing\.with\.dots" + 2. ["content", "body", "thing\.with\.dots"] + 3. ["content", "body", "thing.with.dots"] + + Args: + field: A string representing a path to a field. + + Returns: + A list of nested fields to traverse. + """ result = [] # The current field and whether the previous character was the escape diff --git a/tests/events/test_utils.py b/tests/events/test_utils.py index 14d670e1170d..9deb9a6839fe 100644 --- a/tests/events/test_utils.py +++ b/tests/events/test_utils.py @@ -825,6 +825,8 @@ class SplitFieldTestCase(stdlib_unittest.TestCase): ["m.", ["m", ""]], ["m..", ["m", "", ""]], ["m..foo", ["m", "", "foo"]], + # Invalid escape sequences. + ["\\m", ["\\m"]], ]) def test_split_field(self, input: str, expected: str) -> None: self.assertEqual(_split_field(input), expected) From e74d81600443cc22b3e179567bc37941de6a30e2 Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Tue, 16 May 2023 15:27:57 -0400 Subject: [PATCH 04/14] Actually allow backslashes. --- synapse/api/filtering.py | 12 +----------- tests/api/test_filtering.py | 6 ------ 2 files changed, 1 insertion(+), 17 deletions(-) diff --git a/synapse/api/filtering.py b/synapse/api/filtering.py index de7c56bc0fa3..a950eeb89df3 100644 --- a/synapse/api/filtering.py +++ b/synapse/api/filtering.py @@ -130,17 +130,7 @@ "event_format": {"type": "string", "enum": ["client", "federation"]}, "event_fields": { "type": "array", - "items": { - "type": "string", - # Don't allow '\\' in event field filters. This makes matching - # events a lot easier as we can then use a negative lookbehind - # assertion to split '\.' If we allowed \\ then it would - # incorrectly split '\\.' See synapse.events.utils.serialize_event - # - # Note that because this is a regular expression, we have to escape - # each backslash in the pattern. - "pattern": r"^((?!\\\\).)*$", - }, + "items": {"type": "string"}, }, }, "additionalProperties": True, # Allow new fields for forward compatibility diff --git a/tests/api/test_filtering.py b/tests/api/test_filtering.py index 222449baac81..aa6af5ad7bb2 100644 --- a/tests/api/test_filtering.py +++ b/tests/api/test_filtering.py @@ -48,8 +48,6 @@ def test_errors_on_invalid_filters(self) -> None: invalid_filters: List[JsonDict] = [ # `account_data` must be a dictionary {"account_data": "Hello World"}, - # `event_fields` entries must not contain backslashes - {"event_fields": [r"\\foo"]}, # `event_format` must be "client" or "federation" {"event_format": "other"}, # `not_rooms` must contain valid room IDs @@ -114,10 +112,6 @@ def test_valid_filters(self) -> None: "event_format": "client", "event_fields": ["type", "content", "sender"], }, - # a single backslash should be permitted (though it is debatable whether - # it should be permitted before anything other than `.`, and what that - # actually means) - # # (note that event_fields is implemented in # synapse.events.utils.serialize_event, and so whether this actually works # is tested elsewhere. We just want to check that it is allowed through the From b38f887e935c6a233e1bbf447ee95a431425d0b7 Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Tue, 16 May 2023 15:45:58 -0400 Subject: [PATCH 05/14] Newsfragment --- changelog.d/15607.bugfix | 1 + 1 file changed, 1 insertion(+) create mode 100644 changelog.d/15607.bugfix diff --git a/changelog.d/15607.bugfix b/changelog.d/15607.bugfix new file mode 100644 index 000000000000..a2767adbe2a4 --- /dev/null +++ b/changelog.d/15607.bugfix @@ -0,0 +1 @@ +Fix a long-standing bug where filters with multiple backslashes were rejected. From 3da6c9c9168ea72f3fedca62828e1868e8afa9b9 Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Tue, 16 May 2023 15:46:59 -0400 Subject: [PATCH 06/14] Formatting. --- synapse/api/filtering.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/synapse/api/filtering.py b/synapse/api/filtering.py index a950eeb89df3..82aeef8d1913 100644 --- a/synapse/api/filtering.py +++ b/synapse/api/filtering.py @@ -128,10 +128,7 @@ "account_data": {"$ref": "#/definitions/filter"}, "room": {"$ref": "#/definitions/room_filter"}, "event_format": {"type": "string", "enum": ["client", "federation"]}, - "event_fields": { - "type": "array", - "items": {"type": "string"}, - }, + "event_fields": {"type": "array", "items": {"type": "string"}}, }, "additionalProperties": True, # Allow new fields for forward compatibility } From b91fa8f2589a3636286f75a77d1e2901644795d6 Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Tue, 16 May 2023 15:49:22 -0400 Subject: [PATCH 07/14] Lint --- synapse/events/utils.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/synapse/events/utils.py b/synapse/events/utils.py index 4130f31c8b80..458d6c3e8c31 100644 --- a/synapse/events/utils.py +++ b/synapse/events/utils.py @@ -13,7 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. import collections.abc -import re from typing import ( TYPE_CHECKING, Any, @@ -25,7 +24,6 @@ MutableMapping, Optional, Union, - Match, ) import attr @@ -249,13 +247,7 @@ def _copy_field(src: JsonDict, dst: JsonDict, field: List[str]) -> None: def _split_field(field: str) -> List[str]: """ - Split strings on "." but not "\." and escape backslash escaped characters. - - Convert the field and remove escaping: - - 1. "content.body.thing\.with\.dots" - 2. ["content", "body", "thing\.with\.dots"] - 3. ["content", "body", "thing.with.dots"] + Split strings unescaped dots and removes escaping. Args: field: A string representing a path to a field. @@ -263,6 +255,13 @@ def _split_field(field: str) -> List[str]: Returns: A list of nested fields to traverse. """ + + # Convert the field and remove escaping: + # + # 1. "content.body.thing\.with\.dots" + # 2. ["content", "body", "thing\.with\.dots"] + # 3. ["content", "body", "thing.with.dots"] + result = [] # The current field and whether the previous character was the escape From 220641a2c922627296c270c43c5893989933d1bf Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Tue, 16 May 2023 15:50:25 -0400 Subject: [PATCH 08/14] More lint. --- tests/events/test_utils.py | 65 ++++++++++++++++++++------------------ 1 file changed, 34 insertions(+), 31 deletions(-) diff --git a/tests/events/test_utils.py b/tests/events/test_utils.py index 9deb9a6839fe..c487998e0fe5 100644 --- a/tests/events/test_utils.py +++ b/tests/events/test_utils.py @@ -24,10 +24,11 @@ from synapse.events.utils import ( PowerLevelsContent, SerializeEventConfig, + _split_field, copy_and_fixup_power_levels_contents, maybe_upsert_event_field, prune_event, - serialize_event, _split_field, + serialize_event, ) from synapse.types import JsonDict from synapse.util.frozenutils import freeze @@ -798,35 +799,37 @@ def test_invalid_nesting_raises_type_error(self) -> None: class SplitFieldTestCase(stdlib_unittest.TestCase): - @parameterized.expand([ - # A field with no dots. - ["m", ["m"]], - # Simple dotted fields. - ["m.foo", ["m", "foo"]], - ["m.foo.bar", ["m", "foo", "bar"]], - # Backslash is used as an escape character. - ["m\\.foo", ["m.foo"]], - ["m\\\\.foo", ["m\\", "foo"]], - ["m\\\\\\.foo", ["m\\.foo"]], - ["m\\\\\\\\.foo", ["m\\\\", "foo"]], - ["m\\foo", ["m\\foo"]], - ["m\\\\foo", ["m\\foo"]], - ["m\\\\\\foo", ["m\\\\foo"]], - ["m\\\\\\\\foo", ["m\\\\foo"]], - # Ensure that escapes at the end don't cause issues. - ["m.foo\\", ["m", "foo\\"]], - ["m.foo\\\\", ["m", "foo\\"]], - ["m.foo\\.", ["m", "foo."]], - ["m.foo\\\\.", ["m", "foo\\", ""]], - ["m.foo\\\\\\.", ["m", "foo\\."]], - # Empty parts (corresponding to properties which are an empty string) are allowed. - [".m", ["", "m"]], - ["..m", ["", "", "m"]], - ["m.", ["m", ""]], - ["m..", ["m", "", ""]], - ["m..foo", ["m", "", "foo"]], - # Invalid escape sequences. - ["\\m", ["\\m"]], - ]) + @parameterized.expand( + [ + # A field with no dots. + ["m", ["m"]], + # Simple dotted fields. + ["m.foo", ["m", "foo"]], + ["m.foo.bar", ["m", "foo", "bar"]], + # Backslash is used as an escape character. + ["m\\.foo", ["m.foo"]], + ["m\\\\.foo", ["m\\", "foo"]], + ["m\\\\\\.foo", ["m\\.foo"]], + ["m\\\\\\\\.foo", ["m\\\\", "foo"]], + ["m\\foo", ["m\\foo"]], + ["m\\\\foo", ["m\\foo"]], + ["m\\\\\\foo", ["m\\\\foo"]], + ["m\\\\\\\\foo", ["m\\\\foo"]], + # Ensure that escapes at the end don't cause issues. + ["m.foo\\", ["m", "foo\\"]], + ["m.foo\\\\", ["m", "foo\\"]], + ["m.foo\\.", ["m", "foo."]], + ["m.foo\\\\.", ["m", "foo\\", ""]], + ["m.foo\\\\\\.", ["m", "foo\\."]], + # Empty parts (corresponding to properties which are an empty string) are allowed. + [".m", ["", "m"]], + ["..m", ["", "", "m"]], + ["m.", ["m", ""]], + ["m..", ["m", "", ""]], + ["m..foo", ["m", "", "foo"]], + # Invalid escape sequences. + ["\\m", ["\\m"]], + ] + ) def test_split_field(self, input: str, expected: str) -> None: self.assertEqual(_split_field(input), expected) From c930fed7abd5b4d2e6c59a621ec732c6ced8ba94 Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Fri, 19 May 2023 13:13:57 -0400 Subject: [PATCH 09/14] Clarify comment. Co-authored-by: Sean Quah <8349537+squahtx@users.noreply.github.com> --- synapse/events/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/synapse/events/utils.py b/synapse/events/utils.py index 458d6c3e8c31..a71acfa7f808 100644 --- a/synapse/events/utils.py +++ b/synapse/events/utils.py @@ -247,7 +247,7 @@ def _copy_field(src: JsonDict, dst: JsonDict, field: List[str]) -> None: def _split_field(field: str) -> List[str]: """ - Split strings unescaped dots and removes escaping. + Splits strings on unescaped dots and removes escaping. Args: field: A string representing a path to a field. From fa473728094c50fcff0ad180b5a0f953681cfc83 Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Fri, 19 May 2023 13:14:13 -0400 Subject: [PATCH 10/14] Use raw strings. --- tests/events/test_utils.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/tests/events/test_utils.py b/tests/events/test_utils.py index c487998e0fe5..c9a610db9ab9 100644 --- a/tests/events/test_utils.py +++ b/tests/events/test_utils.py @@ -807,20 +807,20 @@ class SplitFieldTestCase(stdlib_unittest.TestCase): ["m.foo", ["m", "foo"]], ["m.foo.bar", ["m", "foo", "bar"]], # Backslash is used as an escape character. - ["m\\.foo", ["m.foo"]], - ["m\\\\.foo", ["m\\", "foo"]], - ["m\\\\\\.foo", ["m\\.foo"]], - ["m\\\\\\\\.foo", ["m\\\\", "foo"]], - ["m\\foo", ["m\\foo"]], - ["m\\\\foo", ["m\\foo"]], - ["m\\\\\\foo", ["m\\\\foo"]], - ["m\\\\\\\\foo", ["m\\\\foo"]], + [r"m\.foo", ["m.foo"]], + [r"m\\.foo", ["m\\", "foo"]], + [r"m\\\.foo", [r"m\.foo"]], + [r"m\\\\.foo", ["m\\\\", "foo"]], + [r"m\foo", [r"m\foo"]], + [r"m\\foo", [r"m\foo"]], + [r"m\\\foo", [r"m\\foo"]], + [r"m\\\\foo", [r"m\\foo"]], # Ensure that escapes at the end don't cause issues. ["m.foo\\", ["m", "foo\\"]], - ["m.foo\\\\", ["m", "foo\\"]], - ["m.foo\\.", ["m", "foo."]], - ["m.foo\\\\.", ["m", "foo\\", ""]], - ["m.foo\\\\\\.", ["m", "foo\\."]], + ["m.foo\\", ["m", "foo\\"]], + [r"m.foo\.", ["m", "foo."]], + [r"m.foo\\.", ["m", "foo\\", ""]], + [r"m.foo\\\.", ["m", r"foo\."]], # Empty parts (corresponding to properties which are an empty string) are allowed. [".m", ["", "m"]], ["..m", ["", "", "m"]], @@ -828,7 +828,7 @@ class SplitFieldTestCase(stdlib_unittest.TestCase): ["m..", ["m", "", ""]], ["m..foo", ["m", "", "foo"]], # Invalid escape sequences. - ["\\m", ["\\m"]], + [r"\m", [r"\m"]], ] ) def test_split_field(self, input: str, expected: str) -> None: From c73876d9d3d1f05c6f9d0df3e7a4278692405f64 Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Fri, 19 May 2023 13:52:19 -0400 Subject: [PATCH 11/14] Avoid expanding strings. --- synapse/events/utils.py | 75 +++++++++++++++++++---------------------- 1 file changed, 34 insertions(+), 41 deletions(-) diff --git a/synapse/events/utils.py b/synapse/events/utils.py index a71acfa7f808..7a3d36a9aa87 100644 --- a/synapse/events/utils.py +++ b/synapse/events/utils.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import collections.abc +import re from typing import ( TYPE_CHECKING, Any, @@ -21,6 +22,7 @@ Iterable, List, Mapping, + Match, MutableMapping, Optional, Union, @@ -245,6 +247,16 @@ def _copy_field(src: JsonDict, dst: JsonDict, field: List[str]) -> None: sub_out_dict[key_to_move] = sub_dict[key_to_move] +def _escape_slash(m: Match[str]) -> str: + """ + Replacement function; replace a backslash-backslash or backslash-dot with the + second character. Leaves any other string alone. + """ + if m.group(1) in ("\\", "."): + return m.group(1) + return m.group(0) + + def _split_field(field: str) -> List[str]: """ Splits strings on unescaped dots and removes escaping. @@ -262,49 +274,30 @@ def _split_field(field: str) -> List[str]: # 2. ["content", "body", "thing\.with\.dots"] # 3. ["content", "body", "thing.with.dots"] + # Find all dots (and their preceding backslashes). Check if they're escaped + # and if not, then note them as a spot to split on. + splits = [] + for match in re.finditer(r"\\*\.", field): + # If the match is an *even* number of characters than the dot was escaped. + if len(match.group()) % 2 == 0: + continue + + # The character after the one to keep. + splits.append(match.end()) + + # Iterate through the splits in reverse order to avoid modifying the positions + # as we go. result = [] + previous_split = len(field) + for split in reversed(splits): + result.append(field[split:previous_split]) + previous_split = split - 1 - # The current field and whether the previous character was the escape - # character (a backslash). - part = "" - escaped = False - - # Iterate over each character, and decide whether to append to the current - # part (following the escape rules) or to start a new part (based on the - # field separator). - for c in field: - # If the previous character was the escape character (a backslash) - # then decide what to append to the current part. - if escaped: - if c in ("\\", "."): - # An escaped backslash or dot just gets added. - part += c - else: - # A character that shouldn't be escaped gets the backslash prepended. - part += "\\" + c - # This always resets being escaped. - escaped = False - - # Otherwise, the previous character was not the escape character. - else: - if c == ".": - # The field separator creates a new part. - result.append(part) - part = "" - elif c == "\\": - # A backslash adds no characters, but starts an escape sequence. - escaped = True - else: - # Otherwise, just add the current character. - part += c - - # Ensure the final part is included. If there's an open escape sequence - # it should be included. - if escaped: - part += "\\" - result.append(part) - - return result + # Add the first bit. + result.append(field[:previous_split]) + + # Unescape each part. + return [re.sub(r"\\(.)", _escape_slash, part) for part in reversed(result)] def only_fields(dictionary: JsonDict, fields: List[str]) -> JsonDict: From c8d3f5d499c09cbf967d4a77615596e1529a199c Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Fri, 19 May 2023 13:58:44 -0400 Subject: [PATCH 12/14] Simplify. --- synapse/events/utils.py | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/synapse/events/utils.py b/synapse/events/utils.py index 7a3d36a9aa87..c43ab31ff556 100644 --- a/synapse/events/utils.py +++ b/synapse/events/utils.py @@ -274,30 +274,26 @@ def _split_field(field: str) -> List[str]: # 2. ["content", "body", "thing\.with\.dots"] # 3. ["content", "body", "thing.with.dots"] - # Find all dots (and their preceding backslashes). Check if they're escaped - # and if not, then note them as a spot to split on. - splits = [] + # Find all dots (and their preceding backslashes). If the dot is unescaped + # then emit a new field part. + result = [] + prev_start = 0 for match in re.finditer(r"\\*\.", field): # If the match is an *even* number of characters than the dot was escaped. if len(match.group()) % 2 == 0: continue - # The character after the one to keep. - splits.append(match.end()) - - # Iterate through the splits in reverse order to avoid modifying the positions - # as we go. - result = [] - previous_split = len(field) - for split in reversed(splits): - result.append(field[split:previous_split]) - previous_split = split - 1 + # Add a new part (up to the dot, exclusive) after escaping. + result.append( + re.sub(r"\\(.)", _escape_slash, field[prev_start : match.end() - 1]) + ) + prev_start = match.end() - # Add the first bit. - result.append(field[:previous_split]) + # Add any part of the field after the last unescaped dot. (Note that if the + # character is a dot this correctly adds a blank string.) + result.append(re.sub(r"\\(.)", _escape_slash, field[prev_start:])) - # Unescape each part. - return [re.sub(r"\\(.)", _escape_slash, part) for part in reversed(result)] + return result def only_fields(dictionary: JsonDict, fields: List[str]) -> JsonDict: From 9bdb7432c1c26a735d78121d669597d225e7916b Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Mon, 22 May 2023 10:37:38 -0400 Subject: [PATCH 13/14] Pre-compile patterns. --- synapse/events/utils.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/synapse/events/utils.py b/synapse/events/utils.py index c43ab31ff556..99aec2ae1f42 100644 --- a/synapse/events/utils.py +++ b/synapse/events/utils.py @@ -47,6 +47,11 @@ from synapse.handlers.relations import BundledAggregations +# Split strings on "." but not "\." (or "\\\."). +SPLIT_FIELD_REGEX = re.compile(r"\\*\.") +# Find escaped characters, e.g. those with a \ in front of them. +ESCAPE_SEQUENCE_PATTERN = re.compile(r"\\(.)") + CANONICALJSON_MAX_INT = (2**53) - 1 CANONICALJSON_MIN_INT = -CANONICALJSON_MAX_INT @@ -278,14 +283,14 @@ def _split_field(field: str) -> List[str]: # then emit a new field part. result = [] prev_start = 0 - for match in re.finditer(r"\\*\.", field): + for match in SPLIT_FIELD_REGEX.finditer(field): # If the match is an *even* number of characters than the dot was escaped. if len(match.group()) % 2 == 0: continue # Add a new part (up to the dot, exclusive) after escaping. result.append( - re.sub(r"\\(.)", _escape_slash, field[prev_start : match.end() - 1]) + ESCAPE_SEQUENCE_PATTERN.sub(_escape_slash, field[prev_start: match.end() - 1]) ) prev_start = match.end() From d8d07f97f6de0367d65020efcec1d26044afb088 Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Mon, 22 May 2023 10:41:40 -0400 Subject: [PATCH 14/14] Lint --- synapse/events/utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/synapse/events/utils.py b/synapse/events/utils.py index 99aec2ae1f42..e7b7b78b8454 100644 --- a/synapse/events/utils.py +++ b/synapse/events/utils.py @@ -290,7 +290,9 @@ def _split_field(field: str) -> List[str]: # Add a new part (up to the dot, exclusive) after escaping. result.append( - ESCAPE_SEQUENCE_PATTERN.sub(_escape_slash, field[prev_start: match.end() - 1]) + ESCAPE_SEQUENCE_PATTERN.sub( + _escape_slash, field[prev_start : match.end() - 1] + ) ) prev_start = match.end()