Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Fix 3.8 tests.
  • Loading branch information
orsenthil committed Apr 30, 2021
commit 9a1d1f73202633cd7837588d290f2cbd04369866
14 changes: 12 additions & 2 deletions Lib/test/test_urlparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -613,7 +613,7 @@ def test_urlsplit_attributes(self):
p.port

def test_urlsplit_remove_unsafe_bytes(self):
# Remove ASCII tabs and newlines from input
# Remove ASCII tabs and newlines from input, for http common case scenario.
url = "http://www.python.org/java\nscript:\talert('msg\r\n')/#frag"
p = urllib.parse.urlsplit(url)
self.assertEqual(p.scheme, "http")
Expand All @@ -627,7 +627,7 @@ def test_urlsplit_remove_unsafe_bytes(self):
self.assertEqual(p.port, None)
self.assertEqual(p.geturl(), "http://www.python.org/javascript:alert('msg')/#frag")

# Remove ASCII tabs and newlines from input as bytes.
# Remove ASCII tabs and newlines from input as bytes, for http common case scenario.
url = b"http://www.python.org/java\nscript:\talert('msg\r\n')/#frag"
p = urllib.parse.urlsplit(url)
self.assertEqual(p.scheme, b"http")
Expand All @@ -641,6 +641,16 @@ def test_urlsplit_remove_unsafe_bytes(self):
self.assertEqual(p.port, None)
self.assertEqual(p.geturl(), b"http://www.python.org/javascript:alert('msg')/#frag")

# any scheme
url = "x-new-scheme://www.python.org/java\nscript:\talert('msg\r\n')/#frag"
p = urllib.parse.urlsplit(url)
self.assertEqual(p.geturl(), "x-new-scheme://www.python.org/javascript:alert('msg')/#frag")

# Remove ASCII tabs and newlines from input as bytes, any scheme.
url = b"x-new-scheme://www.python.org/java\nscript:\talert('msg\r\n')/#frag"
p = urllib.parse.urlsplit(url)
self.assertEqual(p.geturl(), b"x-new-scheme://www.python.org/javascript:alert('msg')/#frag")

def test_attributes_bad_port(self):
"""Check handling of invalid ports."""
for bytes in (False, True):
Expand Down
9 changes: 7 additions & 2 deletions Lib/urllib/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -417,6 +417,11 @@ def _checknetloc(netloc):
raise ValueError("netloc '" + netloc + "' contains invalid " +
"characters under NFKC normalization")

def _remove_unsafe_bytes_from_url(url):
for b in _UNSAFE_URL_BYTES_TO_REMOVE:
url = url.replace(b, "")
return url

def urlsplit(url, scheme='', allow_fragments=True):
"""Parse a URL into 5 components:
<scheme>://<netloc>/<path>?<query>#<fragment>
Expand Down Expand Up @@ -446,6 +451,7 @@ def urlsplit(url, scheme='', allow_fragments=True):
if '?' in url:
url, query = url.split('?', 1)
_checknetloc(netloc)
url = _remove_unsafe_bytes_from_url(url)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we need to do this up front right after _coerce_args(url, ...) (and should do the same in 3.9 and 3.10).

by this point in 3.8 we've potentially allowed characters to slip through into query and fragment of http: urls.

at a minumum if we weren't do this right after _coerce_args before looking in the cache, this needs to be done right before any splitting happens in this branch of code.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the test case should be updated to include an invalid character in each of the five portions of the url.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

allowed characters to slip through into query and fragment

I had a doubt initially, and I thought (from practice) that newlines and tabs are percent-encoded when in query/fragment than they are removed upfront.

We will have find a spec that will unambiguously state what should be done.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We've already got that unambiguous spec. https://url.spec.whatwg.org/#concept-basic-url-parser step 3. strip these three characters no matter what before the parsing state machine starts.

v = SplitResult('http', netloc, url, query, fragment)
_parse_cache[key] = v
return _coerce_result(v)
Expand All @@ -460,8 +466,7 @@ def urlsplit(url, scheme='', allow_fragments=True):
# not a port number
scheme, url = url[:i].lower(), rest

for b in _UNSAFE_URL_BYTES_TO_REMOVE:
url = url.replace(b, "")
url = _remove_unsafe_bytes_from_url(url)

if url[:2] == '//':
netloc, url = _splitnetloc(url, 2)
Expand Down