diff --git a/CONTRIBUTORS.rst b/CONTRIBUTORS.rst index 8ad646e2..6963ae4f 100644 --- a/CONTRIBUTORS.rst +++ b/CONTRIBUTORS.rst @@ -37,3 +37,4 @@ In chronological order: - Added support for upgrade of plaintext HTTP/1.1 to plaintext HTTP/2. - Added proxy support. + - Improved IPv6 support. diff --git a/NOTICES b/NOTICES index dd28748f..a62c7ba6 100644 --- a/NOTICES +++ b/NOTICES @@ -20,3 +20,19 @@ PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +rfc3986: + +Copyright 2014 Ian Cordasco, Rackspace + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. diff --git a/hyper/cli.py b/hyper/cli.py index f441a9bc..6eed9e10 100644 --- a/hyper/cli.py +++ b/hyper/cli.py @@ -17,6 +17,7 @@ from hyper import HTTPConnection, HTTP20Connection from hyper import __version__ from hyper.compat import is_py2, urlencode, urlsplit, write_to_stdout +from hyper.common.util import to_host_port_tuple log = logging.getLogger('hyper') @@ -113,8 +114,7 @@ def make_troubleshooting_argument(parser): def set_url_info(args): def split_host_and_port(hostname): if ':' in hostname: - host, port = hostname.split(':') - return host, int(port) + return to_host_port_tuple(hostname, default_port=443) return hostname, None class UrlInfo(object): diff --git a/hyper/common/util.py b/hyper/common/util.py index abfa6ec4..bb86344e 100644 --- a/hyper/common/util.py +++ b/hyper/common/util.py @@ -6,6 +6,8 @@ General utility functions for use with hyper. """ from hyper.compat import unicode, bytes, imap +from ..packages.rfc3986.uri import URIReference +import re def to_bytestring(element): """ @@ -25,3 +27,24 @@ def to_bytestring_tuple(*x): tuple. Uses ``to_bytestring``. """ return tuple(imap(to_bytestring, x)) + +def to_host_port_tuple(host_port_str, default_port=80): + """ + Converts the given string containing a host and possibly a port + to a tuple. + """ + uri = URIReference( + scheme=None, + authority=host_port_str, + path=None, + query=None, + fragment=None + ) + + host = uri.host.strip('[]') + if not uri.port: + port = default_port + else: + port = int(uri.port) + + return (host, port) diff --git a/hyper/http11/connection.py b/hyper/http11/connection.py index f8409a87..a1c5297e 100644 --- a/hyper/http11/connection.py +++ b/hyper/http11/connection.py @@ -15,7 +15,7 @@ from ..common.bufsocket import BufferedSocket from ..common.exceptions import TLSUpgrade, HTTPUpgrade from ..common.headers import HTTPHeaderMap -from ..common.util import to_bytestring +from ..common.util import to_bytestring, to_host_port_tuple from ..compat import bytes from ..packages.hyperframe.frame import SettingsFrame @@ -56,11 +56,7 @@ class HTTP11Connection(object): def __init__(self, host, port=None, secure=None, ssl_context=None, proxy_host=None, proxy_port=None, **kwargs): if port is None: - try: - self.host, self.port = host.split(':') - self.port = int(self.port) - except ValueError: - self.host, self.port = host, 80 + self.host, self.port = to_host_port_tuple(host, default_port=80) else: self.host, self.port = host, port @@ -83,12 +79,7 @@ def __init__(self, host, port=None, secure=None, ssl_context=None, # Setup proxy details if applicable. if proxy_host: if proxy_port is None: - try: - self.proxy_host, self.proxy_port = proxy_host.split(':') - except ValueError: - self.proxy_host, self.proxy_port = proxy_host, 8080 - else: - self.proxy_port = int(self.proxy_port) + self.proxy_host, self.proxy_port = to_host_port_tuple(proxy_host, default_port=8080) else: self.proxy_host, self.proxy_port = proxy_host, proxy_port else: diff --git a/hyper/http20/connection.py b/hyper/http20/connection.py index 1f3ee7e1..d5b10b09 100644 --- a/hyper/http20/connection.py +++ b/hyper/http20/connection.py @@ -9,6 +9,7 @@ from ..common.exceptions import ConnectionResetError from ..common.bufsocket import BufferedSocket from ..common.headers import HTTPHeaderMap +from ..common.util import to_host_port_tuple from ..packages.hyperframe.frame import ( FRAMES, DataFrame, HeadersFrame, PushPromiseFrame, RstStreamFrame, SettingsFrame, Frame, WindowUpdateFrame, GoAwayFrame, PingFrame, @@ -67,11 +68,7 @@ def __init__(self, host, port=None, secure=None, window_manager=None, enable_pus Creates an HTTP/2 connection to a specific server. """ if port is None: - try: - self.host, self.port = host.split(':') - self.port = int(self.port) - except ValueError: - self.host, self.port = host, 443 + self.host, self.port = to_host_port_tuple(host, default_port=443) else: self.host, self.port = host, port @@ -88,12 +85,7 @@ def __init__(self, host, port=None, secure=None, window_manager=None, enable_pus # Setup proxy details if applicable. if proxy_host: if proxy_port is None: - try: - self.proxy_host, self.proxy_port = proxy_host.split(':') - except ValueError: - self.proxy_host, self.proxy_port = proxy_host, 8080 - else: - self.proxy_port = int(self.proxy_port) + self.proxy_host, self.proxy_port = to_host_port_tuple(proxy_host, default_port=8080) else: self.proxy_host, self.proxy_port = proxy_host, proxy_port else: diff --git a/hyper/packages/rfc3986/LICENSE b/hyper/packages/rfc3986/LICENSE new file mode 100644 index 00000000..72ce24cf --- /dev/null +++ b/hyper/packages/rfc3986/LICENSE @@ -0,0 +1,13 @@ +Copyright 2014 Ian Cordasco, Rackspace + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. diff --git a/hyper/packages/rfc3986/__init__.py b/hyper/packages/rfc3986/__init__.py new file mode 100644 index 00000000..a3aea4c4 --- /dev/null +++ b/hyper/packages/rfc3986/__init__.py @@ -0,0 +1,45 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2014 Rackspace +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +rfc3986 +======= + +An implementation of semantics and validations described in RFC 3986. See +http://rfc3986.rtfd.org/ for documentation. + +:copyright: (c) 2014 Rackspace +:license: Apache v2.0, see LICENSE for details +""" + +__title__ = 'rfc3986' +__author__ = 'Ian Cordasco' +__author_email__ = 'ian.cordasco@rackspace.com' +__license__ = 'Apache v2.0' +__copyright__ = 'Copyright 2014 Rackspace' +__version__ = '0.3.0' + +from .api import (URIReference, uri_reference, is_valid_uri, normalize_uri, + urlparse) +from .parseresult import ParseResult + +__all__ = ( + 'ParseResult', + 'URIReference', + 'is_valid_uri', + 'normalize_uri', + 'uri_reference', + 'urlparse', +) diff --git a/hyper/packages/rfc3986/api.py b/hyper/packages/rfc3986/api.py new file mode 100644 index 00000000..3e9e401a --- /dev/null +++ b/hyper/packages/rfc3986/api.py @@ -0,0 +1,92 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2014 Rackspace +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +rfc3986.api +~~~~~~~~~~~ + +This defines the simple API to rfc3986. This module defines 3 functions and +provides access to the class ``URIReference``. +""" + +from .uri import URIReference +from .parseresult import ParseResult + + +def uri_reference(uri, encoding='utf-8'): + """Parse a URI string into a URIReference. + + This is a convenience function. You could achieve the same end by using + ``URIReference.from_string(uri)``. + + :param str uri: The URI which needs to be parsed into a reference. + :param str encoding: The encoding of the string provided + :returns: A parsed URI + :rtype: :class:`URIReference` + """ + return URIReference.from_string(uri, encoding) + + +def is_valid_uri(uri, encoding='utf-8', **kwargs): + """Determine if the URI given is valid. + + This is a convenience function. You could use either + ``uri_reference(uri).is_valid()`` or + ``URIReference.from_string(uri).is_valid()`` to achieve the same result. + + :param str uri: The URI to be validated. + :param str encoding: The encoding of the string provided + :param bool require_scheme: Set to ``True`` if you wish to require the + presence of the scheme component. + :param bool require_authority: Set to ``True`` if you wish to require the + presence of the authority component. + :param bool require_path: Set to ``True`` if you wish to require the + presence of the path component. + :param bool require_query: Set to ``True`` if you wish to require the + presence of the query component. + :param bool require_fragment: Set to ``True`` if you wish to require the + presence of the fragment component. + :returns: ``True`` if the URI is valid, ``False`` otherwise. + :rtype: bool + """ + return URIReference.from_string(uri, encoding).is_valid(**kwargs) + + +def normalize_uri(uri, encoding='utf-8'): + """Normalize the given URI. + + This is a convenience function. You could use either + ``uri_reference(uri).normalize().unsplit()`` or + ``URIReference.from_string(uri).normalize().unsplit()`` instead. + + :param str uri: The URI to be normalized. + :param str encoding: The encoding of the string provided + :returns: The normalized URI. + :rtype: str + """ + normalized_reference = URIReference.from_string(uri, encoding).normalize() + return normalized_reference.unsplit() + + +def urlparse(uri, encoding='utf-8'): + """Parse a given URI and return a ParseResult. + + This is a partial replacement of the standard library's urlparse function. + + :param str uri: The URI to be parsed. + :param str encoding: The encoding of the string provided. + :returns: A parsed URI + :rtype: :class:`~rfc3986.parseresult.ParseResult` + """ + return ParseResult.from_string(uri, encoding, strict=False) diff --git a/hyper/packages/rfc3986/compat.py b/hyper/packages/rfc3986/compat.py new file mode 100644 index 00000000..6fc7f6d8 --- /dev/null +++ b/hyper/packages/rfc3986/compat.py @@ -0,0 +1,31 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2014 Rackspace +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import sys + + +if sys.version_info >= (3, 0): + unicode = str # Python 3.x + + +def to_str(b, encoding): + if hasattr(b, 'decode') and not isinstance(b, unicode): + b = b.decode('utf-8') + return b + + +def to_bytes(s, encoding): + if hasattr(s, 'encode') and not isinstance(s, bytes): + s = s.encode('utf-8') + return s diff --git a/hyper/packages/rfc3986/exceptions.py b/hyper/packages/rfc3986/exceptions.py new file mode 100644 index 00000000..f9adbde7 --- /dev/null +++ b/hyper/packages/rfc3986/exceptions.py @@ -0,0 +1,21 @@ +# -*- coding: utf-8 -*- +class RFC3986Exception(Exception): + pass + + +class InvalidAuthority(RFC3986Exception): + def __init__(self, authority): + super(InvalidAuthority, self).__init__( + "The authority ({0}) is not valid.".format(authority)) + + +class InvalidPort(RFC3986Exception): + def __init__(self, port): + super(InvalidPort, self).__init__( + 'The port ("{0}") is not valid.'.format(port)) + + +class ResolutionError(RFC3986Exception): + def __init__(self, uri): + super(ResolutionError, self).__init__( + "{0} is not an absolute URI.".format(uri.unsplit())) diff --git a/hyper/packages/rfc3986/misc.py b/hyper/packages/rfc3986/misc.py new file mode 100644 index 00000000..c599434c --- /dev/null +++ b/hyper/packages/rfc3986/misc.py @@ -0,0 +1,214 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2014 Rackspace +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +rfc3986.misc +~~~~~~~~~~~~ + +This module contains important constants, patterns, and compiled regular +expressions for parsing and validating URIs and their components. +""" + +import re + +# These are enumerated for the named tuple used as a superclass of +# URIReference +URI_COMPONENTS = ['scheme', 'authority', 'path', 'query', 'fragment'] + +important_characters = { + 'generic_delimiters': ":/?#[]@", + 'sub_delimiters': "!$&'()*+,;=", + # We need to escape the '*' in this case + 're_sub_delimiters': "!$&'()\*+,;=", + 'unreserved_chars': ('ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz' + '0123456789._~-'), + # We need to escape the '-' in this case: + 're_unreserved': 'A-Za-z0-9._~\-', + } +# For details about delimiters and reserved characters, see: +# http://tools.ietf.org/html/rfc3986#section-2.2 +GENERIC_DELIMITERS = set(important_characters['generic_delimiters']) +SUB_DELIMITERS = set(important_characters['sub_delimiters']) +RESERVED_CHARS = GENERIC_DELIMITERS.union(SUB_DELIMITERS) +# For details about unreserved characters, see: +# http://tools.ietf.org/html/rfc3986#section-2.3 +UNRESERVED_CHARS = set(important_characters['unreserved_chars']) +NON_PCT_ENCODED = RESERVED_CHARS.union(UNRESERVED_CHARS).union('%') + +# Extracted from http://tools.ietf.org/html/rfc3986#appendix-B +component_pattern_dict = { + 'scheme': '[^:/?#]+', + 'authority': '[^/?#]*', + 'path': '[^?#]*', + 'query': '[^#]*', + 'fragment': '.*', + } + +# See http://tools.ietf.org/html/rfc3986#appendix-B +# In this case, we name each of the important matches so we can use +# SRE_Match#groupdict to parse the values out if we so choose. This is also +# modified to ignore other matches that are not important to the parsing of +# the reference so we can also simply use SRE_Match#groups. +expression = ('(?:(?P{scheme}):)?(?://(?P{authority}))?' + '(?P{path})(?:\?(?P{query}))?' + '(?:#(?P{fragment}))?' + ).format(**component_pattern_dict) + +URI_MATCHER = re.compile(expression) + +# ######################### +# Authority Matcher Section +# ######################### + +# Host patterns, see: http://tools.ietf.org/html/rfc3986#section-3.2.2 +# The pattern for a regular name, e.g., www.google.com, api.github.com +reg_name = '(({0})*|[{1}]*)'.format( + '%[0-9A-Fa-f]{2}', + important_characters['re_sub_delimiters'] + + important_characters['re_unreserved'] + ) +# The pattern for an IPv4 address, e.g., 192.168.255.255, 127.0.0.1, +ipv4 = '(\d{1,3}.){3}\d{1,3}' +# Hexadecimal characters used in each piece of an IPv6 address +hexdig = '[0-9A-Fa-f]{1,4}' +# Least-significant 32 bits of an IPv6 address +ls32 = '({hex}:{hex}|{ipv4})'.format(hex=hexdig, ipv4=ipv4) +# Substitutions into the following patterns for IPv6 patterns defined +# http://tools.ietf.org/html/rfc3986#page-20 +subs = {'hex': hexdig, 'ls32': ls32} + +# Below: h16 = hexdig, see: https://tools.ietf.org/html/rfc5234 for details +# about ABNF (Augmented Backus-Naur Form) use in the comments +variations = [ + # 6( h16 ":" ) ls32 + '(%(hex)s:){6}%(ls32)s' % subs, + # "::" 5( h16 ":" ) ls32 + '::(%(hex)s:){5}%(ls32)s' % subs, + # [ h16 ] "::" 4( h16 ":" ) ls32 + '(%(hex)s)?::(%(hex)s:){4}%(ls32)s' % subs, + # [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32 + '((%(hex)s:)?%(hex)s)?::(%(hex)s:){3}%(ls32)s' % subs, + # [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32 + '((%(hex)s:){0,2}%(hex)s)?::(%(hex)s:){2}%(ls32)s' % subs, + # [ *3( h16 ":" ) h16 ] "::" h16 ":" ls32 + '((%(hex)s:){0,3}%(hex)s)?::%(hex)s:%(ls32)s' % subs, + # [ *4( h16 ":" ) h16 ] "::" ls32 + '((%(hex)s:){0,4}%(hex)s)?::%(ls32)s' % subs, + # [ *5( h16 ":" ) h16 ] "::" h16 + '((%(hex)s:){0,5}%(hex)s)?::%(hex)s' % subs, + # [ *6( h16 ":" ) h16 ] "::" + '((%(hex)s:){0,6}%(hex)s)?::' % subs, + ] + +ipv6 = '(({0})|({1})|({2})|({3})|({4})|({5})|({6})|({7}))'.format(*variations) + +ipv_future = 'v[0-9A-Fa-f]+.[%s]+' % ( + important_characters['re_unreserved'] + + important_characters['re_sub_delimiters'] + + ':') + +ip_literal = '\[({0}|{1})\]'.format(ipv6, ipv_future) + +# Pattern for matching the host piece of the authority +HOST_PATTERN = '({0}|{1}|{2})'.format(reg_name, ipv4, ip_literal) + +SUBAUTHORITY_MATCHER = re.compile(( + '^(?:(?P[A-Za-z0-9_.~\-%:]+)@)?' # userinfo + '(?P{0}?)' # host + ':?(?P\d+)?$' # port + ).format(HOST_PATTERN)) + +IPv4_MATCHER = re.compile('^' + ipv4 + '$') + + +# #################### +# Path Matcher Section +# #################### + +# See http://tools.ietf.org/html/rfc3986#section-3.3 for more information +# about the path patterns defined below. + +# Percent encoded character values +pct_encoded = '%[A-Fa-f0-9]{2}' +pchar = ('([' + important_characters['re_unreserved'] + + important_characters['re_sub_delimiters'] + + ':@]|%s)' % pct_encoded) +segments = { + 'segment': pchar + '*', + # Non-zero length segment + 'segment-nz': pchar + '+', + # Non-zero length segment without ":" + 'segment-nz-nc': pchar.replace(':', '') + '+' + } + +# Path types taken from Section 3.3 (linked above) +path_empty = '^$' +path_rootless = '%(segment-nz)s(/%(segment)s)*' % segments +path_noscheme = '%(segment-nz-nc)s(/%(segment)s)*' % segments +path_absolute = '/(%s)?' % path_rootless +path_abempty = '(/%(segment)s)*' % segments + +# Matcher used to validate path components +PATH_MATCHER = re.compile('^(%s|%s|%s|%s|%s)$' % ( + path_abempty, path_absolute, path_noscheme, path_rootless, path_empty + )) + + +# ################################## +# Query and Fragment Matcher Section +# ################################## + +QUERY_MATCHER = re.compile( + '^([/?:@' + important_characters['re_unreserved'] + + important_characters['re_sub_delimiters'] + + ']|%s)*$' % pct_encoded) + +FRAGMENT_MATCHER = QUERY_MATCHER + +# Scheme validation, see: http://tools.ietf.org/html/rfc3986#section-3.1 +SCHEME_MATCHER = re.compile('^[A-Za-z][A-Za-z0-9+.\-]*$') + +# Relative reference matcher + +# See http://tools.ietf.org/html/rfc3986#section-4.2 for details +relative_part = '(//%s%s|%s|%s|%s)' % ( + component_pattern_dict['authority'], path_abempty, path_absolute, + path_noscheme, path_empty + ) + +RELATIVE_REF_MATCHER = re.compile('^%s(\?%s)?(#%s)?$' % ( + relative_part, QUERY_MATCHER.pattern, FRAGMENT_MATCHER.pattern + )) + +# See http://tools.ietf.org/html/rfc3986#section-3 for definition +hier_part = '(//%s%s|%s|%s|%s)' % ( + component_pattern_dict['authority'], path_abempty, path_absolute, + path_rootless, path_empty + ) + +# See http://tools.ietf.org/html/rfc3986#section-4.3 +ABSOLUTE_URI_MATCHER = re.compile('^%s:%s(\?%s)?$' % ( + component_pattern_dict['scheme'], hier_part, QUERY_MATCHER.pattern[1:-1] + )) + + +# Path merger as defined in http://tools.ietf.org/html/rfc3986#section-5.2.3 +def merge_paths(base_uri, relative_path): + """Merge a base URI's path with a relative URI's path.""" + if base_uri.path is None and base_uri.authority is not None: + return '/' + relative_path + else: + path = base_uri.path or '' + index = path.rfind('/') + return path[:index] + '/' + relative_path diff --git a/hyper/packages/rfc3986/normalizers.py b/hyper/packages/rfc3986/normalizers.py new file mode 100644 index 00000000..bb0630cb --- /dev/null +++ b/hyper/packages/rfc3986/normalizers.py @@ -0,0 +1,115 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2014 Rackspace +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import re + +from .compat import to_bytes +from .misc import NON_PCT_ENCODED + + +def normalize_scheme(scheme): + return scheme.lower() + + +def normalize_authority(authority): + userinfo, host, port = authority + result = '' + if userinfo: + result += normalize_percent_characters(userinfo) + '@' + if host: + result += host.lower() + if port: + result += ':' + port + return result + + +def normalize_path(path): + if not path: + return path + + path = normalize_percent_characters(path) + return remove_dot_segments(path) + + +def normalize_query(query): + return normalize_percent_characters(query) + + +def normalize_fragment(fragment): + return normalize_percent_characters(fragment) + + +PERCENT_MATCHER = re.compile('%[A-Fa-f0-9]{2}') + + +def normalize_percent_characters(s): + """All percent characters should be upper-cased. + + For example, ``"%3afoo%DF%ab"`` should be turned into ``"%3Afoo%DF%AB"``. + """ + matches = set(PERCENT_MATCHER.findall(s)) + for m in matches: + if not m.isupper(): + s = s.replace(m, m.upper()) + return s + + +def remove_dot_segments(s): + # See http://tools.ietf.org/html/rfc3986#section-5.2.4 for pseudo-code + segments = s.split('/') # Turn the path into a list of segments + output = [] # Initialize the variable to use to store output + + for segment in segments: + # '.' is the current directory, so ignore it, it is superfluous + if segment == '.': + continue + # Anything other than '..', should be appended to the output + elif segment != '..': + output.append(segment) + # In this case segment == '..', if we can, we should pop the last + # element + elif output: + output.pop() + + # If the path starts with '/' and the output is empty or the first string + # is non-empty + if s.startswith('/') and (not output or output[0]): + output.insert(0, '') + + # If the path starts with '/.' or '/..' ensure we add one more empty + # string to add a trailing '/' + if s.endswith(('/.', '/..')): + output.append('') + + return '/'.join(output) + + +def encode_component(uri_component, encoding): + if uri_component is None: + return uri_component + + uri_bytes = to_bytes(uri_component, encoding) + + encoded_uri = bytearray() + + for i in range(0, len(uri_bytes)): + # Will return a single character bytestring on both Python 2 & 3 + byte = uri_bytes[i:i+1] + byte_ord = ord(byte) + if byte_ord < 128 and byte.decode() in NON_PCT_ENCODED: + encoded_uri.extend(byte) + continue + encoded_uri.extend('%{0:02x}'.format(byte_ord).encode()) + + return encoded_uri.decode(encoding) diff --git a/hyper/packages/rfc3986/parseresult.py b/hyper/packages/rfc3986/parseresult.py new file mode 100644 index 00000000..2def55b6 --- /dev/null +++ b/hyper/packages/rfc3986/parseresult.py @@ -0,0 +1,303 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2015 Ian Cordasco +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from collections import namedtuple + +from . import compat +from . import exceptions +from . import normalizers +from . import uri + +__all__ = ('ParseResult', 'ParseResultBytes') + +PARSED_COMPONENTS = ('scheme', 'userinfo', 'host', 'port', 'path', 'query', + 'fragment') + + +class ParseResultMixin(object): + def _generate_authority(self, attributes): + # I swear I did not align the comparisons below. That's just how they + # happened to align based on pep8 and attribute lengths. + userinfo, host, port = (attributes[p] + for p in ('userinfo', 'host', 'port')) + if (self.userinfo != userinfo or + self.host != host or + self.port != port): + if port: + port = '{0}'.format(port) + return normalizers.normalize_authority( + (compat.to_str(userinfo, self.encoding), + compat.to_str(host, self.encoding), + port) + ) + return self.authority + + def geturl(self): + """Standard library shim to the unsplit method.""" + return self.unsplit() + + @property + def hostname(self): + """Standard library shim for the host portion of the URI.""" + return self.host + + @property + def netloc(self): + """Standard library shim for the authority portion of the URI.""" + return self.authority + + @property + def params(self): + """Standard library shim for the query portion of the URI.""" + return self.query + + +class ParseResult(namedtuple('ParseResult', PARSED_COMPONENTS), + ParseResultMixin): + slots = () + + def __new__(cls, scheme, userinfo, host, port, path, query, fragment, + uri_ref, encoding='utf-8'): + parse_result = super(ParseResult, cls).__new__( + cls, + scheme or None, + userinfo or None, + host, + port or None, + path or None, + query or None, + fragment or None) + parse_result.encoding = encoding + parse_result.reference = uri_ref + return parse_result + + @classmethod + def from_string(cls, uri_string, encoding='utf-8', strict=True): + """Parse a URI from the given unicode URI string. + + :param str uri_string: Unicode URI to be parsed into a reference. + :param str encoding: The encoding of the string provided + :param bool strict: Parse strictly according to :rfc:`3986` if True. + If False, parse similarly to the standard library's urlparse + function. + :returns: :class:`ParseResult` or subclass thereof + """ + reference = uri.URIReference.from_string(uri_string, encoding) + try: + subauthority = reference.authority_info() + except exceptions.InvalidAuthority: + if strict: + raise + userinfo, host, port = split_authority(reference.authority) + else: + # Thanks to Richard Barrell for this idea: + # https://twitter.com/0x2ba22e11/status/617338811975139328 + userinfo, host, port = (subauthority.get(p) + for p in ('userinfo', 'host', 'port')) + + if port: + try: + port = int(port) + except ValueError: + raise exceptions.InvalidPort(port) + + return cls(scheme=reference.scheme, + userinfo=userinfo, + host=host, + port=port, + path=reference.path, + query=reference.query, + fragment=reference.fragment, + uri_ref=reference, + encoding=encoding) + + @property + def authority(self): + """Normalized authority generated from the subauthority parts.""" + return self.reference.authority + + def copy_with(self, scheme=None, userinfo=None, host=None, port=None, + path=None, query=None, fragment=None): + attributes = zip(PARSED_COMPONENTS, + (scheme, userinfo, host, port, path, query, fragment)) + attrs_dict = {} + for name, value in attributes: + if value is None: + value = getattr(self, name) + attrs_dict[name] = value + authority = self._generate_authority(attrs_dict) + ref = self.reference.copy_with(scheme=attrs_dict['scheme'], + authority=authority, + path=attrs_dict['path'], + query=attrs_dict['query'], + fragment=attrs_dict['fragment']) + return ParseResult(uri_ref=ref, encoding=self.encoding, **attrs_dict) + + def encode(self, encoding=None): + encoding = encoding or self.encoding + attrs = dict( + zip(PARSED_COMPONENTS, + (attr.encode(encoding) if hasattr(attr, 'encode') else attr + for attr in self))) + return ParseResultBytes( + uri_ref=self.reference, + encoding=encoding, + **attrs + ) + + def unsplit(self, use_idna=False): + """Create a URI string from the components. + + :returns: The parsed URI reconstituted as a string. + :rtype: str + """ + parse_result = self + if use_idna and self.host: + hostbytes = self.host.encode('idna') + host = hostbytes.decode(self.encoding) + parse_result = self.copy_with(host=host) + return parse_result.reference.unsplit() + + +class ParseResultBytes(namedtuple('ParseResultBytes', PARSED_COMPONENTS), + ParseResultMixin): + def __new__(cls, scheme, userinfo, host, port, path, query, fragment, + uri_ref, encoding='utf-8'): + parse_result = super(ParseResultBytes, cls).__new__( + cls, + scheme or None, + userinfo or None, + host, + port or None, + path or None, + query or None, + fragment or None) + parse_result.encoding = encoding + parse_result.reference = uri_ref + return parse_result + + @classmethod + def from_string(cls, uri_string, encoding='utf-8', strict=True): + """Parse a URI from the given unicode URI string. + + :param str uri_string: Unicode URI to be parsed into a reference. + :param str encoding: The encoding of the string provided + :param bool strict: Parse strictly according to :rfc:`3986` if True. + If False, parse similarly to the standard library's urlparse + function. + :returns: :class:`ParseResultBytes` or subclass thereof + """ + reference = uri.URIReference.from_string(uri_string, encoding) + try: + subauthority = reference.authority_info() + except exceptions.InvalidAuthority: + if strict: + raise + userinfo, host, port = split_authority(reference.authority) + else: + # Thanks to Richard Barrell for this idea: + # https://twitter.com/0x2ba22e11/status/617338811975139328 + userinfo, host, port = (subauthority.get(p) + for p in ('userinfo', 'host', 'port')) + + if port: + try: + port = int(port) + except ValueError: + raise exceptions.InvalidPort(port) + + to_bytes = compat.to_bytes + return cls(scheme=to_bytes(reference.scheme, encoding), + userinfo=to_bytes(userinfo, encoding), + host=to_bytes(host, encoding), + port=port, + path=to_bytes(reference.path, encoding), + query=to_bytes(reference.query, encoding), + fragment=to_bytes(reference.fragment, encoding), + uri_ref=reference, + encoding=encoding) + + @property + def authority(self): + """Normalized authority generated from the subauthority parts.""" + return self.reference.authority.encode(self.encoding) + + def copy_with(self, scheme=None, userinfo=None, host=None, port=None, + path=None, query=None, fragment=None): + attributes = zip(PARSED_COMPONENTS, + (scheme, userinfo, host, port, path, query, fragment)) + attrs_dict = {} + for name, value in attributes: + if value is None: + value = getattr(self, name) + if not isinstance(value, bytes) and hasattr(value, 'encode'): + value = value.encode(self.encoding) + attrs_dict[name] = value + authority = self._generate_authority(attrs_dict) + to_str = compat.to_str + ref = self.reference.copy_with( + scheme=to_str(attrs_dict['scheme'], self.encoding), + authority=authority, + path=to_str(attrs_dict['path'], self.encoding), + query=to_str(attrs_dict['query'], self.encoding), + fragment=to_str(attrs_dict['fragment'], self.encoding) + ) + return ParseResultBytes( + uri_ref=ref, + encoding=self.encoding, + **attrs_dict + ) + + def unsplit(self, use_idna=False): + """Create a URI bytes object from the components. + + :returns: The parsed URI reconstituted as a string. + :rtype: bytes + """ + parse_result = self + if use_idna and self.host: + # self.host is bytes, to encode to idna, we need to decode it + # first + host = self.host.decode(self.encoding) + hostbytes = host.encode('idna') + parse_result = self.copy_with(host=hostbytes) + uri = parse_result.reference.unsplit() + return uri.encode(self.encoding) + + +def split_authority(authority): + # Initialize our expected return values + userinfo = host = port = None + # Initialize an extra var we may need to use + extra_host = None + # Set-up rest in case there is no userinfo portion + rest = authority + + if '@' in authority: + userinfo, rest = authority.rsplit('@', 1) + + # Handle IPv6 host addresses + if rest.startswith('['): + host, rest = rest.split(']', 1) + host += ']' + + if ':' in rest: + extra_host, port = rest.split(':', 1) + elif not host and rest: + host = rest + + if extra_host and not host: + host = extra_host + + return userinfo, host, port diff --git a/hyper/packages/rfc3986/uri.py b/hyper/packages/rfc3986/uri.py new file mode 100644 index 00000000..b7f5ccb7 --- /dev/null +++ b/hyper/packages/rfc3986/uri.py @@ -0,0 +1,385 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2014 Rackspace +# Copyright (c) 2015 Ian Cordasco +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from collections import namedtuple + +from .compat import to_str +from .exceptions import InvalidAuthority, ResolutionError +from .misc import ( + ABSOLUTE_URI_MATCHER, FRAGMENT_MATCHER, IPv4_MATCHER, PATH_MATCHER, + QUERY_MATCHER, SCHEME_MATCHER, SUBAUTHORITY_MATCHER, URI_MATCHER, + URI_COMPONENTS, merge_paths + ) +from .normalizers import ( + encode_component, normalize_scheme, normalize_authority, normalize_path, + normalize_query, normalize_fragment + ) + + +class URIReference(namedtuple('URIReference', URI_COMPONENTS)): + slots = () + + def __new__(cls, scheme, authority, path, query, fragment, + encoding='utf-8'): + ref = super(URIReference, cls).__new__( + cls, + scheme or None, + authority or None, + path or None, + query or None, + fragment or None) + ref.encoding = encoding + return ref + + def __eq__(self, other): + other_ref = other + if isinstance(other, tuple): + other_ref = URIReference(*other) + elif not isinstance(other, URIReference): + try: + other_ref = URIReference.from_string(other) + except TypeError: + raise TypeError( + 'Unable to compare URIReference() to {0}()'.format( + type(other).__name__)) + + # See http://tools.ietf.org/html/rfc3986#section-6.2 + naive_equality = tuple(self) == tuple(other_ref) + return naive_equality or self.normalized_equality(other_ref) + + @classmethod + def from_string(cls, uri_string, encoding='utf-8'): + """Parse a URI reference from the given unicode URI string. + + :param str uri_string: Unicode URI to be parsed into a reference. + :param str encoding: The encoding of the string provided + :returns: :class:`URIReference` or subclass thereof + """ + uri_string = to_str(uri_string, encoding) + + split_uri = URI_MATCHER.match(uri_string).groupdict() + return cls(split_uri['scheme'], split_uri['authority'], + encode_component(split_uri['path'], encoding), + encode_component(split_uri['query'], encoding), + encode_component(split_uri['fragment'], encoding), encoding) + + def authority_info(self): + """Returns a dictionary with the ``userinfo``, ``host``, and ``port``. + + If the authority is not valid, it will raise a ``InvalidAuthority`` + Exception. + + :returns: + ``{'userinfo': 'username:password', 'host': 'www.example.com', + 'port': '80'}`` + :rtype: dict + :raises InvalidAuthority: If the authority is not ``None`` and can not + be parsed. + """ + if not self.authority: + return {'userinfo': None, 'host': None, 'port': None} + + match = SUBAUTHORITY_MATCHER.match(self.authority) + + if match is None: + # In this case, we have an authority that was parsed from the URI + # Reference, but it cannot be further parsed by our + # SUBAUTHORITY_MATCHER. In this case it must not be a valid + # authority. + raise InvalidAuthority(self.authority.encode(self.encoding)) + + # We had a match, now let's ensure that it is actually a valid host + # address if it is IPv4 + matches = match.groupdict() + host = matches.get('host') + + if (host and IPv4_MATCHER.match(host) and not + valid_ipv4_host_address(host)): + # If we have a host, it appears to be IPv4 and it does not have + # valid bytes, it is an InvalidAuthority. + raise InvalidAuthority(self.authority.encode(self.encoding)) + + return matches + + @property + def host(self): + """If present, a string representing the host.""" + try: + authority = self.authority_info() + except InvalidAuthority: + return None + return authority['host'] + + @property + def port(self): + """If present, the port (as a string) extracted from the authority.""" + try: + authority = self.authority_info() + except InvalidAuthority: + return None + return authority['port'] + + @property + def userinfo(self): + """If present, the userinfo extracted from the authority.""" + try: + authority = self.authority_info() + except InvalidAuthority: + return None + return authority['userinfo'] + + def is_absolute(self): + """Determine if this URI Reference is an absolute URI. + + See http://tools.ietf.org/html/rfc3986#section-4.3 for explanation. + + :returns: ``True`` if it is an absolute URI, ``False`` otherwise. + :rtype: bool + """ + return bool(ABSOLUTE_URI_MATCHER.match(self.unsplit())) + + def is_valid(self, **kwargs): + """Determines if the URI is valid. + + :param bool require_scheme: Set to ``True`` if you wish to require the + presence of the scheme component. + :param bool require_authority: Set to ``True`` if you wish to require + the presence of the authority component. + :param bool require_path: Set to ``True`` if you wish to require the + presence of the path component. + :param bool require_query: Set to ``True`` if you wish to require the + presence of the query component. + :param bool require_fragment: Set to ``True`` if you wish to require + the presence of the fragment component. + :returns: ``True`` if the URI is valid. ``False`` otherwise. + :rtype: bool + """ + validators = [ + (self.scheme_is_valid, kwargs.get('require_scheme', False)), + (self.authority_is_valid, kwargs.get('require_authority', False)), + (self.path_is_valid, kwargs.get('require_path', False)), + (self.query_is_valid, kwargs.get('require_query', False)), + (self.fragment_is_valid, kwargs.get('require_fragment', False)), + ] + return all(v(r) for v, r in validators) + + def _is_valid(self, value, matcher, require): + if require: + return (value is not None + and matcher.match(value)) + + # require is False and value is not None + return value is None or matcher.match(value) + + def authority_is_valid(self, require=False): + """Determines if the authority component is valid. + + :param str require: Set to ``True`` to require the presence of this + component. + :returns: ``True`` if the authority is valid. ``False`` otherwise. + :rtype: bool + """ + try: + self.authority_info() + except InvalidAuthority: + return False + + is_valid = self._is_valid(self.authority, + SUBAUTHORITY_MATCHER, + require) + + # Ensure that IPv4 addresses have valid bytes + if is_valid and self.host and IPv4_MATCHER.match(self.host): + return valid_ipv4_host_address(self.host) + + # Perhaps the host didn't exist or if it did, it wasn't an IPv4-like + # address. In either case, we want to rely on the `_is_valid` check, + # so let's return that. + return is_valid + + def scheme_is_valid(self, require=False): + """Determines if the scheme component is valid. + + :param str require: Set to ``True`` to require the presence of this + component. + :returns: ``True`` if the scheme is valid. ``False`` otherwise. + :rtype: bool + """ + return self._is_valid(self.scheme, SCHEME_MATCHER, require) + + def path_is_valid(self, require=False): + """Determines if the path component is valid. + + :param str require: Set to ``True`` to require the presence of this + component. + :returns: ``True`` if the path is valid. ``False`` otherwise. + :rtype: bool + """ + return self._is_valid(self.path, PATH_MATCHER, require) + + def query_is_valid(self, require=False): + """Determines if the query component is valid. + + :param str require: Set to ``True`` to require the presence of this + component. + :returns: ``True`` if the query is valid. ``False`` otherwise. + :rtype: bool + """ + return self._is_valid(self.query, QUERY_MATCHER, require) + + def fragment_is_valid(self, require=False): + """Determines if the fragment component is valid. + + :param str require: Set to ``True`` to require the presence of this + component. + :returns: ``True`` if the fragment is valid. ``False`` otherwise. + :rtype: bool + """ + return self._is_valid(self.fragment, FRAGMENT_MATCHER, require) + + def normalize(self): + """Normalize this reference as described in Section 6.2.2 + + This is not an in-place normalization. Instead this creates a new + URIReference. + + :returns: A new reference object with normalized components. + :rtype: URIReference + """ + # See http://tools.ietf.org/html/rfc3986#section-6.2.2 for logic in + # this method. + return URIReference(normalize_scheme(self.scheme or ''), + normalize_authority( + (self.userinfo, self.host, self.port)), + normalize_path(self.path or ''), + normalize_query(self.query or ''), + normalize_fragment(self.fragment or '')) + + def normalized_equality(self, other_ref): + """Compare this URIReference to another URIReference. + + :param URIReference other_ref: (required), The reference with which + we're comparing. + :returns: ``True`` if the references are equal, ``False`` otherwise. + :rtype: bool + """ + return tuple(self.normalize()) == tuple(other_ref.normalize()) + + def resolve_with(self, base_uri, strict=False): + """Use an absolute URI Reference to resolve this relative reference. + + Assuming this is a relative reference that you would like to resolve, + use the provided base URI to resolve it. + + See http://tools.ietf.org/html/rfc3986#section-5 for more information. + + :param base_uri: Either a string or URIReference. It must be an + absolute URI or it will raise an exception. + :returns: A new URIReference which is the result of resolving this + reference using ``base_uri``. + :rtype: :class:`URIReference` + :raises ResolutionError: If the ``base_uri`` is not an absolute URI. + """ + if not isinstance(base_uri, URIReference): + base_uri = URIReference.from_string(base_uri) + + if not base_uri.is_absolute(): + raise ResolutionError(base_uri) + + # This is optional per + # http://tools.ietf.org/html/rfc3986#section-5.2.1 + base_uri = base_uri.normalize() + + # The reference we're resolving + resolving = self + + if not strict and resolving.scheme == base_uri.scheme: + resolving = resolving.copy_with(scheme=None) + + # http://tools.ietf.org/html/rfc3986#page-32 + if resolving.scheme is not None: + target = resolving.copy_with(path=normalize_path(resolving.path)) + else: + if resolving.authority is not None: + target = resolving.copy_with( + scheme=base_uri.scheme, + path=normalize_path(resolving.path) + ) + else: + if resolving.path is None: + if resolving.query is not None: + query = resolving.query + else: + query = base_uri.query + target = resolving.copy_with( + scheme=base_uri.scheme, + authority=base_uri.authority, + path=base_uri.path, + query=query + ) + else: + if resolving.path.startswith('/'): + path = normalize_path(resolving.path) + else: + path = normalize_path( + merge_paths(base_uri, resolving.path) + ) + target = resolving.copy_with( + scheme=base_uri.scheme, + authority=base_uri.authority, + path=path, + query=resolving.query + ) + return target + + def unsplit(self): + """Create a URI string from the components. + + :returns: The URI Reference reconstituted as a string. + :rtype: str + """ + # See http://tools.ietf.org/html/rfc3986#section-5.3 + result_list = [] + if self.scheme: + result_list.extend([self.scheme, ':']) + if self.authority: + result_list.extend(['//', self.authority]) + if self.path: + result_list.append(self.path) + if self.query: + result_list.extend(['?', self.query]) + if self.fragment: + result_list.extend(['#', self.fragment]) + return ''.join(result_list) + + def copy_with(self, scheme=None, authority=None, path=None, query=None, + fragment=None): + attributes = { + 'scheme': scheme, + 'authority': authority, + 'path': path, + 'query': query, + 'fragment': fragment, + } + for key, value in list(attributes.items()): + if value is None: + del attributes[key] + return self._replace(**attributes) + + +def valid_ipv4_host_address(host): + # If the host exists, and it might be IPv4, check each byte in the + # address. + return all([0 <= int(byte, base=10) <= 255 for byte in host.split('.')]) diff --git a/setup.py b/setup.py index 3f40ae59..49583c9b 100644 --- a/setup.py +++ b/setup.py @@ -56,6 +56,7 @@ def resolve_install_requires(): 'hyper.packages', 'hyper.packages.hpack', 'hyper.packages.hyperframe', + 'hyper.packages.rfc3986' ] setup( diff --git a/test/test_http11.py b/test/test_http11.py index 19e82fb8..074527e5 100644 --- a/test/test_http11.py +++ b/test/test_http11.py @@ -96,6 +96,14 @@ def test_initialization_proxy_with_separate_port(self): assert c.proxy_host == 'localhost' assert c.proxy_port == 8443 + def test_initialization_with_ipv6_addresses_proxy_inline_port(self): + c = HTTP11Connection('[abcd:dcba::1234]', proxy_host='[ffff:aaaa::1]:8443') + + assert c.host == 'abcd:dcba::1234' + assert c.port == 80 + assert not c.secure + assert c.proxy_host == 'ffff:aaaa::1' + assert c.proxy_port == 8443 def test_basic_request(self): c = HTTP11Connection('httpbin.org') diff --git a/test/test_hyper.py b/test/test_hyper.py index fbdd73d8..b015e4c7 100644 --- a/test/test_hyper.py +++ b/test/test_hyper.py @@ -63,6 +63,15 @@ def test_connections_can_parse_proxy_hosts_and_ports(self): assert c.proxy_host == 'localhost' assert c.proxy_port == 8443 + def test_connections_can_parse_ipv6_hosts_and_ports(self): + c = HTTP20Connection('[abcd:dcba::1234]', + proxy_host='[ffff:aaaa::1]:8443') + + assert c.host == 'abcd:dcba::1234' + assert c.port == 443 + assert c.proxy_host == 'ffff:aaaa::1' + assert c.proxy_port == 8443 + def test_putrequest_establishes_new_stream(self): c = HTTP20Connection("www.google.com")