From e1459a019fd088248e0f6b08c15198e1a181d6de Mon Sep 17 00:00:00 2001 From: Danny Hermes Date: Fri, 20 Oct 2017 13:30:25 -0700 Subject: [PATCH 1/2] Add requests/urllib3 work-around for intercepting gzipped bytes. --- google/resumable_media/requests/download.py | 62 ++++++++++++++++++++- nox.py | 15 +++-- tests/system/requests/test_download.py | 5 +- tests/unit/requests/test_download.py | 53 +++++++++++++++++- 4 files changed, 125 insertions(+), 10 deletions(-) diff --git a/google/resumable_media/requests/download.py b/google/resumable_media/requests/download.py index 178678cc..2621dce3 100644 --- a/google/resumable_media/requests/download.py +++ b/google/resumable_media/requests/download.py @@ -18,6 +18,8 @@ import hashlib import logging +import urllib3.response + from google.resumable_media import _download from google.resumable_media import common from google.resumable_media.requests import _helpers @@ -113,11 +115,14 @@ def _write_to_stream(self, response): else: md5_hash = hashlib.md5() with response: + # NOTE: This might "donate" ``md5_hash`` to the decoder and replace + # it with a ``_DoNothingHash``. + local_hash = _add_decoder(response.raw, md5_hash) body_iter = response.iter_content( chunk_size=_SINGLE_GET_CHUNK_SIZE, decode_unicode=False) for chunk in body_iter: self._stream.write(chunk) - md5_hash.update(chunk) + local_hash.update(chunk) if expected_md5_hash is None: return @@ -286,3 +291,58 @@ def update(self, unused_chunk): Args: unused_chunk (bytes): A chunk of data. """ + + +def _add_decoder(response_raw, md5_hash): + """Patch the ``_decoder`` on a ``urllib3`` response. + + This is so that we can intercept the compressed bytes before they are + decoded. + + Only patches if the content encoding is ``gzip``. + + Args: + response_raw (urllib3.response.HTTPResponse): The raw response for + an HTTP request. + md5_hash (Union[_DoNothingHash, hashlib.md5]): A hash function which + will get updated when it encounters compressed bytes. + + Returns: + Union[_DoNothingHash, hashlib.md5]: Either the original ``md5_hash`` + if ``_decoder`` is not patched. Otherwise, returns a ``_DoNothingHash`` + since the caller will no longer need to hash to decoded bytes. + """ + encoding = response_raw.headers.get(u'content-encoding', u'').lower() + if encoding != u'gzip': + return md5_hash + + response_raw._decoder = GzipDecoder(md5_hash) + return _DoNothingHash() + + +class GzipDecoder(urllib3.response.GzipDecoder): + """Custom subclass of ``urllib3`` decoder for ``gzip``-ed bytes. + + Allows an MD5 hash function to see the compressed bytes before they are + decoded. This way the hash of the compressed value can be computed. + + Args: + md5_hash (Union[_DoNothingHash, hashlib.md5]): A hash function which + will get updated when it encounters compressed bytes. + """ + + def __init__(self, md5_hash): + super(GzipDecoder, self).__init__() + self._md5_hash = md5_hash + + def decompress(self, data): + """Decompress the bytes. + + Args: + data (bytes): The compressed bytes to be decompressed. + + Returns: + bytes: The decompressed bytes from ``data``. + """ + self._md5_hash.update(data) + return super(GzipDecoder, self).decompress(data) diff --git a/nox.py b/nox.py index 8b98e969..e0ed0437 100644 --- a/nox.py +++ b/nox.py @@ -22,6 +22,8 @@ 'GOOGLE_RESUMABLE_MEDIA_BUCKET', 'GOOGLE_APPLICATION_CREDENTIALS', ) +REQUESTS = 'requests >= 2.18.0, < 3.0.0dev' +GOOGLE_AUTH = 'google-auth >= 0.10.0' @nox.session @@ -33,7 +35,7 @@ def unit_tests(session, python_version): session.interpreter = 'python{}'.format(python_version) # Install all test dependencies, then install this package in-place. - session.install('mock', 'pytest', 'pytest-cov') + session.install('mock', 'pytest', 'pytest-cov', REQUESTS) session.install('-e', '.') # Run py.test against the unit tests. @@ -63,7 +65,11 @@ def docs(session): # Install Sphinx and other dependencies. session.chdir(os.path.realpath(os.path.dirname(__file__))) session.install( - 'sphinx', 'sphinx_rtd_theme', 'sphinx-docstring-typing >= 0.0.3') + 'sphinx', + 'sphinx_rtd_theme', + 'sphinx-docstring-typing >= 0.0.3', + REQUESTS, + ) session.install('-e', '.') # Build the docs! @@ -82,7 +88,8 @@ def doctest(session): 'sphinx_rtd_theme', 'sphinx-docstring-typing >= 0.0.3', 'mock', - 'google-auth' + GOOGLE_AUTH, + REQUESTS, ) session.install('-e', '.') @@ -142,7 +149,7 @@ def system_tests(session, python_version): # Install all test dependencies, then install this package into the # virutalenv's dist-packages. - session.install('mock', 'pytest', 'requests', 'google-auth >= 0.10.0') + session.install('mock', 'pytest', REQUESTS, GOOGLE_AUTH) session.install('-e', '.') # Run py.test against the system tests. diff --git a/tests/system/requests/test_download.py b/tests/system/requests/test_download.py index 4f37fa6e..53fefb7c 100644 --- a/tests/system/requests/test_download.py +++ b/tests/system/requests/test_download.py @@ -59,8 +59,7 @@ u'path': os.path.realpath(os.path.join(DATA_DIR, u'file.txt.gz')), u'uncompressed': os.path.realpath(os.path.join(DATA_DIR, u'file.txt')), u'content_type': PLAIN_TEXT, - # NOTE: This **should** be u'KHRs/+ZSrc/FuuR4qz/PZQ=='. - u'checksum': u'XHSHAr/SpIeZtZbjgQ4nGw==', + u'checksum': u'KHRs/+ZSrc/FuuR4qz/PZQ==', u'slices': (), u'metadata': { u'contentEncoding': u'gzip', @@ -96,7 +95,7 @@ class CorruptingAuthorizedSession(tr_requests.AuthorizedSession): def request(self, method, url, data=None, headers=None, **kwargs): """Implementation of Requests' request.""" response = tr_requests.AuthorizedSession.request( - self, method, url, data=data, headers=headers) + self, method, url, data=data, headers=headers, **kwargs) response.headers[download_mod._HASH_HEADER] = ( u'md5={}'.format(self.EMPTY_HASH)) return response diff --git a/tests/unit/requests/test_download.py b/tests/unit/requests/test_download.py index 5fc60801..2124a9c5 100644 --- a/tests/unit/requests/test_download.py +++ b/tests/unit/requests/test_download.py @@ -256,8 +256,15 @@ def _mock_response(self, start_byte, end_byte, total_bytes, response_headers = self._response_headers( start_byte, end_byte, total_bytes) return mock.Mock( - content=content, headers=response_headers, status_code=status_code, - spec=[u'content', u'headers', u'status_code']) + content=content, + headers=response_headers, + status_code=status_code, + spec=[ + u'content', + u'headers', + u'status_code', + ], + ) def test_consume_next_chunk_already_finished(self): download = download_mod.ChunkedDownload(EXAMPLE_URL, 512, None) @@ -355,20 +362,62 @@ def test__DoNothingHash(): assert return_value is None +class Test__add_decoder(object): + + def test_non_gzipped(self): + response_raw = mock.Mock(headers={}, spec=[u'headers']) + md5_hash = download_mod._add_decoder( + response_raw, mock.sentinel.md5_hash) + + assert md5_hash is mock.sentinel.md5_hash + + def test_gzipped(self): + headers = {u'content-encoding': u'gzip'} + response_raw = mock.Mock( + headers=headers, spec=[u'headers', u'_decoder']) + md5_hash = download_mod._add_decoder( + response_raw, mock.sentinel.md5_hash) + + assert md5_hash is not mock.sentinel.md5_hash + assert isinstance(md5_hash, download_mod._DoNothingHash) + assert isinstance(response_raw._decoder, download_mod.GzipDecoder) + assert response_raw._decoder._md5_hash is mock.sentinel.md5_hash + + +class TestGzipDecoder(object): + + def test_constructor(self): + decoder = download_mod.GzipDecoder(mock.sentinel.md5_hash) + assert decoder._md5_hash is mock.sentinel.md5_hash + + def test_decompress(self): + md5_hash = mock.Mock(spec=['update']) + decoder = download_mod.GzipDecoder(md5_hash) + + data = b'\x1f\x8b\x08\x08' + result = decoder.decompress(data) + + assert result == b'' + md5_hash.update.assert_called_once_with(data) + + def _mock_response(status_code=http_client.OK, chunks=(), headers=None): if headers is None: headers = {} if chunks: + mock_raw = mock.Mock(headers=headers, spec=[u'headers']) response = mock.MagicMock( headers=headers, status_code=int(status_code), + raw=mock_raw, spec=[ u'__enter__', u'__exit__', u'iter_content', u'status_code', u'headers', + u'raw', ], ) # i.e. context manager returns ``self``. From 0f1a18d85b7591471d3af90788e20d8d2329e32a Mon Sep 17 00:00:00 2001 From: Danny Hermes Date: Fri, 20 Oct 2017 13:36:21 -0700 Subject: [PATCH 2/2] Making our GzipDecoder non-public. --- google/resumable_media/requests/download.py | 8 ++++---- tests/unit/requests/test_download.py | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/google/resumable_media/requests/download.py b/google/resumable_media/requests/download.py index 2621dce3..d0bcd990 100644 --- a/google/resumable_media/requests/download.py +++ b/google/resumable_media/requests/download.py @@ -316,11 +316,11 @@ def _add_decoder(response_raw, md5_hash): if encoding != u'gzip': return md5_hash - response_raw._decoder = GzipDecoder(md5_hash) + response_raw._decoder = _GzipDecoder(md5_hash) return _DoNothingHash() -class GzipDecoder(urllib3.response.GzipDecoder): +class _GzipDecoder(urllib3.response.GzipDecoder): """Custom subclass of ``urllib3`` decoder for ``gzip``-ed bytes. Allows an MD5 hash function to see the compressed bytes before they are @@ -332,7 +332,7 @@ class GzipDecoder(urllib3.response.GzipDecoder): """ def __init__(self, md5_hash): - super(GzipDecoder, self).__init__() + super(_GzipDecoder, self).__init__() self._md5_hash = md5_hash def decompress(self, data): @@ -345,4 +345,4 @@ def decompress(self, data): bytes: The decompressed bytes from ``data``. """ self._md5_hash.update(data) - return super(GzipDecoder, self).decompress(data) + return super(_GzipDecoder, self).decompress(data) diff --git a/tests/unit/requests/test_download.py b/tests/unit/requests/test_download.py index 2124a9c5..4e1715cd 100644 --- a/tests/unit/requests/test_download.py +++ b/tests/unit/requests/test_download.py @@ -380,19 +380,19 @@ def test_gzipped(self): assert md5_hash is not mock.sentinel.md5_hash assert isinstance(md5_hash, download_mod._DoNothingHash) - assert isinstance(response_raw._decoder, download_mod.GzipDecoder) + assert isinstance(response_raw._decoder, download_mod._GzipDecoder) assert response_raw._decoder._md5_hash is mock.sentinel.md5_hash -class TestGzipDecoder(object): +class Test_GzipDecoder(object): def test_constructor(self): - decoder = download_mod.GzipDecoder(mock.sentinel.md5_hash) + decoder = download_mod._GzipDecoder(mock.sentinel.md5_hash) assert decoder._md5_hash is mock.sentinel.md5_hash def test_decompress(self): md5_hash = mock.Mock(spec=['update']) - decoder = download_mod.GzipDecoder(md5_hash) + decoder = download_mod._GzipDecoder(md5_hash) data = b'\x1f\x8b\x08\x08' result = decoder.decompress(data)