From 37f34fa8d1ec9d51bf079bc75e2cac6608546872 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Fri, 9 Nov 2018 02:25:52 -0500 Subject: [PATCH 01/64] Add alias to `buffer` on Python 2/3 While `buffer` is not exactly equivalent to `memoryview` on Python 3 and Python 2 already has `memoryview`, the support of the `buffer` type is much better within Python 2 much as the support of `memoryview` is much better on Python 3. Plus this is what a lot of Python 2/3 compatibility guides suggest when moving to Python 3. So this seems like a reasonable alias to add. --- numcodecs/compat.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/numcodecs/compat.py b/numcodecs/compat.py index f3b6e95c..8fae9d4a 100644 --- a/numcodecs/compat.py +++ b/numcodecs/compat.py @@ -14,6 +14,7 @@ if PY2: # pragma: py3 no cover + buffer = buffer text_type = unicode binary_type = str integer_types = (int, long) @@ -21,6 +22,7 @@ else: # pragma: py2 no cover + buffer = memoryview text_type = str binary_type = bytes integer_types = int, From a2f667f5ea30deeb85b8120d88d57ddbfc1fc013 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Fri, 9 Nov 2018 02:25:53 -0500 Subject: [PATCH 02/64] Add `to_buffer` function Provides a compatibility function for converting an object into a buffer or memoryview. Makes sure the Python 2 and Python 3 buffers behave roughly the same. Use the `buffer` type on Python 2 as more things support it and it can easily be coerced to a `memoryview`. On Python 3 use a `memoryview` as there is no `buffer` type and `memoryview`s work well on Python 3. Ensure the Python 3 `memoryview` is flattened and of `unsigned char` type, which is inline with the `memoryview` coerced Python 2 buffer. More importantly we generally want to work with flattened byte types anyways, so this provides us a consistent path to getting there while still providing a view onto the underlying data (without copying on either Python). --- numcodecs/compat.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/numcodecs/compat.py b/numcodecs/compat.py index 8fae9d4a..0df4038e 100644 --- a/numcodecs/compat.py +++ b/numcodecs/compat.py @@ -29,6 +29,21 @@ from functools import reduce +def to_buffer(v): + """Obtain a `buffer` or `memoryview` for `v`.""" + b = v + if isinstance(b, np.ndarray): + b = b.reshape(-1, order='A') + if b.dtype.kind != 'O': + b = b.view(np.uint8) + + b = buffer(b) + if not PY2: # pragma: py3 no cover + b = b.cast('B') + + return b + + def buffer_tobytes(v): """Obtain a sequence of bytes for the memory buffer used by `v`.""" if isinstance(v, binary_type): From 290581ade2255036623cf1922c68a7cb43d7165c Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Fri, 9 Nov 2018 02:25:53 -0500 Subject: [PATCH 03/64] Use `to_buffer` to implement `buffer_tobytes` Effectively `to_buffer` is a refactoring of the functionality in `buffer_tobytes`. Namely it handles taking a view of the underlying data and ensuring that view is flattened, contiguous, and cast to `unsigned char`. Though it doesn't actually perform the copy yet. Hence `buffer_tobytes` leverages `to_buffer` to ensure the data is ready to copy. All that is needed is coercion to a `memoryview`. Once coerced the `tobytes` method can be called to return the `bytes` needed for the encoding or decoding operation. This causes a copy, but is unavoidable at this stage. --- numcodecs/compat.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/numcodecs/compat.py b/numcodecs/compat.py index 0df4038e..ed09eae1 100644 --- a/numcodecs/compat.py +++ b/numcodecs/compat.py @@ -46,14 +46,7 @@ def to_buffer(v): def buffer_tobytes(v): """Obtain a sequence of bytes for the memory buffer used by `v`.""" - if isinstance(v, binary_type): - return v - elif isinstance(v, np.ndarray): - return v.tobytes(order='A') - elif PY2 and isinstance(v, array.array): # pragma: py3 no cover - return v.tostring() - else: - return memoryview(v).tobytes() + return memoryview(to_buffer(v)).tobytes() def buffer_copy(buf, out=None): From aa25a3fc25f2dc1b3cafaac3f84fbf750cc9ae16 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Fri, 9 Nov 2018 02:25:54 -0500 Subject: [PATCH 04/64] Use `to_buffer` in `BZ2` to avoid a copy Make use of `to_buffer` in BZ2's `encode` function to avoid a copy to `bytes` due to non-contiguous data. This ensures the view of the data is a flattened one. Also removes the need for functions like `handle_datetime`, which this drops. --- numcodecs/bz2.py | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/numcodecs/bz2.py b/numcodecs/bz2.py index a74ba3d4..ed08dfb5 100644 --- a/numcodecs/bz2.py +++ b/numcodecs/bz2.py @@ -1,14 +1,13 @@ # -*- coding: utf-8 -*- from __future__ import absolute_import, print_function, division import bz2 as _bz2 -import array import numpy as np from numcodecs.abc import Codec -from numcodecs.compat import buffer_copy, handle_datetime +from numcodecs.compat import buffer_copy, to_buffer class BZ2(Codec): @@ -28,18 +27,13 @@ def __init__(self, level=1): def encode(self, buf): - # deal with lack of buffer support for datetime64 and timedelta64 - buf = handle_datetime(buf) - if isinstance(buf, np.ndarray): # cannot compress object array if buf.dtype == object: raise ValueError('cannot encode object array') - # if numpy array, can only handle C contiguous directly - if not buf.flags.c_contiguous: - buf = buf.tobytes(order='A') + buf = to_buffer(buf) # do compression return _bz2.compress(buf, self.level) @@ -47,10 +41,7 @@ def encode(self, buf): # noinspection PyMethodMayBeStatic def decode(self, buf, out=None): - # BZ2 cannot handle ndarray directly at all, coerce everything to - # memoryview - if not isinstance(buf, array.array): - buf = memoryview(buf) + buf = to_buffer(buf) # do decompression dec = _bz2.decompress(buf) From f17767de23e747d58a28a5d20845792a45053398 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Fri, 9 Nov 2018 02:25:54 -0500 Subject: [PATCH 05/64] Use `to_buffer` in `GZip` encoding and decoding Ensures that we are working with a consistent view onto the underlying data with both Python 2 and Python 3. The `buffer` or `memoryview` returned on Python 2 or Python 3 respectively can be comfortably used with `BytesIO` and `GzipFile`'s `write` with the minimal necessary overhead. Also removes the need for functions like `handle_datetime`, as `to_buffer` already ensures an unsigned character view of the data. --- numcodecs/gzip.py | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/numcodecs/gzip.py b/numcodecs/gzip.py index 802d9fcd..df985f44 100644 --- a/numcodecs/gzip.py +++ b/numcodecs/gzip.py @@ -8,7 +8,7 @@ from .abc import Codec -from .compat import buffer_copy, handle_datetime, buffer_tobytes, PY2 +from .compat import buffer_copy, to_buffer class GZip(Codec): @@ -28,22 +28,13 @@ def __init__(self, level=1): def encode(self, buf): - # deal with lack of buffer support for datetime64 and timedelta64 - buf = handle_datetime(buf) - if isinstance(buf, np.ndarray): # cannot compress object array if buf.dtype == object: raise ValueError('cannot encode object array') - # if numpy array, can only handle C contiguous directly - if not buf.flags.c_contiguous: - buf = buf.tobytes(order='A') - - if PY2: # pragma: py3 no cover - # ensure bytes, PY2 cannot handle things like bytearray - buf = buffer_tobytes(buf) + buf = to_buffer(buf) # do compression compressed = io.BytesIO() @@ -58,9 +49,7 @@ def encode(self, buf): # noinspection PyMethodMayBeStatic def decode(self, buf, out=None): - if PY2: # pragma: py3 no cover - # ensure bytes, PY2 cannot handle things like bytearray - buf = buffer_tobytes(buf) + buf = to_buffer(buf) # do decompression buf = io.BytesIO(buf) From 987087f160a6370180d62edeb226ed130143d53e Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Fri, 9 Nov 2018 02:25:55 -0500 Subject: [PATCH 06/64] Use `to_buffer` in `LZMA` to avoid a copy Make use of `to_buffer` in LZMA's `encode` function to avoid a copy to `bytes` due to non-contiguous data. This ensures the view of the data is a flattened one. Also drop the need for `handle_datetime` as `to_buffer` already takes an unsigned character view of the data. --- numcodecs/lzma.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/numcodecs/lzma.py b/numcodecs/lzma.py index 5f7521b6..9de6179b 100644 --- a/numcodecs/lzma.py +++ b/numcodecs/lzma.py @@ -16,7 +16,7 @@ import numpy as np from .abc import Codec - from .compat import buffer_copy, handle_datetime + from .compat import buffer_copy, to_buffer # noinspection PyShadowingBuiltins class LZMA(Codec): @@ -48,18 +48,13 @@ def __init__(self, format=1, check=-1, preset=None, filters=None): def encode(self, buf): - # deal with lack of buffer support for datetime64 and timedelta64 - buf = handle_datetime(buf) - if isinstance(buf, np.ndarray): # cannot compress object array if buf.dtype == object: raise ValueError('cannot encode object array') - # if numpy array, can only handle C contiguous directly - if not buf.flags.c_contiguous: - buf = buf.tobytes(order='A') + buf = to_buffer(buf) # do compression return _lzma.compress(buf, format=self.format, check=self.check, @@ -67,6 +62,8 @@ def encode(self, buf): def decode(self, buf, out=None): + buf = to_buffer(buf) + # do decompression dec = _lzma.decompress(buf, format=self.format, filters=self.filters) From c2841afc144f92e2853afc480dd031b5c058d95c Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Fri, 9 Nov 2018 02:25:56 -0500 Subject: [PATCH 07/64] Use `to_buffer` in `Zlib` encoding and decoding Ensures that we are working with a consistent view onto the underlying data with both Python 2 and Python 3. The `buffer` or `memoryview` returned on Python 2 or Python 3 respectively can be comfortably used with `zlib`'s `compress` and `decompress` functions with the minimal necessary overhead. Also drops the need for `handle_datetime` as `to_buffer` already takes an unsigned character view of the data. --- numcodecs/zlib.py | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/numcodecs/zlib.py b/numcodecs/zlib.py index 0ab68d98..fda8af8a 100644 --- a/numcodecs/zlib.py +++ b/numcodecs/zlib.py @@ -7,7 +7,7 @@ from .abc import Codec -from .compat import buffer_copy, handle_datetime, buffer_tobytes, PY2 +from .compat import buffer_copy, to_buffer class Zlib(Codec): @@ -27,22 +27,13 @@ def __init__(self, level=1): def encode(self, buf): - # deal with lack of buffer support for datetime64 and timedelta64 - buf = handle_datetime(buf) - if isinstance(buf, np.ndarray): # cannot compress object array if buf.dtype == object: raise ValueError('cannot encode object array') - # if numpy array, can only handle C contiguous directly - if not buf.flags.c_contiguous: - buf = buf.tobytes(order='A') - - if PY2: # pragma: py3 no cover - # ensure bytes, PY2 cannot handle things like bytearray - buf = buffer_tobytes(buf) + buf = to_buffer(buf) # do compression return _zlib.compress(buf, self.level) @@ -50,9 +41,7 @@ def encode(self, buf): # noinspection PyMethodMayBeStatic def decode(self, buf, out=None): - if PY2: # pragma: py3 no cover - # ensure bytes, PY2 cannot handle things like bytearray - buf = buffer_tobytes(buf) + buf = to_buffer(buf) # do decompression dec = _zlib.decompress(buf) From d50b66444e8cdcef440938dd9c8189c1384e8827 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Fri, 9 Nov 2018 02:25:57 -0500 Subject: [PATCH 08/64] Drop `handle_datetime` as `to_buffer` handles this The `to_buffer` function already tries to coerce `ndarray`'s into views of `np.uint8` data. So there is no need to handle this through this function any more. --- numcodecs/compat.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/numcodecs/compat.py b/numcodecs/compat.py index ed09eae1..45772c42 100644 --- a/numcodecs/compat.py +++ b/numcodecs/compat.py @@ -108,9 +108,3 @@ def ensure_text(l, encoding='utf-8'): return l else: # pragma: py3 no cover return text_type(l, encoding=encoding) - - -def handle_datetime(buf): - if hasattr(buf, 'dtype') and buf.dtype.kind in 'Mm': - return buf.view('u8') - return buf From 9b51e4fc1d2c056f7be6ab9e4f0beaa67182f15b Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Fri, 9 Nov 2018 14:15:24 -0500 Subject: [PATCH 09/64] Only cast datetime/timedeltas in `to_buffer` Limits explicit casting to `unsigned char` only for datetime and timedeltas. Preserve types of underlying data in all other cases. --- numcodecs/compat.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/numcodecs/compat.py b/numcodecs/compat.py index 45772c42..f57716c1 100644 --- a/numcodecs/compat.py +++ b/numcodecs/compat.py @@ -34,12 +34,14 @@ def to_buffer(v): b = v if isinstance(b, np.ndarray): b = b.reshape(-1, order='A') - if b.dtype.kind != 'O': + if b.dtype.kind in 'Mm': b = b.view(np.uint8) - - b = buffer(b) - if not PY2: # pragma: py3 no cover - b = b.cast('B') + b = buffer(b) + elif not PY2: # pragma: py3 no cover + b = buffer(b) + b = b.cast('B').cast(b.format) + else: + b = buffer(b) return b From f585c6348e9fe5d5b716670e03011e7dfc79c919 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Fri, 9 Nov 2018 14:53:12 -0500 Subject: [PATCH 10/64] Flip Python 2/3 cases in `to_buffer`, fix pragmas --- numcodecs/compat.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/numcodecs/compat.py b/numcodecs/compat.py index f57716c1..00201201 100644 --- a/numcodecs/compat.py +++ b/numcodecs/compat.py @@ -37,11 +37,11 @@ def to_buffer(v): if b.dtype.kind in 'Mm': b = b.view(np.uint8) b = buffer(b) - elif not PY2: # pragma: py3 no cover + elif PY2: # pragma: py3 no cover b = buffer(b) - b = b.cast('B').cast(b.format) - else: + else: # pragma: py2 no cover b = buffer(b) + b = b.cast('B').cast(b.format) return b From bf5638eab5c181ffc0e11ce5039d47b762d8b9eb Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Fri, 9 Nov 2018 15:55:29 -0500 Subject: [PATCH 11/64] Raise in `to_buffer` if `object` array is provided As encoding `object` arrays to `bytes` does not really make sense (one gets address of pointers on the stack), raise if any data provided is of `object` type. This can be done easily with an `ndarray`. In other cases, it can be checked through a `memoryview` of the data. Though there are some edge cases on Python 2, which need special handling. --- numcodecs/compat.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/numcodecs/compat.py b/numcodecs/compat.py index 00201201..c96f8467 100644 --- a/numcodecs/compat.py +++ b/numcodecs/compat.py @@ -33,14 +33,20 @@ def to_buffer(v): """Obtain a `buffer` or `memoryview` for `v`.""" b = v if isinstance(b, np.ndarray): + if b.dtype.kind is 'O': + raise ValueError('cannot encode object array') b = b.reshape(-1, order='A') if b.dtype.kind in 'Mm': b = b.view(np.uint8) b = buffer(b) elif PY2: # pragma: py3 no cover + if not isinstance(b, array.array) and memoryview(b).format is 'O': + raise ValueError('cannot encode object array') b = buffer(b) else: # pragma: py2 no cover b = buffer(b) + if b.format is 'O': + raise ValueError('cannot encode object array') b = b.cast('B').cast(b.format) return b From 442583366a7035ee006b81aa311c39a74c4fa53f Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Fri, 9 Nov 2018 16:01:10 -0500 Subject: [PATCH 12/64] Use `buffer_tobytes` on non-`object` types in test When running the `check_backwards_compatibility` test, make sure not to call `buffer_tobytes` on `object` arrays from fixture data. Casting these to bytes doesn't really make sense as it will return representation of pointers in the underlying buffer. Not only are the pointers things we do not want to be comparing, but it is also internal spec to NumPy and Python that really isn't reliable either. So make sure to pass through the `object` arrays as is. --- numcodecs/tests/common.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/numcodecs/tests/common.py b/numcodecs/tests/common.py index 0e61e418..2cd90cc3 100644 --- a/numcodecs/tests/common.py +++ b/numcodecs/tests/common.py @@ -187,7 +187,8 @@ def check_backwards_compatibility(codec_id, arrays, codecs, precision=None, pref # setup i = int(arr_fn.split('.')[-2]) arr = np.load(arr_fn) - arr_bytes = buffer_tobytes(arr) + if arr.dtype.kind is not 'O': + arr_bytes = buffer_tobytes(arr) if arr.flags.f_contiguous: order = 'F' else: From d03f4c66220863ea394cbfab6183aba3540c1193 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Fri, 9 Nov 2018 17:03:39 -0500 Subject: [PATCH 13/64] Check `buffer_tobytes` raises on `object` arrays Includes a simple check involving an `object` array and a `memoryview` of the `object` array to ensure that these cases are being caught and raised by `buffer_tobytes`. --- numcodecs/tests/test_compat.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/numcodecs/tests/test_compat.py b/numcodecs/tests/test_compat.py index f4935401..3c664173 100644 --- a/numcodecs/tests/test_compat.py +++ b/numcodecs/tests/test_compat.py @@ -2,6 +2,7 @@ from __future__ import absolute_import, print_function, division import array +import pytest import numpy as np @@ -9,6 +10,14 @@ from numcodecs.compat import buffer_tobytes +def test_buffer_tobytes_raises(): + a = np.array([u'Xin chào thế giới'], dtype=object) + with pytest.raises(ValueError): + buffer_tobytes(a) + with pytest.raises(ValueError): + buffer_tobytes(memoryview(a)) + + def test_buffer_tobytes(): bufs = [ b'adsdasdas', From c2ae1c41d44bc293fd6145f2bf4df4bfc72444ca Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Fri, 9 Nov 2018 17:36:18 -0500 Subject: [PATCH 14/64] Drop object checks from BZ2, GZip, LZMA, and Zlib As all of these functions coerce their buffers with `to_buffer` and `to_buffer` now checks for `object` type and raises, there is no need for an additional check before encoding data in these codecs. So go ahead and drop this code. --- numcodecs/bz2.py | 9 --------- numcodecs/gzip.py | 9 --------- numcodecs/lzma.py | 7 ------- numcodecs/zlib.py | 9 --------- 4 files changed, 34 deletions(-) diff --git a/numcodecs/bz2.py b/numcodecs/bz2.py index ed08dfb5..d59261eb 100644 --- a/numcodecs/bz2.py +++ b/numcodecs/bz2.py @@ -3,9 +3,6 @@ import bz2 as _bz2 -import numpy as np - - from numcodecs.abc import Codec from numcodecs.compat import buffer_copy, to_buffer @@ -27,12 +24,6 @@ def __init__(self, level=1): def encode(self, buf): - if isinstance(buf, np.ndarray): - - # cannot compress object array - if buf.dtype == object: - raise ValueError('cannot encode object array') - buf = to_buffer(buf) # do compression diff --git a/numcodecs/gzip.py b/numcodecs/gzip.py index df985f44..1f4c1138 100644 --- a/numcodecs/gzip.py +++ b/numcodecs/gzip.py @@ -4,9 +4,6 @@ import io -import numpy as np - - from .abc import Codec from .compat import buffer_copy, to_buffer @@ -28,12 +25,6 @@ def __init__(self, level=1): def encode(self, buf): - if isinstance(buf, np.ndarray): - - # cannot compress object array - if buf.dtype == object: - raise ValueError('cannot encode object array') - buf = to_buffer(buf) # do compression diff --git a/numcodecs/lzma.py b/numcodecs/lzma.py index 9de6179b..20521593 100644 --- a/numcodecs/lzma.py +++ b/numcodecs/lzma.py @@ -14,7 +14,6 @@ if _lzma: - import numpy as np from .abc import Codec from .compat import buffer_copy, to_buffer @@ -48,12 +47,6 @@ def __init__(self, format=1, check=-1, preset=None, filters=None): def encode(self, buf): - if isinstance(buf, np.ndarray): - - # cannot compress object array - if buf.dtype == object: - raise ValueError('cannot encode object array') - buf = to_buffer(buf) # do compression diff --git a/numcodecs/zlib.py b/numcodecs/zlib.py index fda8af8a..e56ee3a2 100644 --- a/numcodecs/zlib.py +++ b/numcodecs/zlib.py @@ -3,9 +3,6 @@ import zlib as _zlib -import numpy as np - - from .abc import Codec from .compat import buffer_copy, to_buffer @@ -27,12 +24,6 @@ def __init__(self, level=1): def encode(self, buf): - if isinstance(buf, np.ndarray): - - # cannot compress object array - if buf.dtype == object: - raise ValueError('cannot encode object array') - buf = to_buffer(buf) # do compression From c7b3f35eadedc60bfbd622b0a76a77806b2baa8c Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Fri, 9 Nov 2018 17:48:03 -0500 Subject: [PATCH 15/64] Cast datetime/timedelta arrays to uint64 This preserves the itemsize of the underlying type, which could be handy information for compression algorithms to take advantage of (if they are able to). --- numcodecs/compat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/numcodecs/compat.py b/numcodecs/compat.py index c96f8467..01f6a5a8 100644 --- a/numcodecs/compat.py +++ b/numcodecs/compat.py @@ -37,7 +37,7 @@ def to_buffer(v): raise ValueError('cannot encode object array') b = b.reshape(-1, order='A') if b.dtype.kind in 'Mm': - b = b.view(np.uint8) + b = b.view(np.uint64) b = buffer(b) elif PY2: # pragma: py3 no cover if not isinstance(b, array.array) and memoryview(b).format is 'O': From 105210858272cc0defda0d593038de5dfaf02bb8 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Fri, 9 Nov 2018 18:10:57 -0500 Subject: [PATCH 16/64] Revert "Use `buffer_tobytes` on non-`object` types in test" This reverts commit 442583366a7035ee006b81aa311c39a74c4fa53f. --- numcodecs/tests/common.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/numcodecs/tests/common.py b/numcodecs/tests/common.py index 2cd90cc3..0e61e418 100644 --- a/numcodecs/tests/common.py +++ b/numcodecs/tests/common.py @@ -187,8 +187,7 @@ def check_backwards_compatibility(codec_id, arrays, codecs, precision=None, pref # setup i = int(arr_fn.split('.')[-2]) arr = np.load(arr_fn) - if arr.dtype.kind is not 'O': - arr_bytes = buffer_tobytes(arr) + arr_bytes = buffer_tobytes(arr) if arr.flags.f_contiguous: order = 'F' else: From 732fac7ab6afdd9f7dc55afca5cd4ba880fa8029 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Fri, 9 Nov 2018 20:40:19 -0500 Subject: [PATCH 17/64] Simply catch all non-`memoryview` coercible types When attempt to coerce an object to a `memoryview` to determine the buffer's type on Python 2, just use a `try...catch` to ignore all cases that fail. The are generally things like `array` or `mmap`, which cannot hold object types anyways. So they can be presumed safe and ignored. We are only interested in checking for things like `memoryviews` of other things that already contain `object`s or some third-party `ndarray`-like class that holds `object`s. If any of these show up, throw a `ValueError` per usual. --- numcodecs/compat.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/numcodecs/compat.py b/numcodecs/compat.py index 01f6a5a8..4be0b7af 100644 --- a/numcodecs/compat.py +++ b/numcodecs/compat.py @@ -40,8 +40,11 @@ def to_buffer(v): b = b.view(np.uint64) b = buffer(b) elif PY2: # pragma: py3 no cover - if not isinstance(b, array.array) and memoryview(b).format is 'O': - raise ValueError('cannot encode object array') + try: + if memoryview(b).format is 'O': + raise ValueError('cannot encode object array') + except TypeError: + pass b = buffer(b) else: # pragma: py2 no cover b = buffer(b) From 7368f07bdb93477f4fd082bb758c28c15b42a9a0 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Fri, 9 Nov 2018 21:05:17 -0500 Subject: [PATCH 18/64] Test coercing `mmap`'s to `bytes` as well Simply use a small anonymous memory segment from `mmap` to test `buffer_tobytes`. This should be good for exercising a case similar to memory-mapping files that Zarr users may want. Should add CPython automatically cleans this up for us once all reference counts go to zero. ref: https://stackoverflow.com/a/50460828 --- numcodecs/tests/test_compat.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/numcodecs/tests/test_compat.py b/numcodecs/tests/test_compat.py index 3c664173..93d2b782 100644 --- a/numcodecs/tests/test_compat.py +++ b/numcodecs/tests/test_compat.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- from __future__ import absolute_import, print_function, division import array +import mmap import pytest @@ -23,7 +24,8 @@ def test_buffer_tobytes(): b'adsdasdas', bytes(20), np.arange(100), - array.array('l', b'qwertyuiqwertyui') + array.array('l', b'qwertyuiqwertyui'), + mmap.mmap(-1, 10) ] for buf in bufs: b = buffer_tobytes(buf) From 923d3541789764485f67d0947dcddd3dc9435b97 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Fri, 9 Nov 2018 21:10:39 -0500 Subject: [PATCH 19/64] Tests `to_buffer` alongside `buffer_tobytes` Adds some direct tests of `to_buffer` to make sure it is providing us the expected buffer type for the different pieces of data we give it. Performs these tests alongside the existing `buffer_tobytes` tests. --- numcodecs/tests/test_compat.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/numcodecs/tests/test_compat.py b/numcodecs/tests/test_compat.py index 93d2b782..419ad40a 100644 --- a/numcodecs/tests/test_compat.py +++ b/numcodecs/tests/test_compat.py @@ -8,18 +8,18 @@ import numpy as np -from numcodecs.compat import buffer_tobytes +from numcodecs.compat import buffer, to_buffer, buffer_tobytes -def test_buffer_tobytes_raises(): +def test_buffer_coercion_raises(): a = np.array([u'Xin chào thế giới'], dtype=object) - with pytest.raises(ValueError): - buffer_tobytes(a) - with pytest.raises(ValueError): - buffer_tobytes(memoryview(a)) + for e in [a, memoryview(a)]: + for f in [to_buffer, buffer_tobytes]: + with pytest.raises(ValueError): + f(e) -def test_buffer_tobytes(): +def test_buffer_coercion(): bufs = [ b'adsdasdas', bytes(20), @@ -28,5 +28,7 @@ def test_buffer_tobytes(): mmap.mmap(-1, 10) ] for buf in bufs: - b = buffer_tobytes(buf) - assert isinstance(b, bytes) + b1 = to_buffer(buf) + assert isinstance(b1, buffer) + b2 = buffer_tobytes(buf) + assert isinstance(b2, bytes) From 1477dffdbf163be11a7ca058a9a68279f7eb290d Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Fri, 9 Nov 2018 21:34:22 -0500 Subject: [PATCH 20/64] Breakup array handling, object check, & Python 2/3 To simplify the code a bit, split it up into chunks. First handle the flattening of arrays and coercion of datetime/timedelta types. Second check to see if the data is actually `object` type and raise if it is. Third handle Python 2/3 differences of `buffer`s and `memoryview`s respectively. This cuts down the code and makes the remaining code easier to read and follow. Also should make it less prone to edge case bugs. --- numcodecs/compat.py | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/numcodecs/compat.py b/numcodecs/compat.py index 4be0b7af..7acc60e9 100644 --- a/numcodecs/compat.py +++ b/numcodecs/compat.py @@ -32,24 +32,20 @@ def to_buffer(v): """Obtain a `buffer` or `memoryview` for `v`.""" b = v + if isinstance(b, np.ndarray): - if b.dtype.kind is 'O': - raise ValueError('cannot encode object array') b = b.reshape(-1, order='A') if b.dtype.kind in 'Mm': b = b.view(np.uint64) - b = buffer(b) - elif PY2: # pragma: py3 no cover - try: - if memoryview(b).format is 'O': - raise ValueError('cannot encode object array') - except TypeError: - pass - b = buffer(b) - else: # pragma: py2 no cover - b = buffer(b) - if b.format is 'O': + + try: + if memoryview(b).format is 'O': raise ValueError('cannot encode object array') + except TypeError: # pragma: py3 no cover + pass + + b = buffer(b) + if not PY2 and not isinstance(v, np.ndarray): # pragma: py2 no cover b = b.cast('B').cast(b.format) return b From 7fbb4d6a5a41c5b57f93fcc03245c7494ea65de7 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Sat, 10 Nov 2018 00:43:40 -0500 Subject: [PATCH 21/64] Use `try...except...else` with `memoryview` Only coerce the buffer to a `memoryview` in the `try` portion of the block. Handle the case where it contains `object`s in the `else`. Avoids suppressing unrelated exceptions unintentionally. --- numcodecs/compat.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/numcodecs/compat.py b/numcodecs/compat.py index 7acc60e9..0864e649 100644 --- a/numcodecs/compat.py +++ b/numcodecs/compat.py @@ -39,10 +39,12 @@ def to_buffer(v): b = b.view(np.uint64) try: - if memoryview(b).format is 'O': - raise ValueError('cannot encode object array') + m = memoryview(b) except TypeError: # pragma: py3 no cover pass + else: + if m.format is 'O': + raise ValueError('cannot encode object array') b = buffer(b) if not PY2 and not isinstance(v, np.ndarray): # pragma: py2 no cover From ddfbaee8a41c97dda38eb973558ea487a37ff904 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Sat, 10 Nov 2018 00:58:36 -0500 Subject: [PATCH 22/64] Reshape all `memoryview`s using NumPy Instead of simply reshaping NumPy arrays, take anything that can be coerced into a `memoryview` and view it as a NumPy array so as to reshape it. This way all `memoryview`s (including NumPy arrays) can be consistently reshaped. Not to mention this works on Python 2 and Python 3. By doing this, we are able to drop usage of `memoryview`'s `cast` method. This was restricted to Python 3. So provided no options for handling equivalent cases in Python 2. Also the `cast` method was a bit convoluted for just reshaping. Not to mention it was unclear that this would enforce C or Fortran ordering (though it did seem to raise if those were not achievable). Of course this doesn't work for Python 2 `buffer` objects, but we weren't doing anything special for them anyways. Besides the few cases where Python 2 cannot construct a `memoryview` are for things like `array` and `mmap`, which are contiguous and flat anyways. Given how the old `buffer` object could handle chunks of non-contiguous memory, reshaping or forcing contiguous memory seems unnecessary in that case. --- numcodecs/compat.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/numcodecs/compat.py b/numcodecs/compat.py index 0864e649..66ffa177 100644 --- a/numcodecs/compat.py +++ b/numcodecs/compat.py @@ -34,7 +34,6 @@ def to_buffer(v): b = v if isinstance(b, np.ndarray): - b = b.reshape(-1, order='A') if b.dtype.kind in 'Mm': b = b.view(np.uint64) @@ -46,9 +45,9 @@ def to_buffer(v): if m.format is 'O': raise ValueError('cannot encode object array') + b = np.array(m, copy=False).reshape(-1, order='A') + b = buffer(b) - if not PY2 and not isinstance(v, np.ndarray): # pragma: py2 no cover - b = b.cast('B').cast(b.format) return b From 8b8d93efd1a9ac926623903925a5eddf4c974624 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Sat, 10 Nov 2018 01:14:53 -0500 Subject: [PATCH 23/64] Use `b` instead of `m` --- numcodecs/compat.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/numcodecs/compat.py b/numcodecs/compat.py index 66ffa177..46d7a72f 100644 --- a/numcodecs/compat.py +++ b/numcodecs/compat.py @@ -38,14 +38,14 @@ def to_buffer(v): b = b.view(np.uint64) try: - m = memoryview(b) + b = memoryview(b) except TypeError: # pragma: py3 no cover pass else: - if m.format is 'O': + if b.format is 'O': raise ValueError('cannot encode object array') - b = np.array(m, copy=False).reshape(-1, order='A') + b = np.array(b, copy=False).reshape(-1, order='A') b = buffer(b) From 27b6ce396ee6c5a7635b4f7760aa6deb89b4c051 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Sat, 10 Nov 2018 02:23:23 -0500 Subject: [PATCH 24/64] Move reshape into `else` --- numcodecs/compat.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/numcodecs/compat.py b/numcodecs/compat.py index 46d7a72f..144ee6c7 100644 --- a/numcodecs/compat.py +++ b/numcodecs/compat.py @@ -44,8 +44,8 @@ def to_buffer(v): else: if b.format is 'O': raise ValueError('cannot encode object array') - - b = np.array(b, copy=False).reshape(-1, order='A') + else: + b = np.array(b, copy=False).reshape(-1, order='A') b = buffer(b) From 43ba708a7ef3f62a46b67c90374df3f1625ed1d3 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Sat, 10 Nov 2018 02:24:13 -0500 Subject: [PATCH 25/64] Swap `object` array cases --- numcodecs/compat.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/numcodecs/compat.py b/numcodecs/compat.py index 144ee6c7..27a268e1 100644 --- a/numcodecs/compat.py +++ b/numcodecs/compat.py @@ -42,10 +42,10 @@ def to_buffer(v): except TypeError: # pragma: py3 no cover pass else: - if b.format is 'O': - raise ValueError('cannot encode object array') - else: + if b.format is not 'O': b = np.array(b, copy=False).reshape(-1, order='A') + else: + raise ValueError('cannot encode object array') b = buffer(b) From 33bb8a3a610ca86da97244ede218df39a84703eb Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Sat, 10 Nov 2018 02:33:08 -0500 Subject: [PATCH 26/64] Mark `to_buffer`'s NumPy array as readonly As the point of using `to_buffer` is to coerce external data into a buffer that matches are constraints and can be used efficiently to feed data into encoders/decoders, the buffer should not ever need to be writable (only readable). Given this, mark the flattened `ndarray` of our data as readonly. By doing this, we ensure that `memoryview`s constructed from this `ndarray` will also be `readonly`. Thus protecting the user's data from accidentally being overwritten. --- numcodecs/compat.py | 1 + 1 file changed, 1 insertion(+) diff --git a/numcodecs/compat.py b/numcodecs/compat.py index 27a268e1..c9f784ea 100644 --- a/numcodecs/compat.py +++ b/numcodecs/compat.py @@ -44,6 +44,7 @@ def to_buffer(v): else: if b.format is not 'O': b = np.array(b, copy=False).reshape(-1, order='A') + b.setflags(write=False) else: raise ValueError('cannot encode object array') From f934c851462f57ff5e5299327038c2bd076b03b0 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Sat, 10 Nov 2018 02:46:35 -0500 Subject: [PATCH 27/64] Test `to_buffer` returns readonly buffers Includes a simple test of passing in writable array data to `to_buffer` and checks that the buffer returned has its `readonly` attribute enabled. On Python 2 this is always true as `buffer` creates readonly buffers only, but it is not easy to inspect. So we coerce the `buffer` object to a `memoryview`, which has the `readonly` attribute. On Python 3, this extra coercion is cheap since we have a `memoryview` to begin with. So it should just copy the existing `memoryview` on Python 3 (though not the underlying data). --- numcodecs/tests/test_compat.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/numcodecs/tests/test_compat.py b/numcodecs/tests/test_compat.py index 419ad40a..bc1e4fab 100644 --- a/numcodecs/tests/test_compat.py +++ b/numcodecs/tests/test_compat.py @@ -19,6 +19,16 @@ def test_buffer_coercion_raises(): f(e) +def test_buffer_readonly(): + a = np.arange(100) + a.setflags(write=True) + + b = to_buffer(a) + m = memoryview(b) + + assert m.readonly + + def test_buffer_coercion(): bufs = [ b'adsdasdas', From ce5bce137bc8a65837ca7538a5565f57febec52d Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Sun, 11 Nov 2018 14:14:03 -0500 Subject: [PATCH 28/64] Coerce `memoryview`s and `buffer`s to `ndarray`s Instead of returning a `memoryview` or `buffer` from `to_buffer`, return something coercible to either type. Namely return an `ndarray` that views, but does not copy, the underlying data. That way compression algorithms can easily coerce the data into a `memoryview`, `buffer`, or `bytes` depending on which is appropriate and works best for the data. --- numcodecs/compat.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/numcodecs/compat.py b/numcodecs/compat.py index c9f784ea..db7e1015 100644 --- a/numcodecs/compat.py +++ b/numcodecs/compat.py @@ -40,15 +40,15 @@ def to_buffer(v): try: b = memoryview(b) except TypeError: # pragma: py3 no cover - pass - else: - if b.format is not 'O': - b = np.array(b, copy=False).reshape(-1, order='A') - b.setflags(write=False) - else: - raise ValueError('cannot encode object array') + b = buffer(b) + + b = np.array(b, copy=False) + + if b.dtype.kind is 'O': + raise ValueError('cannot encode object array') - b = buffer(b) + b = b.reshape(-1, order='A') + b.setflags(write=False) return b From 32721f61e0c3f6017cf376dbb58b46007dd1bc7f Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Sun, 11 Nov 2018 14:18:52 -0500 Subject: [PATCH 29/64] Use `np.getbuffer` instead of `buffer` Make use of `np.getbuffer` to coerce data into a `buffer` type in `to_buffer`. This ensures that if the data supports writing that the `buffer` returned will also support `writing` unlike the builtin `buffer`, which always returns a read-only `buffer`. --- numcodecs/compat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/numcodecs/compat.py b/numcodecs/compat.py index db7e1015..95cde7c3 100644 --- a/numcodecs/compat.py +++ b/numcodecs/compat.py @@ -40,7 +40,7 @@ def to_buffer(v): try: b = memoryview(b) except TypeError: # pragma: py3 no cover - b = buffer(b) + b = np.getbuffer(b) b = np.array(b, copy=False) From 30b63d2953a9df8530603e3b21207008d73cd2cf Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Sun, 11 Nov 2018 14:53:00 -0500 Subject: [PATCH 30/64] Fix-up some codecs to handle `ndarray` As `to_buffer` now returns an `ndarray`, update a few codecs that need to cast to something more explicit. --- numcodecs/bz2.py | 2 +- numcodecs/gzip.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/numcodecs/bz2.py b/numcodecs/bz2.py index d59261eb..15500082 100644 --- a/numcodecs/bz2.py +++ b/numcodecs/bz2.py @@ -32,7 +32,7 @@ def encode(self, buf): # noinspection PyMethodMayBeStatic def decode(self, buf, out=None): - buf = to_buffer(buf) + buf = memoryview(to_buffer(buf)) # do decompression dec = _bz2.decompress(buf) diff --git a/numcodecs/gzip.py b/numcodecs/gzip.py index 1f4c1138..98efa8f6 100644 --- a/numcodecs/gzip.py +++ b/numcodecs/gzip.py @@ -25,7 +25,7 @@ def __init__(self, level=1): def encode(self, buf): - buf = to_buffer(buf) + buf = memoryview(to_buffer(buf)) # do compression compressed = io.BytesIO() From 98a7860631ac2ce6d0ad6c16b0e6cad5703dd27b Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Sun, 11 Nov 2018 14:57:26 -0500 Subject: [PATCH 31/64] Check that `to_buffer` returns an `ndarray` Also ensure that it is coercible to a `buffer` or `memoryview` as needed. --- numcodecs/tests/test_compat.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/numcodecs/tests/test_compat.py b/numcodecs/tests/test_compat.py index bc1e4fab..06f0e98b 100644 --- a/numcodecs/tests/test_compat.py +++ b/numcodecs/tests/test_compat.py @@ -39,6 +39,8 @@ def test_buffer_coercion(): ] for buf in bufs: b1 = to_buffer(buf) - assert isinstance(b1, buffer) + buffer(b1) + memoryview(b1) + assert isinstance(b1, np.ndarray) b2 = buffer_tobytes(buf) assert isinstance(b2, bytes) From c046a72c49b67feab32465d5a7d3d01ceb48fc05 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Sun, 11 Nov 2018 15:05:18 -0500 Subject: [PATCH 32/64] Ensure `array` has the correct type on Python 2 On Python 2, the `array` type cannot be cast to a `memoryview`. Though it can be cast to a `buffer`. However the old `buffer` interface does not keep track of the type of data it has and merely represents it as `unsigned char`. This behavior is reproduced by `ndarray`s viewing the `buffer` as well. To fix this, we cast the `ndarray` back to the type specified by the `array`. --- numcodecs/compat.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/numcodecs/compat.py b/numcodecs/compat.py index 95cde7c3..087ebb72 100644 --- a/numcodecs/compat.py +++ b/numcodecs/compat.py @@ -43,6 +43,8 @@ def to_buffer(v): b = np.getbuffer(b) b = np.array(b, copy=False) + if isinstance(v, array.array): + b = b.view(v.typecode) if b.dtype.kind is 'O': raise ValueError('cannot encode object array') From 5fb505d34eff9451886901fc0c70c736195e607a Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Sun, 11 Nov 2018 15:13:32 -0500 Subject: [PATCH 33/64] State `arange` type in `test_buffer_coercion` Just add typing to `arange` for clarity as Windows and Unix builds of Python and NumPy often handle integers differently. --- numcodecs/tests/test_compat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/numcodecs/tests/test_compat.py b/numcodecs/tests/test_compat.py index 06f0e98b..5206aaba 100644 --- a/numcodecs/tests/test_compat.py +++ b/numcodecs/tests/test_compat.py @@ -33,7 +33,7 @@ def test_buffer_coercion(): bufs = [ b'adsdasdas', bytes(20), - np.arange(100), + np.arange(100, dtype=np.int64), array.array('l', b'qwertyuiqwertyui'), mmap.mmap(-1, 10) ] From efc7552d9646b4bccff7d299e0991fdfd69f8ecf Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Sun, 11 Nov 2018 15:18:32 -0500 Subject: [PATCH 34/64] Test that `to_buffer` maintains types --- numcodecs/tests/test_compat.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/numcodecs/tests/test_compat.py b/numcodecs/tests/test_compat.py index 5206aaba..09998db3 100644 --- a/numcodecs/tests/test_compat.py +++ b/numcodecs/tests/test_compat.py @@ -30,17 +30,18 @@ def test_buffer_readonly(): def test_buffer_coercion(): - bufs = [ - b'adsdasdas', - bytes(20), - np.arange(100, dtype=np.int64), - array.array('l', b'qwertyuiqwertyui'), - mmap.mmap(-1, 10) + typed_bufs = [ + ('B', b'adsdasdas'), + ('B', bytes(20)), + ('l', np.arange(100, dtype=np.int64)), + ('l', array.array('l', b'qwertyuiqwertyui')), + ('B', mmap.mmap(-1, 10)) ] - for buf in bufs: + for typ, buf in typed_bufs: b1 = to_buffer(buf) - buffer(b1) - memoryview(b1) assert isinstance(b1, np.ndarray) + buffer(b1) + b1mv = memoryview(b1) + assert b1mv.format is typ b2 = buffer_tobytes(buf) assert isinstance(b2, bytes) From 230da2117a9a22bc21ebc799f00448c8a6ce44b5 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Sun, 11 Nov 2018 16:04:10 -0500 Subject: [PATCH 35/64] Drop the cast to `memoryview` in `to_buffer` As `to_buffer` just returns an `ndarray` now, simply call `tobytes` on it directly. No need to coerce it to a `memoryview` first. --- numcodecs/compat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/numcodecs/compat.py b/numcodecs/compat.py index 087ebb72..423ad84c 100644 --- a/numcodecs/compat.py +++ b/numcodecs/compat.py @@ -57,7 +57,7 @@ def to_buffer(v): def buffer_tobytes(v): """Obtain a sequence of bytes for the memory buffer used by `v`.""" - return memoryview(to_buffer(v)).tobytes() + return to_buffer(v).tobytes() def buffer_copy(buf, out=None): From af0946d9e85dae9dd0d4f29ee21f5b1c368ff280 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Sun, 11 Nov 2018 16:14:11 -0500 Subject: [PATCH 36/64] Use to_buffer directly instead of buffer_tobytes As `buffer_tobytes` is merely calling `to_buffer` and calling `tobytes`, codecs can do this work for themselves. --- numcodecs/json.py | 6 +++--- numcodecs/msgpacks.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/numcodecs/json.py b/numcodecs/json.py index 3d8d9f13..bb9a99ba 100644 --- a/numcodecs/json.py +++ b/numcodecs/json.py @@ -8,7 +8,7 @@ from .abc import Codec -from .compat import buffer_tobytes +from .compat import to_buffer class JSON(Codec): @@ -63,7 +63,7 @@ def encode(self, buf): return self._encoder.encode(items).encode(self._text_encoding) def decode(self, buf, out=None): - buf = buffer_tobytes(buf) + buf = to_buffer(buf).tobytes() items = self._decoder.decode(buf.decode(self._text_encoding)) dec = np.empty(items[-1], dtype=items[-2]) dec[:] = items[:-2] @@ -125,7 +125,7 @@ def encode(self, buf): return self._encoder.encode(items).encode(self._text_encoding) def decode(self, buf, out=None): - buf = buffer_tobytes(buf) + buf = to_buffer(buf).tobytes() items = self._decoder.decode(buf.decode(self._text_encoding)) dec = np.array(items[:-1], dtype=items[-1]) if out is not None: diff --git a/numcodecs/msgpacks.py b/numcodecs/msgpacks.py index bd195cbd..ff09ff07 100644 --- a/numcodecs/msgpacks.py +++ b/numcodecs/msgpacks.py @@ -7,7 +7,7 @@ from .abc import Codec -from .compat import buffer_tobytes +from .compat import to_buffer class MsgPack(Codec): @@ -50,7 +50,7 @@ def encode(self, buf): return msgpack.packb(items, encoding=self.encoding) def decode(self, buf, out=None): - buf = buffer_tobytes(buf) + buf = to_buffer(buf).tobytes() items = msgpack.unpackb(buf, encoding=self.encoding) dec = np.empty(items[-1], dtype=items[-2]) dec[:] = items[:-2] @@ -89,7 +89,7 @@ def encode(self, buf): return msgpack.packb(items, encoding=self.encoding) def decode(self, buf, out=None): - buf = buffer_tobytes(buf) + buf = to_buffer(buf).tobytes() items = msgpack.unpackb(buf, encoding=self.encoding) dec = np.array(items[:-1], dtype=items[-1]) if out is not None: From a0d299dbb6cb5112e942698dee4f2034f970c5fe Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Sun, 11 Nov 2018 16:14:23 -0500 Subject: [PATCH 37/64] Drop `buffer_tobytes` As there is no real need for `buffer_tobytes` any more, go ahead and drop it. --- numcodecs/compat.py | 5 ----- numcodecs/tests/common.py | 6 +++--- numcodecs/tests/test_compat.py | 9 ++++----- 3 files changed, 7 insertions(+), 13 deletions(-) diff --git a/numcodecs/compat.py b/numcodecs/compat.py index 423ad84c..5fe2fada 100644 --- a/numcodecs/compat.py +++ b/numcodecs/compat.py @@ -55,11 +55,6 @@ def to_buffer(v): return b -def buffer_tobytes(v): - """Obtain a sequence of bytes for the memory buffer used by `v`.""" - return to_buffer(v).tobytes() - - def buffer_copy(buf, out=None): """Copy the contents of the memory buffer from `buf` to `out`.""" diff --git a/numcodecs/tests/common.py b/numcodecs/tests/common.py index 6e6a20d1..1b7edacf 100644 --- a/numcodecs/tests/common.py +++ b/numcodecs/tests/common.py @@ -11,7 +11,7 @@ import pytest -from numcodecs.compat import buffer_tobytes, ndarray_from_buffer +from numcodecs.compat import to_buffer, ndarray_from_buffer from numcodecs.registry import get_codec # star import needed for repr tests so eval finds names from numcodecs import * # noqa @@ -86,7 +86,7 @@ def check_encode_decode(arr, codec, precision=None): # as well as array.array in PY2 # setup - enc_bytes = buffer_tobytes(enc) + enc_bytes = to_buffer(enc).tobytes() # test decoding of raw bytes dec = codec.decode(enc_bytes) @@ -232,7 +232,7 @@ def check_backwards_compatibility(codec_id, arrays, codecs, precision=None, pref assert_array_items_equal(arr, dec_arr) else: assert_array_equal(arr, dec_arr) - assert buffer_tobytes(arr) == buffer_tobytes(dec) + assert to_buffer(arr).tobytes() == to_buffer(dec).tobytes() def check_err_decode_object_buffer(compressor): diff --git a/numcodecs/tests/test_compat.py b/numcodecs/tests/test_compat.py index 09998db3..482c6be2 100644 --- a/numcodecs/tests/test_compat.py +++ b/numcodecs/tests/test_compat.py @@ -8,15 +8,14 @@ import numpy as np -from numcodecs.compat import buffer, to_buffer, buffer_tobytes +from numcodecs.compat import buffer, to_buffer def test_buffer_coercion_raises(): a = np.array([u'Xin chào thế giới'], dtype=object) for e in [a, memoryview(a)]: - for f in [to_buffer, buffer_tobytes]: - with pytest.raises(ValueError): - f(e) + with pytest.raises(ValueError): + to_buffer(e) def test_buffer_readonly(): @@ -43,5 +42,5 @@ def test_buffer_coercion(): buffer(b1) b1mv = memoryview(b1) assert b1mv.format is typ - b2 = buffer_tobytes(buf) + b2 = b1.tobytes() assert isinstance(b2, bytes) From dc58d209ea5a43293b5dcc168170e8868a543939 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Sun, 11 Nov 2018 16:19:16 -0500 Subject: [PATCH 38/64] Check that the buffer and the ndarray share memory Uses NumPy's `shares_memory` function to check if the `buffer` from the input data and the `ndarray` constructed from it share memory. --- numcodecs/tests/test_compat.py | 1 + 1 file changed, 1 insertion(+) diff --git a/numcodecs/tests/test_compat.py b/numcodecs/tests/test_compat.py index 482c6be2..e8a928f5 100644 --- a/numcodecs/tests/test_compat.py +++ b/numcodecs/tests/test_compat.py @@ -39,6 +39,7 @@ def test_buffer_coercion(): for typ, buf in typed_bufs: b1 = to_buffer(buf) assert isinstance(b1, np.ndarray) + assert np.shares_memory(b1, buffer(buf)) buffer(b1) b1mv = memoryview(b1) assert b1mv.format is typ From d6bac3514b414ec97d96e2df6eee39519badb3c2 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Sun, 11 Nov 2018 16:28:35 -0500 Subject: [PATCH 39/64] Test type matches against `ndarray` --- numcodecs/tests/test_compat.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/numcodecs/tests/test_compat.py b/numcodecs/tests/test_compat.py index e8a928f5..8f2dfc09 100644 --- a/numcodecs/tests/test_compat.py +++ b/numcodecs/tests/test_compat.py @@ -30,18 +30,18 @@ def test_buffer_readonly(): def test_buffer_coercion(): typed_bufs = [ - ('B', b'adsdasdas'), - ('B', bytes(20)), - ('l', np.arange(100, dtype=np.int64)), - ('l', array.array('l', b'qwertyuiqwertyui')), - ('B', mmap.mmap(-1, 10)) + ('u', b'adsdasdas'), + ('u', bytes(20)), + ('i', np.arange(100, dtype=np.int64)), + ('i', array.array('l', b'qwertyuiqwertyui')), + ('u', mmap.mmap(-1, 10)) ] for typ, buf in typed_bufs: b1 = to_buffer(buf) assert isinstance(b1, np.ndarray) + assert b1.dtype.kind is typ assert np.shares_memory(b1, buffer(buf)) buffer(b1) - b1mv = memoryview(b1) - assert b1mv.format is typ + memoryview(b1) b2 = b1.tobytes() assert isinstance(b2, bytes) From cd550a6189924d5f49ea3ed14ec56db0cf4581e4 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Sun, 11 Nov 2018 16:29:28 -0500 Subject: [PATCH 40/64] Drop tests of `ndarray` coercion These are already tested by NumPy. So we don't need to test them explicitly. --- numcodecs/tests/test_compat.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/numcodecs/tests/test_compat.py b/numcodecs/tests/test_compat.py index 8f2dfc09..2aa7b0d0 100644 --- a/numcodecs/tests/test_compat.py +++ b/numcodecs/tests/test_compat.py @@ -41,7 +41,3 @@ def test_buffer_coercion(): assert isinstance(b1, np.ndarray) assert b1.dtype.kind is typ assert np.shares_memory(b1, buffer(buf)) - buffer(b1) - memoryview(b1) - b2 = b1.tobytes() - assert isinstance(b2, bytes) From ad90e70b14eecee3b36a7bb44e8cc235ef83a5dc Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Sun, 11 Nov 2018 16:34:54 -0500 Subject: [PATCH 41/64] Skip coercion of `ndarray`s As `ndarray`s already match what we want to get, skip coercing them. --- numcodecs/compat.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/numcodecs/compat.py b/numcodecs/compat.py index 5fe2fada..6f438ecd 100644 --- a/numcodecs/compat.py +++ b/numcodecs/compat.py @@ -36,15 +36,15 @@ def to_buffer(v): if isinstance(b, np.ndarray): if b.dtype.kind in 'Mm': b = b.view(np.uint64) - - try: - b = memoryview(b) - except TypeError: # pragma: py3 no cover - b = np.getbuffer(b) - - b = np.array(b, copy=False) - if isinstance(v, array.array): - b = b.view(v.typecode) + else: + try: + b = memoryview(b) + except TypeError: # pragma: py3 no cover + b = np.getbuffer(b) + + b = np.array(b, copy=False) + if isinstance(v, array.array): + b = b.view(v.typecode) if b.dtype.kind is 'O': raise ValueError('cannot encode object array') From ab159bf2e7a5e0af7bb01392348353a43792ddfd Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Sun, 11 Nov 2018 16:38:07 -0500 Subject: [PATCH 42/64] Join type checks after coercing non-`ndarray`s --- numcodecs/compat.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/numcodecs/compat.py b/numcodecs/compat.py index 6f438ecd..4dd15a29 100644 --- a/numcodecs/compat.py +++ b/numcodecs/compat.py @@ -33,10 +33,7 @@ def to_buffer(v): """Obtain a `buffer` or `memoryview` for `v`.""" b = v - if isinstance(b, np.ndarray): - if b.dtype.kind in 'Mm': - b = b.view(np.uint64) - else: + if not isinstance(b, np.ndarray): try: b = memoryview(b) except TypeError: # pragma: py3 no cover @@ -48,6 +45,8 @@ def to_buffer(v): if b.dtype.kind is 'O': raise ValueError('cannot encode object array') + elif b.dtype.kind in 'Mm': + b = b.view(np.uint64) b = b.reshape(-1, order='A') b.setflags(write=False) From 5b780d3d3592dad193d8065a181ac2a1b2642888 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Sun, 11 Nov 2018 16:42:21 -0500 Subject: [PATCH 43/64] Limit casting of `array` to Python 2 On Python 3, `array` is coercible to a `memoryview`. So it maintains its type information and the `ndarray` will reflect that. While casting to the `ndarray` to the type it already is won't hurt, it isn't needed either. So restrict this to Python 2, where `array` can only be coerced to `buffer` and the type information must be restored. --- numcodecs/compat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/numcodecs/compat.py b/numcodecs/compat.py index 4dd15a29..b5d76f6e 100644 --- a/numcodecs/compat.py +++ b/numcodecs/compat.py @@ -40,7 +40,7 @@ def to_buffer(v): b = np.getbuffer(b) b = np.array(b, copy=False) - if isinstance(v, array.array): + if PY2 and isinstance(v, array.array): # pragma: py3 no cover b = b.view(v.typecode) if b.dtype.kind is 'O': From 18f2f33fa8a259ba5225de4157b2bbe07b398c01 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Sun, 11 Nov 2018 17:13:53 -0500 Subject: [PATCH 44/64] Drop disabling writing in `to_buffer` As there may be use cases where a writable buffer could be useful, do not limit users of `to_buffer` to only getting a read-only buffer. Update our tests to ensure writable arrays remain writable and read-only arrays remain readable. --- numcodecs/compat.py | 1 - numcodecs/tests/test_compat.py | 13 +++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/numcodecs/compat.py b/numcodecs/compat.py index b5d76f6e..8558fce6 100644 --- a/numcodecs/compat.py +++ b/numcodecs/compat.py @@ -49,7 +49,6 @@ def to_buffer(v): b = b.view(np.uint64) b = b.reshape(-1, order='A') - b.setflags(write=False) return b diff --git a/numcodecs/tests/test_compat.py b/numcodecs/tests/test_compat.py index 2aa7b0d0..e595a358 100644 --- a/numcodecs/tests/test_compat.py +++ b/numcodecs/tests/test_compat.py @@ -18,14 +18,15 @@ def test_buffer_coercion_raises(): to_buffer(e) -def test_buffer_readonly(): - a = np.arange(100) - a.setflags(write=True) +def test_buffer_writable(): + for writable in [False, True]: + a = np.arange(100) + a.setflags(write=writable) - b = to_buffer(a) - m = memoryview(b) + b = to_buffer(a) + m = memoryview(b) - assert m.readonly + assert m.readonly != writable def test_buffer_coercion(): From 3aea13fc39bac81426313a6d71ccd44c55e42f07 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Sun, 11 Nov 2018 18:43:34 -0500 Subject: [PATCH 45/64] Move array coercion to `ndarray_from_buffer` Pulls out the `ndarray` coercion logic from `to_buffer` and moves it into `ndarray_from_buffer`. As part of this work, soften the `dtype` requirement in `ndarray_from_buffer`. Then rewrite `to_buffer` to call `ndarray_from_buffer` internally to simplify the function and get better use of the `ndarray` coercion code. --- numcodecs/compat.py | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/numcodecs/compat.py b/numcodecs/compat.py index 8558fce6..fa589aaa 100644 --- a/numcodecs/compat.py +++ b/numcodecs/compat.py @@ -31,25 +31,14 @@ def to_buffer(v): """Obtain a `buffer` or `memoryview` for `v`.""" - b = v - if not isinstance(b, np.ndarray): - try: - b = memoryview(b) - except TypeError: # pragma: py3 no cover - b = np.getbuffer(b) - - b = np.array(b, copy=False) - if PY2 and isinstance(v, array.array): # pragma: py3 no cover - b = b.view(v.typecode) + b = ndarray_from_buffer(v) if b.dtype.kind is 'O': raise ValueError('cannot encode object array') elif b.dtype.kind in 'Mm': b = b.view(np.uint64) - b = b.reshape(-1, order='A') - return b @@ -99,11 +88,22 @@ def buffer_copy(buf, out=None): return out -def ndarray_from_buffer(buf, dtype): - if isinstance(buf, np.ndarray): - arr = buf.reshape(-1, order='A').view(dtype) - else: - arr = np.frombuffer(buf, dtype=dtype) +def ndarray_from_buffer(buf, dtype=None): + arr = buf + if not isinstance(arr, np.ndarray): + try: + arr = memoryview(arr) + except TypeError: # pragma: py3 no cover + arr = np.getbuffer(arr) + + arr = np.array(arr, copy=False) + if PY2 and isinstance(buf, array.array): # pragma: py3 no cover + arr = arr.view(buf.typecode) + + arr = arr.reshape(-1, order='A') + if dtype is not None: + arr = arr.view(dtype) + return arr From 398a3e95934ecd0fa935fb8ac00eb7e7ed6d54b2 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Sun, 11 Nov 2018 19:06:39 -0500 Subject: [PATCH 46/64] Handle casting outside of `ndarray_from_buffer` Casting within `ndarray_from_buffer` complicates usage of this rather general function. Besides this can easily be done externally by calling `ndarray`'s `view` method. So rewrite all cases that relied on `ndarray_from_buffer` to handle casting using `view` on the result instead. --- numcodecs/astype.py | 4 ++-- numcodecs/categorize.py | 4 ++-- numcodecs/checksum32.py | 4 ++-- numcodecs/delta.py | 4 ++-- numcodecs/fixedscaleoffset.py | 4 ++-- numcodecs/packbits.py | 4 ++-- numcodecs/quantize.py | 4 ++-- numcodecs/tests/common.py | 4 ++-- 8 files changed, 16 insertions(+), 16 deletions(-) diff --git a/numcodecs/astype.py b/numcodecs/astype.py index 0d3a3160..50352f24 100644 --- a/numcodecs/astype.py +++ b/numcodecs/astype.py @@ -50,7 +50,7 @@ def __init__(self, encode_dtype, decode_dtype): def encode(self, buf): # view input data as 1D array - arr = ndarray_from_buffer(buf, self.decode_dtype) + arr = ndarray_from_buffer(buf).view(self.decode_dtype) # convert and copy enc = arr.astype(self.encode_dtype) @@ -60,7 +60,7 @@ def encode(self, buf): def decode(self, buf, out=None): # view encoded data as 1D array - enc = ndarray_from_buffer(buf, self.encode_dtype) + enc = ndarray_from_buffer(buf).view(self.encode_dtype) # convert and copy dec = enc.astype(self.decode_dtype) diff --git a/numcodecs/categorize.py b/numcodecs/categorize.py index 49a899e7..37573a10 100644 --- a/numcodecs/categorize.py +++ b/numcodecs/categorize.py @@ -53,7 +53,7 @@ def __init__(self, labels, dtype, astype='u1'): def encode(self, buf): # view input as ndarray - arr = ndarray_from_buffer(buf, self.dtype) + arr = ndarray_from_buffer(buf).view(self.dtype) # setup output array enc = np.zeros_like(arr, dtype=self.astype) @@ -67,7 +67,7 @@ def encode(self, buf): def decode(self, buf, out=None): # view encoded data as ndarray - enc = ndarray_from_buffer(buf, self.astype) + enc = ndarray_from_buffer(buf).view(self.astype) # setup output if isinstance(out, np.ndarray): diff --git a/numcodecs/checksum32.py b/numcodecs/checksum32.py index bd044621..8306b668 100644 --- a/numcodecs/checksum32.py +++ b/numcodecs/checksum32.py @@ -17,7 +17,7 @@ class Checksum32(Codec): def encode(self, buf): if isinstance(buf, np.ndarray) and buf.dtype == object: raise ValueError('cannot encode object array') - arr = ndarray_from_buffer(buf, dtype='u1') + arr = ndarray_from_buffer(buf).view('u1') checksum = self.checksum(arr) & 0xffffffff enc = np.empty(arr.nbytes + 4, dtype='u1') enc[:4].view(' Date: Sun, 11 Nov 2018 19:23:46 -0500 Subject: [PATCH 47/64] Drop `ndarray_from_buffer`'s `dtype` argument As the `dtype` argument of `ndarray_from_buffer` is now no longer used, drop this argument from the function and its use internally. It's trivial for users of this function to perform the needed casting. --- numcodecs/astype.py | 6 ++++-- numcodecs/categorize.py | 6 ++++-- numcodecs/checksum32.py | 6 ++++-- numcodecs/compat.py | 4 +--- numcodecs/delta.py | 6 ++++-- numcodecs/fixedscaleoffset.py | 6 ++++-- numcodecs/packbits.py | 6 ++++-- numcodecs/quantize.py | 6 ++++-- numcodecs/tests/common.py | 6 ++++-- 9 files changed, 33 insertions(+), 19 deletions(-) diff --git a/numcodecs/astype.py b/numcodecs/astype.py index 50352f24..715d4b44 100644 --- a/numcodecs/astype.py +++ b/numcodecs/astype.py @@ -50,7 +50,8 @@ def __init__(self, encode_dtype, decode_dtype): def encode(self, buf): # view input data as 1D array - arr = ndarray_from_buffer(buf).view(self.decode_dtype) + arr = ndarray_from_buffer(buf) + arr = arr.view(self.decode_dtype) # convert and copy enc = arr.astype(self.encode_dtype) @@ -60,7 +61,8 @@ def encode(self, buf): def decode(self, buf, out=None): # view encoded data as 1D array - enc = ndarray_from_buffer(buf).view(self.encode_dtype) + enc = ndarray_from_buffer(buf) + enc = enc.view(self.encode_dtype) # convert and copy dec = enc.astype(self.decode_dtype) diff --git a/numcodecs/categorize.py b/numcodecs/categorize.py index 37573a10..6e4de4b7 100644 --- a/numcodecs/categorize.py +++ b/numcodecs/categorize.py @@ -53,7 +53,8 @@ def __init__(self, labels, dtype, astype='u1'): def encode(self, buf): # view input as ndarray - arr = ndarray_from_buffer(buf).view(self.dtype) + arr = ndarray_from_buffer(buf) + arr = arr.view(self.dtype) # setup output array enc = np.zeros_like(arr, dtype=self.astype) @@ -67,7 +68,8 @@ def encode(self, buf): def decode(self, buf, out=None): # view encoded data as ndarray - enc = ndarray_from_buffer(buf).view(self.astype) + enc = ndarray_from_buffer(buf) + enc = enc.view(self.astype) # setup output if isinstance(out, np.ndarray): diff --git a/numcodecs/checksum32.py b/numcodecs/checksum32.py index 8306b668..cd7c114d 100644 --- a/numcodecs/checksum32.py +++ b/numcodecs/checksum32.py @@ -17,7 +17,8 @@ class Checksum32(Codec): def encode(self, buf): if isinstance(buf, np.ndarray) and buf.dtype == object: raise ValueError('cannot encode object array') - arr = ndarray_from_buffer(buf).view('u1') + arr = ndarray_from_buffer(buf) + arr = arr.view('u1') checksum = self.checksum(arr) & 0xffffffff enc = np.empty(arr.nbytes + 4, dtype='u1') enc[:4].view(' Date: Sun, 11 Nov 2018 19:23:47 -0500 Subject: [PATCH 48/64] Handle reshaping outside of `ndarray_from_buffer` Performing reshaping within `ndarray_from_buffer` makes this general function a little harder to use. Especially as the reshaping may cause an unwanted copy. Given this is a trivial thing for users to do externally, drop the reshape from the `ndarray_from_buffer` and rewrite usages of `ndarray_from_buffer` to handle the reshaping themselves. --- numcodecs/astype.py | 4 ++-- numcodecs/categorize.py | 4 ++-- numcodecs/checksum32.py | 4 ++-- numcodecs/compat.py | 4 ++-- numcodecs/delta.py | 4 ++-- numcodecs/fixedscaleoffset.py | 4 ++-- numcodecs/packbits.py | 4 ++-- numcodecs/quantize.py | 4 ++-- numcodecs/tests/common.py | 4 ++-- 9 files changed, 18 insertions(+), 18 deletions(-) diff --git a/numcodecs/astype.py b/numcodecs/astype.py index 715d4b44..f222860f 100644 --- a/numcodecs/astype.py +++ b/numcodecs/astype.py @@ -51,7 +51,7 @@ def encode(self, buf): # view input data as 1D array arr = ndarray_from_buffer(buf) - arr = arr.view(self.decode_dtype) + arr = arr.view(self.decode_dtype).reshape(-1, order='A') # convert and copy enc = arr.astype(self.encode_dtype) @@ -62,7 +62,7 @@ def decode(self, buf, out=None): # view encoded data as 1D array enc = ndarray_from_buffer(buf) - enc = enc.view(self.encode_dtype) + enc = enc.view(self.encode_dtype).reshape(-1, order='A') # convert and copy dec = enc.astype(self.decode_dtype) diff --git a/numcodecs/categorize.py b/numcodecs/categorize.py index 6e4de4b7..adfd28cd 100644 --- a/numcodecs/categorize.py +++ b/numcodecs/categorize.py @@ -54,7 +54,7 @@ def encode(self, buf): # view input as ndarray arr = ndarray_from_buffer(buf) - arr = arr.view(self.dtype) + arr = arr.view(self.dtype).reshape(-1, order='A') # setup output array enc = np.zeros_like(arr, dtype=self.astype) @@ -69,7 +69,7 @@ def decode(self, buf, out=None): # view encoded data as ndarray enc = ndarray_from_buffer(buf) - enc = enc.view(self.astype) + enc = enc.view(self.astype).reshape(-1, order='A') # setup output if isinstance(out, np.ndarray): diff --git a/numcodecs/checksum32.py b/numcodecs/checksum32.py index cd7c114d..574446c6 100644 --- a/numcodecs/checksum32.py +++ b/numcodecs/checksum32.py @@ -18,7 +18,7 @@ def encode(self, buf): if isinstance(buf, np.ndarray) and buf.dtype == object: raise ValueError('cannot encode object array') arr = ndarray_from_buffer(buf) - arr = arr.view('u1') + arr = arr.view('u1').reshape(-1, order='A') checksum = self.checksum(arr) & 0xffffffff enc = np.empty(arr.nbytes + 4, dtype='u1') enc[:4].view(' Date: Sun, 11 Nov 2018 19:35:02 -0500 Subject: [PATCH 49/64] Rewrite `buffer_copy` using `to_buffer` Use `to_buffer` to coerce both `buf` and `out` to `ndarray`s in `buffer_copy`. This simplifies the logic in `buffer_copy` making things easier to maintain and understand generally. --- numcodecs/compat.py | 52 ++++++++++++++++----------------------------- 1 file changed, 18 insertions(+), 34 deletions(-) diff --git a/numcodecs/compat.py b/numcodecs/compat.py index d78d3cd8..6dafbe31 100644 --- a/numcodecs/compat.py +++ b/numcodecs/compat.py @@ -51,41 +51,25 @@ def buffer_copy(buf, out=None): # no-op return buf - # handle ndarray destination - if isinstance(out, np.ndarray): - - # view source as destination dtype - if isinstance(buf, np.ndarray): - buf = buf.view(dtype=out.dtype).reshape(-1, order='A') + # coerce to ndarrays + buf = ndarray_from_buffer(buf) + out = ndarray_from_buffer(out) + + # view source as destination dtype + if out.dtype.kind is not 'O': + buf = buf.view(out.dtype) + + # ensure shapes are compatible + buf = buf.reshape(-1, order='A') + if buf.shape != out.shape: + if out.flags.f_contiguous: + order = 'F' else: - buf = np.frombuffer(buf, dtype=out.dtype) - - # ensure shapes are compatible - if buf.shape != out.shape: - if out.flags.f_contiguous: - order = 'F' - else: - order = 'C' - buf = buf.reshape(out.shape, order=order) - - # copy via numpy - np.copyto(out, buf) - - # handle generic buffer destination - else: - - # obtain memoryview of destination - dest = memoryview(out) - - # ensure source is 1D - if isinstance(buf, np.ndarray): - buf = buf.reshape(-1, order='A') - # try to match itemsize - dtype = 'u%s' % dest.itemsize - buf = buf.view(dtype=dtype) - - # try to copy via memoryview - dest[:] = buf + order = 'C' + buf = buf.reshape(out.shape, order=order) + + # copy via numpy + np.copyto(out, buf) return out From 7a2b3b7e3db740bd84e27287cbd049ce29ba4fe4 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Mon, 12 Nov 2018 11:17:43 -0500 Subject: [PATCH 50/64] Raise for unicode `array` in `ndarray_from_buffer` Handling of unicode `array`s are generally a bit broken. NumPy doesn't have native support of the type. Trying to workaround their issues in a cross-platform Python 2/3 friendly way is fairly difficult. Also they have been deprecated since Python 3.3. Plus these are likely an exceedingly rare thing to see in use. Given all of these issues, simply raise an error if users provide a unicode array stating they are not supported. If it turns out there are users desperately in need of this feature, we can always revisit and discus with them. However we will really need their feedback as any implementation here is going to require feedback and likely someone dedicated to solving the problem. For now erroring out is the most simplest path forward. ref: https://bugs.python.org/issue13072 ref: https://bugs.python.org/issue15625 --- numcodecs/compat.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/numcodecs/compat.py b/numcodecs/compat.py index 6dafbe31..300d1d27 100644 --- a/numcodecs/compat.py +++ b/numcodecs/compat.py @@ -77,6 +77,9 @@ def buffer_copy(buf, out=None): def ndarray_from_buffer(buf): arr = buf if not isinstance(arr, np.ndarray): + if isinstance(arr, array.array) and arr.typecode == 'u': + raise ValueError("unicode arrays are not supported") + try: arr = memoryview(arr) except TypeError: # pragma: py3 no cover From 074bf31bce2e970aa056efba78d0cf015bae58ff Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Mon, 12 Nov 2018 11:18:41 -0500 Subject: [PATCH 51/64] Check that unicode `array`'s raise an error Add a quick test to just verify that a unicode `array` will cause an error to be raised. --- numcodecs/tests/test_compat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/numcodecs/tests/test_compat.py b/numcodecs/tests/test_compat.py index e595a358..839b04b4 100644 --- a/numcodecs/tests/test_compat.py +++ b/numcodecs/tests/test_compat.py @@ -13,7 +13,7 @@ def test_buffer_coercion_raises(): a = np.array([u'Xin chào thế giới'], dtype=object) - for e in [a, memoryview(a)]: + for e in [a, memoryview(a), array.array('u', a[0])]: with pytest.raises(ValueError): to_buffer(e) From de0a9477375238599da9a0abdc07c2de14107588 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Mon, 12 Nov 2018 11:26:49 -0500 Subject: [PATCH 52/64] Revert "Check that unicode `array`'s raise an error" This reverts commit 074bf31bce2e970aa056efba78d0cf015bae58ff. --- numcodecs/tests/test_compat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/numcodecs/tests/test_compat.py b/numcodecs/tests/test_compat.py index 839b04b4..e595a358 100644 --- a/numcodecs/tests/test_compat.py +++ b/numcodecs/tests/test_compat.py @@ -13,7 +13,7 @@ def test_buffer_coercion_raises(): a = np.array([u'Xin chào thế giới'], dtype=object) - for e in [a, memoryview(a), array.array('u', a[0])]: + for e in [a, memoryview(a)]: with pytest.raises(ValueError): to_buffer(e) From eacfdd7588bec2ff84e02f89d9b2beb55e04391a Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Mon, 12 Nov 2018 11:26:53 -0500 Subject: [PATCH 53/64] Revert "Raise for unicode `array` in `ndarray_from_buffer`" This reverts commit 7a2b3b7e3db740bd84e27287cbd049ce29ba4fe4. --- numcodecs/compat.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/numcodecs/compat.py b/numcodecs/compat.py index 300d1d27..6dafbe31 100644 --- a/numcodecs/compat.py +++ b/numcodecs/compat.py @@ -77,9 +77,6 @@ def buffer_copy(buf, out=None): def ndarray_from_buffer(buf): arr = buf if not isinstance(arr, np.ndarray): - if isinstance(arr, array.array) and arr.typecode == 'u': - raise ValueError("unicode arrays are not supported") - try: arr = memoryview(arr) except TypeError: # pragma: py3 no cover From 9d43a3b12754d4a17547f721404c5d8bab709322 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Mon, 12 Nov 2018 11:29:52 -0500 Subject: [PATCH 54/64] Cast around unicode array On Python 3, cast the unicode array to an unsigned char array. This matches what will happen on Python 2 as well. Should bypass NumPy and Windows issues on Python 3 with this type as well because the casting precedes construction of the array. As Python 2 already goes through this path, this is a non-issue there. Afterwards cast the resulting `ndarray` to an `unsigned` type with the same `itemsize` as the original unicode type (as it may be 16-bit or 32-bit depending on the Python build). Handle all other `array` cases the same as before. --- numcodecs/compat.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/numcodecs/compat.py b/numcodecs/compat.py index 6dafbe31..a5821f52 100644 --- a/numcodecs/compat.py +++ b/numcodecs/compat.py @@ -81,10 +81,16 @@ def ndarray_from_buffer(buf): arr = memoryview(arr) except TypeError: # pragma: py3 no cover arr = np.getbuffer(arr) + else: # pragma: py2 no cover + if isinstance(buf, array.array) and buf.typecode is 'u': + arr = arr.cast('B') arr = np.array(arr, copy=False) - if PY2 and isinstance(buf, array.array): # pragma: py3 no cover - arr = arr.view(buf.typecode) + if isinstance(buf, array.array): + if buf.typecode is 'u': + arr = arr.view('u%i' % buf.itemsize) + elif PY2: # pragma: py3 no cover + arr = arr.view(buf.typecode) return arr From d0b0ef760e4950486e37575c2b720ab890e844b3 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Mon, 12 Nov 2018 12:01:20 -0500 Subject: [PATCH 55/64] Test handling of unicode `array`s Make sure the unicode `array`s are being cast to unsigned integers of some size. --- numcodecs/tests/test_compat.py | 1 + 1 file changed, 1 insertion(+) diff --git a/numcodecs/tests/test_compat.py b/numcodecs/tests/test_compat.py index e595a358..dfd9261d 100644 --- a/numcodecs/tests/test_compat.py +++ b/numcodecs/tests/test_compat.py @@ -35,6 +35,7 @@ def test_buffer_coercion(): ('u', bytes(20)), ('i', np.arange(100, dtype=np.int64)), ('i', array.array('l', b'qwertyuiqwertyui')), + ('u', array.array('u', u'qwertyuiqwertyui')), ('u', mmap.mmap(-1, 10)) ] for typ, buf in typed_bufs: From a789c4d14d0902b7d9cadd0e68b969fb47d8cd9a Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Mon, 12 Nov 2018 12:23:14 -0500 Subject: [PATCH 56/64] Add utility `getbuffer` function Provides a Python 2/3 `getbuffer` function that tries to mimic NumPy's `getbuffer` functionality on Python 3 and simply uses NumPy's implementation on Python 2. --- numcodecs/compat.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/numcodecs/compat.py b/numcodecs/compat.py index a5821f52..45381d55 100644 --- a/numcodecs/compat.py +++ b/numcodecs/compat.py @@ -29,6 +29,13 @@ from functools import reduce +def getbuffer(v): + if PY2: + return np.getbuffer(v) + else: + return memoryview(v).cast('B') + + def to_buffer(v): """Obtain a `buffer` or `memoryview` for `v`.""" From e01195cd8b5b6ce29dc573a8f60f451ccf40faeb Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Mon, 12 Nov 2018 12:24:44 -0500 Subject: [PATCH 57/64] Use `getbuffer` for checking shared memory Works around some pain points with unicode `array` types on Windows specifically. --- numcodecs/tests/test_compat.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/numcodecs/tests/test_compat.py b/numcodecs/tests/test_compat.py index dfd9261d..9125022d 100644 --- a/numcodecs/tests/test_compat.py +++ b/numcodecs/tests/test_compat.py @@ -8,7 +8,7 @@ import numpy as np -from numcodecs.compat import buffer, to_buffer +from numcodecs.compat import getbuffer, to_buffer def test_buffer_coercion_raises(): @@ -42,4 +42,4 @@ def test_buffer_coercion(): b1 = to_buffer(buf) assert isinstance(b1, np.ndarray) assert b1.dtype.kind is typ - assert np.shares_memory(b1, buffer(buf)) + assert np.shares_memory(b1, getbuffer(buf)) From 94fcc379796538966f531c9cf3469947aa5e1f63 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Mon, 12 Nov 2018 17:57:39 -0500 Subject: [PATCH 58/64] Preserve `memoryview` of `array`'s `itemsize` Perform both `memoryview` casts for unicode `array`s one after the other. This ensures that the `memoryview` has the expected replacement type before taking the `ndarray` view. The behavior for an end user appears to be essentially identical. However it's arguably nicer to ensure the underlying `memoryview` has the intended type and shape. Updates the `ndarray` casting block at the end to be Python 2 only. As the unicode `array` case (the only one in need of casting on Python 3), is handled before constructing the `ndarray`. So there is no need for additional casting of `ndarray`s for Python 3. --- numcodecs/compat.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/numcodecs/compat.py b/numcodecs/compat.py index 45381d55..6cabba7b 100644 --- a/numcodecs/compat.py +++ b/numcodecs/compat.py @@ -90,13 +90,13 @@ def ndarray_from_buffer(buf): arr = np.getbuffer(arr) else: # pragma: py2 no cover if isinstance(buf, array.array) and buf.typecode is 'u': - arr = arr.cast('B') + arr = arr.cast('B').cast(np.dtype('u%i' % buf.itemsize).char) arr = np.array(arr, copy=False) - if isinstance(buf, array.array): + if PY2 and isinstance(buf, array.array): # pragma: py3 no cover if buf.typecode is 'u': arr = arr.view('u%i' % buf.itemsize) - elif PY2: # pragma: py3 no cover + else: arr = arr.view(buf.typecode) return arr From 5aa7b897508e59450e942e566cbd944c602fb8ee Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Mon, 12 Nov 2018 21:24:23 -0500 Subject: [PATCH 59/64] Re-raise `TypeError` on Python 3 If coercion to a `memoryview` does not work on Python 3, re-raise the original `TypeError` exception. --- numcodecs/compat.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/numcodecs/compat.py b/numcodecs/compat.py index 6cabba7b..d297db2d 100644 --- a/numcodecs/compat.py +++ b/numcodecs/compat.py @@ -86,8 +86,11 @@ def ndarray_from_buffer(buf): if not isinstance(arr, np.ndarray): try: arr = memoryview(arr) - except TypeError: # pragma: py3 no cover - arr = np.getbuffer(arr) + except TypeError: + if PY2: # pragma: py3 no cover + arr = np.getbuffer(arr) + else: + raise else: # pragma: py2 no cover if isinstance(buf, array.array) and buf.typecode is 'u': arr = arr.cast('B').cast(np.dtype('u%i' % buf.itemsize).char) From f0b550830c36b31304b5cd79962739f7a99db6e9 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Mon, 12 Nov 2018 21:31:35 -0500 Subject: [PATCH 60/64] Test that non-buffer types raise errors If an object is provided that does not implement either buffer protocol, it should raise a `TypeError`. This includes such a test to ensure that `TypeError`s are being raised for non-buffer supporting types. --- numcodecs/tests/test_compat.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/numcodecs/tests/test_compat.py b/numcodecs/tests/test_compat.py index 9125022d..32137885 100644 --- a/numcodecs/tests/test_compat.py +++ b/numcodecs/tests/test_compat.py @@ -16,6 +16,8 @@ def test_buffer_coercion_raises(): for e in [a, memoryview(a)]: with pytest.raises(ValueError): to_buffer(e) + with pytest.raises(TypeError): + to_buffer(a.tolist()) def test_buffer_writable(): From 96625e893588083c6b95678cc900408becaf7446 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Thu, 22 Nov 2018 15:44:14 -0500 Subject: [PATCH 61/64] Drop PY2K/PY3K `buffer` assignment This is no longer used as we have switched to `np.getbuffer` on Python 2 to ensure that read-write buffers are used where possible like with `memoryview` on Python 3. --- numcodecs/compat.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/numcodecs/compat.py b/numcodecs/compat.py index d297db2d..62f2b490 100644 --- a/numcodecs/compat.py +++ b/numcodecs/compat.py @@ -14,7 +14,6 @@ if PY2: # pragma: py3 no cover - buffer = buffer text_type = unicode binary_type = str integer_types = (int, long) @@ -22,7 +21,6 @@ else: # pragma: py2 no cover - buffer = memoryview text_type = str binary_type = bytes integer_types = int, From ae0434f542eef4132f272cfcc1d08b6808321fe4 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Thu, 22 Nov 2018 16:26:22 -0500 Subject: [PATCH 62/64] Inline `getbuffer` This was only being used as a convenience function for one test. As this results in a bit of indirection and can be easily inlined for simplicity. Just inline it and drop the `getbuffer` compatibility function. --- numcodecs/compat.py | 7 ------- numcodecs/tests/test_compat.py | 7 +++++-- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/numcodecs/compat.py b/numcodecs/compat.py index 62f2b490..722627c8 100644 --- a/numcodecs/compat.py +++ b/numcodecs/compat.py @@ -27,13 +27,6 @@ from functools import reduce -def getbuffer(v): - if PY2: - return np.getbuffer(v) - else: - return memoryview(v).cast('B') - - def to_buffer(v): """Obtain a `buffer` or `memoryview` for `v`.""" diff --git a/numcodecs/tests/test_compat.py b/numcodecs/tests/test_compat.py index 32137885..1b1aa2ce 100644 --- a/numcodecs/tests/test_compat.py +++ b/numcodecs/tests/test_compat.py @@ -8,7 +8,7 @@ import numpy as np -from numcodecs.compat import getbuffer, to_buffer +from numcodecs.compat import PY2, to_buffer def test_buffer_coercion_raises(): @@ -44,4 +44,7 @@ def test_buffer_coercion(): b1 = to_buffer(buf) assert isinstance(b1, np.ndarray) assert b1.dtype.kind is typ - assert np.shares_memory(b1, getbuffer(buf)) + if PY2: + assert np.shares_memory(b1, np.getbuffer(buf)) + else: + assert np.shares_memory(b1, memoryview(buf)) From 3ed988b8d6c3c9cad5b21ceddf77ee32dc6e5b63 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Thu, 22 Nov 2018 16:40:07 -0500 Subject: [PATCH 63/64] Cast `memoryview` to `uint8` for `shares_memory` As the buffer is coerced by NumPy into an array without copying, it needs to be able to handle the data type. However for Windows the unicode array has UCS2 whereas NumPy only supports UCS4. To fix this issue just cast the `memoryview` to `uint8`. We don't particularly care about the type for this test. Only that data is shared. So this should workaround the issue and allow the check to occur. --- numcodecs/tests/test_compat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/numcodecs/tests/test_compat.py b/numcodecs/tests/test_compat.py index 1b1aa2ce..643506df 100644 --- a/numcodecs/tests/test_compat.py +++ b/numcodecs/tests/test_compat.py @@ -47,4 +47,4 @@ def test_buffer_coercion(): if PY2: assert np.shares_memory(b1, np.getbuffer(buf)) else: - assert np.shares_memory(b1, memoryview(buf)) + assert np.shares_memory(b1, memoryview(buf).cast('B')) From 1f1f9f4b78bacc1a5baac50db6a2d6e4870ce363 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Fri, 23 Nov 2018 15:03:11 -0500 Subject: [PATCH 64/64] Add no cover pragmas for shared memory test --- numcodecs/tests/test_compat.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/numcodecs/tests/test_compat.py b/numcodecs/tests/test_compat.py index 643506df..d391f865 100644 --- a/numcodecs/tests/test_compat.py +++ b/numcodecs/tests/test_compat.py @@ -44,7 +44,7 @@ def test_buffer_coercion(): b1 = to_buffer(buf) assert isinstance(b1, np.ndarray) assert b1.dtype.kind is typ - if PY2: + if PY2: # pragma: py3 no cover assert np.shares_memory(b1, np.getbuffer(buf)) - else: + else: # pragma: py2 no cover assert np.shares_memory(b1, memoryview(buf).cast('B'))