-
Notifications
You must be signed in to change notification settings - Fork 111
Re-enable VLenBytes round-trip None test
#736
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
07571e5
b97fc62
e7b6b4b
9bcd15c
8e24cce
d6085a8
c05d33b
5b550ae
739dd88
a37b8f9
644f147
8cc95fe
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -10,23 +10,10 @@ cimport cython | |
| from libc.stdint cimport uint8_t, uint32_t | ||
| from libc.string cimport memcpy | ||
|
|
||
| from cpython.buffer cimport PyBuffer_IsContiguous | ||
| from cpython.bytearray cimport ( | ||
| PyByteArray_AS_STRING, | ||
| PyByteArray_FromStringAndSize, | ||
| ) | ||
| from cpython.bytes cimport ( | ||
| PyBytes_AS_STRING, | ||
| PyBytes_GET_SIZE, | ||
| PyBytes_Check, | ||
| PyBytes_FromStringAndSize, | ||
| ) | ||
| from cpython.bytearray cimport PyByteArray_FromStringAndSize | ||
| from cpython.bytes cimport PyBytes_FromStringAndSize | ||
| from cpython.memoryview cimport PyMemoryView_GET_BUFFER | ||
| from cpython.unicode cimport ( | ||
| PyUnicode_AsUTF8String, | ||
| PyUnicode_Check, | ||
| PyUnicode_FromStringAndSize, | ||
| ) | ||
| from cpython.unicode cimport PyUnicode_FromStringAndSize | ||
|
|
||
| from numpy cimport ndarray | ||
|
|
||
|
|
@@ -89,15 +76,15 @@ class VLenUTF8(Codec): | |
| @cython.boundscheck(False) | ||
| def encode(self, buf): | ||
| cdef: | ||
| Py_ssize_t i, l, n_items, data_length, total_length | ||
| Py_ssize_t i, L, n_items, data_length | ||
| ndarray[object, ndim=1] input_values | ||
| object[:] encoded_values | ||
| int[:] encoded_lengths | ||
| char* encv | ||
| bytes b | ||
| bytearray out | ||
| char* data | ||
| object u | ||
| object o | ||
| unicode u | ||
|
|
||
| # normalise input | ||
| input_values = np.asarray(buf, dtype=object).reshape(-1, order='A') | ||
|
|
@@ -110,36 +97,33 @@ class VLenUTF8(Codec): | |
| encoded_lengths = np.empty(n_items, dtype=np.intc) | ||
|
|
||
| # first iteration to convert to bytes | ||
| data_length = 0 | ||
| data_length = HEADER_LENGTH | ||
| for i in range(n_items): | ||
| u = input_values[i] | ||
| if u is None or u == 0: # treat these as missing value, normalize | ||
| u = '' | ||
| elif not PyUnicode_Check(u): | ||
| raise TypeError('expected unicode string, found %r' % u) | ||
| b = PyUnicode_AsUTF8String(u) | ||
| l = PyBytes_GET_SIZE(b) | ||
| o = input_values[i] | ||
| # replace missing value and coerce to typed data | ||
| u = "" if o is None or o == 0 else o | ||
| b = u.encode("utf-8") | ||
| L = len(b) | ||
| encoded_values[i] = b | ||
| data_length += l + HEADER_LENGTH | ||
| encoded_lengths[i] = l | ||
| data_length += L + HEADER_LENGTH | ||
| encoded_lengths[i] = L | ||
|
|
||
| # setup output | ||
| total_length = HEADER_LENGTH + data_length | ||
| out = PyByteArray_FromStringAndSize(NULL, total_length) | ||
| out = PyByteArray_FromStringAndSize(NULL, data_length) | ||
|
|
||
| # write header | ||
| data = PyByteArray_AS_STRING(out) | ||
| data = out | ||
| store_le32(<uint8_t*>data, n_items) | ||
|
|
||
| # second iteration, store data | ||
| data += HEADER_LENGTH | ||
| for i in range(n_items): | ||
| l = encoded_lengths[i] | ||
| store_le32(<uint8_t*>data, l) | ||
| L = encoded_lengths[i] | ||
| store_le32(<uint8_t*>data, L) | ||
| data += HEADER_LENGTH | ||
| encv = PyBytes_AS_STRING(encoded_values[i]) | ||
| memcpy(data, encv, l) | ||
| data += l | ||
| b = encoded_values[i] | ||
| memcpy(data, <const char*>b, L) | ||
| data += L | ||
|
|
||
| return out | ||
|
|
||
|
|
@@ -151,16 +135,14 @@ class VLenUTF8(Codec): | |
| const Py_buffer* buf_pb | ||
| const char* data | ||
| const char* data_end | ||
| Py_ssize_t i, l, n_items, data_length | ||
| Py_ssize_t i, L, n_items, data_length | ||
|
|
||
| # obtain memoryview | ||
| buf = ensure_contiguous_ndarray(buf) | ||
| buf_mv = memoryview(buf) | ||
| buf_mv = ensure_continguous_memoryview(buf) | ||
| buf_pb = PyMemoryView_GET_BUFFER(buf_mv) | ||
|
|
||
| # sanity checks | ||
| if not PyBuffer_IsContiguous(buf_pb, b'A'): | ||
| raise BufferError("`buf` must contain contiguous memory") | ||
| if buf_pb.len < HEADER_LENGTH: | ||
| raise ValueError('corrupt buffer, missing or truncated header') | ||
|
|
||
|
|
@@ -184,12 +166,12 @@ class VLenUTF8(Codec): | |
| for i in range(n_items): | ||
| if data + HEADER_LENGTH > data_end: | ||
| raise ValueError('corrupt buffer, data seem truncated') | ||
| l = load_le32(<uint8_t*>data) | ||
| L = load_le32(<uint8_t*>data) | ||
| data += HEADER_LENGTH | ||
| if data + l > data_end: | ||
| if data + L > data_end: | ||
| raise ValueError('corrupt buffer, data seem truncated') | ||
| out[i] = PyUnicode_FromStringAndSize(data, l) | ||
| data += l | ||
| out[i] = PyUnicode_FromStringAndSize(data, L) | ||
| data += L | ||
|
|
||
| return out | ||
|
|
||
|
|
@@ -225,11 +207,12 @@ class VLenBytes(Codec): | |
| @cython.boundscheck(False) | ||
| def encode(self, buf): | ||
| cdef: | ||
| Py_ssize_t i, l, n_items, data_length, total_length | ||
| Py_ssize_t i, L, n_items, data_length | ||
| object[:] values | ||
| object[:] normed_values | ||
| int[:] lengths | ||
| char* encv | ||
| object b | ||
| object o | ||
| bytes b | ||
| bytearray out | ||
| char* data | ||
|
|
||
|
|
@@ -240,37 +223,36 @@ class VLenBytes(Codec): | |
| n_items = values.shape[0] | ||
|
|
||
| # setup intermediates | ||
| normed_values = np.empty(n_items, dtype=object) | ||
| lengths = np.empty(n_items, dtype=np.intc) | ||
|
|
||
| # first iteration to find lengths | ||
| data_length = 0 | ||
| data_length = HEADER_LENGTH | ||
| for i in range(n_items): | ||
| b = values[i] | ||
| if b is None or b == 0: # treat these as missing value, normalize | ||
| b = b'' | ||
| elif not PyBytes_Check(b): | ||
| raise TypeError('expected byte string, found %r' % b) | ||
| l = PyBytes_GET_SIZE(b) | ||
| data_length += l + HEADER_LENGTH | ||
| lengths[i] = l | ||
| o = values[i] | ||
| # replace missing value and coerce to typed data | ||
| b = b"" if o is None or o == 0 else o | ||
| normed_values[i] = b | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Now when we loop through and normalize an input, we can store each normalized value for processing later |
||
| L = len(b) | ||
| data_length += HEADER_LENGTH + L | ||
| lengths[i] = L | ||
|
|
||
| # setup output | ||
| total_length = HEADER_LENGTH + data_length | ||
| out = PyByteArray_FromStringAndSize(NULL, total_length) | ||
| out = PyByteArray_FromStringAndSize(NULL, data_length) | ||
|
|
||
| # write header | ||
| data = PyByteArray_AS_STRING(out) | ||
| data = out | ||
| store_le32(<uint8_t*>data, n_items) | ||
|
|
||
| # second iteration, store data | ||
| data += HEADER_LENGTH | ||
| for i in range(n_items): | ||
| l = lengths[i] | ||
| store_le32(<uint8_t*>data, l) | ||
| L = lengths[i] | ||
| store_le32(<uint8_t*>data, L) | ||
| data += HEADER_LENGTH | ||
| encv = PyBytes_AS_STRING(values[i]) | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Looks like this line was the actual one that failed ( a bit further in than the aforementioned error would suggest: #683 (comment) ) The issue is that |
||
| memcpy(data, encv, l) | ||
| data += l | ||
| b = normed_values[i] | ||
| memcpy(data, <const char*>b, L) | ||
| data += L | ||
|
|
||
| return out | ||
|
|
||
|
|
@@ -282,16 +264,14 @@ class VLenBytes(Codec): | |
| const Py_buffer* buf_pb | ||
| const char* data | ||
| const char* data_end | ||
| Py_ssize_t i, l, n_items, data_length | ||
| Py_ssize_t i, L, n_items, data_length | ||
|
|
||
| # obtain memoryview | ||
| buf = ensure_contiguous_ndarray(buf) | ||
| buf_mv = memoryview(buf) | ||
| buf_mv = ensure_continguous_memoryview(buf) | ||
| buf_pb = PyMemoryView_GET_BUFFER(buf_mv) | ||
|
|
||
| # sanity checks | ||
| if not PyBuffer_IsContiguous(buf_pb, b'A'): | ||
| raise BufferError("`buf` must contain contiguous memory") | ||
| if buf_pb.len < HEADER_LENGTH: | ||
| raise ValueError('corrupt buffer, missing or truncated header') | ||
|
|
||
|
|
@@ -315,12 +295,12 @@ class VLenBytes(Codec): | |
| for i in range(n_items): | ||
| if data + HEADER_LENGTH > data_end: | ||
| raise ValueError('corrupt buffer, data seem truncated') | ||
| l = load_le32(<uint8_t*>data) | ||
| L = load_le32(<uint8_t*>data) | ||
| data += HEADER_LENGTH | ||
| if data + l > data_end: | ||
| if data + L > data_end: | ||
| raise ValueError('corrupt buffer, data seem truncated') | ||
| out[i] = PyBytes_FromStringAndSize(data, l) | ||
| data += l | ||
| out[i] = PyBytes_FromStringAndSize(data, L) | ||
| data += L | ||
|
|
||
| return out | ||
|
|
||
|
|
@@ -369,17 +349,16 @@ class VLenArray(Codec): | |
| @cython.boundscheck(False) | ||
| def encode(self, buf): | ||
| cdef: | ||
| Py_ssize_t i, l, n_items, data_length, total_length | ||
| Py_ssize_t i, L, n_items, data_length | ||
| object[:] values | ||
| object[:] normed_values | ||
| int[:] lengths | ||
| const char* encv | ||
| bytes b | ||
| bytearray out | ||
| char* data | ||
| memoryview value_mv | ||
| const Py_buffer* value_pb | ||
| object v | ||
| object o | ||
|
|
||
| # normalise input | ||
| values = np.asarray(buf, dtype=object).reshape(-1, order='A') | ||
|
|
@@ -392,41 +371,41 @@ class VLenArray(Codec): | |
| lengths = np.empty(n_items, dtype=np.intc) | ||
|
|
||
| # first iteration to convert to bytes | ||
| data_length = 0 | ||
| data_length = HEADER_LENGTH | ||
| for i in range(n_items): | ||
| v = values[i] | ||
| if v is None: | ||
| v = np.array([], dtype=self.dtype) | ||
| else: | ||
| v = np.ascontiguousarray(v, self.dtype) | ||
| if v.ndim != 1: | ||
| raise ValueError('only 1-dimensional arrays are supported') | ||
| l = v.nbytes | ||
| normed_values[i] = v | ||
| data_length += l + HEADER_LENGTH | ||
| lengths[i] = l | ||
| o = values[i] | ||
| # replace missing value and coerce to typed data | ||
| value_mv = ensure_continguous_memoryview( | ||
| np.array([], dtype=self.dtype) if o is None | ||
| else np.ascontiguousarray(o, self.dtype) | ||
| ) | ||
| value_pb = PyMemoryView_GET_BUFFER(value_mv) | ||
| if value_pb.ndim != 1: | ||
| raise ValueError("only 1-dimensional arrays are supported") | ||
| L = value_pb.len | ||
| normed_values[i] = value_mv | ||
| data_length += HEADER_LENGTH + L | ||
| lengths[i] = L | ||
|
|
||
| # setup output | ||
| total_length = HEADER_LENGTH + data_length | ||
| out = PyByteArray_FromStringAndSize(NULL, total_length) | ||
| out = PyByteArray_FromStringAndSize(NULL, data_length) | ||
|
|
||
| # write header | ||
| data = PyByteArray_AS_STRING(out) | ||
| data = out | ||
| store_le32(<uint8_t*>data, n_items) | ||
|
|
||
| # second iteration, store data | ||
| data += HEADER_LENGTH | ||
| for i in range(n_items): | ||
| l = lengths[i] | ||
| store_le32(<uint8_t*>data, l) | ||
| L = lengths[i] | ||
| store_le32(<uint8_t*>data, L) | ||
| data += HEADER_LENGTH | ||
|
|
||
| value_mv = ensure_continguous_memoryview(normed_values[i]) | ||
| value_mv = normed_values[i] | ||
| value_pb = PyMemoryView_GET_BUFFER(value_mv) | ||
| encv = <const char*>value_pb.buf | ||
|
|
||
| memcpy(data, encv, l) | ||
| data += l | ||
| memcpy(data, value_pb.buf, L) | ||
| data += L | ||
|
|
||
| return out | ||
|
|
||
|
|
@@ -441,16 +420,14 @@ class VLenArray(Codec): | |
| object v | ||
| memoryview v_mv | ||
| Py_buffer* v_pb | ||
| Py_ssize_t i, l, n_items, data_length | ||
| Py_ssize_t i, L, n_items, data_length | ||
|
|
||
| # obtain memoryview | ||
| buf = ensure_contiguous_ndarray(buf) | ||
| buf_mv = memoryview(buf) | ||
| buf_mv = ensure_continguous_memoryview(buf) | ||
| buf_pb = PyMemoryView_GET_BUFFER(buf_mv) | ||
|
|
||
| # sanity checks | ||
| if not PyBuffer_IsContiguous(buf_pb, b'A'): | ||
| raise BufferError("`buf` must contain contiguous memory") | ||
| if buf_pb.len < HEADER_LENGTH: | ||
| raise ValueError('corrupt buffer, missing or truncated header') | ||
|
|
||
|
|
@@ -474,18 +451,18 @@ class VLenArray(Codec): | |
| for i in range(n_items): | ||
| if data + HEADER_LENGTH > data_end: | ||
| raise ValueError('corrupt buffer, data seem truncated') | ||
| l = load_le32(<uint8_t*>data) | ||
| L = load_le32(<uint8_t*>data) | ||
| data += HEADER_LENGTH | ||
| if data + l > data_end: | ||
| if data + L > data_end: | ||
| raise ValueError('corrupt buffer, data seem truncated') | ||
|
|
||
| # Create & fill array value | ||
| v = np.empty((l,), dtype="uint8").view(self.dtype) | ||
| v = np.empty((L,), dtype="uint8").view(self.dtype) | ||
| v_mv = memoryview(v) | ||
| v_pb = PyMemoryView_GET_BUFFER(v_mv) | ||
| memcpy(v_pb.buf, data, l) | ||
| memcpy(v_pb.buf, data, L) | ||
|
|
||
| out[i] = v | ||
| data += l | ||
| data += L | ||
|
|
||
| return out | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
To fix the error mentioned above, we need to create a place to store the normalized values
So we allocate an empty array to start.
objecttype as we are storing Python objects in it