Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 0 additions & 4 deletions numcodecs/tests/test_vlen_bytes.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import sys
import unittest

import numpy as np
Expand Down Expand Up @@ -85,9 +84,6 @@ def test_decode_errors():
codec.decode(enc, out=np.zeros(10, dtype='i4'))


# TODO: fix this test on GitHub actions somehow...
# See https://github.com/zarr-developers/numcodecs/issues/683
@pytest.mark.skipif(sys.platform == "darwin", reason="Test is failing on macOS on GitHub actions.")
def test_encode_none():
a = np.array([b'foo', None, b'bar'], dtype=object)
codec = VLenBytes()
Expand Down
189 changes: 83 additions & 106 deletions numcodecs/vlen.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -10,23 +10,10 @@ cimport cython
from libc.stdint cimport uint8_t, uint32_t
from libc.string cimport memcpy

from cpython.buffer cimport PyBuffer_IsContiguous
from cpython.bytearray cimport (
PyByteArray_AS_STRING,
PyByteArray_FromStringAndSize,
)
from cpython.bytes cimport (
PyBytes_AS_STRING,
PyBytes_GET_SIZE,
PyBytes_Check,
PyBytes_FromStringAndSize,
)
from cpython.bytearray cimport PyByteArray_FromStringAndSize
from cpython.bytes cimport PyBytes_FromStringAndSize
from cpython.memoryview cimport PyMemoryView_GET_BUFFER
from cpython.unicode cimport (
PyUnicode_AsUTF8String,
PyUnicode_Check,
PyUnicode_FromStringAndSize,
)
from cpython.unicode cimport PyUnicode_FromStringAndSize

from numpy cimport ndarray

Expand Down Expand Up @@ -89,15 +76,15 @@ class VLenUTF8(Codec):
@cython.boundscheck(False)
def encode(self, buf):
cdef:
Py_ssize_t i, l, n_items, data_length, total_length
Py_ssize_t i, L, n_items, data_length
ndarray[object, ndim=1] input_values
object[:] encoded_values
int[:] encoded_lengths
char* encv
bytes b
bytearray out
char* data
object u
object o
unicode u

# normalise input
input_values = np.asarray(buf, dtype=object).reshape(-1, order='A')
Expand All @@ -110,36 +97,33 @@ class VLenUTF8(Codec):
encoded_lengths = np.empty(n_items, dtype=np.intc)

# first iteration to convert to bytes
data_length = 0
data_length = HEADER_LENGTH
for i in range(n_items):
u = input_values[i]
if u is None or u == 0: # treat these as missing value, normalize
u = ''
elif not PyUnicode_Check(u):
raise TypeError('expected unicode string, found %r' % u)
b = PyUnicode_AsUTF8String(u)
l = PyBytes_GET_SIZE(b)
o = input_values[i]
# replace missing value and coerce to typed data
u = "" if o is None or o == 0 else o
b = u.encode("utf-8")
L = len(b)
encoded_values[i] = b
data_length += l + HEADER_LENGTH
encoded_lengths[i] = l
data_length += L + HEADER_LENGTH
encoded_lengths[i] = L

# setup output
total_length = HEADER_LENGTH + data_length
out = PyByteArray_FromStringAndSize(NULL, total_length)
out = PyByteArray_FromStringAndSize(NULL, data_length)

# write header
data = PyByteArray_AS_STRING(out)
data = out
store_le32(<uint8_t*>data, n_items)

# second iteration, store data
data += HEADER_LENGTH
for i in range(n_items):
l = encoded_lengths[i]
store_le32(<uint8_t*>data, l)
L = encoded_lengths[i]
store_le32(<uint8_t*>data, L)
data += HEADER_LENGTH
encv = PyBytes_AS_STRING(encoded_values[i])
memcpy(data, encv, l)
data += l
b = encoded_values[i]
memcpy(data, <const char*>b, L)
data += L

return out

Expand All @@ -151,16 +135,14 @@ class VLenUTF8(Codec):
const Py_buffer* buf_pb
const char* data
const char* data_end
Py_ssize_t i, l, n_items, data_length
Py_ssize_t i, L, n_items, data_length

# obtain memoryview
buf = ensure_contiguous_ndarray(buf)
buf_mv = memoryview(buf)
buf_mv = ensure_continguous_memoryview(buf)
buf_pb = PyMemoryView_GET_BUFFER(buf_mv)

# sanity checks
if not PyBuffer_IsContiguous(buf_pb, b'A'):
raise BufferError("`buf` must contain contiguous memory")
if buf_pb.len < HEADER_LENGTH:
raise ValueError('corrupt buffer, missing or truncated header')

Expand All @@ -184,12 +166,12 @@ class VLenUTF8(Codec):
for i in range(n_items):
if data + HEADER_LENGTH > data_end:
raise ValueError('corrupt buffer, data seem truncated')
l = load_le32(<uint8_t*>data)
L = load_le32(<uint8_t*>data)
data += HEADER_LENGTH
if data + l > data_end:
if data + L > data_end:
raise ValueError('corrupt buffer, data seem truncated')
out[i] = PyUnicode_FromStringAndSize(data, l)
data += l
out[i] = PyUnicode_FromStringAndSize(data, L)
data += L

return out

Expand Down Expand Up @@ -225,11 +207,12 @@ class VLenBytes(Codec):
@cython.boundscheck(False)
def encode(self, buf):
cdef:
Py_ssize_t i, l, n_items, data_length, total_length
Py_ssize_t i, L, n_items, data_length
object[:] values
object[:] normed_values
int[:] lengths
char* encv
object b
object o
bytes b
bytearray out
char* data

Expand All @@ -240,37 +223,36 @@ class VLenBytes(Codec):
n_items = values.shape[0]

# setup intermediates
normed_values = np.empty(n_items, dtype=object)
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To fix the error mentioned above, we need to create a place to store the normalized values

So we allocate an empty array to start. object type as we are storing Python objects in it

lengths = np.empty(n_items, dtype=np.intc)

# first iteration to find lengths
data_length = 0
data_length = HEADER_LENGTH
for i in range(n_items):
b = values[i]
if b is None or b == 0: # treat these as missing value, normalize
b = b''
elif not PyBytes_Check(b):
raise TypeError('expected byte string, found %r' % b)
l = PyBytes_GET_SIZE(b)
data_length += l + HEADER_LENGTH
lengths[i] = l
o = values[i]
# replace missing value and coerce to typed data
b = b"" if o is None or o == 0 else o
normed_values[i] = b
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Now when we loop through and normalize an input, we can store each normalized value for processing later

L = len(b)
data_length += HEADER_LENGTH + L
lengths[i] = L

# setup output
total_length = HEADER_LENGTH + data_length
out = PyByteArray_FromStringAndSize(NULL, total_length)
out = PyByteArray_FromStringAndSize(NULL, data_length)

# write header
data = PyByteArray_AS_STRING(out)
data = out
store_le32(<uint8_t*>data, n_items)

# second iteration, store data
data += HEADER_LENGTH
for i in range(n_items):
l = lengths[i]
store_le32(<uint8_t*>data, l)
L = lengths[i]
store_le32(<uint8_t*>data, L)
data += HEADER_LENGTH
encv = PyBytes_AS_STRING(values[i])
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks like this line was the actual one that failed ( a bit further in than the aforementioned error would suggest: #683 (comment) )

The issue is that value[i] does not contain the normalized value. So if None shows up here, it errors. Also the error is a nasty segfault (due to attempting to access a field that does not exist) or an assertion error if Python's debug mode is used

memcpy(data, encv, l)
data += l
b = normed_values[i]
memcpy(data, <const char*>b, L)
data += L

return out

Expand All @@ -282,16 +264,14 @@ class VLenBytes(Codec):
const Py_buffer* buf_pb
const char* data
const char* data_end
Py_ssize_t i, l, n_items, data_length
Py_ssize_t i, L, n_items, data_length

# obtain memoryview
buf = ensure_contiguous_ndarray(buf)
buf_mv = memoryview(buf)
buf_mv = ensure_continguous_memoryview(buf)
buf_pb = PyMemoryView_GET_BUFFER(buf_mv)

# sanity checks
if not PyBuffer_IsContiguous(buf_pb, b'A'):
raise BufferError("`buf` must contain contiguous memory")
if buf_pb.len < HEADER_LENGTH:
raise ValueError('corrupt buffer, missing or truncated header')

Expand All @@ -315,12 +295,12 @@ class VLenBytes(Codec):
for i in range(n_items):
if data + HEADER_LENGTH > data_end:
raise ValueError('corrupt buffer, data seem truncated')
l = load_le32(<uint8_t*>data)
L = load_le32(<uint8_t*>data)
data += HEADER_LENGTH
if data + l > data_end:
if data + L > data_end:
raise ValueError('corrupt buffer, data seem truncated')
out[i] = PyBytes_FromStringAndSize(data, l)
data += l
out[i] = PyBytes_FromStringAndSize(data, L)
data += L

return out

Expand Down Expand Up @@ -369,17 +349,16 @@ class VLenArray(Codec):
@cython.boundscheck(False)
def encode(self, buf):
cdef:
Py_ssize_t i, l, n_items, data_length, total_length
Py_ssize_t i, L, n_items, data_length
object[:] values
object[:] normed_values
int[:] lengths
const char* encv
bytes b
bytearray out
char* data
memoryview value_mv
const Py_buffer* value_pb
object v
object o

# normalise input
values = np.asarray(buf, dtype=object).reshape(-1, order='A')
Expand All @@ -392,41 +371,41 @@ class VLenArray(Codec):
lengths = np.empty(n_items, dtype=np.intc)

# first iteration to convert to bytes
data_length = 0
data_length = HEADER_LENGTH
for i in range(n_items):
v = values[i]
if v is None:
v = np.array([], dtype=self.dtype)
else:
v = np.ascontiguousarray(v, self.dtype)
if v.ndim != 1:
raise ValueError('only 1-dimensional arrays are supported')
l = v.nbytes
normed_values[i] = v
data_length += l + HEADER_LENGTH
lengths[i] = l
o = values[i]
# replace missing value and coerce to typed data
value_mv = ensure_continguous_memoryview(
np.array([], dtype=self.dtype) if o is None
else np.ascontiguousarray(o, self.dtype)
)
value_pb = PyMemoryView_GET_BUFFER(value_mv)
if value_pb.ndim != 1:
raise ValueError("only 1-dimensional arrays are supported")
L = value_pb.len
normed_values[i] = value_mv
data_length += HEADER_LENGTH + L
lengths[i] = L

# setup output
total_length = HEADER_LENGTH + data_length
out = PyByteArray_FromStringAndSize(NULL, total_length)
out = PyByteArray_FromStringAndSize(NULL, data_length)

# write header
data = PyByteArray_AS_STRING(out)
data = out
store_le32(<uint8_t*>data, n_items)

# second iteration, store data
data += HEADER_LENGTH
for i in range(n_items):
l = lengths[i]
store_le32(<uint8_t*>data, l)
L = lengths[i]
store_le32(<uint8_t*>data, L)
data += HEADER_LENGTH

value_mv = ensure_continguous_memoryview(normed_values[i])
value_mv = normed_values[i]
value_pb = PyMemoryView_GET_BUFFER(value_mv)
encv = <const char*>value_pb.buf

memcpy(data, encv, l)
data += l
memcpy(data, value_pb.buf, L)
data += L

return out

Expand All @@ -441,16 +420,14 @@ class VLenArray(Codec):
object v
memoryview v_mv
Py_buffer* v_pb
Py_ssize_t i, l, n_items, data_length
Py_ssize_t i, L, n_items, data_length

# obtain memoryview
buf = ensure_contiguous_ndarray(buf)
buf_mv = memoryview(buf)
buf_mv = ensure_continguous_memoryview(buf)
buf_pb = PyMemoryView_GET_BUFFER(buf_mv)

# sanity checks
if not PyBuffer_IsContiguous(buf_pb, b'A'):
raise BufferError("`buf` must contain contiguous memory")
if buf_pb.len < HEADER_LENGTH:
raise ValueError('corrupt buffer, missing or truncated header')

Expand All @@ -474,18 +451,18 @@ class VLenArray(Codec):
for i in range(n_items):
if data + HEADER_LENGTH > data_end:
raise ValueError('corrupt buffer, data seem truncated')
l = load_le32(<uint8_t*>data)
L = load_le32(<uint8_t*>data)
data += HEADER_LENGTH
if data + l > data_end:
if data + L > data_end:
raise ValueError('corrupt buffer, data seem truncated')

# Create & fill array value
v = np.empty((l,), dtype="uint8").view(self.dtype)
v = np.empty((L,), dtype="uint8").view(self.dtype)
v_mv = memoryview(v)
v_pb = PyMemoryView_GET_BUFFER(v_mv)
memcpy(v_pb.buf, data, l)
memcpy(v_pb.buf, data, L)

out[i] = v
data += l
data += L

return out
Loading