Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion python/pyarrow/array.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -401,7 +401,15 @@ def array(object obj, type=None, mask=None, size=None, from_pandas=None,
result = _sequence_to_array(obj, mask, size, type, pool, c_from_pandas)

if extension_type is not None:
result = ExtensionArray.from_storage(extension_type, result)
if isinstance(result, ChunkedArray):
# Handle ChunkedArray case (e.g., when data overflows int32)
chunks = []
for chunk in result.chunks:
ext_chunk = ExtensionArray.from_storage(extension_type, chunk)
chunks.append(ext_chunk)
result = chunked_array(chunks, type=extension_type)
else:
result = ExtensionArray.from_storage(extension_type, result)
return result


Expand Down
77 changes: 77 additions & 0 deletions python/pyarrow/tests/test_extension_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -2120,3 +2120,80 @@ def test_json(storage_type, pickle_module):
pa.ArrowInvalid,
match=f"Invalid storage type for JsonExtensionType: {storage_type}"):
pa.json_(storage_type)


class ListExtensionType(pa.ExtensionType):
"""Extension type with a list field for testing int32 overflow."""

def __init__(self):
super().__init__(
pa.struct({"data": pa.list_(pa.uint8())}),
"test_list_ext",
)

def __arrow_ext_serialize__(self):
return b""

@classmethod
def __arrow_ext_deserialize__(cls, storage_type, serialized):
return cls()


@pytest.mark.large_memory
@pytest.mark.numpy
def test_extension_type_list_overflow():
"""
Test that extension types with list fields handle int32 offset overflow.
"""
try:
pa.register_extension_type(ListExtensionType())
except pa.ArrowKeyError:
pass

schema = pa.schema({"col": ListExtensionType()})

# Create data that exceeds int32 max cumulative values
# 5 rows × 500M values = 2.5B > int32 max (2,147,483,647)
arr = np.zeros(500_000_000, dtype=np.uint8)
rows = [{"col": {"data": arr}} for _ in range(5)]
Comment on lines +2142 to +2158

result = pa.Table.from_pylist(rows, schema=schema)

assert result.num_rows == 5
assert result.num_columns == 1
assert result.schema[0].type == ListExtensionType()

col = result.column(0)
assert isinstance(col, pa.ChunkedArray)
assert col.type == ListExtensionType()

for chunk_idx in range(col.num_chunks):
chunk_data = col.chunk(chunk_idx)
assert chunk_data.type == ListExtensionType()
Comment on lines +2166 to +2172


@pytest.mark.numpy
def test_extension_type_no_overflow():
"""Test that extension types work normally when there's no overflow."""
try:
pa.register_extension_type(ListExtensionType())
except pa.ArrowKeyError:
# Already registered
pass

schema = pa.schema({"col": ListExtensionType()})

# Small data that won't overflow
arr = np.array([1, 2, 3], dtype=np.uint8)
rows = [{"col": {"data": arr}} for _ in range(3)]

result = pa.Table.from_pylist(rows, schema=schema)

assert result.num_rows == 3
assert result.num_columns == 1
assert result.schema[0].type == ListExtensionType()

# The column should be a ChunkedArray with a single chunk
col = result.column(0)
assert isinstance(col, pa.ChunkedArray)
assert col.type == ListExtensionType()
Comment on lines +2148 to +2199
Comment on lines +2148 to +2199
Loading