From 8fc69210f9daa0775c853b449e53d051a9d712b4 Mon Sep 17 00:00:00 2001 From: Tadeja Kadunc Date: Tue, 6 Jan 2026 19:08:56 +0100 Subject: [PATCH 1/9] Initial commit --- .../src/arrow/python/python_to_arrow.cc | 26 ++++++++++++++----- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/python/pyarrow/src/arrow/python/python_to_arrow.cc b/python/pyarrow/src/arrow/python/python_to_arrow.cc index c70510a48000..a0b894d69c00 100644 --- a/python/pyarrow/src/arrow/python/python_to_arrow.cc +++ b/python/pyarrow/src/arrow/python/python_to_arrow.cc @@ -584,6 +584,14 @@ class PyConverter : public Converter { } }; +// Helper function to unwrap extension scalar to its storage scalar +inline const Scalar& GetStorageScalar(const Scalar& scalar) { + if (scalar.type->id() == Type::EXTENSION) { + return *checked_cast(scalar).value; + } + return scalar; +} + template class PyPrimitiveConverter; @@ -663,7 +671,8 @@ class PyPrimitiveConverter< } else if (arrow::py::is_scalar(value)) { ARROW_ASSIGN_OR_RAISE(std::shared_ptr scalar, arrow::py::unwrap_scalar(value)); - ARROW_RETURN_NOT_OK(this->primitive_builder_->AppendScalar(*scalar)); + ARROW_RETURN_NOT_OK( + this->primitive_builder_->AppendScalar(GetStorageScalar(*scalar))); } else { ARROW_ASSIGN_OR_RAISE( auto converted, PyValue::Convert(this->primitive_type_, this->options_, value)); @@ -684,7 +693,8 @@ class PyPrimitiveConverter< } else if (arrow::py::is_scalar(value)) { ARROW_ASSIGN_OR_RAISE(std::shared_ptr scalar, arrow::py::unwrap_scalar(value)); - ARROW_RETURN_NOT_OK(this->primitive_builder_->AppendScalar(*scalar)); + ARROW_RETURN_NOT_OK( + this->primitive_builder_->AppendScalar(GetStorageScalar(*scalar))); } else { ARROW_ASSIGN_OR_RAISE( auto converted, PyValue::Convert(this->primitive_type_, this->options_, value)); @@ -710,7 +720,8 @@ class PyPrimitiveConverter:: } else if (arrow::py::is_scalar(value)) { ARROW_ASSIGN_OR_RAISE(std::shared_ptr scalar, arrow::py::unwrap_scalar(value)); - ARROW_RETURN_NOT_OK(this->primitive_builder_->AppendScalar(*scalar)); + ARROW_RETURN_NOT_OK( + this->primitive_builder_->AppendScalar(GetStorageScalar(*scalar))); } else { ARROW_RETURN_NOT_OK( PyValue::Convert(this->primitive_type_, this->options_, value, view_)); @@ -747,7 +758,8 @@ class PyPrimitiveConverter< } else if (arrow::py::is_scalar(value)) { ARROW_ASSIGN_OR_RAISE(std::shared_ptr scalar, arrow::py::unwrap_scalar(value)); - ARROW_RETURN_NOT_OK(this->primitive_builder_->AppendScalar(*scalar)); + ARROW_RETURN_NOT_OK( + this->primitive_builder_->AppendScalar(GetStorageScalar(*scalar))); } else { ARROW_RETURN_NOT_OK( PyValue::Convert(this->primitive_type_, this->options_, value, view_)); @@ -791,7 +803,7 @@ class PyDictionaryConverter> } else if (arrow::py::is_scalar(value)) { ARROW_ASSIGN_OR_RAISE(std::shared_ptr scalar, arrow::py::unwrap_scalar(value)); - return this->value_builder_->AppendScalar(*scalar, 1); + return this->value_builder_->AppendScalar(GetStorageScalar(*scalar), 1); } else { ARROW_ASSIGN_OR_RAISE(auto converted, PyValue::Convert(this->value_type_, this->options_, value)); @@ -810,7 +822,7 @@ class PyDictionaryConverter> } else if (arrow::py::is_scalar(value)) { ARROW_ASSIGN_OR_RAISE(std::shared_ptr scalar, arrow::py::unwrap_scalar(value)); - return this->value_builder_->AppendScalar(*scalar, 1); + return this->value_builder_->AppendScalar(GetStorageScalar(*scalar), 1); } else { ARROW_RETURN_NOT_OK( PyValue::Convert(this->value_type_, this->options_, value, view_)); @@ -983,7 +995,7 @@ class PyStructConverter : public StructConverter } else if (arrow::py::is_scalar(value)) { ARROW_ASSIGN_OR_RAISE(std::shared_ptr scalar, arrow::py::unwrap_scalar(value)); - return this->struct_builder_->AppendScalar(*scalar); + return this->struct_builder_->AppendScalar(GetStorageScalar(*scalar)); } switch (input_kind_) { case InputKind::DICT: From 3f255f2de14c07beb1d65b85a7aecaf7e48bd860 Mon Sep 17 00:00:00 2001 From: Tadeja Kadunc Date: Wed, 7 Jan 2026 16:24:19 +0100 Subject: [PATCH 2/9] Adding tests # Conflicts: # python/pyarrow/tests/test_extension_type.py --- python/pyarrow/tests/test_extension_type.py | 63 +++++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index 66fcfc05568e..e233d1d30c7e 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -1486,6 +1486,69 @@ def bytes(self): pa.scalar(bad) +def test_array_from_extension_scalars(): + # Test unwrap to various storage types and different converters + import datetime + + builtin_cases = [ + # fixed_size_binary[16] storage + (pa.uuid(), [b"0123456789abcdef"], [ + UUID('30313233-3435-3637-3839-616263646566')]), + # int8 storage + (pa.bool8(), [0, 1], [0, 1]), + # string storage + (pa.json_(pa.string()), ['{"a":1}', '{"b":2}'], ['{"a":1}', '{"b":2}']), + # binary storage + (pa.opaque(pa.binary(), "t", "v"), [b"x", b"y"], [b"x", b"y"]), + ] + for ext_type, values, expected in builtin_cases: + scalars = [pa.scalar(v, type=ext_type) for v in values] + result = pa.array(scalars, type=ext_type) + assert result.type == ext_type + # TODO: make `expected` pyarrow array so `to_pylist` isn't used, check GH-48241 + assert result.to_pylist() == expected + + # Custom extension types requiring registration + custom_cases = [ + # int8 storage + (TinyIntType(), [1, 2], [1, 2]), + # int64 storage + (IntegerType(), [100, 200], [100, 200]), + # string storage + (LabelType(), ["a", "b"], ["a", "b"]), + # struct storage + (MyStructType(), [{"left": 1, "right": 2}], [{"left": 1, "right": 2}]), + # timestamp storage + (AnnotatedType(pa.timestamp("us"), "ts"), + [datetime.datetime(2023, 1, 1)], [datetime.datetime(2023, 1, 1)]), + # duration storage + (AnnotatedType(pa.duration("s"), "dur"), + [datetime.timedelta(seconds=100)], [datetime.timedelta(seconds=100)]), + # date storage + (AnnotatedType(pa.date32(), "date"), + [datetime.date(2023, 1, 1)], [datetime.date(2023, 1, 1)]), + # float64 storage + (AnnotatedType(pa.float64(), "f"), [1.5, 2.5], [1.5, 2.5]), + # boolean storage + (AnnotatedType(pa.bool_(), "b"), [True, False], [True, False]), + # binary storage + (AnnotatedType(pa.binary(), "bin"), [b"x", b"y"], [b"x", b"y"]), + ] + for ext_type, values, expected in custom_cases: + with registered_extension_type(ext_type): + scalars = [pa.scalar(v, type=ext_type) for v in values] + result = pa.array(scalars, type=ext_type) + assert result.type == ext_type + # TODO: make `expected` pyarrow array so `to_pylist` isn't used + assert result.to_pylist() == expected + + uuid_type = pa.uuid() + scalars = [pa.scalar(b"0123456789abcdef", type=uuid_type), + pa.scalar(None, type=uuid_type)] + result = pa.array(scalars, type=uuid_type) + assert result[0].is_valid and not result[1].is_valid + + def test_tensor_type(): tensor_type = pa.fixed_shape_tensor(pa.int8(), [2, 3]) assert tensor_type.extension_name == "arrow.fixed_shape_tensor" From dbf1194abdd52c623652a31c68eac84c5f8ca9cc Mon Sep 17 00:00:00 2001 From: Tadeja Kadunc Date: Wed, 4 Mar 2026 20:21:48 +0100 Subject: [PATCH 3/9] Add tests and replace to_pylist with equals() post #48727 --- .../src/arrow/python/python_to_arrow.cc | 2 +- python/pyarrow/tests/test_extension_type.py | 78 ++++++++++--------- 2 files changed, 41 insertions(+), 39 deletions(-) diff --git a/python/pyarrow/src/arrow/python/python_to_arrow.cc b/python/pyarrow/src/arrow/python/python_to_arrow.cc index a0b894d69c00..e7ce54abcd8f 100644 --- a/python/pyarrow/src/arrow/python/python_to_arrow.cc +++ b/python/pyarrow/src/arrow/python/python_to_arrow.cc @@ -585,7 +585,7 @@ class PyConverter : public Converter { }; // Helper function to unwrap extension scalar to its storage scalar -inline const Scalar& GetStorageScalar(const Scalar& scalar) { +const Scalar& GetStorageScalar(const Scalar& scalar) { if (scalar.type->id() == Type::EXTENSION) { return *checked_cast(scalar).value; } diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index e233d1d30c7e..5daa2c14a097 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -1487,67 +1487,69 @@ def bytes(self): def test_array_from_extension_scalars(): - # Test unwrap to various storage types and different converters import datetime + from decimal import Decimal builtin_cases = [ - # fixed_size_binary[16] storage - (pa.uuid(), [b"0123456789abcdef"], [ - UUID('30313233-3435-3637-3839-616263646566')]), - # int8 storage - (pa.bool8(), [0, 1], [0, 1]), - # string storage - (pa.json_(pa.string()), ['{"a":1}', '{"b":2}'], ['{"a":1}', '{"b":2}']), - # binary storage - (pa.opaque(pa.binary(), "t", "v"), [b"x", b"y"], [b"x", b"y"]), + (pa.uuid(), [b"0123456789abcdef"]), + (pa.bool8(), [0, 1]), + (pa.json_(pa.string()), ['{"a":1}', '{"b":2}']), + (pa.opaque(pa.binary(), "t", "v"), [b"x", b"y"]), ] - for ext_type, values, expected in builtin_cases: + for ext_type, values in builtin_cases: scalars = [pa.scalar(v, type=ext_type) for v in values] result = pa.array(scalars, type=ext_type) - assert result.type == ext_type - # TODO: make `expected` pyarrow array so `to_pylist` isn't used, check GH-48241 - assert result.to_pylist() == expected + expected = pa.array(values, type=ext_type) + assert result.equals(expected) # Custom extension types requiring registration custom_cases = [ - # int8 storage - (TinyIntType(), [1, 2], [1, 2]), - # int64 storage - (IntegerType(), [100, 200], [100, 200]), - # string storage - (LabelType(), ["a", "b"], ["a", "b"]), - # struct storage - (MyStructType(), [{"left": 1, "right": 2}], [{"left": 1, "right": 2}]), - # timestamp storage + (TinyIntType(), [1, 2]), + (IntegerType(), [100, 200]), + (LabelType(), ["a", "b"]), + (MyStructType(), [{"left": 1, "right": 2}]), (AnnotatedType(pa.timestamp("us"), "ts"), - [datetime.datetime(2023, 1, 1)], [datetime.datetime(2023, 1, 1)]), - # duration storage + [datetime.datetime(2023, 1, 1)]), (AnnotatedType(pa.duration("s"), "dur"), - [datetime.timedelta(seconds=100)], [datetime.timedelta(seconds=100)]), - # date storage + [datetime.timedelta(seconds=100)]), (AnnotatedType(pa.date32(), "date"), - [datetime.date(2023, 1, 1)], [datetime.date(2023, 1, 1)]), - # float64 storage - (AnnotatedType(pa.float64(), "f"), [1.5, 2.5], [1.5, 2.5]), - # boolean storage - (AnnotatedType(pa.bool_(), "b"), [True, False], [True, False]), - # binary storage - (AnnotatedType(pa.binary(), "bin"), [b"x", b"y"], [b"x", b"y"]), + [datetime.date(2023, 1, 1)]), + (AnnotatedType(pa.float64(), "f"), [1.5, 2.5]), + (AnnotatedType(pa.bool_(), "b"), [True, False]), + (AnnotatedType(pa.binary(), "bin"), [b"x", b"y"]), + (AnnotatedType(pa.decimal128(10, 2), "dec"), + [Decimal("1.50"), Decimal("2.75")]), + (AnnotatedType(pa.large_string(), "lstr"), ["hello", "world"]), + (AnnotatedType(pa.large_binary(), "lbin"), [b"ab", b"cd"]), ] - for ext_type, values, expected in custom_cases: + for ext_type, values in custom_cases: with registered_extension_type(ext_type): scalars = [pa.scalar(v, type=ext_type) for v in values] result = pa.array(scalars, type=ext_type) - assert result.type == ext_type - # TODO: make `expected` pyarrow array so `to_pylist` isn't used - assert result.to_pylist() == expected + expected = pa.array(values, type=ext_type) + assert result.equals(expected) + # Null handling uuid_type = pa.uuid() scalars = [pa.scalar(b"0123456789abcdef", type=uuid_type), pa.scalar(None, type=uuid_type)] result = pa.array(scalars, type=uuid_type) assert result[0].is_valid and not result[1].is_valid + # Type inference without explicit type + u = uuid4() + scalars = [pa.scalar(u, type=pa.uuid()), None] + result = pa.array(scalars) + assert result.type == pa.uuid() + assert result[0].as_py() == u + assert not result[1].is_valid + + # Mixed extension scalars and raw Python objects + u1, u2 = uuid4(), uuid4() + result = pa.array([pa.scalar(u1, type=pa.uuid()), u2], type=pa.uuid()) + expected = pa.array([u1, u2], type=pa.uuid()) + assert result.equals(expected) + def test_tensor_type(): tensor_type = pa.fixed_shape_tensor(pa.int8(), [2, 3]) From c00e6be13f12d9f0012de2dbaf755bee09210eb0 Mon Sep 17 00:00:00 2001 From: tadeja Date: Wed, 11 Mar 2026 18:20:30 +0100 Subject: [PATCH 4/9] Apply suggestions from code review Co-authored-by: Rok Mihevc --- python/pyarrow/src/arrow/python/python_to_arrow.cc | 13 +++++++------ python/pyarrow/tests/test_extension_type.py | 12 ++++++++++-- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/python/pyarrow/src/arrow/python/python_to_arrow.cc b/python/pyarrow/src/arrow/python/python_to_arrow.cc index e7ce54abcd8f..d697cc8df883 100644 --- a/python/pyarrow/src/arrow/python/python_to_arrow.cc +++ b/python/pyarrow/src/arrow/python/python_to_arrow.cc @@ -585,13 +585,13 @@ class PyConverter : public Converter { }; // Helper function to unwrap extension scalar to its storage scalar -const Scalar& GetStorageScalar(const Scalar& scalar) { - if (scalar.type->id() == Type::EXTENSION) { - return *checked_cast(scalar).value; +Result GetStorageScalar(const Scalar& scalar) { + if (scalar.type->id() != Type::EXTENSION) { + return &scalar; } - return scalar; + const auto& extension_scalar = checked_cast(scalar); + return extension_scalar.value.get(); } - template class PyPrimitiveConverter; @@ -671,8 +671,9 @@ class PyPrimitiveConverter< } else if (arrow::py::is_scalar(value)) { ARROW_ASSIGN_OR_RAISE(std::shared_ptr scalar, arrow::py::unwrap_scalar(value)); + ARROW_ASSIGN_OR_RAISE(const Scalar* storage_scalar, GetStorageScalar(*scalar)); ARROW_RETURN_NOT_OK( - this->primitive_builder_->AppendScalar(GetStorageScalar(*scalar))); + this->primitive_builder_->AppendScalar(*storage_scalar)); } else { ARROW_ASSIGN_OR_RAISE( auto converted, PyValue::Convert(this->primitive_type_, this->options_, value)); diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index 5daa2c14a097..e1e66d08d083 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -1485,7 +1485,10 @@ def bytes(self): with pytest.raises(RuntimeError, match="broken"): pa.scalar(bad) - +def test_array_from_mismatched_extension_scalar_type(): + scalar = pa.scalar(b"1" * 16, type=ExampleUuidType()) + with pytest.raises(TypeError): + pa.array([scalar], type=ExampleUuidType2()) def test_array_from_extension_scalars(): import datetime from decimal import Decimal @@ -1535,7 +1538,12 @@ def test_array_from_extension_scalars(): pa.scalar(None, type=uuid_type)] result = pa.array(scalars, type=uuid_type) assert result[0].is_valid and not result[1].is_valid - + scalars = [ + pa.ExtensionScalar.from_storage(uuid_type, b"0123456789abcdef"), + pa.ExtensionScalar.from_storage(uuid_type, None), + ] + result = pa.array(scalars, type=uuid_type) + assert result.to_pylist() == [UUID(bytes=b"0123456789abcdef"), None] # Type inference without explicit type u = uuid4() scalars = [pa.scalar(u, type=pa.uuid()), None] From 483e8667f99c760548440a74c6290940f517925e Mon Sep 17 00:00:00 2001 From: Tadeja Kadunc Date: Wed, 11 Mar 2026 18:31:36 +0100 Subject: [PATCH 5/9] Adjust all to use Result and remove mismatch test --- .../src/arrow/python/python_to_arrow.cc | 24 ++++++++++--------- python/pyarrow/tests/test_extension_type.py | 5 +--- 2 files changed, 14 insertions(+), 15 deletions(-) diff --git a/python/pyarrow/src/arrow/python/python_to_arrow.cc b/python/pyarrow/src/arrow/python/python_to_arrow.cc index d697cc8df883..ace3a1c5202b 100644 --- a/python/pyarrow/src/arrow/python/python_to_arrow.cc +++ b/python/pyarrow/src/arrow/python/python_to_arrow.cc @@ -672,8 +672,7 @@ class PyPrimitiveConverter< ARROW_ASSIGN_OR_RAISE(std::shared_ptr scalar, arrow::py::unwrap_scalar(value)); ARROW_ASSIGN_OR_RAISE(const Scalar* storage_scalar, GetStorageScalar(*scalar)); - ARROW_RETURN_NOT_OK( - this->primitive_builder_->AppendScalar(*storage_scalar)); + ARROW_RETURN_NOT_OK(this->primitive_builder_->AppendScalar(*storage_scalar)); } else { ARROW_ASSIGN_OR_RAISE( auto converted, PyValue::Convert(this->primitive_type_, this->options_, value)); @@ -694,8 +693,8 @@ class PyPrimitiveConverter< } else if (arrow::py::is_scalar(value)) { ARROW_ASSIGN_OR_RAISE(std::shared_ptr scalar, arrow::py::unwrap_scalar(value)); - ARROW_RETURN_NOT_OK( - this->primitive_builder_->AppendScalar(GetStorageScalar(*scalar))); + ARROW_ASSIGN_OR_RAISE(const Scalar* storage_scalar, GetStorageScalar(*scalar)); + ARROW_RETURN_NOT_OK(this->primitive_builder_->AppendScalar(*storage_scalar)); } else { ARROW_ASSIGN_OR_RAISE( auto converted, PyValue::Convert(this->primitive_type_, this->options_, value)); @@ -721,8 +720,8 @@ class PyPrimitiveConverter:: } else if (arrow::py::is_scalar(value)) { ARROW_ASSIGN_OR_RAISE(std::shared_ptr scalar, arrow::py::unwrap_scalar(value)); - ARROW_RETURN_NOT_OK( - this->primitive_builder_->AppendScalar(GetStorageScalar(*scalar))); + ARROW_ASSIGN_OR_RAISE(const Scalar* storage_scalar, GetStorageScalar(*scalar)); + ARROW_RETURN_NOT_OK(this->primitive_builder_->AppendScalar(*storage_scalar)); } else { ARROW_RETURN_NOT_OK( PyValue::Convert(this->primitive_type_, this->options_, value, view_)); @@ -759,8 +758,8 @@ class PyPrimitiveConverter< } else if (arrow::py::is_scalar(value)) { ARROW_ASSIGN_OR_RAISE(std::shared_ptr scalar, arrow::py::unwrap_scalar(value)); - ARROW_RETURN_NOT_OK( - this->primitive_builder_->AppendScalar(GetStorageScalar(*scalar))); + ARROW_ASSIGN_OR_RAISE(const Scalar* storage_scalar, GetStorageScalar(*scalar)); + ARROW_RETURN_NOT_OK(this->primitive_builder_->AppendScalar(*storage_scalar)); } else { ARROW_RETURN_NOT_OK( PyValue::Convert(this->primitive_type_, this->options_, value, view_)); @@ -804,7 +803,8 @@ class PyDictionaryConverter> } else if (arrow::py::is_scalar(value)) { ARROW_ASSIGN_OR_RAISE(std::shared_ptr scalar, arrow::py::unwrap_scalar(value)); - return this->value_builder_->AppendScalar(GetStorageScalar(*scalar), 1); + ARROW_ASSIGN_OR_RAISE(const Scalar* storage_scalar, GetStorageScalar(*scalar)); + return this->value_builder_->AppendScalar(*storage_scalar, 1); } else { ARROW_ASSIGN_OR_RAISE(auto converted, PyValue::Convert(this->value_type_, this->options_, value)); @@ -823,7 +823,8 @@ class PyDictionaryConverter> } else if (arrow::py::is_scalar(value)) { ARROW_ASSIGN_OR_RAISE(std::shared_ptr scalar, arrow::py::unwrap_scalar(value)); - return this->value_builder_->AppendScalar(GetStorageScalar(*scalar), 1); + ARROW_ASSIGN_OR_RAISE(const Scalar* storage_scalar, GetStorageScalar(*scalar)); + return this->value_builder_->AppendScalar(*storage_scalar, 1); } else { ARROW_RETURN_NOT_OK( PyValue::Convert(this->value_type_, this->options_, value, view_)); @@ -996,7 +997,8 @@ class PyStructConverter : public StructConverter } else if (arrow::py::is_scalar(value)) { ARROW_ASSIGN_OR_RAISE(std::shared_ptr scalar, arrow::py::unwrap_scalar(value)); - return this->struct_builder_->AppendScalar(GetStorageScalar(*scalar)); + ARROW_ASSIGN_OR_RAISE(const Scalar* storage_scalar, GetStorageScalar(*scalar)); + return this->struct_builder_->AppendScalar(*storage_scalar); } switch (input_kind_) { case InputKind::DICT: diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index e1e66d08d083..bea06642b21c 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -1485,10 +1485,7 @@ def bytes(self): with pytest.raises(RuntimeError, match="broken"): pa.scalar(bad) -def test_array_from_mismatched_extension_scalar_type(): - scalar = pa.scalar(b"1" * 16, type=ExampleUuidType()) - with pytest.raises(TypeError): - pa.array([scalar], type=ExampleUuidType2()) + def test_array_from_extension_scalars(): import datetime from decimal import Decimal From b220ebce096381b0b9f55378938c49cbe8a70c22 Mon Sep 17 00:00:00 2001 From: Tadeja Kadunc Date: Thu, 12 Mar 2026 20:32:46 +0100 Subject: [PATCH 6/9] Formatting --- python/pyarrow/src/arrow/python/python_to_arrow.cc | 1 + python/pyarrow/tests/test_extension_type.py | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/python/pyarrow/src/arrow/python/python_to_arrow.cc b/python/pyarrow/src/arrow/python/python_to_arrow.cc index ace3a1c5202b..9200202e5bde 100644 --- a/python/pyarrow/src/arrow/python/python_to_arrow.cc +++ b/python/pyarrow/src/arrow/python/python_to_arrow.cc @@ -592,6 +592,7 @@ Result GetStorageScalar(const Scalar& scalar) { const auto& extension_scalar = checked_cast(scalar); return extension_scalar.value.get(); } + template class PyPrimitiveConverter; diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index bea06642b21c..57fae1e99652 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -1535,12 +1535,15 @@ def test_array_from_extension_scalars(): pa.scalar(None, type=uuid_type)] result = pa.array(scalars, type=uuid_type) assert result[0].is_valid and not result[1].is_valid + scalars = [ pa.ExtensionScalar.from_storage(uuid_type, b"0123456789abcdef"), pa.ExtensionScalar.from_storage(uuid_type, None), ] result = pa.array(scalars, type=uuid_type) - assert result.to_pylist() == [UUID(bytes=b"0123456789abcdef"), None] + expected = pa.array([b"0123456789abcdef", None], type=uuid_type) + assert result.equals(expected) + # Type inference without explicit type u = uuid4() scalars = [pa.scalar(u, type=pa.uuid()), None] From 02387746bb6235ec73c6ee4f75f616af48b2f50e Mon Sep 17 00:00:00 2001 From: Tadeja Kadunc Date: Thu, 12 Mar 2026 22:16:54 +0100 Subject: [PATCH 7/9] Remove redundant tests --- python/pyarrow/tests/test_extension_type.py | 31 +++++---------------- 1 file changed, 7 insertions(+), 24 deletions(-) diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index 57fae1e99652..775a098d33a3 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -1488,46 +1488,29 @@ def bytes(self): def test_array_from_extension_scalars(): import datetime - from decimal import Decimal + # One case per C++ converter: FixedSizeBinary, Binary/String builtin_cases = [ (pa.uuid(), [b"0123456789abcdef"]), - (pa.bool8(), [0, 1]), - (pa.json_(pa.string()), ['{"a":1}', '{"b":2}']), (pa.opaque(pa.binary(), "t", "v"), [b"x", b"y"]), ] for ext_type, values in builtin_cases: scalars = [pa.scalar(v, type=ext_type) for v in values] result = pa.array(scalars, type=ext_type) - expected = pa.array(values, type=ext_type) - assert result.equals(expected) + assert result.equals(pa.array(values, type=ext_type)) - # Custom extension types requiring registration + # One case per C++ converter: Numeric, Timestamp/Duration, Struct custom_cases = [ - (TinyIntType(), [1, 2]), (IntegerType(), [100, 200]), - (LabelType(), ["a", "b"]), - (MyStructType(), [{"left": 1, "right": 2}]), (AnnotatedType(pa.timestamp("us"), "ts"), [datetime.datetime(2023, 1, 1)]), - (AnnotatedType(pa.duration("s"), "dur"), - [datetime.timedelta(seconds=100)]), - (AnnotatedType(pa.date32(), "date"), - [datetime.date(2023, 1, 1)]), - (AnnotatedType(pa.float64(), "f"), [1.5, 2.5]), - (AnnotatedType(pa.bool_(), "b"), [True, False]), - (AnnotatedType(pa.binary(), "bin"), [b"x", b"y"]), - (AnnotatedType(pa.decimal128(10, 2), "dec"), - [Decimal("1.50"), Decimal("2.75")]), - (AnnotatedType(pa.large_string(), "lstr"), ["hello", "world"]), - (AnnotatedType(pa.large_binary(), "lbin"), [b"ab", b"cd"]), + (MyStructType(), [{"left": 1, "right": 2}]), ] for ext_type, values in custom_cases: with registered_extension_type(ext_type): scalars = [pa.scalar(v, type=ext_type) for v in values] result = pa.array(scalars, type=ext_type) - expected = pa.array(values, type=ext_type) - assert result.equals(expected) + assert result.equals(pa.array(values, type=ext_type)) # Null handling uuid_type = pa.uuid() @@ -1536,6 +1519,7 @@ def test_array_from_extension_scalars(): result = pa.array(scalars, type=uuid_type) assert result[0].is_valid and not result[1].is_valid + # ExtensionScalar.from_storage path scalars = [ pa.ExtensionScalar.from_storage(uuid_type, b"0123456789abcdef"), pa.ExtensionScalar.from_storage(uuid_type, None), @@ -1555,8 +1539,7 @@ def test_array_from_extension_scalars(): # Mixed extension scalars and raw Python objects u1, u2 = uuid4(), uuid4() result = pa.array([pa.scalar(u1, type=pa.uuid()), u2], type=pa.uuid()) - expected = pa.array([u1, u2], type=pa.uuid()) - assert result.equals(expected) + assert result.equals(pa.array([u1, u2], type=pa.uuid())) def test_tensor_type(): From e47182d892d2f854dd344d478f8addd087556654 Mon Sep 17 00:00:00 2001 From: tadeja Date: Wed, 18 Mar 2026 11:59:50 +0100 Subject: [PATCH 8/9] Remove Result from GetStorageScalar --- .../src/arrow/python/python_to_arrow.cc | 34 ++++++++----------- 1 file changed, 15 insertions(+), 19 deletions(-) diff --git a/python/pyarrow/src/arrow/python/python_to_arrow.cc b/python/pyarrow/src/arrow/python/python_to_arrow.cc index 9200202e5bde..e7ce54abcd8f 100644 --- a/python/pyarrow/src/arrow/python/python_to_arrow.cc +++ b/python/pyarrow/src/arrow/python/python_to_arrow.cc @@ -585,12 +585,11 @@ class PyConverter : public Converter { }; // Helper function to unwrap extension scalar to its storage scalar -Result GetStorageScalar(const Scalar& scalar) { - if (scalar.type->id() != Type::EXTENSION) { - return &scalar; +const Scalar& GetStorageScalar(const Scalar& scalar) { + if (scalar.type->id() == Type::EXTENSION) { + return *checked_cast(scalar).value; } - const auto& extension_scalar = checked_cast(scalar); - return extension_scalar.value.get(); + return scalar; } template @@ -672,8 +671,8 @@ class PyPrimitiveConverter< } else if (arrow::py::is_scalar(value)) { ARROW_ASSIGN_OR_RAISE(std::shared_ptr scalar, arrow::py::unwrap_scalar(value)); - ARROW_ASSIGN_OR_RAISE(const Scalar* storage_scalar, GetStorageScalar(*scalar)); - ARROW_RETURN_NOT_OK(this->primitive_builder_->AppendScalar(*storage_scalar)); + ARROW_RETURN_NOT_OK( + this->primitive_builder_->AppendScalar(GetStorageScalar(*scalar))); } else { ARROW_ASSIGN_OR_RAISE( auto converted, PyValue::Convert(this->primitive_type_, this->options_, value)); @@ -694,8 +693,8 @@ class PyPrimitiveConverter< } else if (arrow::py::is_scalar(value)) { ARROW_ASSIGN_OR_RAISE(std::shared_ptr scalar, arrow::py::unwrap_scalar(value)); - ARROW_ASSIGN_OR_RAISE(const Scalar* storage_scalar, GetStorageScalar(*scalar)); - ARROW_RETURN_NOT_OK(this->primitive_builder_->AppendScalar(*storage_scalar)); + ARROW_RETURN_NOT_OK( + this->primitive_builder_->AppendScalar(GetStorageScalar(*scalar))); } else { ARROW_ASSIGN_OR_RAISE( auto converted, PyValue::Convert(this->primitive_type_, this->options_, value)); @@ -721,8 +720,8 @@ class PyPrimitiveConverter:: } else if (arrow::py::is_scalar(value)) { ARROW_ASSIGN_OR_RAISE(std::shared_ptr scalar, arrow::py::unwrap_scalar(value)); - ARROW_ASSIGN_OR_RAISE(const Scalar* storage_scalar, GetStorageScalar(*scalar)); - ARROW_RETURN_NOT_OK(this->primitive_builder_->AppendScalar(*storage_scalar)); + ARROW_RETURN_NOT_OK( + this->primitive_builder_->AppendScalar(GetStorageScalar(*scalar))); } else { ARROW_RETURN_NOT_OK( PyValue::Convert(this->primitive_type_, this->options_, value, view_)); @@ -759,8 +758,8 @@ class PyPrimitiveConverter< } else if (arrow::py::is_scalar(value)) { ARROW_ASSIGN_OR_RAISE(std::shared_ptr scalar, arrow::py::unwrap_scalar(value)); - ARROW_ASSIGN_OR_RAISE(const Scalar* storage_scalar, GetStorageScalar(*scalar)); - ARROW_RETURN_NOT_OK(this->primitive_builder_->AppendScalar(*storage_scalar)); + ARROW_RETURN_NOT_OK( + this->primitive_builder_->AppendScalar(GetStorageScalar(*scalar))); } else { ARROW_RETURN_NOT_OK( PyValue::Convert(this->primitive_type_, this->options_, value, view_)); @@ -804,8 +803,7 @@ class PyDictionaryConverter> } else if (arrow::py::is_scalar(value)) { ARROW_ASSIGN_OR_RAISE(std::shared_ptr scalar, arrow::py::unwrap_scalar(value)); - ARROW_ASSIGN_OR_RAISE(const Scalar* storage_scalar, GetStorageScalar(*scalar)); - return this->value_builder_->AppendScalar(*storage_scalar, 1); + return this->value_builder_->AppendScalar(GetStorageScalar(*scalar), 1); } else { ARROW_ASSIGN_OR_RAISE(auto converted, PyValue::Convert(this->value_type_, this->options_, value)); @@ -824,8 +822,7 @@ class PyDictionaryConverter> } else if (arrow::py::is_scalar(value)) { ARROW_ASSIGN_OR_RAISE(std::shared_ptr scalar, arrow::py::unwrap_scalar(value)); - ARROW_ASSIGN_OR_RAISE(const Scalar* storage_scalar, GetStorageScalar(*scalar)); - return this->value_builder_->AppendScalar(*storage_scalar, 1); + return this->value_builder_->AppendScalar(GetStorageScalar(*scalar), 1); } else { ARROW_RETURN_NOT_OK( PyValue::Convert(this->value_type_, this->options_, value, view_)); @@ -998,8 +995,7 @@ class PyStructConverter : public StructConverter } else if (arrow::py::is_scalar(value)) { ARROW_ASSIGN_OR_RAISE(std::shared_ptr scalar, arrow::py::unwrap_scalar(value)); - ARROW_ASSIGN_OR_RAISE(const Scalar* storage_scalar, GetStorageScalar(*scalar)); - return this->struct_builder_->AppendScalar(*storage_scalar); + return this->struct_builder_->AppendScalar(GetStorageScalar(*scalar)); } switch (input_kind_) { case InputKind::DICT: From f0a8428ef04fdeca2b4cdf5b99fee3374cbd3aa6 Mon Sep 17 00:00:00 2001 From: tadeja Date: Wed, 18 Mar 2026 12:05:45 +0100 Subject: [PATCH 9/9] Move stdlib import from test --- python/pyarrow/tests/test_extension_type.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index 775a098d33a3..465b556876b4 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -16,6 +16,7 @@ # under the License. import contextlib +import datetime import os import shutil import subprocess @@ -1487,8 +1488,6 @@ def bytes(self): def test_array_from_extension_scalars(): - import datetime - # One case per C++ converter: FixedSizeBinary, Binary/String builtin_cases = [ (pa.uuid(), [b"0123456789abcdef"]),