From ce8e59611fb2e1017916523ae34907b98a909a0b Mon Sep 17 00:00:00 2001 From: Nemo Yu Date: Thu, 18 Jun 2026 15:30:30 -0400 Subject: [PATCH 1/2] feat(vortex-geo): unify ST_Distance on the geo crate + add Polygon Signed-off-by: Nemo Yu --- Cargo.lock | 146 +++++++++++ Cargo.toml | 1 + vortex-geo/Cargo.toml | 5 +- vortex-geo/src/extension/coordinate.rs | 89 ------- vortex-geo/src/extension/mod.rs | 50 ++++ vortex-geo/src/extension/point.rs | 75 ++++-- vortex-geo/src/extension/polygon.rs | 324 +++++++++++++++++++++++++ vortex-geo/src/lib.rs | 4 + vortex-geo/src/scalar_fn/distance.rs | 84 +++---- vortex-geo/src/test_harness.rs | 12 + vortex-geo/src/tests/point.rs | 2 +- 11 files changed, 629 insertions(+), 163 deletions(-) create mode 100644 vortex-geo/src/extension/polygon.rs diff --git a/Cargo.lock b/Cargo.lock index 693082a77f8..13d8f068cec 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3441,6 +3441,16 @@ version = "1.0.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d0881ea181b1df73ff77ffaaf9c7544ecc11e82fba9b5f27b262a3c73a332555" +[[package]] +name = "earcutr" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79127ed59a85d7687c409e9978547cffb7dc79675355ed22da6b66fd5f6ead01" +dependencies = [ + "itertools 0.11.0", + "num-traits", +] + [[package]] name = "educe" version = "0.6.0" @@ -3703,6 +3713,12 @@ dependencies = [ "zlib-rs", ] +[[package]] +name = "float_next_after" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8bf7cc16383c4b8d58b9905a8509f02926ce3058053c056376248d958c9df1e8" + [[package]] name = "fnv" version = "1.0.7" @@ -3920,6 +3936,24 @@ dependencies = [ "version_check", ] +[[package]] +name = "geo" +version = "0.31.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2fc1a1678e54befc9b4bcab6cd43b8e7f834ae8ea121118b0fd8c42747675b4a" +dependencies = [ + "earcutr", + "float_next_after", + "geo-types", + "geographiclib-rs", + "i_overlay", + "log", + "num-traits", + "robust", + "rstar", + "spade", +] + [[package]] name = "geo-traits" version = "0.3.0" @@ -3937,6 +3971,8 @@ checksum = "94776032c45f950d30a13af6113c2ad5625316c9abfbccee4dd5a6695f8fe0f5" dependencies = [ "approx", "num-traits", + "rayon", + "rstar", "serde", ] @@ -3979,6 +4015,15 @@ dependencies = [ "thiserror 1.0.69", ] +[[package]] +name = "geographiclib-rs" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c5a7f08910fd98737a6eda7568e7c5e645093e073328eeef49758cfe8b0489c7" +dependencies = [ + "libm", +] + [[package]] name = "get_dir" version = "0.5.0" @@ -4101,6 +4146,15 @@ dependencies = [ "zerocopy", ] +[[package]] +name = "hash32" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47d60b12902ba28e2730cd37e95b8c9223af2808df9e902d4df49588d1470606" +dependencies = [ + "byteorder", +] + [[package]] name = "hashbrown" version = "0.12.3" @@ -4151,6 +4205,16 @@ dependencies = [ "foldhash 0.2.0", ] +[[package]] +name = "heapless" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0bfb9eb618601c89945a70e254898da93b13be0388091d42117462b265bb3fad" +dependencies = [ + "hash32", + "stable_deref_trait", +] + [[package]] name = "heck" version = "0.5.0" @@ -4312,6 +4376,49 @@ dependencies = [ "serde", ] +[[package]] +name = "i_float" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "010025c2c532c8d82e42d0b8bb5184afa449fa6f06c709ea9adcb16c49ae405b" +dependencies = [ + "libm", +] + +[[package]] +name = "i_key_sort" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9190f86706ca38ac8add223b2aed8b1330002b5cdbbce28fb58b10914d38fc27" + +[[package]] +name = "i_overlay" +version = "4.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "413183068e6e0289e18d7d0a1f661b81546e6918d5453a44570b9ab30cbed1b3" +dependencies = [ + "i_float", + "i_key_sort", + "i_shape", + "i_tree", + "rayon", +] + +[[package]] +name = "i_shape" +version = "1.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ea154b742f7d43dae2897fcd5ead86bc7b5eefcedd305a7ebf9f69d44d61082" +dependencies = [ + "i_float", +] + +[[package]] +name = "i_tree" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "35e6d558e6d4c7b82bc51d9c771e7a927862a161a7d87bf2b0541450e0e20915" + [[package]] name = "iana-time-zone" version = "0.1.65" @@ -4609,6 +4716,15 @@ dependencies = [ "either", ] +[[package]] +name = "itertools" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1c173a5686ce8bfa551b3563d0c2170bf24ca44da99c7ca4bfdab5418c3fe57" +dependencies = [ + "either", +] + [[package]] name = "itertools" version = "0.12.1" @@ -7557,6 +7673,23 @@ dependencies = [ "byteorder", ] +[[package]] +name = "robust" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e27ee8bb91ca0adcf0ecb116293afa12d393f9c2b9b9cd54d33e8078fe19839" + +[[package]] +name = "rstar" +version = "0.12.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "421400d13ccfd26dfa5858199c30a5d76f9c54e0dba7575273025b43c5175dbb" +dependencies = [ + "heapless", + "num-traits", + "smallvec", +] + [[package]] name = "rstest" version = "0.26.1" @@ -8215,6 +8348,18 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "spade" +version = "2.15.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9699399fd9349b00b184f5635b074f9ec93afffef30c853f8c875b32c0f8c7fa" +dependencies = [ + "hashbrown 0.16.1", + "num-traits", + "robust", + "smallvec", +] + [[package]] name = "sqllogictest" version = "0.29.1" @@ -9881,6 +10026,7 @@ version = "0.1.0" dependencies = [ "arrow-array", "arrow-schema", + "geo", "geo-traits", "geo-types", "geoarrow", diff --git a/Cargo.toml b/Cargo.toml index 8cf6e741a8b..4d04d36403e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -158,6 +158,7 @@ flatbuffers = "25.2.10" fsst-rs = "0.5.11" futures = { version = "0.3.31", default-features = false } fuzzy-matcher = "0.3" +geo = "0.31.0" geo-traits = "0.3.0" geo-types = "0.7.19" geoarrow = "0.8.0" diff --git a/vortex-geo/Cargo.toml b/vortex-geo/Cargo.toml index 3b09cb77cf8..e2f7e4dc10f 100644 --- a/vortex-geo/Cargo.toml +++ b/vortex-geo/Cargo.toml @@ -16,6 +16,9 @@ version.workspace = true [dependencies] arrow-array = { workspace = true } arrow-schema = { workspace = true } +geo = { workspace = true } +geo-traits = { workspace = true } +geo-types = { workspace = true } geoarrow = { workspace = true } prost = { workspace = true } vortex-array = { workspace = true } @@ -24,8 +27,6 @@ vortex-session = { workspace = true } wkb = { workspace = true } [dev-dependencies] -geo-traits = { workspace = true } -geo-types = { workspace = true } rstest = { workspace = true } [lints] diff --git a/vortex-geo/src/extension/coordinate.rs b/vortex-geo/src/extension/coordinate.rs index 767a294e67e..b6e324cd457 100644 --- a/vortex-geo/src/extension/coordinate.rs +++ b/vortex-geo/src/extension/coordinate.rs @@ -15,13 +15,6 @@ use std::fmt::Display; use std::fmt::Formatter; use geoarrow::datatypes::Dimension as GeoArrowDimension; -use vortex_array::ArrayRef; -use vortex_array::ExecutionCtx; -use vortex_array::arrays::ExtensionArray; -use vortex_array::arrays::PrimitiveArray; -use vortex_array::arrays::StructArray; -use vortex_array::arrays::extension::ExtensionArrayExt; -use vortex_array::arrays::struct_::StructArrayExt; use vortex_array::dtype::DType; use vortex_array::dtype::FieldNames; use vortex_array::dtype::Nullability; @@ -196,74 +189,16 @@ pub(crate) fn coordinate_from_struct(scalar: &Scalar) -> VortexResult VortexResult { - match scalar.as_extension_opt() { - Some(ext_scalar) => coordinate_from_struct(&ext_scalar.to_storage_scalar()), - None => coordinate_from_struct(scalar), - } -} - -/// Validated, executed `x`/`y` columns of a point array. The bulk counterpart to [`Coordinate`]; -/// `z`/`m` are not executed. -pub(crate) struct ParsedCoordinates { - /// The flat `f64` `x` column. - pub(crate) xs: PrimitiveArray, - /// The flat `f64` `y` column. - pub(crate) ys: PrimitiveArray, -} - -/// Validate a point column's coordinate storage (layout and non-nullability) and execute its -/// `x`/`y` columns. -pub(crate) fn parse_storage( - points: &ArrayRef, - ctx: &mut ExecutionCtx, -) -> VortexResult { - let storage = points - .clone() - .execute::(ctx)? - .storage_array() - .clone() - .execute::(ctx)?; - coordinate_dimension(storage.dtype())?; - vortex_ensure!( - !storage.dtype().is_nullable(), - "coordinate storage must be non-nullable to read unmasked ordinates, was {}", - storage.dtype() - ); - let xs = storage - .unmasked_field_by_name("x")? - .clone() - .execute::(ctx)?; - let ys = storage - .unmasked_field_by_name("y")? - .clone() - .execute::(ctx)?; - Ok(ParsedCoordinates { xs, ys }) -} - #[cfg(test)] mod tests { use rstest::rstest; - use vortex_array::IntoArray; - use vortex_array::VortexSessionExecute; - use vortex_array::arrays::ExtensionArray; - use vortex_array::arrays::PrimitiveArray; - use vortex_array::arrays::StructArray; - use vortex_array::dtype::FieldNames; use vortex_array::dtype::Nullability; - use vortex_array::dtype::extension::ExtDType; - use vortex_array::validity::Validity; use vortex_error::VortexResult; use super::Coordinate; use super::Dimension; use super::coordinate_dimension; use super::coordinate_storage_dtype; - use super::parse_storage; - use crate::extension::GeoMetadata; - use crate::extension::Point; /// Each dimension round-trips through its field names and canonical storage dtype. #[rstest] @@ -296,28 +231,4 @@ mod tests { }; assert_eq!(coordinate.to_string(), expected); } - - /// [`parse_storage`] reads the coordinate fields unmasked, so a nullable point column must - /// be rejected at parse time rather than decoding null rows as garbage ordinates. - #[test] - fn parse_rejects_nullable_points() -> VortexResult<()> { - let session = vortex_array::array_session(); - let mut ctx = session.create_execution_ctx(); - - let storage = StructArray::try_new( - FieldNames::from(["x", "y"]), - vec![ - PrimitiveArray::from_iter(vec![1.0f64]).into_array(), - PrimitiveArray::from_iter(vec![2.0f64]).into_array(), - ], - 1, - Validity::AllValid, - )? - .into_array(); - let dtype = ExtDType::::try_new(GeoMetadata { crs: None }, storage.dtype().clone())?; - let points = ExtensionArray::new(dtype.erased(), storage).into_array(); - - assert!(parse_storage(&points, &mut ctx).is_err()); - Ok(()) - } } diff --git a/vortex-geo/src/extension/mod.rs b/vortex-geo/src/extension/mod.rs index cc3a1f6e532..684c83bade0 100644 --- a/vortex-geo/src/extension/mod.rs +++ b/vortex-geo/src/extension/mod.rs @@ -3,16 +3,66 @@ pub(crate) mod coordinate; mod point; +mod polygon; mod wkb; use std::fmt::Display; use std::sync::Arc; +use geo_types::Geometry; use geoarrow::datatypes::Crs; use geoarrow::datatypes::Metadata; pub use point::*; +pub use polygon::*; +use vortex_array::ArrayRef; +use vortex_array::ExecutionCtx; +use vortex_array::IntoArray; +use vortex_array::arrays::ConstantArray; +use vortex_array::arrays::ExtensionArray; +use vortex_array::arrays::extension::ExtensionArrayExt; +use vortex_array::scalar::Scalar; +use vortex_error::VortexResult; +use vortex_error::vortex_bail; +use vortex_error::vortex_err; pub use wkb::*; +/// Decode a native geometry column to `geo_types`. A non-geometry operand is an error. +pub(crate) fn geometries( + array: &ArrayRef, + ctx: &mut ExecutionCtx, +) -> VortexResult>> { + let Some(ext) = array.dtype().as_extension_opt() else { + vortex_bail!( + "geo: operand is not a geometry extension type, was {}", + array.dtype() + ); + }; + let storage = array + .clone() + .execute::(ctx)? + .storage_array() + .clone(); + if ext.is::() { + point_geometries(&storage, ctx) + } else if ext.is::() { + polygon_geometries(&storage, ctx) + } else { + vortex_bail!("geo: unsupported geometry extension {}", array.dtype()) + } +} + +/// Decode a constant operand scalar to one geo geometry, a constant of any +/// supported geometry type is decoded exactly like a column. +pub(crate) fn single_geometry( + scalar: &Scalar, + ctx: &mut ExecutionCtx, +) -> VortexResult> { + let array = ConstantArray::new(scalar.clone(), 1).into_array(); + geometries(&array, ctx)? + .pop() + .ok_or_else(|| vortex_err!("geo: constant operand decoded to no geometry")) +} + /// Extension metadata that is common to all the geospatial extension types. /// /// Currently, this is just the coordinate reference system (CRS). diff --git a/vortex-geo/src/extension/point.rs b/vortex-geo/src/extension/point.rs index 237249ad605..19e33c212f5 100644 --- a/vortex-geo/src/extension/point.rs +++ b/vortex-geo/src/extension/point.rs @@ -10,6 +10,9 @@ use arrow_array::ArrayRef as ArrowArrayRef; use arrow_schema::DataType; use arrow_schema::Field; use arrow_schema::extension::ExtensionType; +use geo_traits::to_geo::ToGeoGeometry; +use geo_types::Geometry; +use geoarrow::array::GeoArrowArrayAccessor; use geoarrow::array::IntoArrow; use geoarrow::array::PointArray; use geoarrow::datatypes::CoordType; @@ -28,6 +31,7 @@ use vortex_array::arrow::ArrowSession; use vortex_array::arrow::ArrowSessionExt; use vortex_array::arrow::FromArrowArray; use vortex_array::dtype::DType; +use vortex_array::dtype::arrow::FromArrowType; use vortex_array::dtype::extension::ExtDType; use vortex_array::dtype::extension::ExtId; use vortex_array::dtype::extension::ExtVTable; @@ -92,6 +96,30 @@ fn point_type(geo_metadata: &GeoMetadata, dimension: Dimension) -> PointType { PointType::new(dimension.into(), geoarrow_metadata(geo_metadata)) } +/// Decode `Point` storage to `geo_types` points, for the geo scalar functions. +pub(crate) fn point_geometries( + storage: &ArrayRef, + ctx: &mut ExecutionCtx, +) -> VortexResult>> { + let point_type = point_type( + &GeoMetadata::default(), + coordinate_dimension(storage.dtype())?, + ); + let session = ctx.session().clone(); + let arrow = session.arrow().execute_arrow(storage.clone(), None, ctx)?; + let points = PointArray::try_from((arrow.as_ref(), point_type)) + .map_err(|e| vortex_err!("failed to construct PointArray: {e}"))?; + points + .iter() + .map(|geometry| -> VortexResult> { + Ok(geometry + .ok_or_else(|| vortex_err!("geo: null geometry is not supported"))? + .map_err(|e| vortex_err!("geo: geometry access failed: {e}"))? + .to_geometry()) + }) + .collect() +} + impl ArrowExportVTable for Point { fn arrow_ext_id(&self) -> Id { *ARROW_POINT @@ -166,25 +194,40 @@ impl ArrowImportVTable for Point { *ARROW_POINT } + /// Import a `geoarrow.point` field as the [`Point`] dtype. Keyed off the standard GeoArrow name, + /// so any producer (DataFusion, DuckDB, geoarrow-rs, …) resolves here. Accepts the full + /// `PointType` extension, or — for a metadata-less geometry literal — the name alone, inferring + /// the dimension from the coordinate field names. fn from_arrow_field(&self, field: &Field) -> VortexResult> { - let Ok(point_meta) = field.try_extension_type::() else { - return Ok(None); + let (dimension, metadata) = if let Ok(point_meta) = field.try_extension_type::() + { + vortex_ensure!( + point_meta.coord_type() == CoordType::Separated, + "geoarrow.point with interleaved coordinates is not supported; \ + re-encode with separated (struct) coordinates" + ); + ( + point_meta.dimension().into(), + geo_metadata_from_arrow(point_meta.metadata()), + ) + } else { + // Infer the dimension from the field names, not the canonical storage check: a literal's + // coordinate fields may be nullable, which that check rejects. + if field.extension_type_name() != Some(PointType::NAME) { + return Ok(None); + } + let DType::Struct(fields, _) = DType::from_arrow(field) else { + return Ok(None); + }; + let Ok(dimension) = Dimension::from_field_names(fields.names()) else { + return Ok(None); + }; + (dimension, GeoMetadata::default()) }; - vortex_ensure!( - point_meta.coord_type() == CoordType::Separated, - "geoarrow.point with interleaved coordinates is not supported; \ - re-encode with separated (struct) coordinates" - ); - let storage_dtype = - coordinate_storage_dtype(point_meta.dimension().into(), field.is_nullable().into()); + let storage_dtype = coordinate_storage_dtype(dimension, field.is_nullable().into()); Ok(Some(DType::Extension( - ExtDType::try_with_vtable( - Point, - geo_metadata_from_arrow(point_meta.metadata()), - storage_dtype, - )? - .erased(), + ExtDType::try_with_vtable(Point, metadata, storage_dtype)?.erased(), ))) } @@ -228,8 +271,8 @@ mod tests { use crate::extension::GeoMetadata; use crate::extension::coordinate::Coordinate; use crate::extension::coordinate::Dimension; - use crate::extension::coordinate::coordinate_from_scalar; use crate::extension::coordinate::coordinate_storage_dtype; + use crate::test_harness::coordinate_from_scalar; use crate::test_harness::point_column; fn geo_meta() -> GeoMetadata { diff --git a/vortex-geo/src/extension/polygon.rs b/vortex-geo/src/extension/polygon.rs new file mode 100644 index 00000000000..e441d8a9eed --- /dev/null +++ b/vortex-geo/src/extension/polygon.rs @@ -0,0 +1,324 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! The [`Polygon`] geometry extension type (`vortex.geo.polygon`): rings of the [`Point`] +//! coordinate struct, stored as `List>>` and tagged with +//! [`GeoMetadata`] (CRS). The first ring is the exterior boundary; the rest are holes. + +use std::sync::Arc; + +use arrow_array::ArrayRef as ArrowArrayRef; +use arrow_schema::DataType; +use arrow_schema::Field; +use arrow_schema::extension::ExtensionType; +use geo_traits::to_geo::ToGeoGeometry; +use geo_types::Geometry; +use geoarrow::array::GeoArrowArrayAccessor; +use geoarrow::array::IntoArrow; +use geoarrow::array::PolygonArray; +use geoarrow::datatypes::CoordType; +use geoarrow::datatypes::PolygonType; +use prost::Message; +use vortex_array::ArrayRef; +use vortex_array::ExecutionCtx; +use vortex_array::IntoArray; +use vortex_array::arrays::ExtensionArray; +use vortex_array::arrays::extension::ExtensionArrayExt; +use vortex_array::arrow::ArrowExport; +use vortex_array::arrow::ArrowExportVTable; +use vortex_array::arrow::ArrowImport; +use vortex_array::arrow::ArrowImportVTable; +use vortex_array::arrow::ArrowSession; +use vortex_array::arrow::ArrowSessionExt; +use vortex_array::arrow::FromArrowArray; +use vortex_array::dtype::DType; +use vortex_array::dtype::Nullability; +use vortex_array::dtype::arrow::FromArrowType; +use vortex_array::dtype::extension::ExtDType; +use vortex_array::dtype::extension::ExtId; +use vortex_array::dtype::extension::ExtVTable; +use vortex_array::scalar::ScalarValue; +use vortex_error::VortexResult; +use vortex_error::vortex_bail; +use vortex_error::vortex_ensure; +use vortex_error::vortex_err; +use vortex_session::registry::CachedId; +use vortex_session::registry::Id; + +use super::GeoMetadata; +use super::coordinate::Dimension; +use super::coordinate::coordinate_dimension; +use super::coordinate::coordinate_storage_dtype; +use super::geo_metadata_from_arrow; +use super::geoarrow_metadata; + +/// A polygon: `geoarrow.polygon`, stored as `List>>` (rings of vertices). +#[derive(Debug, Clone, Default, PartialEq, Eq, Hash)] +pub struct Polygon; + +impl ExtVTable for Polygon { + type Metadata = GeoMetadata; + // No cheap owned value like Point's `Coordinate`; expose the raw storage scalar. + type NativeValue<'a> = &'a ScalarValue; + + fn id(&self) -> ExtId { + ExtId::new_static("vortex.geo.polygon") + } + + fn serialize_metadata(&self, metadata: &Self::Metadata) -> VortexResult> { + Ok(metadata.encode_to_vec()) + } + + fn deserialize_metadata(&self, metadata: &[u8]) -> VortexResult { + Ok(GeoMetadata::decode(metadata)?) + } + + fn validate_dtype(ext_dtype: &ExtDType) -> VortexResult<()> { + polygon_dimension(ext_dtype.storage_dtype()).map(|_| ()) + } + + fn unpack_native<'a>( + _ext_dtype: &'a ExtDType, + storage_value: &'a ScalarValue, + ) -> VortexResult<&'a ScalarValue> { + Ok(storage_value) + } +} + +/// Canonical polygon storage: an outer list of rings, each a list of the coordinate `Struct`. +pub(crate) fn polygon_storage_dtype(dim: Dimension, nullability: Nullability) -> DType { + let coords = coordinate_storage_dtype(dim, Nullability::NonNullable); + let ring = DType::List(Arc::new(coords), Nullability::NonNullable); + DType::List(Arc::new(ring), nullability) +} + +/// Validate `dtype` is `List>` and return its [`Dimension`]. +pub(crate) fn polygon_dimension(dtype: &DType) -> VortexResult { + let DType::List(ring, _) = dtype else { + vortex_bail!("polygon storage must be a List of rings, was {dtype}"); + }; + let DType::List(coords, _) = ring.as_ref() else { + vortex_bail!("polygon ring storage must be a List of coordinates, was {ring}"); + }; + coordinate_dimension(coords) +} + +static ARROW_POLYGON: CachedId = CachedId::new(PolygonType::NAME); + +/// The `geoarrow.polygon` extension type for `dimension`, with separated (struct) coordinates +/// matching `Polygon` storage. +fn polygon_type(geo_metadata: &GeoMetadata, dimension: Dimension) -> PolygonType { + PolygonType::new(dimension.into(), geoarrow_metadata(geo_metadata)) +} + +/// Decode `Polygon` storage (`List>`) to `geo_types` polygons, for the geo scalar +/// functions. CRS does not affect planar geometry ops, so default metadata is used. +pub(crate) fn polygon_geometries( + storage: &ArrayRef, + ctx: &mut ExecutionCtx, +) -> VortexResult>> { + let polygon_type = polygon_type(&GeoMetadata::default(), polygon_dimension(storage.dtype())?); + let session = ctx.session().clone(); + let arrow = session.arrow().execute_arrow(storage.clone(), None, ctx)?; + let polygons = PolygonArray::try_from((arrow.as_ref(), polygon_type)) + .map_err(|e| vortex_err!("failed to construct PolygonArray: {e}"))?; + polygons + .iter() + .map(|geometry| -> VortexResult> { + Ok(geometry + .ok_or_else(|| vortex_err!("geo: null geometry is not supported"))? + .map_err(|e| vortex_err!("geo: geometry access failed: {e}"))? + .to_geometry()) + }) + .collect() +} + +impl ArrowExportVTable for Polygon { + fn arrow_ext_id(&self) -> Id { + *ARROW_POLYGON + } + + fn vortex_id(&self) -> Id { + self.id() + } + + fn to_arrow_field( + &self, + name: &str, + dtype: &DType, + session: &ArrowSession, + ) -> VortexResult> { + let ext_type = dtype.as_extension(); + let geo_metadata = ext_type.metadata::(); + let dimension = polygon_dimension(ext_type.storage_dtype())?; + + let mut field = session.to_arrow_field(name, ext_type.storage_dtype())?; + field.try_with_extension_type(polygon_type(geo_metadata, dimension))?; + + Ok(Some(field)) + } + + fn execute_arrow( + &self, + array: ArrayRef, + target: &Field, + ctx: &mut ExecutionCtx, + ) -> VortexResult { + let is_polygon = array + .dtype() + .as_extension_opt() + .map(|ext| ext.is::()) + .unwrap_or(false); + if !is_polygon { + return Ok(ArrowExport::Unsupported(array)); + } + + let Ok(polygon_meta) = target.try_extension_type::() else { + return Ok(ArrowExport::Unsupported(array)); + }; + if polygon_meta.coord_type() != CoordType::Separated { + return Ok(ArrowExport::Unsupported(array)); + } + + let executed = array.execute::(ctx)?; + let storage = executed.storage_array().clone(); + + let storage_field = Field::new( + String::new(), + target.data_type().clone(), + target.is_nullable(), + ); + let session = ctx.session().clone(); + let arrow_storage = session + .arrow() + .execute_arrow(storage, Some(&storage_field), ctx)?; + + // Round-trip through GeoArrow's polygon array; `into_arrow` is concrete, so wrap in `Arc`. + let polygons = PolygonArray::try_from((arrow_storage.as_ref(), polygon_meta)) + .map_err(|e| vortex_err!("failed to construct PolygonArray: {e}"))?; + + Ok(ArrowExport::Exported(Arc::new(polygons.into_arrow()))) + } +} + +impl ArrowImportVTable for Polygon { + fn arrow_ext_id(&self) -> Id { + *ARROW_POLYGON + } + + /// Import a `geoarrow.polygon` field as the [`Polygon`] dtype. Keyed off the standard GeoArrow + /// name, so any producer (DataFusion, DuckDB, geoarrow-rs, …) resolves here. Accepts the full + /// `PolygonType` extension, or — for a metadata-less geometry literal — the name alone, inferring + /// the dimension from the coordinate field names. + fn from_arrow_field(&self, field: &Field) -> VortexResult> { + let (dimension, metadata) = + if let Ok(polygon_meta) = field.try_extension_type::() { + vortex_ensure!( + polygon_meta.coord_type() == CoordType::Separated, + "geoarrow.polygon with interleaved coordinates is not supported; \ + re-encode with separated (struct) coordinates" + ); + ( + polygon_meta.dimension().into(), + geo_metadata_from_arrow(polygon_meta.metadata()), + ) + } else { + // Infer the dimension from the field names, not the canonical storage check: a literal's + // coordinate fields may be nullable, which that check rejects. Peel the two `List` layers + // (polygon → rings → coordinates) to reach the struct. + if field.extension_type_name() != Some(PolygonType::NAME) { + return Ok(None); + } + let DType::List(ring, _) = DType::from_arrow(field) else { + return Ok(None); + }; + let DType::List(coords, _) = ring.as_ref() else { + return Ok(None); + }; + let DType::Struct(fields, _) = coords.as_ref() else { + return Ok(None); + }; + let Ok(dimension) = Dimension::from_field_names(fields.names()) else { + return Ok(None); + }; + (dimension, GeoMetadata::default()) + }; + + let storage_dtype = polygon_storage_dtype(dimension, field.is_nullable().into()); + Ok(Some(DType::Extension( + ExtDType::try_with_vtable(Polygon, metadata, storage_dtype)?.erased(), + ))) + } + + fn from_arrow_array( + &self, + array: ArrowArrayRef, + field: &Field, + dtype: &DType, + ) -> VortexResult { + let Some(ext_dtype) = dtype.as_extension_opt() else { + return Ok(ArrowImport::Unsupported(array)); + }; + if !ext_dtype.is::() + || field.try_extension_type::().is_err() + || !matches!(array.data_type(), DataType::List(_)) + { + return Ok(ArrowImport::Unsupported(array)); + } + + let storage = ArrayRef::from_arrow(array.as_ref(), field.is_nullable())?; + Ok(ArrowImport::Imported( + ExtensionArray::try_new(ext_dtype.clone(), storage)?.into_array(), + )) + } +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use rstest::rstest; + use vortex_array::dtype::DType; + use vortex_array::dtype::Nullability; + use vortex_array::dtype::PType; + use vortex_array::dtype::extension::ExtDType; + use vortex_error::VortexResult; + + use super::Polygon; + use super::polygon_storage_dtype; + use crate::extension::GeoMetadata; + use crate::extension::coordinate::Dimension; + use crate::extension::coordinate::coordinate_storage_dtype; + + fn geo_meta() -> GeoMetadata { + GeoMetadata { + crs: Some("EPSG:4326".to_string()), + } + } + + /// `Polygon` accepts the canonical `List>` storage of every dimension. + #[rstest] + #[case::xy(Dimension::Xy)] + #[case::xyz(Dimension::Xyz)] + #[case::xym(Dimension::Xym)] + #[case::xyzm(Dimension::Xyzm)] + fn polygon_validates_every_dimension(#[case] dim: Dimension) -> VortexResult<()> { + let storage = polygon_storage_dtype(dim, Nullability::NonNullable); + ExtDType::::try_new(geo_meta(), storage)?; + Ok(()) + } + + /// Non-polygon storage is rejected at dtype construction: a bare struct (point) and a single + /// list (linestring) both fail. + #[test] + fn polygon_rejects_invalid_storage() -> VortexResult<()> { + let primitive = DType::Primitive(PType::F64, Nullability::NonNullable); + assert!(ExtDType::::try_new(geo_meta(), primitive).is_err()); + + // A single list of coordinates is a LineString, not a Polygon. + let coords = coordinate_storage_dtype(Dimension::Xy, Nullability::NonNullable); + let line = DType::List(Arc::new(coords), Nullability::NonNullable); + assert!(ExtDType::::try_new(geo_meta(), line).is_err()); + Ok(()) + } +} diff --git a/vortex-geo/src/lib.rs b/vortex-geo/src/lib.rs index d47b0976bc6..951d93b7b4f 100644 --- a/vortex-geo/src/lib.rs +++ b/vortex-geo/src/lib.rs @@ -9,6 +9,7 @@ use vortex_array::scalar_fn::session::ScalarFnSessionExt; use vortex_session::VortexSession; use crate::extension::Point; +use crate::extension::Polygon; use crate::extension::WellKnownBinary; use crate::scalar_fn::distance::GeoDistance; @@ -28,6 +29,9 @@ pub fn initialize(session: &VortexSession) { session.dtypes().register(Point); session.arrow().register_exporter(Arc::new(Point)); session.arrow().register_importer(Arc::new(Point)); + session.dtypes().register(Polygon); + session.arrow().register_exporter(Arc::new(Polygon)); + session.arrow().register_importer(Arc::new(Polygon)); // Register the geometry scalar functions. session.scalar_fns().register(GeoDistance); diff --git a/vortex-geo/src/scalar_fn/distance.rs b/vortex-geo/src/scalar_fn/distance.rs index 7f222cb763a..feb7ea833aa 100644 --- a/vortex-geo/src/scalar_fn/distance.rs +++ b/vortex-geo/src/scalar_fn/distance.rs @@ -1,8 +1,10 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -//! Straight-line (Euclidean) distance between points; "planar" distance in GIS terms. +//! `ST_Distance`: planar (Euclidean) distance between two native geometries via the `geo` crate. +use geo::Distance; +use geo::Euclidean; use vortex_array::ArrayRef; use vortex_array::ExecutionCtx; use vortex_array::IntoArray; @@ -22,29 +24,20 @@ use vortex_array::scalar_fn::ScalarFnId; use vortex_array::scalar_fn::ScalarFnVTable; use vortex_array::scalar_fn::TypedScalarFnInstance; use vortex_error::VortexResult; +use vortex_error::vortex_ensure; use vortex_session::VortexSession; -use crate::extension::coordinate::coordinate_from_scalar; -use crate::extension::coordinate::parse_storage; +use crate::extension::geometries; +use crate::extension::single_geometry; -/// Straight-line (L2) distance between `(ax, ay)` and `(bx, by)`. -fn euclidean_distance(ax: f64, ay: f64, bx: f64, by: f64) -> f64 { - let dx = ax - bx; - let dy = ay - by; - (dx * dx + dy * dy).sqrt() -} - -/// Straight-line (Euclidean) distance between two point operands — "planar" distance in GIS terms -/// (e.g. PostGIS `ST_Distance`). No geodesic correction, and `z`/`m` are ignored. -/// -/// The operands are two point columns of equal length; either (or both) may be constant, in which -/// case the constant query point is decoded once and broadcast. +/// Planar (Euclidean) `ST_Distance` (no geodesic correction) between two native geometry operands. +/// Each is a column or a constant literal; `geo` computes the distance between each pair. #[derive(Debug, Clone, Default, PartialEq, Eq, Hash)] pub struct GeoDistance; impl GeoDistance { - /// A lazy `ScalarFnArray` computing the per-row distance between the point columns `a` and - /// `b`; either may be constant. The output length is taken from `a`. + /// A lazy `ScalarFnArray` computing the per-row distance between operands `a` and `b`; either may + /// be constant. The output length is taken from `a`. pub fn try_new_array(a: ArrayRef, b: ArrayRef) -> VortexResult { ScalarFnArray::try_new( TypedScalarFnInstance::new(GeoDistance, EmptyOptions).erased(), @@ -94,9 +87,9 @@ impl ScalarFnVTable for GeoDistance { let b = args.get(1)?; match (a.as_opt::(), b.as_opt::()) { (Some(qa), Some(qb)) => { - let qa = coordinate_from_scalar(qa.scalar())?; - let qb = coordinate_from_scalar(qb.scalar())?; - let distance = euclidean_distance(qa.x, qa.y, qb.x, qb.y); + let ga = single_geometry(qa.scalar(), ctx)?; + let gb = single_geometry(qb.scalar(), ctx)?; + let distance = Euclidean.distance(&ga, &gb); Ok(ConstantArray::new( Scalar::primitive(distance, Nullability::NonNullable), a.len(), @@ -106,42 +99,31 @@ impl ScalarFnVTable for GeoDistance { (Some(query), None) => distances_to_constant(&b, query.scalar(), ctx), (None, Some(query)) => distances_to_constant(&a, query.scalar(), ctx), (None, None) => { - let a_coords = parse_storage(&a, ctx)?; - let b_coords = parse_storage(&b, ctx)?; - let distances = a_coords - .xs - .as_slice::() - .iter() - .zip(a_coords.ys.as_slice::()) - .zip( - b_coords - .xs - .as_slice::() - .iter() - .zip(b_coords.ys.as_slice::()), - ) - .map(|((&ax, &ay), (&bx, &by))| euclidean_distance(ax, ay, bx, by)); + let ag = geometries(&a, ctx)?; + let bg = geometries(&b, ctx)?; + vortex_ensure!( + ag.len() == bg.len(), + "geo distance: operand length mismatch {} vs {}", + ag.len(), + bg.len() + ); + let distances = ag.iter().zip(&bg).map(|(x, y)| Euclidean.distance(x, y)); Ok(PrimitiveArray::from_iter(distances).into_array()) } } } } -/// Distance from each row of `points` to a constant `query` point, decoded once and broadcast. +/// Distance from each row of `operand` to a constant `query` geometry, decoded once and broadcast. /// Distance is symmetric, so this serves a constant on either side. fn distances_to_constant( - points: &ArrayRef, + operand: &ArrayRef, query: &Scalar, ctx: &mut ExecutionCtx, ) -> VortexResult { - let query = coordinate_from_scalar(query)?; - let coords = parse_storage(points, ctx)?; - let distances = coords - .xs - .as_slice::() - .iter() - .zip(coords.ys.as_slice::()) - .map(|(&x, &y)| euclidean_distance(x, y, query.x, query.y)); + let query = single_geometry(query, ctx)?; + let geoms = geometries(operand, ctx)?; + let distances = geoms.iter().map(|g| Euclidean.distance(g, &query)); Ok(PrimitiveArray::from_iter(distances).into_array()) } @@ -156,7 +138,6 @@ mod tests { use vortex_error::VortexResult; use super::GeoDistance; - use super::euclidean_distance; use crate::test_harness::point_column; /// A constant `Point` column of length `len`, every row at `(x, y)`. @@ -179,15 +160,8 @@ mod tests { .to_vec()) } - /// The kernel computes straight-line distance (the 3–4–5 triangle). - #[test] - fn euclidean_distance_is_straight_line() { - assert_eq!(euclidean_distance(0.0, 0.0, 3.0, 4.0), 5.0); - assert_eq!(euclidean_distance(1.5, -1.5, 1.5, -1.5), 0.0); - } - - /// `GeoDistance` returns the per-row distance between two point columns (here the second is a - /// constant query point). + /// `GeoDistance` returns the per-row distance between a point column and a constant query point + /// (3–4–5 triangles), computed via the geo crate. #[test] fn distance_over_points() -> VortexResult<()> { let session = vortex_array::array_session(); diff --git a/vortex-geo/src/test_harness.rs b/vortex-geo/src/test_harness.rs index 9f066fb4d14..2e9e7f43c27 100644 --- a/vortex-geo/src/test_harness.rs +++ b/vortex-geo/src/test_harness.rs @@ -9,10 +9,13 @@ use vortex_array::arrays::ExtensionArray; use vortex_array::arrays::PrimitiveArray; use vortex_array::arrays::StructArray; use vortex_array::dtype::extension::ExtDType; +use vortex_array::scalar::Scalar; use vortex_error::VortexResult; use crate::extension::GeoMetadata; use crate::extension::Point; +use crate::extension::coordinate::Coordinate; +use crate::extension::coordinate::coordinate_from_struct; /// A `Point` column (CRS `EPSG:4326`) over the given x/y coordinates. pub(crate) fn point_column(xs: Vec, ys: Vec) -> VortexResult { @@ -27,3 +30,12 @@ pub(crate) fn point_column(xs: Vec, ys: Vec) -> VortexResult let dtype = ExtDType::::try_new(metadata, storage.dtype().clone())?; Ok(ExtensionArray::new(dtype.erased(), storage).into_array()) } + +/// Decode a [`Coordinate`] from an extension-typed point scalar (unwrapped to its coordinate +/// storage) or a bare coordinate `Struct` scalar — used to read back a single point in assertions. +pub(crate) fn coordinate_from_scalar(scalar: &Scalar) -> VortexResult { + match scalar.as_extension_opt() { + Some(ext_scalar) => coordinate_from_struct(&ext_scalar.to_storage_scalar()), + None => coordinate_from_struct(scalar), + } +} diff --git a/vortex-geo/src/tests/point.rs b/vortex-geo/src/tests/point.rs index 074ef97f0d3..ff74b01ba01 100644 --- a/vortex-geo/src/tests/point.rs +++ b/vortex-geo/src/tests/point.rs @@ -29,7 +29,7 @@ use vortex_error::vortex_err; use super::SESSION; use crate::extension::Point; use crate::extension::coordinate::Coordinate; -use crate::extension::coordinate::coordinate_from_scalar; +use crate::test_harness::coordinate_from_scalar; use crate::test_harness::point_column; /// A `geoarrow.point` Arrow field with separated (struct) XY coordinates. From 242ee4c78fa9325cae600e8363a0aa5da0920fec Mon Sep 17 00:00:00 2001 From: Nemo Yu Date: Thu, 18 Jun 2026 15:46:06 -0400 Subject: [PATCH 2/2] fix: linters and tests Signed-off-by: Nemo Yu --- vortex-geo/src/extension/polygon.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vortex-geo/src/extension/polygon.rs b/vortex-geo/src/extension/polygon.rs index e441d8a9eed..fc06ce59bd3 100644 --- a/vortex-geo/src/extension/polygon.rs +++ b/vortex-geo/src/extension/polygon.rs @@ -1,8 +1,8 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -//! The [`Polygon`] geometry extension type (`vortex.geo.polygon`): rings of the [`Point`] -//! coordinate struct, stored as `List>>` and tagged with +//! The [`Polygon`] geometry extension type (`vortex.geo.polygon`): rings of the +//! [`Point`](super::Point) coordinate struct, stored as `List>>` and tagged with //! [`GeoMetadata`] (CRS). The first ring is the exterior boundary; the rest are holes. use std::sync::Arc;