diff --git a/Cargo.lock b/Cargo.lock index b6f67682c4a..4bc788f48d9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -10269,6 +10269,7 @@ dependencies = [ "geo-types", "geoarrow", "prost 0.14.4", + "rstest", "vortex-array", "vortex-error", "vortex-session", diff --git a/vortex-geo/Cargo.toml b/vortex-geo/Cargo.toml index 4e76e2eefcb..3b09cb77cf8 100644 --- a/vortex-geo/Cargo.toml +++ b/vortex-geo/Cargo.toml @@ -26,6 +26,7 @@ wkb = { workspace = true } [dev-dependencies] geo-traits = { workspace = true } geo-types = { workspace = true } +rstest = { workspace = true } [lints] workspace = true diff --git a/vortex-geo/src/extension/geometry/coordinate.rs b/vortex-geo/src/extension/geometry/coordinate.rs new file mode 100644 index 00000000000..6fc555a6bab --- /dev/null +++ b/vortex-geo/src/extension/geometry/coordinate.rs @@ -0,0 +1,270 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Coordinate building blocks for the [`Geometry`](crate::extension::Geometry) extension type: +//! the `Struct` storage, its [`Dimension`], and the decoded +//! [`Coordinate`] value. +//! +//! The coordinate fields, where `{}` marks a field that may be absent (every field present is +//! non-nullable), are: +//! - `x` — longitude or easting +//! - `y` — latitude or northing +//! - `z` — elevation +//! - `m` — measure: an arbitrary per-point value such as distance along a route or a timestamp + +use std::fmt::Display; +use std::fmt::Formatter; + +use vortex_array::ArrayRef; +use vortex_array::ExecutionCtx; +use vortex_array::arrays::ExtensionArray; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::arrays::StructArray; +use vortex_array::arrays::extension::ExtensionArrayExt; +use vortex_array::arrays::struct_::StructArrayExt; +use vortex_array::dtype::DType; +use vortex_array::dtype::FieldNames; +use vortex_array::dtype::Nullability; +use vortex_array::dtype::PType; +use vortex_array::scalar::Scalar; +use vortex_error::VortexResult; +use vortex_error::vortex_bail; +use vortex_error::vortex_ensure; +use vortex_error::vortex_err; + +use crate::extension::Geometry; +use crate::extension::GeometryKind; + +/// Coordinate dimensions, matching GeoArrow. Field order is fixed: `x`, `y`, then `z` before `m`. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub(crate) enum Dimension { + /// 2D: `x`, `y`. + Xy, + /// 3D with elevation: `x`, `y`, `z`. + Xyz, + /// 3D with a measure: `x`, `y`, `m`. + Xym, + /// 4D: `x`, `y`, `z`, `m`. + Xyzm, +} + +impl Dimension { + /// Recover the dimension from a coordinate's field names, in GeoArrow order. + pub(crate) fn from_field_names(names: &FieldNames) -> VortexResult { + let mut strs = [""; 4]; + vortex_ensure!( + names.len() <= strs.len(), + "not a valid GeoArrow coordinate dimension: {names:?}" + ); + for (slot, name) in strs.iter_mut().zip(names.iter()) { + *slot = name.as_ref(); + } + Ok(match &strs[..names.len()] { + ["x", "y"] => Dimension::Xy, + ["x", "y", "z"] => Dimension::Xyz, + ["x", "y", "m"] => Dimension::Xym, + ["x", "y", "z", "m"] => Dimension::Xyzm, + _ => vortex_bail!("not a valid GeoArrow coordinate dimension: {names:?}"), + }) + } +} + +/// A decoded coordinate. `z`/`m` are `Some` iff the storage dimension includes them. +/// +/// This is the native value produced when unpacking a point-kind +/// [`Geometry`](crate::extension::Geometry) scalar; the rest of the coordinate machinery is +/// crate-internal. +#[derive(Debug, Clone, Copy, PartialEq)] +pub struct Coordinate { + /// The x (longitude/easting) ordinate. + pub x: f64, + /// The y (latitude/northing) ordinate. + pub y: f64, + /// The optional `z` (elevation) ordinate. + pub z: Option, + /// The optional `m` (measure) ordinate. + pub m: Option, +} + +impl Coordinate { + /// A 2D coordinate (`z`/`m` unset). + pub fn xy(x: f64, y: f64) -> Self { + Coordinate { + x, + y, + z: None, + m: None, + } + } +} + +impl Display for Coordinate { + fn fmt(&self, fmt: &mut Formatter<'_>) -> std::fmt::Result { + match (self.z, self.m) { + (None, None) => write!(fmt, "POINT({} {})", self.x, self.y), + (Some(z), None) => write!(fmt, "POINT Z ({} {} {})", self.x, self.y, z), + (None, Some(m)) => write!(fmt, "POINT M ({} {} {})", self.x, self.y, m), + (Some(z), Some(m)) => write!(fmt, "POINT ZM ({} {} {} {})", self.x, self.y, z, m), + } + } +} + +/// Validate that `dtype` is a coordinate struct of non-nullable `f64` fields, returning its +/// [`Dimension`]. Any of the four GeoArrow dimensions validates. +pub(crate) fn coordinate_dimension(dtype: &DType) -> VortexResult { + let DType::Struct(fields, _) = dtype else { + vortex_bail!("coordinate storage must be a Struct, was {dtype}"); + }; + for (name, field) in fields.names().iter().zip(fields.fields()) { + vortex_ensure!( + matches!( + field, + DType::Primitive(PType::F64, Nullability::NonNullable) + ), + "coordinate field {name} must be non-nullable f64, was {field}" + ); + } + Dimension::from_field_names(fields.names()) +} + +/// Decode a [`Coordinate`] from a coordinate `Struct` scalar (`z`/`m` read iff +/// present, so the same decoder serves every dimension). +pub(crate) fn coordinate_from_struct(scalar: &Scalar) -> VortexResult { + let fields = scalar.as_struct(); + let required = |name: &str| -> VortexResult { + f64::try_from( + &fields + .field(name) + .ok_or_else(|| vortex_err!("coordinate missing {name}"))?, + ) + }; + let optional = |name: &str| -> VortexResult> { + fields + .field(name) + .map(|value| f64::try_from(&value)) + .transpose() + }; + Ok(Coordinate { + x: required("x")?, + y: required("y")?, + z: optional("z")?, + m: optional("m")?, + }) +} + +/// Decode a [`Coordinate`] from an extension-typed point scalar (unwrapped to its coordinate +/// storage) or a bare coordinate `Struct` scalar. The per-row decode used by the distance fns. +pub(crate) fn coordinate_from_scalar(scalar: &Scalar) -> VortexResult { + match scalar.as_extension_opt() { + Some(ext_scalar) => coordinate_from_struct(&ext_scalar.to_storage_scalar()), + None => coordinate_from_struct(scalar), + } +} + +/// Validated, executed `x`/`y` columns of a point array. The bulk counterpart to [`Coordinate`]; +/// `z`/`m` are not executed. +pub(crate) struct ParsedCoordinates { + /// The flat `f64` `x` column. + pub(crate) xs: PrimitiveArray, + /// The flat `f64` `y` column. + pub(crate) ys: PrimitiveArray, +} + +/// Validate a point column's geometry kind and coordinate storage (layout and non-nullability), +/// then execute its `x`/`y` columns. +pub(crate) fn parse_storage( + points: &ArrayRef, + ctx: &mut ExecutionCtx, +) -> VortexResult { + if let Some(ext) = points.dtype().as_extension_opt() + && ext.is::() + { + let kind = ext.metadata::().kind()?; + vortex_ensure!( + kind == GeometryKind::Point, + "expected a point column, was {kind}" + ); + } + let storage = points + .clone() + .execute::(ctx)? + .storage_array() + .clone() + .execute::(ctx)?; + coordinate_dimension(storage.dtype())?; + vortex_ensure!( + !storage.dtype().is_nullable(), + "coordinate storage must be non-nullable to read unmasked ordinates, was {}", + storage.dtype() + ); + let xs = storage + .unmasked_field_by_name("x")? + .clone() + .execute::(ctx)?; + let ys = storage + .unmasked_field_by_name("y")? + .clone() + .execute::(ctx)?; + Ok(ParsedCoordinates { xs, ys }) +} + +#[cfg(test)] +mod tests { + use vortex_array::IntoArray; + use vortex_array::VortexSessionExecute; + use vortex_array::arrays::ExtensionArray; + use vortex_array::arrays::PrimitiveArray; + use vortex_array::arrays::StructArray; + use vortex_array::dtype::FieldNames; + use vortex_array::session::ArraySession; + use vortex_array::validity::Validity; + use vortex_error::VortexResult; + use vortex_session::VortexSession; + + use super::Coordinate; + use super::parse_storage; + use crate::extension::Geometry; + use crate::extension::GeometryKind; + + /// Display emits WKT, including `z`/`m` when present. + #[test] + fn display_is_wkt() { + let coordinate = |z, m| Coordinate { + x: 1.0, + y: 2.0, + z, + m, + }; + assert_eq!(coordinate(None, None).to_string(), "POINT(1 2)"); + assert_eq!(coordinate(Some(3.0), None).to_string(), "POINT Z (1 2 3)"); + assert_eq!(coordinate(None, Some(4.0)).to_string(), "POINT M (1 2 4)"); + assert_eq!( + coordinate(Some(3.0), Some(4.0)).to_string(), + "POINT ZM (1 2 3 4)" + ); + } + + /// [`parse_storage`] reads the coordinate fields unmasked, so a nullable point column must + /// be rejected at parse time rather than decoding null rows as garbage ordinates. + #[test] + fn parse_rejects_nullable_points() -> VortexResult<()> { + let session = VortexSession::empty().with::(); + let mut ctx = session.create_execution_ctx(); + + let storage = StructArray::try_new( + FieldNames::from(["x", "y"]), + vec![ + PrimitiveArray::from_iter(vec![1.0f64]).into_array(), + PrimitiveArray::from_iter(vec![2.0f64]).into_array(), + ], + 1, + Validity::AllValid, + )? + .into_array(); + let dtype = Geometry::dtype(GeometryKind::Point, None, storage.dtype().clone())?; + let points = ExtensionArray::new(dtype.erased(), storage).into_array(); + + assert!(parse_storage(&points, &mut ctx).is_err()); + Ok(()) + } +} diff --git a/vortex-geo/src/extension/geometry/mod.rs b/vortex-geo/src/extension/geometry/mod.rs new file mode 100644 index 00000000000..2a1604c4e07 --- /dev/null +++ b/vortex-geo/src/extension/geometry/mod.rs @@ -0,0 +1,413 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! The [`Geometry`] extension type (`vortex.geo.geometry`): one logical type for all +//! GeoArrow-native geometry kinds. + +pub(crate) mod coordinate; + +use std::fmt::Display; +use std::fmt::Formatter; + +use prost::Message; +use vortex_array::dtype::DType; +use vortex_array::dtype::extension::ExtDType; +use vortex_array::dtype::extension::ExtId; +use vortex_array::dtype::extension::ExtVTable; +use vortex_array::scalar::Scalar; +use vortex_array::scalar::ScalarValue; +use vortex_error::VortexResult; +use vortex_error::vortex_bail; +use vortex_error::vortex_ensure; + +use self::coordinate::Coordinate; +use self::coordinate::Dimension; +use self::coordinate::coordinate_dimension; +use self::coordinate::coordinate_from_struct; +use super::GeoMetadata; +use super::GeometryKind; + +/// The `vortex.geo.geometry` extension type: native geometry of a single [`GeometryKind`]. +#[derive(Debug, Clone, Default, PartialEq, Eq, Hash)] +pub struct Geometry; + +impl Geometry { + /// The extension dtype for a column of `kind` geometries over `storage`. + pub fn dtype( + kind: GeometryKind, + crs: Option, + storage: DType, + ) -> VortexResult> { + ExtDType::try_new( + GeoMetadata { + crs, + geometry_type: kind as i32, + }, + storage, + ) + } + + /// The [`GeometryKind`] of a geometry-typed `dtype`; errors on non-geometry dtypes. + pub fn kind_of(dtype: &DType) -> VortexResult { + let Some(ext) = dtype.as_extension_opt() else { + vortex_bail!("expected a geometry column, was {dtype}"); + }; + vortex_ensure!( + ext.is::(), + "expected a geometry column, was {dtype}" + ); + ext.metadata::().kind() + } +} + +/// Validate that `storage` is `kind`'s GeoArrow layout. +/// +/// Only point columns are supported end to end; other kinds are rejected here until their +/// scalar unpacking and kernels exist, so that a valid dtype never produces arrays whose +/// scalars fail to unpack. +fn validate_storage(kind: GeometryKind, storage: &DType) -> VortexResult { + match kind { + GeometryKind::Unspecified => { + vortex_bail!("geometry kind must be specified; mixed columns are not yet supported") + } + GeometryKind::Point => coordinate_dimension(storage), + kind => vortex_bail!("{kind} geometry columns are not yet supported"), + } +} + +impl ExtVTable for Geometry { + type Metadata = GeoMetadata; + type NativeValue<'a> = GeometryValue; + + fn id(&self) -> ExtId { + ExtId::new_static("vortex.geo.geometry") + } + + fn serialize_metadata(&self, metadata: &Self::Metadata) -> VortexResult> { + Ok(metadata.encode_to_vec()) + } + + fn deserialize_metadata(&self, metadata: &[u8]) -> VortexResult { + Ok(GeoMetadata::decode(metadata)?) + } + + fn validate_dtype(ext_dtype: &ExtDType) -> VortexResult<()> { + validate_storage(ext_dtype.metadata().kind()?, ext_dtype.storage_dtype()).map(|_| ()) + } + + fn unpack_native<'a>( + ext_dtype: &'a ExtDType, + storage_value: &'a ScalarValue, + ) -> VortexResult { + match ext_dtype.metadata().kind()? { + GeometryKind::Point => { + let storage = Scalar::try_new( + ext_dtype.storage_dtype().clone(), + Some(storage_value.clone()), + )?; + coordinate_from_struct(&storage).map(GeometryValue::Point) + } + kind => vortex_bail!("unpacking {kind} scalars is not yet implemented"), + } + } +} + +/// A decoded native geometry scalar value. +#[derive(Debug, Clone, PartialEq)] +#[non_exhaustive] +pub enum GeometryValue { + /// A single coordinate. + Point(Coordinate), + /// A line of coordinates. + LineString(Vec), + /// An outer ring plus zero or more interior rings (holes). + Polygon(Vec>), + /// A collection of points. + MultiPoint(Vec), + /// A collection of linestrings. + MultiLineString(Vec>), + /// A collection of polygons. + MultiPolygon(Vec>>), +} + +/// The WKT dimension tag (`" Z"`, `" M"`, `" ZM"`, or empty), read from `first`. +fn dimension_tag(first: Option<&Coordinate>) -> &'static str { + match first.map(|coordinate| (coordinate.z, coordinate.m)) { + None | Some((None, None)) => "", + Some((Some(_), None)) => " Z", + Some((None, Some(_))) => " M", + Some((Some(_), Some(_))) => " ZM", + } +} + +/// Write the bare ordinates of one coordinate: `x y {z} {m}`. +fn fmt_ordinates(f: &mut Formatter<'_>, coordinate: &Coordinate) -> std::fmt::Result { + write!(f, "{} {}", coordinate.x, coordinate.y)?; + if let Some(z) = coordinate.z { + write!(f, " {z}")?; + } + if let Some(m) = coordinate.m { + write!(f, " {m}")?; + } + Ok(()) +} + +/// Write `(…, …)`, formatting each element with `item`. +fn fmt_seq( + f: &mut Formatter<'_>, + items: &[T], + item: impl Fn(&mut Formatter<'_>, &T) -> std::fmt::Result, +) -> std::fmt::Result { + write!(f, "(")?; + for (idx, value) in items.iter().enumerate() { + if idx > 0 { + write!(f, ", ")?; + } + item(f, value)?; + } + write!(f, ")") +} + +/// Write a WKT body: ` EMPTY` if empty, otherwise ` {tag} (…)`. +fn fmt_body( + f: &mut Formatter<'_>, + items: &[T], + first: Option<&Coordinate>, + item: impl Fn(&mut Formatter<'_>, &T) -> std::fmt::Result, +) -> std::fmt::Result { + if items.is_empty() { + return write!(f, " EMPTY"); + } + write!(f, "{} ", dimension_tag(first))?; + fmt_seq(f, items, item) +} + +impl Display for GeometryValue { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + let coords = + |f: &mut Formatter<'_>, line: &Vec| fmt_seq(f, line, fmt_ordinates); + let rings = |f: &mut Formatter<'_>, rings: &Vec>| fmt_seq(f, rings, coords); + match self { + GeometryValue::Point(coordinate) => Display::fmt(coordinate, f), + GeometryValue::LineString(line) => { + write!(f, "LINESTRING")?; + fmt_body(f, line, line.first(), fmt_ordinates) + } + GeometryValue::MultiPoint(points) => { + write!(f, "MULTIPOINT")?; + fmt_body(f, points, points.first(), fmt_ordinates) + } + GeometryValue::Polygon(polygon) => { + write!(f, "POLYGON")?; + let first = polygon.first().and_then(|ring| ring.first()); + fmt_body(f, polygon, first, coords) + } + GeometryValue::MultiLineString(lines) => { + write!(f, "MULTILINESTRING")?; + let first = lines.first().and_then(|line| line.first()); + fmt_body(f, lines, first, coords) + } + GeometryValue::MultiPolygon(polygons) => { + write!(f, "MULTIPOLYGON")?; + let first = polygons + .first() + .and_then(|polygon| polygon.first()) + .and_then(|ring| ring.first()); + fmt_body(f, polygons, first, rings) + } + } + } +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use rstest::rstest; + use vortex_array::IntoArray; + use vortex_array::VortexSessionExecute; + use vortex_array::arrays::ExtensionArray; + use vortex_array::arrays::PrimitiveArray; + use vortex_array::arrays::StructArray; + use vortex_array::dtype::DType; + use vortex_array::dtype::FieldNames; + use vortex_array::dtype::Nullability; + use vortex_array::dtype::PType; + use vortex_array::dtype::StructFields; + use vortex_array::session::ArraySession; + use vortex_error::VortexResult; + use vortex_session::VortexSession; + + use super::Geometry; + use super::GeometryValue; + use super::coordinate::Coordinate; + use super::coordinate::Dimension; + use super::coordinate::coordinate_dimension; + use super::coordinate::coordinate_from_scalar; + use crate::extension::GeometryKind; + + /// A coordinate storage dtype with the given field names, non-nullable `f64` per field. + fn coordinate_dtype(names: &[&'static str]) -> DType { + let fields = std::iter::repeat_n( + DType::Primitive(PType::F64, Nullability::NonNullable), + names.len(), + ) + .collect::>(); + DType::Struct( + StructFields::new(FieldNames::from(names), fields), + Nullability::NonNullable, + ) + } + + /// `storage` wrapped in `depth` non-nullable `List` layers. + fn nested_list(storage: DType, depth: usize) -> DType { + let mut dtype = storage; + for _ in 0..depth { + dtype = DType::List(Arc::new(dtype), Nullability::NonNullable); + } + dtype + } + + /// All four GeoArrow dimensions validate as point storage and round-trip via field names. + #[test] + fn point_validates_every_dimension() -> VortexResult<()> { + let cases = [ + (Dimension::Xy, ["x", "y"].as_slice()), + (Dimension::Xyz, ["x", "y", "z"].as_slice()), + (Dimension::Xym, ["x", "y", "m"].as_slice()), + (Dimension::Xyzm, ["x", "y", "z", "m"].as_slice()), + ]; + for (dim, names) in cases { + let storage = coordinate_dtype(names); + assert_eq!(coordinate_dimension(&storage)?, dim); + Geometry::dtype(GeometryKind::Point, Some("EPSG:4326".to_string()), storage)?; + } + Ok(()) + } + + /// Non-point kinds are rejected at dtype construction — even with their correct GeoArrow + /// layout — until their scalar unpacking and kernels exist. + #[rstest] + #[case(GeometryKind::LineString, 1)] + #[case(GeometryKind::MultiPoint, 1)] + #[case(GeometryKind::Polygon, 2)] + #[case(GeometryKind::MultiLineString, 2)] + #[case(GeometryKind::MultiPolygon, 3)] + fn non_point_kinds_are_rejected(#[case] kind: GeometryKind, #[case] depth: usize) { + let storage = nested_list(coordinate_dtype(&["x", "y"]), depth); + assert!(Geometry::dtype(kind, None, storage).is_err()); + } + + /// Construction rejects non-struct storage, non-coordinate fields, and the `Unspecified` + /// kind. + #[test] + fn rejects_invalid_storage() -> VortexResult<()> { + let primitive = DType::Primitive(PType::F64, Nullability::NonNullable); + assert!(Geometry::dtype(GeometryKind::Point, None, primitive).is_err()); + + let wrong_fields = StructArray::from_fields(&[ + ("a", PrimitiveArray::from_iter(vec![0.0f64]).into_array()), + ("b", PrimitiveArray::from_iter(vec![0.0f64]).into_array()), + ])? + .into_array(); + assert!(Geometry::dtype(GeometryKind::Point, None, wrong_fields.dtype().clone()).is_err()); + + assert!( + Geometry::dtype( + GeometryKind::Unspecified, + None, + coordinate_dtype(&["x", "y"]) + ) + .is_err() + ); + Ok(()) + } + + /// A point column round-trips through scalar execution back to its coordinates. + #[test] + fn point_unpacks_coordinates() -> VortexResult<()> { + let session = VortexSession::empty().with::(); + let mut ctx = session.create_execution_ctx(); + + let storage = StructArray::from_fields(&[ + ( + "x", + PrimitiveArray::from_iter(vec![1.0f64, -111.7610]).into_array(), + ), + ( + "y", + PrimitiveArray::from_iter(vec![2.0f64, 34.8697]).into_array(), + ), + ])? + .into_array(); + let dtype = Geometry::dtype( + GeometryKind::Point, + Some("EPSG:4326".to_string()), + storage.dtype().clone(), + )?; + let points = ExtensionArray::new(dtype.erased(), storage).into_array(); + + assert_eq!( + coordinate_from_scalar(&points.execute_scalar(0, &mut ctx)?)?, + Coordinate::xy(1.0, 2.0) + ); + assert_eq!( + coordinate_from_scalar(&points.execute_scalar(1, &mut ctx)?)?, + Coordinate::xy(-111.7610, 34.8697) + ); + Ok(()) + } + + /// `GeometryValue` displays as WKT for every kind, including dimension tags and `EMPTY`. + #[test] + fn display_is_wkt() { + let xy = Coordinate::xy; + assert_eq!(GeometryValue::Point(xy(1.0, 2.0)).to_string(), "POINT(1 2)"); + assert_eq!( + GeometryValue::LineString(vec![xy(0.0, 0.0), xy(1.0, 1.0)]).to_string(), + "LINESTRING (0 0, 1 1)" + ); + assert_eq!( + GeometryValue::LineString(vec![]).to_string(), + "LINESTRING EMPTY" + ); + assert_eq!( + GeometryValue::MultiPoint(vec![xy(0.0, 0.0), xy(1.0, 1.0)]).to_string(), + "MULTIPOINT (0 0, 1 1)" + ); + assert_eq!( + GeometryValue::Polygon(vec![vec![xy(0.0, 0.0), xy(1.0, 0.0), xy(0.0, 0.0)]]) + .to_string(), + "POLYGON ((0 0, 1 0, 0 0))" + ); + assert_eq!( + GeometryValue::MultiLineString(vec![vec![xy(0.0, 0.0), xy(1.0, 1.0)]]).to_string(), + "MULTILINESTRING ((0 0, 1 1))" + ); + assert_eq!( + GeometryValue::MultiPolygon(vec![vec![vec![xy(0.0, 0.0), xy(1.0, 0.0), xy(0.0, 0.0)]]]) + .to_string(), + "MULTIPOLYGON (((0 0, 1 0, 0 0)))" + ); + let zm = Coordinate { + x: 1.0, + y: 2.0, + z: Some(3.0), + m: Some(4.0), + }; + assert_eq!( + GeometryValue::LineString(vec![zm, zm]).to_string(), + "LINESTRING ZM (1 2 3 4, 1 2 3 4)" + ); + let z = Coordinate { m: None, ..zm }; + assert_eq!( + GeometryValue::LineString(vec![z]).to_string(), + "LINESTRING Z (1 2 3)" + ); + let m = Coordinate { z: None, ..zm }; + assert_eq!( + GeometryValue::MultiPoint(vec![m]).to_string(), + "MULTIPOINT M (1 2 4)" + ); + } +} diff --git a/vortex-geo/src/extension/mod.rs b/vortex-geo/src/extension/mod.rs index f08b76ae83d..b5edacb6b3d 100644 --- a/vortex-geo/src/extension/mod.rs +++ b/vortex-geo/src/extension/mod.rs @@ -1,42 +1,103 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors +pub(crate) mod geometry; mod wkb; use std::fmt::Display; +pub use geometry::*; +use vortex_error::VortexResult; +use vortex_error::vortex_err; pub use wkb::*; /// Extension metadata that is common to all the geospatial extension types. /// -/// Currently, this is just the coordinate reference system (CRS). -/// We may wish to add a second field for edges interpretation in the future similar to -/// the GeoArrow standard. +/// This carries the coordinate reference system (CRS) and, for native [`Geometry`] columns, the +/// [`GeometryKind`] held by the column. We may wish to add a field for edges interpretation in +/// the future similar to the GeoArrow standard. #[derive(Clone, PartialEq, Eq, Hash, prost::Message)] pub struct GeoMetadata { + /// The coordinate reference system, or `None` for unreferenced geometry. #[prost(optional, string, tag = "1")] pub crs: Option, + + /// The [`GeometryKind`] held by a native [`Geometry`] column. + #[prost(enumeration = "GeometryKind", tag = "2")] + pub geometry_type: i32, +} + +impl GeoMetadata { + /// The decoded [`GeometryKind`] of this metadata. + pub fn kind(&self) -> VortexResult { + GeometryKind::try_from(self.geometry_type) + .map_err(|_| vortex_err!("unknown geometry kind {}", self.geometry_type)) + } } impl Display for GeoMetadata { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "Geometry(")?; + match GeometryKind::try_from(self.geometry_type) { + Ok(GeometryKind::Unspecified) => {} + Ok(kind) => write!(f, "{kind}, ")?, + Err(_) => write!(f, "kind={}, ", self.geometry_type)?, + } match self.crs.as_ref() { - Some(crs) => write!(f, "Geometry(crs={crs})"), - None => write!(f, "Geometry(unreferenced)"), + Some(crs) => write!(f, "crs={crs})"), + None => write!(f, "unreferenced)"), } } } +/// The kind of geometry held in a native [`Geometry`] column, matching the GeoArrow native +/// geometry types. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, prost::Enumeration)] +#[repr(i32)] +pub enum GeometryKind { + /// The wire default when no kind was set (e.g. WKB); rejected by [`Geometry`]. + Unspecified = 0, + /// A single location. + Point = 1, + /// A sequence of locations connected into a line. + LineString = 2, + /// One outer ring with zero or more interior rings (holes). + Polygon = 3, + /// A collection of points. + MultiPoint = 4, + /// A collection of linestrings. + MultiLineString = 5, + /// A collection of polygons. + MultiPolygon = 6, +} + +impl Display for GeometryKind { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let name = match self { + GeometryKind::Unspecified => "unspecified", + GeometryKind::Point => "point", + GeometryKind::LineString => "linestring", + GeometryKind::Polygon => "polygon", + GeometryKind::MultiPoint => "multipoint", + GeometryKind::MultiLineString => "multilinestring", + GeometryKind::MultiPolygon => "multipolygon", + }; + write!(f, "{name}") + } +} + #[cfg(test)] mod tests { use prost::Message; use crate::extension::GeoMetadata; + use crate::extension::GeometryKind; #[test] fn test_metadata() { let meta = GeoMetadata { crs: Some("EPSG:4326".to_string()), + ..Default::default() }; assert_eq!(meta.to_string(), "Geometry(crs=EPSG:4326)"); @@ -45,4 +106,18 @@ mod tests { let decoded = GeoMetadata::decode(bytes.as_slice()).unwrap(); assert_eq!(decoded, meta); } + + #[test] + fn test_metadata_with_kind() { + let meta = GeoMetadata { + crs: Some("EPSG:4326".to_string()), + geometry_type: GeometryKind::Point as i32, + }; + + assert_eq!(meta.to_string(), "Geometry(point, crs=EPSG:4326)"); + let bytes = meta.encode_to_vec(); + let decoded = GeoMetadata::decode(bytes.as_slice()).unwrap(); + assert_eq!(decoded, meta); + assert_eq!(decoded.kind().unwrap(), GeometryKind::Point); + } } diff --git a/vortex-geo/src/extension/wkb.rs b/vortex-geo/src/extension/wkb.rs index 172da41b93b..e49c8587f86 100644 --- a/vortex-geo/src/extension/wkb.rs +++ b/vortex-geo/src/extension/wkb.rs @@ -313,5 +313,8 @@ fn geo_metadata(wkb_type: &WkbType) -> GeoMetadata { .map(str::to_string) .unwrap_or_else(|| value.to_string()) }); - GeoMetadata { crs } + GeoMetadata { + crs, + ..Default::default() + } } diff --git a/vortex-geo/src/lib.rs b/vortex-geo/src/lib.rs index 513caf85d92..1f4e401c573 100644 --- a/vortex-geo/src/lib.rs +++ b/vortex-geo/src/lib.rs @@ -5,17 +5,26 @@ use std::sync::Arc; use vortex_array::arrow::ArrowSessionExt; use vortex_array::dtype::session::DTypeSessionExt; +use vortex_array::scalar_fn::session::ScalarFnSessionExt; use vortex_session::VortexSession; +use crate::extension::Geometry; use crate::extension::WellKnownBinary; +use crate::scalar_fn::distance::GeoDistance; pub mod extension; +pub mod scalar_fn; + /// Set up a session with support for geospatial extension types, encodings and layouts. pub fn initialize(session: &VortexSession) { - // register geospatial extension types + // Register the geospatial extension types. session.dtypes().register(WellKnownBinary); session.arrow().register_exporter(Arc::new(WellKnownBinary)); session.arrow().register_importer(Arc::new(WellKnownBinary)); + session.dtypes().register(Geometry); + + // Register the geometry scalar functions. + session.scalar_fns().register(GeoDistance); } #[cfg(test)] @@ -95,6 +104,7 @@ mod tests { let dtype = ExtDType::::try_new( GeoMetadata { crs: Some("EPSG:4326".to_string()), + ..Default::default() }, DType::Binary(Nullability::NonNullable), )?; @@ -135,6 +145,7 @@ mod tests { let dtype = ExtDType::::try_new( GeoMetadata { crs: Some("EPSG:4326".to_string()), + ..Default::default() }, DType::Binary(Nullability::NonNullable), )?; diff --git a/vortex-geo/src/scalar_fn/distance.rs b/vortex-geo/src/scalar_fn/distance.rs new file mode 100644 index 00000000000..cbec9f48081 --- /dev/null +++ b/vortex-geo/src/scalar_fn/distance.rs @@ -0,0 +1,361 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Straight-line (Euclidean) distance between geometries; "planar" distance in GIS terms. + +use vortex_array::ArrayRef; +use vortex_array::ExecutionCtx; +use vortex_array::IntoArray; +use vortex_array::arrays::Constant; +use vortex_array::arrays::ConstantArray; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::arrays::ScalarFnArray; +use vortex_array::dtype::DType; +use vortex_array::dtype::Nullability; +use vortex_array::dtype::PType; +use vortex_array::scalar::Scalar; +use vortex_array::scalar_fn::Arity; +use vortex_array::scalar_fn::ChildName; +use vortex_array::scalar_fn::EmptyOptions; +use vortex_array::scalar_fn::ExecutionArgs; +use vortex_array::scalar_fn::ScalarFnId; +use vortex_array::scalar_fn::ScalarFnVTable; +use vortex_array::scalar_fn::TypedScalarFnInstance; +use vortex_error::VortexResult; +use vortex_error::vortex_bail; +use vortex_error::vortex_ensure; +use vortex_session::VortexSession; + +use crate::extension::Geometry; +use crate::extension::GeometryKind; +use crate::extension::geometry::coordinate::coordinate_from_scalar; +use crate::extension::geometry::coordinate::parse_storage; + +/// Straight-line (L2) distance between `(ax, ay)` and `(bx, by)`. +fn euclidean_distance(ax: f64, ay: f64, bx: f64, by: f64) -> f64 { + let dx = ax - bx; + let dy = ay - by; + (dx * dx + dy * dy).sqrt() +} + +/// Planar (Euclidean) distance between two geometry columns, like PostGIS `ST_Distance`; +/// `z`/`m` are ignored. +/// +/// Operands are type-checked at construction: point kind (the only kernel so far), +/// non-nullable, and sharing a CRS. Execution dispatches on the operands' [`GeometryKind`]s. +#[derive(Debug, Clone, Default, PartialEq, Eq, Hash)] +pub struct GeoDistance; + +impl GeoDistance { + /// A lazy `ScalarFnArray` of per-row distances between `a` and `b`, with `a`'s length. + pub fn try_new_array(a: ArrayRef, b: ArrayRef) -> VortexResult { + let len = a.len(); + ScalarFnArray::try_new( + TypedScalarFnInstance::new(GeoDistance, EmptyOptions).erased(), + vec![a, b], + len, + ) + } +} + +impl ScalarFnVTable for GeoDistance { + type Options = EmptyOptions; + + fn id(&self) -> ScalarFnId { + ScalarFnId::new("vortex.geo.distance") + } + + fn serialize(&self, _: &Self::Options) -> VortexResult>> { + Ok(Some(vec![])) + } + + fn deserialize(&self, _: &[u8], _: &VortexSession) -> VortexResult { + Ok(EmptyOptions) + } + + fn arity(&self, _: &Self::Options) -> Arity { + Arity::Exact(2) + } + + fn child_name(&self, _: &Self::Options, child_idx: usize) -> ChildName { + match child_idx { + 0 => ChildName::from("a"), + 1 => ChildName::from("b"), + _ => unreachable!("distance has exactly two children"), + } + } + + fn return_dtype(&self, _: &Self::Options, arg_dtypes: &[DType]) -> VortexResult { + for dtype in arg_dtypes { + let kind = Geometry::kind_of(dtype)?; + vortex_ensure!( + kind == GeometryKind::Point, + "distance over {kind} geometries is not yet implemented" + ); + vortex_ensure!( + !dtype.is_nullable(), + "distance over nullable geometry is not yet supported, was {dtype}" + ); + } + if let [a, b] = arg_dtypes { + let a_crs = &a.as_extension().metadata::().crs; + let b_crs = &b.as_extension().metadata::().crs; + vortex_ensure!( + a_crs == b_crs, + "distance operands must share a CRS, was {} and {}", + a_crs.as_deref().unwrap_or("unreferenced"), + b_crs.as_deref().unwrap_or("unreferenced") + ); + } + Ok(DType::Primitive(PType::F64, Nullability::NonNullable)) + } + + fn execute( + &self, + _: &Self::Options, + args: &dyn ExecutionArgs, + ctx: &mut ExecutionCtx, + ) -> VortexResult { + let a = args.get(0)?; + let b = args.get(1)?; + match (Geometry::kind_of(a.dtype())?, Geometry::kind_of(b.dtype())?) { + (GeometryKind::Point, GeometryKind::Point) => point_to_point_distance(&a, &b, ctx), + (a_kind, b_kind) => { + vortex_bail!("distance({a_kind}, {b_kind}) is not yet implemented") + } + } + } +} + +/// Per-row distance between two point columns; either operand (or both) may be constant. +fn point_to_point_distance( + a: &ArrayRef, + b: &ArrayRef, + ctx: &mut ExecutionCtx, +) -> VortexResult { + match (a.as_opt::(), b.as_opt::()) { + (Some(qa), Some(qb)) => { + let qa = coordinate_from_scalar(qa.scalar())?; + let qb = coordinate_from_scalar(qb.scalar())?; + let distance = euclidean_distance(qa.x, qa.y, qb.x, qb.y); + Ok(ConstantArray::new( + Scalar::primitive(distance, Nullability::NonNullable), + a.len(), + ) + .into_array()) + } + (Some(query), None) => point_to_constant_distance(b, query.scalar(), ctx), + (None, Some(query)) => point_to_constant_distance(a, query.scalar(), ctx), + (None, None) => { + let a_coords = parse_storage(a, ctx)?; + let b_coords = parse_storage(b, ctx)?; + let distances = a_coords + .xs + .as_slice::() + .iter() + .zip(a_coords.ys.as_slice::()) + .zip( + b_coords + .xs + .as_slice::() + .iter() + .zip(b_coords.ys.as_slice::()), + ) + .map(|((&ax, &ay), (&bx, &by))| euclidean_distance(ax, ay, bx, by)); + Ok(PrimitiveArray::from_iter(distances).into_array()) + } + } +} + +/// Per-row distance from `points` to the constant `query` point, decoded once. Distance is +/// symmetric, so this serves a constant on either side. +fn point_to_constant_distance( + points: &ArrayRef, + query: &Scalar, + ctx: &mut ExecutionCtx, +) -> VortexResult { + let query = coordinate_from_scalar(query)?; + let coords = parse_storage(points, ctx)?; + let distances = coords + .xs + .as_slice::() + .iter() + .zip(coords.ys.as_slice::()) + .map(|(&x, &y)| euclidean_distance(x, y, query.x, query.y)); + Ok(PrimitiveArray::from_iter(distances).into_array()) +} + +#[cfg(test)] +mod tests { + use vortex_array::ArrayRef; + use vortex_array::Canonical; + use vortex_array::ExecutionCtx; + use vortex_array::IntoArray; + use vortex_array::VortexSessionExecute; + use vortex_array::arrays::ConstantArray; + use vortex_array::arrays::ExtensionArray; + use vortex_array::arrays::PrimitiveArray; + use vortex_array::arrays::StructArray; + use vortex_array::dtype::FieldNames; + use vortex_array::session::ArraySession; + use vortex_array::validity::Validity; + use vortex_error::VortexResult; + use vortex_session::VortexSession; + + use super::GeoDistance; + use super::euclidean_distance; + use crate::extension::Geometry; + use crate::extension::GeometryKind; + + /// A point `Geometry` column with the given CRS over the given x/y coordinates. + fn point_column_with_crs( + xs: Vec, + ys: Vec, + crs: Option, + ) -> VortexResult { + let storage = StructArray::from_fields(&[ + ("x", PrimitiveArray::from_iter(xs).into_array()), + ("y", PrimitiveArray::from_iter(ys).into_array()), + ])? + .into_array(); + let dtype = Geometry::dtype(GeometryKind::Point, crs, storage.dtype().clone())?; + Ok(ExtensionArray::new(dtype.erased(), storage).into_array()) + } + + /// A point `Geometry` column (CRS `EPSG:4326`) over the given x/y coordinates. + fn point_column(xs: Vec, ys: Vec) -> VortexResult { + point_column_with_crs(xs, ys, Some("EPSG:4326".to_string())) + } + + /// A constant point column of length `len`, every row at `(x, y)`. + fn point_constant( + x: f64, + y: f64, + len: usize, + ctx: &mut ExecutionCtx, + ) -> VortexResult { + let single = point_column(vec![x], vec![y])?.execute_scalar(0, ctx)?; + Ok(ConstantArray::new(single, len).into_array()) + } + + /// Execute a `GeoDistance` array and read back its per-row `f64` distances. + fn distances(distance: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult> { + Ok(distance + .execute::(ctx)? + .into_primitive() + .as_slice::() + .to_vec()) + } + + /// The kernel computes straight-line distance (the 3–4–5 triangle). + #[test] + fn euclidean_distance_is_straight_line() { + assert_eq!(euclidean_distance(0.0, 0.0, 3.0, 4.0), 5.0); + assert_eq!(euclidean_distance(1.5, -1.5, 1.5, -1.5), 0.0); + } + + /// Per-row distance between a point column and a constant query point. + #[test] + fn distance_over_points() -> VortexResult<()> { + let session = VortexSession::empty().with::(); + let mut ctx = session.create_execution_ctx(); + + let a = point_column(vec![0.0, 3.0, 0.0, 3.0], vec![0.0, 0.0, 4.0, 4.0])?; + let b = point_constant(0.0, 0.0, 4, &mut ctx)?; + let distance = GeoDistance::try_new_array(a, b)?.into_array(); + + assert_eq!(distances(distance, &mut ctx)?, vec![0.0, 3.0, 4.0, 5.0]); + Ok(()) + } + + /// Column-to-column distance pairs corresponding rows of the two columns. + #[test] + fn distance_between_columns() -> VortexResult<()> { + let session = VortexSession::empty().with::(); + let mut ctx = session.create_execution_ctx(); + + let a = point_column(vec![0.0, 1.0], vec![0.0, 1.0])?; + let b = point_column(vec![3.0, 1.0], vec![4.0, 1.0])?; + let distance = GeoDistance::try_new_array(a, b)?.into_array(); + + assert_eq!(distances(distance, &mut ctx)?, vec![5.0, 0.0]); + Ok(()) + } + + /// The constant query point may be either operand; distance is symmetric. + #[test] + fn distance_with_constant_first_operand() -> VortexResult<()> { + let session = VortexSession::empty().with::(); + let mut ctx = session.create_execution_ctx(); + + let a = point_constant(0.0, 0.0, 4, &mut ctx)?; + let b = point_column(vec![0.0, 3.0, 0.0, 3.0], vec![0.0, 0.0, 4.0, 4.0])?; + let distance = GeoDistance::try_new_array(a, b)?.into_array(); + + assert_eq!(distances(distance, &mut ctx)?, vec![0.0, 3.0, 4.0, 5.0]); + Ok(()) + } + + /// Two constant operands: every row has the same distance. + #[test] + fn distance_between_two_constants() -> VortexResult<()> { + let session = VortexSession::empty().with::(); + let mut ctx = session.create_execution_ctx(); + + let a = point_constant(0.0, 0.0, 3, &mut ctx)?; + let b = point_constant(3.0, 4.0, 3, &mut ctx)?; + let distance = GeoDistance::try_new_array(a, b)?.into_array(); + + assert_eq!(distances(distance, &mut ctx)?, vec![5.0, 5.0, 5.0]); + Ok(()) + } + + /// Distance is signed `(geometry, geometry)`: a non-geometry operand is rejected at + /// construction. + #[test] + fn distance_rejects_non_geometry_operands() -> VortexResult<()> { + let a = point_column(vec![0.0], vec![0.0])?; + let b = PrimitiveArray::from_iter(vec![1.0f64]).into_array(); + assert!(GeoDistance::try_new_array(a, b).is_err()); + Ok(()) + } + + /// Operands in different (or missing vs. set) CRS are rejected at construction: their raw + /// ordinates are not comparable. + #[test] + fn distance_rejects_mismatched_crs() -> VortexResult<()> { + let a = point_column(vec![0.0], vec![0.0])?; + let b = point_column_with_crs(vec![1.0], vec![1.0], Some("EPSG:3857".to_string()))?; + assert!(GeoDistance::try_new_array(a, b).is_err()); + + let a = point_column(vec![0.0], vec![0.0])?; + let unreferenced = point_column_with_crs(vec![1.0], vec![1.0], None)?; + assert!(GeoDistance::try_new_array(a, unreferenced).is_err()); + Ok(()) + } + + /// Nullable point columns are rejected at construction until validity propagation exists. + #[test] + fn distance_rejects_nullable_points() -> VortexResult<()> { + let storage = StructArray::try_new( + FieldNames::from(["x", "y"]), + vec![ + PrimitiveArray::from_iter(vec![1.0f64]).into_array(), + PrimitiveArray::from_iter(vec![2.0f64]).into_array(), + ], + 1, + Validity::AllValid, + )? + .into_array(); + let dtype = Geometry::dtype( + GeometryKind::Point, + Some("EPSG:4326".to_string()), + storage.dtype().clone(), + )?; + let nullable = ExtensionArray::new(dtype.erased(), storage).into_array(); + + let b = point_column(vec![2.0], vec![2.0])?; + assert!(GeoDistance::try_new_array(nullable, b).is_err()); + Ok(()) + } +} diff --git a/vortex-geo/src/scalar_fn/mod.rs b/vortex-geo/src/scalar_fn/mod.rs new file mode 100644 index 00000000000..ef9f220218a --- /dev/null +++ b/vortex-geo/src/scalar_fn/mod.rs @@ -0,0 +1,6 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Geometry scalar functions over the [`Geometry`](crate::extension::Geometry) type. + +pub mod distance;