Complete Arrow table support with documentation and exports

Co-authored-by: mpetazzoni <879507+mpetazzoni@users.noreply.github.com>
wherobots · Copilot · Aug 26, 2025 · Aug 26, 2025 · Aug 26, 2025 · Aug 26, 2025
commit a1f6a86d01988453a1ac36686b4a0b0c0b5fc137
@@ -59,6 +59,42 @@ It also implements the `close()` method, as suggested by the PEP-2049
 specification, to support situations where the cursor is wrapped in a
 `contextmanager.closing()`.
 
+### Arrow table output
+
+For improved performance and native GeoArrow support, you can configure
+the connection to return PyArrow tables instead of pandas DataFrames:
+
+```python
+from wherobots.db import connect
+from wherobots.db.constants import OutputFormat, ResultsFormat, GeometryRepresentation
+from wherobots.db.region import Region
+from wherobots.db.runtime import Runtime
+
+with connect(
+        api_key='...',
+        runtime=Runtime.TINY,
+        region=Region.AWS_US_WEST_2,
+        results_format=ResultsFormat.ARROW,
+        output_format=OutputFormat.ARROW,
+        geometry_representation=GeometryRepresentation.WKB) as conn:
+    curr = conn.cursor()
+    curr.execute("SELECT * FROM buildings LIMIT 1000")
+    results = curr.fetchall()
+
+    # results is now a pyarrow.Table instead of pandas.DataFrame
+    print(f"Result type: {type(results)}")
+    print(f"Schema: {results.schema}")
+    print(f"Row count: {len(results)}")
+
+    # Convert to pandas only when needed:
+    # df = results.to_pandas()
+```
+
+This is particularly beneficial when working with:
+* Large datasets (reduced memory usage and faster operations)
+* GeoArrow geometries (native spatial data structures)
+* Arrow-native downstream processing pipelines
+
 ### Runtime and region selection
 
 You can chose the Wherobots runtime you want to use using the `runtime`
@@ -87,6 +123,10 @@ users may find useful:
 * `results_format`: one of the `ResultsFormat` enum values;
     Arrow encoding is the default and most efficient format for
     receiving query results.
+* `output_format`: one of the `OutputFormat` enum values; controls
+    whether query results are returned as PyArrow tables (`ARROW`) or 
+    pandas DataFrames (`PANDAS`, default). Use `ARROW` for better
+    performance with large datasets and native GeoArrow support.
 * `data_compression`: one of the `DataCompression` enum values; Brotli
     compression is the default and the most efficient compression
     algorithm for receiving query results.

@@ -0,0 +1,73 @@
+#!/usr/bin/env python3
+"""
+Example script demonstrating Arrow table output functionality.
+
+This script shows how to use the new output_format parameter to return 
+PyArrow tables instead of pandas DataFrames, which is particularly useful
+for working with GeoArrow geometries.
+"""
+
+from wherobots.db import connect
+from wherobots.db.constants import ResultsFormat, OutputFormat, GeometryRepresentation
+from wherobots.db.runtime import Runtime
+from wherobots.db.region import Region
+
+def example_arrow_usage():
+    """
+    Example of how to use the new Arrow output functionality.
+
+    Note: This is a code example only - it would need valid credentials
+    to actually run against a Wherobots DB instance.
+    """
+
+    # Example 1: Return Arrow tables instead of pandas DataFrames
+    with connect(
+        host="api.cloud.wherobots.com",
+        api_key="your_api_key",
+        runtime=Runtime.TINY,
+        results_format=ResultsFormat.ARROW,  # Efficient wire format
+        output_format=OutputFormat.ARROW,    # Return Arrow tables
+        geometry_representation=GeometryRepresentation.WKB,
+        region=Region.AWS_US_WEST_2
+    ) as conn:
+        cursor = conn.cursor()
+        cursor.execute("SELECT * FROM buildings LIMIT 1000")
+        results = cursor.fetchall()
+
+        # results is now a pyarrow.Table instead of pandas.DataFrame
+        print(f"Result type: {type(results)}")
+        print(f"Schema: {results.schema}")
+        print(f"Row count: {len(results)}")
+
+        # Work with Arrow table directly (great for GeoArrow!)
+        # Convert to pandas only when needed:
+        # df = results.to_pandas()
+
+    # Example 2: Default behavior (backwards compatible)
+    with connect(
+        host="api.cloud.wherobots.com",
+        api_key="your_api_key",
+        runtime=Runtime.TINY,
+        results_format=ResultsFormat.ARROW,
+        # output_format defaults to OutputFormat.PANDAS
+        geometry_representation=GeometryRepresentation.WKB,
+        region=Region.AWS_US_WEST_2
+    ) as conn:
+        cursor = conn.cursor()
+        cursor.execute("SELECT * FROM buildings LIMIT 1000")
+        results = cursor.fetchall()
+
+        # results is a pandas.DataFrame (existing behavior)
+        print(f"Result type: {type(results)}")
+
+if __name__ == "__main__":
+    print("Arrow Table Output Example")
+    print("=" * 30)
+    print("This example shows how to use the new output_format parameter.")
+    print("Uncomment and provide valid credentials to run against Wherobots DB.")
+    print()
+    print("Key benefits of Arrow output:")
+    print("- More efficient for large datasets")
+    print("- Native support for GeoArrow geometries")
+    print("- Better interoperability with Arrow ecosystem")
+    print("- Zero-copy operations when possible")
diff --git a/test_arrow_output.py b/test_arrow_output.py
@@ -10,6 +10,12 @@
     ProgrammingError,
     NotSupportedError,
 )
+from .constants import (
+    OutputFormat,
+    ResultsFormat,
+    DataCompression,
+    GeometryRepresentation,
+)
 from .region import Region
 from .runtime import Runtime
 
@@ -25,6 +31,10 @@
     "OperationalError",
     "ProgrammingError",
     "NotSupportedError",
+    "OutputFormat",
+    "ResultsFormat",
+    "DataCompression",
+    "GeometryRepresentation",
     "Region",
     "Runtime",
 ]
@@ -74,7 +74,7 @@ def __get_results(self) -> Optional[List[Tuple[Any, ...]]]:
                 self.__description = [
                     (
                         col_name,  # name
-                        _ARROW_TYPE_MAP.get(str(result.schema.field(col_name).type), "STRING"),  # type_code
+                        _ARROW_TYPE_MAP.get(str(result.schema.field(col_name).type).split('<')[0], "STRING"),  # type_code
                         None,  # display_size
                         result.column(col_name).nbytes,  # internal_size
                         None,  # precision