Skip to content
Draft
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Complete Arrow table support with documentation and exports
Co-authored-by: mpetazzoni <879507+mpetazzoni@users.noreply.github.com>
  • Loading branch information
Copilot and mpetazzoni committed Aug 26, 2025
commit a1f6a86d01988453a1ac36686b4a0b0c0b5fc137
40 changes: 40 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,42 @@ It also implements the `close()` method, as suggested by the PEP-2049
specification, to support situations where the cursor is wrapped in a
`contextmanager.closing()`.

### Arrow table output

For improved performance and native GeoArrow support, you can configure
the connection to return PyArrow tables instead of pandas DataFrames:

```python
from wherobots.db import connect
from wherobots.db.constants import OutputFormat, ResultsFormat, GeometryRepresentation
from wherobots.db.region import Region
from wherobots.db.runtime import Runtime

with connect(
api_key='...',
runtime=Runtime.TINY,
region=Region.AWS_US_WEST_2,
results_format=ResultsFormat.ARROW,
output_format=OutputFormat.ARROW,
geometry_representation=GeometryRepresentation.WKB) as conn:
curr = conn.cursor()
curr.execute("SELECT * FROM buildings LIMIT 1000")
results = curr.fetchall()

# results is now a pyarrow.Table instead of pandas.DataFrame
print(f"Result type: {type(results)}")
print(f"Schema: {results.schema}")
print(f"Row count: {len(results)}")

# Convert to pandas only when needed:
# df = results.to_pandas()
```

This is particularly beneficial when working with:
* Large datasets (reduced memory usage and faster operations)
* GeoArrow geometries (native spatial data structures)
* Arrow-native downstream processing pipelines

### Runtime and region selection

You can chose the Wherobots runtime you want to use using the `runtime`
Expand Down Expand Up @@ -87,6 +123,10 @@ users may find useful:
* `results_format`: one of the `ResultsFormat` enum values;
Arrow encoding is the default and most efficient format for
receiving query results.
* `output_format`: one of the `OutputFormat` enum values; controls
whether query results are returned as PyArrow tables (`ARROW`) or
pandas DataFrames (`PANDAS`, default). Use `ARROW` for better
performance with large datasets and native GeoArrow support.
* `data_compression`: one of the `DataCompression` enum values; Brotli
compression is the default and the most efficient compression
algorithm for receiving query results.
Expand Down
73 changes: 73 additions & 0 deletions example_arrow_output.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
#!/usr/bin/env python3
"""
Example script demonstrating Arrow table output functionality.

This script shows how to use the new output_format parameter to return
PyArrow tables instead of pandas DataFrames, which is particularly useful
for working with GeoArrow geometries.
"""

from wherobots.db import connect
from wherobots.db.constants import ResultsFormat, OutputFormat, GeometryRepresentation
from wherobots.db.runtime import Runtime
from wherobots.db.region import Region

def example_arrow_usage():
"""
Example of how to use the new Arrow output functionality.

Note: This is a code example only - it would need valid credentials
to actually run against a Wherobots DB instance.
"""

# Example 1: Return Arrow tables instead of pandas DataFrames
with connect(
host="api.cloud.wherobots.com",
api_key="your_api_key",
runtime=Runtime.TINY,
results_format=ResultsFormat.ARROW, # Efficient wire format
output_format=OutputFormat.ARROW, # Return Arrow tables
geometry_representation=GeometryRepresentation.WKB,
region=Region.AWS_US_WEST_2
) as conn:
cursor = conn.cursor()
cursor.execute("SELECT * FROM buildings LIMIT 1000")
results = cursor.fetchall()

# results is now a pyarrow.Table instead of pandas.DataFrame
print(f"Result type: {type(results)}")
print(f"Schema: {results.schema}")
print(f"Row count: {len(results)}")

# Work with Arrow table directly (great for GeoArrow!)
# Convert to pandas only when needed:
# df = results.to_pandas()

# Example 2: Default behavior (backwards compatible)
with connect(
host="api.cloud.wherobots.com",
api_key="your_api_key",
runtime=Runtime.TINY,
results_format=ResultsFormat.ARROW,
# output_format defaults to OutputFormat.PANDAS
geometry_representation=GeometryRepresentation.WKB,
region=Region.AWS_US_WEST_2
) as conn:
cursor = conn.cursor()
cursor.execute("SELECT * FROM buildings LIMIT 1000")
results = cursor.fetchall()

# results is a pandas.DataFrame (existing behavior)
print(f"Result type: {type(results)}")

if __name__ == "__main__":
print("Arrow Table Output Example")
print("=" * 30)
print("This example shows how to use the new output_format parameter.")
print("Uncomment and provide valid credentials to run against Wherobots DB.")
print()
print("Key benefits of Arrow output:")
print("- More efficient for large datasets")
print("- Native support for GeoArrow geometries")
print("- Better interoperability with Arrow ecosystem")
print("- Zero-copy operations when possible")
163 changes: 0 additions & 163 deletions test_arrow_output.py

This file was deleted.

10 changes: 10 additions & 0 deletions wherobots/db/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,12 @@
ProgrammingError,
NotSupportedError,
)
from .constants import (
OutputFormat,
ResultsFormat,
DataCompression,
GeometryRepresentation,
)
from .region import Region
from .runtime import Runtime

Expand All @@ -25,6 +31,10 @@
"OperationalError",
"ProgrammingError",
"NotSupportedError",
"OutputFormat",
"ResultsFormat",
"DataCompression",
"GeometryRepresentation",
"Region",
"Runtime",
]
2 changes: 1 addition & 1 deletion wherobots/db/cursor.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def __get_results(self) -> Optional[List[Tuple[Any, ...]]]:
self.__description = [
(
col_name, # name
_ARROW_TYPE_MAP.get(str(result.schema.field(col_name).type), "STRING"), # type_code
_ARROW_TYPE_MAP.get(str(result.schema.field(col_name).type).split('<')[0], "STRING"), # type_code
None, # display_size
result.column(col_name).nbytes, # internal_size
None, # precision
Expand Down