From 7a7ab28edbe2e28b3d436b3c47da692f5d3a2a8d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andre=20Anast=C3=A1cio?= Date: Thu, 7 Mar 2024 16:38:14 -0300 Subject: [PATCH 1/4] Decouple imports from pyiceberg.catalog --- pyiceberg/catalog/glue.py | 4 +-- pyiceberg/catalog/hive.py | 4 +-- pyiceberg/catalog/sql.py | 4 +-- tests/catalog/test_base.py | 30 +++++++++--------- tests/catalog/test_rest.py | 3 +- tests/catalog/test_sql.py | 12 +++++--- tests/integration/test_writes.py | 53 +++++++++++++++++--------------- 7 files changed, 56 insertions(+), 54 deletions(-) diff --git a/pyiceberg/catalog/glue.py b/pyiceberg/catalog/glue.py index bd902cd1ff..adec150d84 100644 --- a/pyiceberg/catalog/glue.py +++ b/pyiceberg/catalog/glue.py @@ -46,8 +46,6 @@ PREVIOUS_METADATA_LOCATION, TABLE_TYPE, Catalog, - Identifier, - Properties, PropertiesUpdateSummary, ) from pyiceberg.exceptions import ( @@ -67,7 +65,7 @@ from pyiceberg.table import CommitTableRequest, CommitTableResponse, Table, update_table_metadata from pyiceberg.table.metadata import TableMetadata, new_table_metadata from pyiceberg.table.sorting import UNSORTED_SORT_ORDER, SortOrder -from pyiceberg.typedef import EMPTY_DICT +from pyiceberg.typedef import EMPTY_DICT, Identifier, Properties from pyiceberg.types import ( BinaryType, BooleanType, diff --git a/pyiceberg/catalog/hive.py b/pyiceberg/catalog/hive.py index c24355f6fb..18bbcfe084 100644 --- a/pyiceberg/catalog/hive.py +++ b/pyiceberg/catalog/hive.py @@ -59,8 +59,6 @@ METADATA_LOCATION, TABLE_TYPE, Catalog, - Identifier, - Properties, PropertiesUpdateSummary, ) from pyiceberg.exceptions import ( @@ -79,7 +77,7 @@ from pyiceberg.table import CommitTableRequest, CommitTableResponse, Table, TableProperties, update_table_metadata from pyiceberg.table.metadata import new_table_metadata from pyiceberg.table.sorting import UNSORTED_SORT_ORDER, SortOrder -from pyiceberg.typedef import EMPTY_DICT +from pyiceberg.typedef import EMPTY_DICT, Identifier, Properties from pyiceberg.types import ( BinaryType, BooleanType, diff --git a/pyiceberg/catalog/sql.py b/pyiceberg/catalog/sql.py index b6b2feeeb0..d44d4996b6 100644 --- a/pyiceberg/catalog/sql.py +++ b/pyiceberg/catalog/sql.py @@ -44,8 +44,6 @@ from pyiceberg.catalog import ( METADATA_LOCATION, Catalog, - Identifier, - Properties, PropertiesUpdateSummary, ) from pyiceberg.exceptions import ( @@ -64,7 +62,7 @@ from pyiceberg.table import CommitTableRequest, CommitTableResponse, Table, update_table_metadata from pyiceberg.table.metadata import new_table_metadata from pyiceberg.table.sorting import UNSORTED_SORT_ORDER, SortOrder -from pyiceberg.typedef import EMPTY_DICT +from pyiceberg.typedef import EMPTY_DICT, Identifier, Properties if TYPE_CHECKING: import pyarrow as pa diff --git a/tests/catalog/test_base.py b/tests/catalog/test_base.py index 1defb0996f..e432734905 100644 --- a/tests/catalog/test_base.py +++ b/tests/catalog/test_base.py @@ -31,8 +31,6 @@ from pyiceberg.catalog import ( Catalog, - Identifier, - Properties, PropertiesUpdateSummary, ) from pyiceberg.exceptions import ( @@ -58,7 +56,7 @@ from pyiceberg.table.metadata import TableMetadataV1 from pyiceberg.table.sorting import UNSORTED_SORT_ORDER, SortOrder from pyiceberg.transforms import IdentityTransform -from pyiceberg.typedef import EMPTY_DICT +from pyiceberg.typedef import EMPTY_DICT, Identifier, Properties from pyiceberg.types import IntegerType, LongType, NestedField @@ -94,18 +92,20 @@ def create_table( self.__namespaces[namespace] = {} new_location = location or f's3://warehouse/{"/".join(identifier)}/data' - metadata = TableMetadataV1(**{ - "format-version": 1, - "table-uuid": "d20125c8-7284-442c-9aea-15fee620737c", - "location": new_location, - "last-updated-ms": 1602638573874, - "last-column-id": schema.highest_field_id, - "schema": schema.model_dump(), - "partition-spec": partition_spec.model_dump()["fields"], - "properties": properties, - "current-snapshot-id": -1, - "snapshots": [{"snapshot-id": 1925, "timestamp-ms": 1602638573822}], - }) + metadata = TableMetadataV1( + **{ + "format-version": 1, + "table-uuid": "d20125c8-7284-442c-9aea-15fee620737c", + "location": new_location, + "last-updated-ms": 1602638573874, + "last-column-id": schema.highest_field_id, + "schema": schema.model_dump(), + "partition-spec": partition_spec.model_dump()["fields"], + "properties": properties, + "current-snapshot-id": -1, + "snapshots": [{"snapshot-id": 1925, "timestamp-ms": 1602638573822}], + } + ) table = Table( identifier=identifier, metadata=metadata, diff --git a/tests/catalog/test_rest.py b/tests/catalog/test_rest.py index 51bc286267..65498cdb0e 100644 --- a/tests/catalog/test_rest.py +++ b/tests/catalog/test_rest.py @@ -23,7 +23,8 @@ from requests_mock import Mocker import pyiceberg -from pyiceberg.catalog import PropertiesUpdateSummary, Table, load_catalog +from pyiceberg.table import Table +from pyiceberg.catalog import PropertiesUpdateSummary, load_catalog from pyiceberg.catalog.rest import AUTH_URL, RestCatalog from pyiceberg.exceptions import ( AuthorizationExpiredError, diff --git a/tests/catalog/test_sql.py b/tests/catalog/test_sql.py index 0b869d6826..af0d3151c6 100644 --- a/tests/catalog/test_sql.py +++ b/tests/catalog/test_sql.py @@ -25,7 +25,6 @@ from pytest_lazyfixture import lazy_fixture from sqlalchemy.exc import ArgumentError, IntegrityError -from pyiceberg.catalog import Identifier from pyiceberg.catalog.sql import SqlCatalog from pyiceberg.exceptions import ( CommitFailedException, @@ -49,6 +48,7 @@ SortOrder, ) from pyiceberg.transforms import IdentityTransform +from pyiceberg.typedef import Identifier from pyiceberg.types import IntegerType @@ -903,10 +903,12 @@ def test_write_and_evolve(catalog: SqlCatalog, format_version: int) -> None: 'foo': ['a', None, 'z'], 'bar': [19, None, 25], }, - schema=pa.schema([ - pa.field("foo", pa.string(), nullable=True), - pa.field("bar", pa.int32(), nullable=True), - ]), + schema=pa.schema( + [ + pa.field("foo", pa.string(), nullable=True), + pa.field("bar", pa.int32(), nullable=True), + ] + ), ) with tbl.transaction() as txn: diff --git a/tests/integration/test_writes.py b/tests/integration/test_writes.py index 3b6d476b74..c0c29f7be4 100644 --- a/tests/integration/test_writes.py +++ b/tests/integration/test_writes.py @@ -32,11 +32,12 @@ from pyspark.sql import SparkSession from pytest_mock.plugin import MockerFixture -from pyiceberg.catalog import Catalog, Properties, Table +from pyiceberg.catalog import Catalog from pyiceberg.catalog.sql import SqlCatalog from pyiceberg.exceptions import NoSuchTableError from pyiceberg.schema import Schema -from pyiceberg.table import _dataframe_to_data_files +from pyiceberg.table import Table, _dataframe_to_data_files +from pyiceberg.typedef import Properties from pyiceberg.types import ( BinaryType, BooleanType, @@ -96,24 +97,26 @@ @pytest.fixture(scope="session") def pa_schema() -> pa.Schema: - return pa.schema([ - ("bool", pa.bool_()), - ("string", pa.string()), - ("string_long", pa.string()), - ("int", pa.int32()), - ("long", pa.int64()), - ("float", pa.float32()), - ("double", pa.float64()), - ("timestamp", pa.timestamp(unit="us")), - ("timestamptz", pa.timestamp(unit="us", tz="UTC")), - ("date", pa.date32()), - # Not supported by Spark - # ("time", pa.time64("us")), - # Not natively supported by Arrow - # ("uuid", pa.fixed(16)), - ("binary", pa.large_binary()), - ("fixed", pa.binary(16)), - ]) + return pa.schema( + [ + ("bool", pa.bool_()), + ("string", pa.string()), + ("string_long", pa.string()), + ("int", pa.int32()), + ("long", pa.int64()), + ("float", pa.float32()), + ("double", pa.float64()), + ("timestamp", pa.timestamp(unit="us")), + ("timestamptz", pa.timestamp(unit="us", tz="UTC")), + ("date", pa.date32()), + # Not supported by Spark + # ("time", pa.time64("us")), + # Not natively supported by Arrow + # ("uuid", pa.fixed(16)), + ("binary", pa.large_binary()), + ("fixed", pa.binary(16)), + ] + ) @pytest.fixture(scope="session") @@ -616,10 +619,12 @@ def test_write_and_evolve(session_catalog: Catalog, format_version: int) -> None 'foo': ['a', None, 'z'], 'bar': [19, None, 25], }, - schema=pa.schema([ - pa.field("foo", pa.string(), nullable=True), - pa.field("bar", pa.int32(), nullable=True), - ]), + schema=pa.schema( + [ + pa.field("foo", pa.string(), nullable=True), + pa.field("bar", pa.int32(), nullable=True), + ] + ), ) with tbl.transaction() as txn: From 45119427979bc9da3a79b705d8db85a1594bfd86 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andre=20Anast=C3=A1cio?= Date: Thu, 7 Mar 2024 17:30:55 -0300 Subject: [PATCH 2/4] Decouple imports from pyiceberg.table --- pyiceberg/catalog/noop.py | 3 +- pyiceberg/catalog/rest.py | 2 +- tests/catalog/test_rest.py | 2 +- tests/table/test_init.py | 146 +++++++++++++++++++---------------- tests/table/test_metadata.py | 3 +- tests/utils/test_manifest.py | 3 +- 6 files changed, 84 insertions(+), 75 deletions(-) diff --git a/pyiceberg/catalog/noop.py b/pyiceberg/catalog/noop.py index a8b7154621..e294390e61 100644 --- a/pyiceberg/catalog/noop.py +++ b/pyiceberg/catalog/noop.py @@ -28,10 +28,9 @@ from pyiceberg.table import ( CommitTableRequest, CommitTableResponse, - SortOrder, Table, ) -from pyiceberg.table.sorting import UNSORTED_SORT_ORDER +from pyiceberg.table.sorting import UNSORTED_SORT_ORDER, SortOrder from pyiceberg.typedef import EMPTY_DICT, Identifier, Properties if TYPE_CHECKING: diff --git a/pyiceberg/catalog/rest.py b/pyiceberg/catalog/rest.py index 79fc37a398..c401339e18 100644 --- a/pyiceberg/catalog/rest.py +++ b/pyiceberg/catalog/rest.py @@ -65,8 +65,8 @@ CommitTableResponse, Table, TableIdentifier, - TableMetadata, ) +from pyiceberg.table.metadata import TableMetadata from pyiceberg.table.sorting import UNSORTED_SORT_ORDER, SortOrder, assign_fresh_sort_order_ids from pyiceberg.typedef import EMPTY_DICT, UTF8, IcebergBaseModel from pyiceberg.types import transform_dict_value_to_str diff --git a/tests/catalog/test_rest.py b/tests/catalog/test_rest.py index 65498cdb0e..850e5f0180 100644 --- a/tests/catalog/test_rest.py +++ b/tests/catalog/test_rest.py @@ -23,7 +23,6 @@ from requests_mock import Mocker import pyiceberg -from pyiceberg.table import Table from pyiceberg.catalog import PropertiesUpdateSummary, load_catalog from pyiceberg.catalog.rest import AUTH_URL, RestCatalog from pyiceberg.exceptions import ( @@ -37,6 +36,7 @@ from pyiceberg.io import load_file_io from pyiceberg.partitioning import PartitionField, PartitionSpec from pyiceberg.schema import Schema +from pyiceberg.table import Table from pyiceberg.table.metadata import TableMetadataV1 from pyiceberg.table.sorting import SortField, SortOrder from pyiceberg.transforms import IdentityTransform, TruncateTransform diff --git a/tests/table/test_init.py b/tests/table/test_init.py index b8097f5fcf..d5f0a65830 100644 --- a/tests/table/test_init.py +++ b/tests/table/test_init.py @@ -57,7 +57,6 @@ SetDefaultSortOrderUpdate, SetPropertiesUpdate, SetSnapshotRefUpdate, - SnapshotRef, StaticTable, Table, UpdateSchema, @@ -68,6 +67,7 @@ update_table_metadata, ) from pyiceberg.table.metadata import INITIAL_SEQUENCE_NUMBER, TableMetadataUtil, TableMetadataV2, _generate_snapshot_id +from pyiceberg.table.refs import SnapshotRef from pyiceberg.table.snapshots import ( Operation, Snapshot, @@ -930,52 +930,56 @@ def test_assert_default_sort_order_id(table_v2: Table) -> None: def test_correct_schema() -> None: - table_metadata = TableMetadataV2(**{ - "format-version": 2, - "table-uuid": "9c12d441-03fe-4693-9a96-a0705ddf69c1", - "location": "s3://bucket/test/location", - "last-sequence-number": 34, - "last-updated-ms": 1602638573590, - "last-column-id": 3, - "current-schema-id": 1, - "schemas": [ - {"type": "struct", "schema-id": 0, "fields": [{"id": 1, "name": "x", "required": True, "type": "long"}]}, - { - "type": "struct", - "schema-id": 1, - "identifier-field-ids": [1, 2], - "fields": [ - {"id": 1, "name": "x", "required": True, "type": "long"}, - {"id": 2, "name": "y", "required": True, "type": "long"}, - {"id": 3, "name": "z", "required": True, "type": "long"}, - ], - }, - ], - "default-spec-id": 0, - "partition-specs": [{"spec-id": 0, "fields": [{"name": "x", "transform": "identity", "source-id": 1, "field-id": 1000}]}], - "last-partition-id": 1000, - "default-sort-order-id": 0, - "sort-orders": [], - "current-snapshot-id": 123, - "snapshots": [ - { - "snapshot-id": 234, - "timestamp-ms": 1515100955770, - "sequence-number": 0, - "summary": {"operation": "append"}, - "manifest-list": "s3://a/b/1.avro", - "schema-id": 10, - }, - { - "snapshot-id": 123, - "timestamp-ms": 1515100955770, - "sequence-number": 0, - "summary": {"operation": "append"}, - "manifest-list": "s3://a/b/1.avro", - "schema-id": 0, - }, - ], - }) + table_metadata = TableMetadataV2( + **{ + "format-version": 2, + "table-uuid": "9c12d441-03fe-4693-9a96-a0705ddf69c1", + "location": "s3://bucket/test/location", + "last-sequence-number": 34, + "last-updated-ms": 1602638573590, + "last-column-id": 3, + "current-schema-id": 1, + "schemas": [ + {"type": "struct", "schema-id": 0, "fields": [{"id": 1, "name": "x", "required": True, "type": "long"}]}, + { + "type": "struct", + "schema-id": 1, + "identifier-field-ids": [1, 2], + "fields": [ + {"id": 1, "name": "x", "required": True, "type": "long"}, + {"id": 2, "name": "y", "required": True, "type": "long"}, + {"id": 3, "name": "z", "required": True, "type": "long"}, + ], + }, + ], + "default-spec-id": 0, + "partition-specs": [ + {"spec-id": 0, "fields": [{"name": "x", "transform": "identity", "source-id": 1, "field-id": 1000}]} + ], + "last-partition-id": 1000, + "default-sort-order-id": 0, + "sort-orders": [], + "current-snapshot-id": 123, + "snapshots": [ + { + "snapshot-id": 234, + "timestamp-ms": 1515100955770, + "sequence-number": 0, + "summary": {"operation": "append"}, + "manifest-list": "s3://a/b/1.avro", + "schema-id": 10, + }, + { + "snapshot-id": 123, + "timestamp-ms": 1515100955770, + "sequence-number": 0, + "summary": {"operation": "append"}, + "manifest-list": "s3://a/b/1.avro", + "schema-id": 0, + }, + ], + } + ) t = Table( identifier=("default", "t1"), @@ -1014,11 +1018,13 @@ def test_correct_schema() -> None: def test_schema_mismatch_type(table_schema_simple: Schema) -> None: - other_schema = pa.schema(( - pa.field("foo", pa.string(), nullable=True), - pa.field("bar", pa.decimal128(18, 6), nullable=False), - pa.field("baz", pa.bool_(), nullable=True), - )) + other_schema = pa.schema( + ( + pa.field("foo", pa.string(), nullable=True), + pa.field("bar", pa.decimal128(18, 6), nullable=False), + pa.field("baz", pa.bool_(), nullable=True), + ) + ) expected = r"""Mismatch in fields: ┏━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ @@ -1035,11 +1041,13 @@ def test_schema_mismatch_type(table_schema_simple: Schema) -> None: def test_schema_mismatch_nullability(table_schema_simple: Schema) -> None: - other_schema = pa.schema(( - pa.field("foo", pa.string(), nullable=True), - pa.field("bar", pa.int32(), nullable=True), - pa.field("baz", pa.bool_(), nullable=True), - )) + other_schema = pa.schema( + ( + pa.field("foo", pa.string(), nullable=True), + pa.field("bar", pa.int32(), nullable=True), + pa.field("baz", pa.bool_(), nullable=True), + ) + ) expected = """Mismatch in fields: ┏━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┓ @@ -1056,10 +1064,12 @@ def test_schema_mismatch_nullability(table_schema_simple: Schema) -> None: def test_schema_mismatch_missing_field(table_schema_simple: Schema) -> None: - other_schema = pa.schema(( - pa.field("foo", pa.string(), nullable=True), - pa.field("baz", pa.bool_(), nullable=True), - )) + other_schema = pa.schema( + ( + pa.field("foo", pa.string(), nullable=True), + pa.field("baz", pa.bool_(), nullable=True), + ) + ) expected = """Mismatch in fields: ┏━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┓ @@ -1076,12 +1086,14 @@ def test_schema_mismatch_missing_field(table_schema_simple: Schema) -> None: def test_schema_mismatch_additional_field(table_schema_simple: Schema) -> None: - other_schema = pa.schema(( - pa.field("foo", pa.string(), nullable=True), - pa.field("bar", pa.int32(), nullable=True), - pa.field("baz", pa.bool_(), nullable=True), - pa.field("new_field", pa.date32(), nullable=True), - )) + other_schema = pa.schema( + ( + pa.field("foo", pa.string(), nullable=True), + pa.field("bar", pa.int32(), nullable=True), + pa.field("baz", pa.bool_(), nullable=True), + pa.field("new_field", pa.date32(), nullable=True), + ) + ) expected = r"PyArrow table contains more columns: new_field. Update the schema first \(hint, use union_by_name\)." diff --git a/tests/table/test_metadata.py b/tests/table/test_metadata.py index c05700ecbb..0cf17b11a2 100644 --- a/tests/table/test_metadata.py +++ b/tests/table/test_metadata.py @@ -29,7 +29,6 @@ from pyiceberg.partitioning import PartitionField, PartitionSpec from pyiceberg.schema import Schema from pyiceberg.serializers import FromByteStream -from pyiceberg.table import SortOrder from pyiceberg.table.metadata import ( TableMetadataUtil, TableMetadataV1, @@ -37,7 +36,7 @@ new_table_metadata, ) from pyiceberg.table.refs import SnapshotRef, SnapshotRefType -from pyiceberg.table.sorting import NullOrder, SortDirection, SortField +from pyiceberg.table.sorting import NullOrder, SortDirection, SortField, SortOrder from pyiceberg.transforms import IdentityTransform from pyiceberg.typedef import UTF8 from pyiceberg.types import ( diff --git a/tests/utils/test_manifest.py b/tests/utils/test_manifest.py index 6ef11a47ea..3e789cb854 100644 --- a/tests/utils/test_manifest.py +++ b/tests/utils/test_manifest.py @@ -37,8 +37,7 @@ ) from pyiceberg.partitioning import PartitionField, PartitionSpec from pyiceberg.schema import Schema -from pyiceberg.table import Snapshot -from pyiceberg.table.snapshots import Operation, Summary +from pyiceberg.table.snapshots import Operation, Snapshot, Summary from pyiceberg.transforms import IdentityTransform from pyiceberg.typedef import Record from pyiceberg.types import IntegerType, NestedField From 7b5c156f0e66d8b5d3b57ea051cc9dcc840eb9f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andre=20Anast=C3=A1cio?= Date: Thu, 7 Mar 2024 17:49:54 -0300 Subject: [PATCH 3/4] Decouple imports from pyiceberg.expressions --- pyiceberg/expressions/visitors.py | 3 +-- pyiceberg/io/pyarrow.py | 2 +- pyiceberg/table/__init__.py | 13 +++++++------ tests/expressions/test_parser.py | 2 +- tests/io/test_pyarrow.py | 2 +- tests/test_schema.py | 2 +- 6 files changed, 12 insertions(+), 12 deletions(-) diff --git a/pyiceberg/expressions/visitors.py b/pyiceberg/expressions/visitors.py index a185164cd1..79bc995198 100644 --- a/pyiceberg/expressions/visitors.py +++ b/pyiceberg/expressions/visitors.py @@ -54,7 +54,6 @@ BoundStartsWith, BoundTerm, BoundUnaryPredicate, - L, Not, Or, UnboundPredicate, @@ -63,7 +62,7 @@ from pyiceberg.manifest import DataFile, ManifestFile, PartitionFieldSummary from pyiceberg.partitioning import PartitionSpec from pyiceberg.schema import Schema -from pyiceberg.typedef import EMPTY_DICT, StructProtocol +from pyiceberg.typedef import EMPTY_DICT, L, StructProtocol from pyiceberg.types import ( DoubleType, FloatType, diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py index be944ffb36..7192513a2d 100644 --- a/pyiceberg/io/pyarrow.py +++ b/pyiceberg/io/pyarrow.py @@ -75,8 +75,8 @@ AlwaysTrue, BooleanExpression, BoundTerm, - Literal, ) +from pyiceberg.expressions.literals import Literal from pyiceberg.expressions.visitors import ( BoundBooleanExpressionVisitor, bind, diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py index 1a4183c914..674dcb0322 100644 --- a/pyiceberg/table/__init__.py +++ b/pyiceberg/table/__init__.py @@ -46,6 +46,8 @@ from sortedcontainers import SortedList from typing_extensions import Annotated +import pyiceberg.expressions.parser as parser +import pyiceberg.expressions.visitors as visitors from pyiceberg.exceptions import CommitFailedException, ResolveError, ValidationError from pyiceberg.expressions import ( AlwaysTrue, @@ -53,10 +55,7 @@ BooleanExpression, EqualTo, Reference, - parser, - visitors, ) -from pyiceberg.expressions.visitors import _InclusiveMetricsEvaluator, inclusive_projection from pyiceberg.io import FileIO, load_file_io from pyiceberg.manifest import ( POSITIONAL_DELETE_SCHEMA, @@ -1375,7 +1374,9 @@ def _match_deletes_to_data_file(data_entry: ManifestEntry, positional_delete_ent relevant_entries = positional_delete_entries[positional_delete_entries.bisect_right(data_entry) :] if len(relevant_entries) > 0: - evaluator = _InclusiveMetricsEvaluator(POSITIONAL_DELETE_SCHEMA, EqualTo("file_path", data_entry.data_file.file_path)) + evaluator = visitors._InclusiveMetricsEvaluator( + POSITIONAL_DELETE_SCHEMA, EqualTo("file_path", data_entry.data_file.file_path) + ) return { positional_delete_entry.data_file for positional_delete_entry in relevant_entries @@ -1399,7 +1400,7 @@ def __init__( super().__init__(table, row_filter, selected_fields, case_sensitive, snapshot_id, options, limit) def _build_partition_projection(self, spec_id: int) -> BooleanExpression: - project = inclusive_projection(self.table.schema(), self.table.specs()[spec_id]) + project = visitors.inclusive_projection(self.table.schema(), self.table.specs()[spec_id]) return project(self.row_filter) @cached_property @@ -1466,7 +1467,7 @@ def plan_files(self) -> Iterable[FileScanTask]: # this filter depends on the partition spec used to write the manifest file partition_evaluators: Dict[int, Callable[[DataFile], bool]] = KeyDefaultDict(self._build_partition_evaluator) - metrics_evaluator = _InclusiveMetricsEvaluator( + metrics_evaluator = visitors._InclusiveMetricsEvaluator( self.table.schema(), self.row_filter, self.case_sensitive, self.options.get("include_empty_files") == "true" ).eval diff --git a/tests/expressions/test_parser.py b/tests/expressions/test_parser.py index 3ce2f2226c..0bccc9b80f 100644 --- a/tests/expressions/test_parser.py +++ b/tests/expressions/test_parser.py @@ -17,6 +17,7 @@ import pytest from pyparsing import ParseException +import pyiceberg.expressions.parser as parser from pyiceberg.expressions import ( AlwaysFalse, AlwaysTrue, @@ -37,7 +38,6 @@ NotStartsWith, Or, StartsWith, - parser, ) diff --git a/tests/io/test_pyarrow.py b/tests/io/test_pyarrow.py index a3dd56db7f..2acffdfdf9 100644 --- a/tests/io/test_pyarrow.py +++ b/tests/io/test_pyarrow.py @@ -53,8 +53,8 @@ GreaterThan, Not, Or, - literal, ) +from pyiceberg.expressions.literals import literal from pyiceberg.io import InputStream, OutputStream, load_file_io from pyiceberg.io.pyarrow import ( ICEBERG_SCHEMA, diff --git a/tests/test_schema.py b/tests/test_schema.py index 6394b72ba6..7e10dd5b0d 100644 --- a/tests/test_schema.py +++ b/tests/test_schema.py @@ -23,8 +23,8 @@ from pyiceberg import schema from pyiceberg.exceptions import ResolveError, ValidationError -from pyiceberg.expressions import Accessor from pyiceberg.schema import ( + Accessor, Schema, build_position_accessors, promote, From 47b2320f7a112ec13d89ac1e603f8282bd633fcc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andre=20Anast=C3=A1cio?= Date: Thu, 7 Mar 2024 17:53:04 -0300 Subject: [PATCH 4/4] Add linter fixes --- tests/catalog/test_base.py | 26 +++--- tests/catalog/test_sql.py | 10 +-- tests/integration/test_writes.py | 48 +++++------ tests/table/test_init.py | 144 ++++++++++++++----------------- 4 files changed, 104 insertions(+), 124 deletions(-) diff --git a/tests/catalog/test_base.py b/tests/catalog/test_base.py index e432734905..1f0060780e 100644 --- a/tests/catalog/test_base.py +++ b/tests/catalog/test_base.py @@ -92,20 +92,18 @@ def create_table( self.__namespaces[namespace] = {} new_location = location or f's3://warehouse/{"/".join(identifier)}/data' - metadata = TableMetadataV1( - **{ - "format-version": 1, - "table-uuid": "d20125c8-7284-442c-9aea-15fee620737c", - "location": new_location, - "last-updated-ms": 1602638573874, - "last-column-id": schema.highest_field_id, - "schema": schema.model_dump(), - "partition-spec": partition_spec.model_dump()["fields"], - "properties": properties, - "current-snapshot-id": -1, - "snapshots": [{"snapshot-id": 1925, "timestamp-ms": 1602638573822}], - } - ) + metadata = TableMetadataV1(**{ + "format-version": 1, + "table-uuid": "d20125c8-7284-442c-9aea-15fee620737c", + "location": new_location, + "last-updated-ms": 1602638573874, + "last-column-id": schema.highest_field_id, + "schema": schema.model_dump(), + "partition-spec": partition_spec.model_dump()["fields"], + "properties": properties, + "current-snapshot-id": -1, + "snapshots": [{"snapshot-id": 1925, "timestamp-ms": 1602638573822}], + }) table = Table( identifier=identifier, metadata=metadata, diff --git a/tests/catalog/test_sql.py b/tests/catalog/test_sql.py index af0d3151c6..3a77f8678a 100644 --- a/tests/catalog/test_sql.py +++ b/tests/catalog/test_sql.py @@ -903,12 +903,10 @@ def test_write_and_evolve(catalog: SqlCatalog, format_version: int) -> None: 'foo': ['a', None, 'z'], 'bar': [19, None, 25], }, - schema=pa.schema( - [ - pa.field("foo", pa.string(), nullable=True), - pa.field("bar", pa.int32(), nullable=True), - ] - ), + schema=pa.schema([ + pa.field("foo", pa.string(), nullable=True), + pa.field("bar", pa.int32(), nullable=True), + ]), ) with tbl.transaction() as txn: diff --git a/tests/integration/test_writes.py b/tests/integration/test_writes.py index c0c29f7be4..82c4ace711 100644 --- a/tests/integration/test_writes.py +++ b/tests/integration/test_writes.py @@ -97,26 +97,24 @@ @pytest.fixture(scope="session") def pa_schema() -> pa.Schema: - return pa.schema( - [ - ("bool", pa.bool_()), - ("string", pa.string()), - ("string_long", pa.string()), - ("int", pa.int32()), - ("long", pa.int64()), - ("float", pa.float32()), - ("double", pa.float64()), - ("timestamp", pa.timestamp(unit="us")), - ("timestamptz", pa.timestamp(unit="us", tz="UTC")), - ("date", pa.date32()), - # Not supported by Spark - # ("time", pa.time64("us")), - # Not natively supported by Arrow - # ("uuid", pa.fixed(16)), - ("binary", pa.large_binary()), - ("fixed", pa.binary(16)), - ] - ) + return pa.schema([ + ("bool", pa.bool_()), + ("string", pa.string()), + ("string_long", pa.string()), + ("int", pa.int32()), + ("long", pa.int64()), + ("float", pa.float32()), + ("double", pa.float64()), + ("timestamp", pa.timestamp(unit="us")), + ("timestamptz", pa.timestamp(unit="us", tz="UTC")), + ("date", pa.date32()), + # Not supported by Spark + # ("time", pa.time64("us")), + # Not natively supported by Arrow + # ("uuid", pa.fixed(16)), + ("binary", pa.large_binary()), + ("fixed", pa.binary(16)), + ]) @pytest.fixture(scope="session") @@ -619,12 +617,10 @@ def test_write_and_evolve(session_catalog: Catalog, format_version: int) -> None 'foo': ['a', None, 'z'], 'bar': [19, None, 25], }, - schema=pa.schema( - [ - pa.field("foo", pa.string(), nullable=True), - pa.field("bar", pa.int32(), nullable=True), - ] - ), + schema=pa.schema([ + pa.field("foo", pa.string(), nullable=True), + pa.field("bar", pa.int32(), nullable=True), + ]), ) with tbl.transaction() as txn: diff --git a/tests/table/test_init.py b/tests/table/test_init.py index d5f0a65830..f734211510 100644 --- a/tests/table/test_init.py +++ b/tests/table/test_init.py @@ -930,56 +930,52 @@ def test_assert_default_sort_order_id(table_v2: Table) -> None: def test_correct_schema() -> None: - table_metadata = TableMetadataV2( - **{ - "format-version": 2, - "table-uuid": "9c12d441-03fe-4693-9a96-a0705ddf69c1", - "location": "s3://bucket/test/location", - "last-sequence-number": 34, - "last-updated-ms": 1602638573590, - "last-column-id": 3, - "current-schema-id": 1, - "schemas": [ - {"type": "struct", "schema-id": 0, "fields": [{"id": 1, "name": "x", "required": True, "type": "long"}]}, - { - "type": "struct", - "schema-id": 1, - "identifier-field-ids": [1, 2], - "fields": [ - {"id": 1, "name": "x", "required": True, "type": "long"}, - {"id": 2, "name": "y", "required": True, "type": "long"}, - {"id": 3, "name": "z", "required": True, "type": "long"}, - ], - }, - ], - "default-spec-id": 0, - "partition-specs": [ - {"spec-id": 0, "fields": [{"name": "x", "transform": "identity", "source-id": 1, "field-id": 1000}]} - ], - "last-partition-id": 1000, - "default-sort-order-id": 0, - "sort-orders": [], - "current-snapshot-id": 123, - "snapshots": [ - { - "snapshot-id": 234, - "timestamp-ms": 1515100955770, - "sequence-number": 0, - "summary": {"operation": "append"}, - "manifest-list": "s3://a/b/1.avro", - "schema-id": 10, - }, - { - "snapshot-id": 123, - "timestamp-ms": 1515100955770, - "sequence-number": 0, - "summary": {"operation": "append"}, - "manifest-list": "s3://a/b/1.avro", - "schema-id": 0, - }, - ], - } - ) + table_metadata = TableMetadataV2(**{ + "format-version": 2, + "table-uuid": "9c12d441-03fe-4693-9a96-a0705ddf69c1", + "location": "s3://bucket/test/location", + "last-sequence-number": 34, + "last-updated-ms": 1602638573590, + "last-column-id": 3, + "current-schema-id": 1, + "schemas": [ + {"type": "struct", "schema-id": 0, "fields": [{"id": 1, "name": "x", "required": True, "type": "long"}]}, + { + "type": "struct", + "schema-id": 1, + "identifier-field-ids": [1, 2], + "fields": [ + {"id": 1, "name": "x", "required": True, "type": "long"}, + {"id": 2, "name": "y", "required": True, "type": "long"}, + {"id": 3, "name": "z", "required": True, "type": "long"}, + ], + }, + ], + "default-spec-id": 0, + "partition-specs": [{"spec-id": 0, "fields": [{"name": "x", "transform": "identity", "source-id": 1, "field-id": 1000}]}], + "last-partition-id": 1000, + "default-sort-order-id": 0, + "sort-orders": [], + "current-snapshot-id": 123, + "snapshots": [ + { + "snapshot-id": 234, + "timestamp-ms": 1515100955770, + "sequence-number": 0, + "summary": {"operation": "append"}, + "manifest-list": "s3://a/b/1.avro", + "schema-id": 10, + }, + { + "snapshot-id": 123, + "timestamp-ms": 1515100955770, + "sequence-number": 0, + "summary": {"operation": "append"}, + "manifest-list": "s3://a/b/1.avro", + "schema-id": 0, + }, + ], + }) t = Table( identifier=("default", "t1"), @@ -1018,13 +1014,11 @@ def test_correct_schema() -> None: def test_schema_mismatch_type(table_schema_simple: Schema) -> None: - other_schema = pa.schema( - ( - pa.field("foo", pa.string(), nullable=True), - pa.field("bar", pa.decimal128(18, 6), nullable=False), - pa.field("baz", pa.bool_(), nullable=True), - ) - ) + other_schema = pa.schema(( + pa.field("foo", pa.string(), nullable=True), + pa.field("bar", pa.decimal128(18, 6), nullable=False), + pa.field("baz", pa.bool_(), nullable=True), + )) expected = r"""Mismatch in fields: ┏━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ @@ -1041,13 +1035,11 @@ def test_schema_mismatch_type(table_schema_simple: Schema) -> None: def test_schema_mismatch_nullability(table_schema_simple: Schema) -> None: - other_schema = pa.schema( - ( - pa.field("foo", pa.string(), nullable=True), - pa.field("bar", pa.int32(), nullable=True), - pa.field("baz", pa.bool_(), nullable=True), - ) - ) + other_schema = pa.schema(( + pa.field("foo", pa.string(), nullable=True), + pa.field("bar", pa.int32(), nullable=True), + pa.field("baz", pa.bool_(), nullable=True), + )) expected = """Mismatch in fields: ┏━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┓ @@ -1064,12 +1056,10 @@ def test_schema_mismatch_nullability(table_schema_simple: Schema) -> None: def test_schema_mismatch_missing_field(table_schema_simple: Schema) -> None: - other_schema = pa.schema( - ( - pa.field("foo", pa.string(), nullable=True), - pa.field("baz", pa.bool_(), nullable=True), - ) - ) + other_schema = pa.schema(( + pa.field("foo", pa.string(), nullable=True), + pa.field("baz", pa.bool_(), nullable=True), + )) expected = """Mismatch in fields: ┏━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┓ @@ -1086,14 +1076,12 @@ def test_schema_mismatch_missing_field(table_schema_simple: Schema) -> None: def test_schema_mismatch_additional_field(table_schema_simple: Schema) -> None: - other_schema = pa.schema( - ( - pa.field("foo", pa.string(), nullable=True), - pa.field("bar", pa.int32(), nullable=True), - pa.field("baz", pa.bool_(), nullable=True), - pa.field("new_field", pa.date32(), nullable=True), - ) - ) + other_schema = pa.schema(( + pa.field("foo", pa.string(), nullable=True), + pa.field("bar", pa.int32(), nullable=True), + pa.field("baz", pa.bool_(), nullable=True), + pa.field("new_field", pa.date32(), nullable=True), + )) expected = r"PyArrow table contains more columns: new_field. Update the schema first \(hint, use union_by_name\)."