Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
2dfa498
Bug: Take signed bit into account (#677)
Fokko May 1, 2024
0270bdc
Build: Bump moto from 5.0.5 to 5.0.6 (#679)
dependabot[bot] May 1, 2024
ba296f6
Build: Bump Poetry to 1.8.2 (#676)
Fokko May 1, 2024
391a1b6
Build: Bump pyarrow from 15.0.2 to 16.0.0 (#681)
dependabot[bot] May 1, 2024
f54d614
Build: Bump pydantic from 2.7.0 to 2.7.1 (#680)
dependabot[bot] May 1, 2024
7c51c8f
Build: Bump getdaft from 0.2.21 to 0.2.23 (#689)
dependabot[bot] May 2, 2024
0536cff
Changed class variable names in pyarrow.py (#686)
SebastianoMeneghin May 2, 2024
d17a70d
Build: Bump mkdocstrings-python from 1.8.0 to 1.10.0 (#690)
dependabot[bot] May 2, 2024
da313d0
Build: Bump coverage from 7.4.4 to 7.5.0 (#688)
dependabot[bot] May 2, 2024
7bd5d9e
table_exists unit/integration test for NoSuchTableError (#678)
MehulBatra May 4, 2024
74caa17
Test: Add test to partition on field with a dot (#610)
Fokko May 6, 2024
d872245
Remove trailing slash from table location when creating a table (#702)
felixscherz May 6, 2024
a1f4ba8
Build: Bump mkdocs-section-index from 0.3.8 to 0.3.9 (#696)
dependabot[bot] May 7, 2024
e2f547d
Build: Bump cython from 3.0.8 to 3.0.10 (#697)
dependabot[bot] May 7, 2024
29beaf8
Build: Bump tqdm from 4.66.2 to 4.66.3 (#699)
dependabot[bot] May 7, 2024
70a45f6
Build: Bump werkzeug from 3.0.1 to 3.0.3 (#706)
dependabot[bot] May 7, 2024
0eb0c1c
Build: Bump jinja2 from 3.1.3 to 3.1.4 in /mkdocs (#707)
dependabot[bot] May 7, 2024
990ce80
Make `add_files` to support `snapshot_properties` argument (#695)
enkidulan May 7, 2024
0508667
Add support for categorical type (#693)
sungwy May 7, 2024
1f39b59
Build: Bump tenacity from 8.2.3 to 8.3.0 (#714)
dependabot[bot] May 8, 2024
50a65e5
Build: Bump mkdocstrings from 0.25.0 to 0.25.1 (#715)
dependabot[bot] May 8, 2024
3461305
Build: Bump coverage from 7.5.0 to 7.5.1 (#713)
dependabot[bot] May 8, 2024
399a9be
Build: Bump sqlalchemy from 2.0.29 to 2.0.30 (#712)
dependabot[bot] May 8, 2024
6f72e30
Build: Bump flask-cors from 4.0.0 to 4.0.1 (#718)
dependabot[bot] May 8, 2024
4de207d
Build: Bump mkdocs-material from 9.5.20 to 9.5.21 (#719)
dependabot[bot] May 9, 2024
d02d7a1
Build: Bump getdaft from 0.2.23 to 0.2.24 (#721)
dependabot[bot] May 9, 2024
aa361d1
Test, write subset of schema (#704)
kevinjqliu May 9, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
# under the License.

install-poetry:
pip install poetry==1.7.1
pip install poetry==1.8.2

install-dependencies:
poetry install -E pyarrow -E hive -E s3fs -E glue -E adlfs -E duckdb -E ray -E sql-postgres -E gcsfs -E sql-sqlite -E daft
Expand Down
10 changes: 5 additions & 5 deletions mkdocs/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,12 @@

mkdocs==1.6.0
griffe==0.44.0
jinja2==3.1.3
mkdocstrings==0.25.0
mkdocstrings-python==1.8.0
jinja2==3.1.4
mkdocstrings==0.25.1
mkdocstrings-python==1.10.0
mkdocs-literate-nav==0.6.1
mkdocs-autorefs==1.0.1
mkdocs-gen-files==0.5.0
mkdocs-material==9.5.20
mkdocs-material==9.5.21
mkdocs-material-extensions==1.3.1
mkdocs-section-index==0.3.8
mkdocs-section-index==0.3.9
639 changes: 319 additions & 320 deletions poetry.lock

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion pyiceberg/catalog/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -779,7 +779,7 @@ def _get_updated_props_and_update_summary(
def _resolve_table_location(self, location: Optional[str], database_name: str, table_name: str) -> str:
if not location:
return self._get_default_warehouse_location(database_name, table_name)
return location
return location.rstrip("/")

def _get_default_warehouse_location(self, database_name: str, table_name: str) -> str:
database_properties = self.load_namespace_properties(database_name)
Expand Down
2 changes: 2 additions & 0 deletions pyiceberg/catalog/rest.py
Original file line number Diff line number Diff line change
Expand Up @@ -519,6 +519,8 @@ def _create_table(
fresh_sort_order = assign_fresh_sort_order_ids(sort_order, iceberg_schema, fresh_schema)

namespace_and_table = self._split_identifier_for_path(identifier)
if location:
location = location.rstrip("/")
request = CreateTableRequest(
name=namespace_and_table["table"],
location=location,
Expand Down
12 changes: 11 additions & 1 deletion pyiceberg/io/pyarrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@ class PyArrowFile(InputFile, OutputFile):
>>> # output_file.create().write(b'foobytes')
"""

_fs: FileSystem
_filesystem: FileSystem
_path: str
_buffer_size: int

Expand Down Expand Up @@ -731,6 +731,16 @@ def _(obj: pa.MapType, visitor: PyArrowSchemaVisitor[T]) -> T:
return visitor.map(obj, key_result, value_result)


@visit_pyarrow.register(pa.DictionaryType)
def _(obj: pa.DictionaryType, visitor: PyArrowSchemaVisitor[T]) -> T:
# Parquet has no dictionary type. dictionary-encoding is handled
# as an encoding detail, not as a separate type.
# We will follow this approach in determining the Iceberg Type,
# as we only support parquet in PyIceberg for now.
logger.warning(f"Iceberg does not have a dictionary type. {type(obj)} will be inferred as {obj.value_type} on read.")
return visit_pyarrow(obj.value_type, visitor)


@visit_pyarrow.register(pa.DataType)
def _(obj: pa.DataType, visitor: PyArrowSchemaVisitor[T]) -> T:
if pa.types.is_nested(obj):
Expand Down
8 changes: 4 additions & 4 deletions pyiceberg/table/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -443,7 +443,7 @@ def overwrite(
for data_file in data_files:
update_snapshot.append_data_file(data_file)

def add_files(self, file_paths: List[str]) -> None:
def add_files(self, file_paths: List[str], snapshot_properties: Dict[str, str] = EMPTY_DICT) -> None:
"""
Shorthand API for adding files as data files to the table transaction.

Expand All @@ -455,7 +455,7 @@ def add_files(self, file_paths: List[str]) -> None:
"""
if self._table.name_mapping() is None:
self.set_properties(**{TableProperties.DEFAULT_NAME_MAPPING: self._table.schema().name_mapping.model_dump_json()})
with self.update_snapshot().fast_append() as update_snapshot:
with self.update_snapshot(snapshot_properties=snapshot_properties).fast_append() as update_snapshot:
data_files = _parquet_files_to_data_files(
table_metadata=self._table.metadata, file_paths=file_paths, io=self._table.io
)
Expand Down Expand Up @@ -1341,7 +1341,7 @@ def overwrite(
with self.transaction() as tx:
tx.overwrite(df=df, overwrite_filter=overwrite_filter, snapshot_properties=snapshot_properties)

def add_files(self, file_paths: List[str]) -> None:
def add_files(self, file_paths: List[str], snapshot_properties: Dict[str, str] = EMPTY_DICT) -> None:
"""
Shorthand API for adding files as data files to the table.

Expand All @@ -1352,7 +1352,7 @@ def add_files(self, file_paths: List[str]) -> None:
FileNotFoundError: If the file does not exist.
"""
with self.transaction() as tx:
tx.add_files(file_paths=file_paths)
tx.add_files(file_paths=file_paths, snapshot_properties=snapshot_properties)

def update_spec(self, case_sensitive: bool = True) -> UpdateSpec:
return UpdateSpec(Transaction(self, autocommit=True), case_sensitive=case_sensitive)
Expand Down
4 changes: 2 additions & 2 deletions pyiceberg/utils/decimal.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,9 @@ def bytes_required(value: Union[int, Decimal]) -> int:
int: the minimum number of bytes needed to serialize the value.
"""
if isinstance(value, int):
return (value.bit_length() + 7) // 8
return (value.bit_length() + 8) // 8
elif isinstance(value, Decimal):
return (decimal_to_unscaled(value).bit_length() + 7) // 8
return (decimal_to_unscaled(value).bit_length() + 8) // 8

raise ValueError(f"Unsupported value: {value}")

Expand Down
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ fsspec = ">=2023.1.0,<2025.1.0"
pyparsing = ">=3.1.0,<4.0.0"
zstandard = ">=0.13.0,<1.0.0"
tenacity = ">=8.2.3,<9.0.0"
pyarrow = { version = ">=9.0.0,<16.0.0", optional = true }
pyarrow = { version = ">=9.0.0,<17.0.0", optional = true }
pandas = { version = ">=1.0.0,<3.0.0", optional = true }
duckdb = { version = ">=0.5.0,<1.0.0", optional = true }
ray = { version = ">=2.0.0,<2.10.0", optional = true }
Expand All @@ -85,7 +85,7 @@ moto = { version = "^5.0.2", extras = ["server"] }
typing-extensions = "4.11.0"
pytest-mock = "3.14.0"
pyspark = "3.5.1"
cython = "3.0.8"
cython = "3.0.10"
deptry = ">=0.14,<0.17"
docutils = "!=0.21"

Expand Down
6 changes: 6 additions & 0 deletions tests/catalog/integration_test_dynamodb.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,3 +262,9 @@ def test_update_namespace_properties(test_catalog: Catalog, database_name: str)
else:
assert k in update_report.removed
assert "updated test description" == test_catalog.load_namespace_properties(database_name)["comment"]


def test_table_exists(test_catalog: Catalog, table_schema_nested: Schema, database_name: str, table_name: str) -> None:
test_catalog.create_namespace(database_name)
test_catalog.create_table((database_name, table_name), table_schema_nested)
assert test_catalog.table_exists((database_name, table_name)) is True
6 changes: 6 additions & 0 deletions tests/catalog/integration_test_glue.py
Original file line number Diff line number Diff line change
Expand Up @@ -558,3 +558,9 @@ def test_create_table_transaction(
]
},
]


def test_table_exists(test_catalog: Catalog, table_schema_nested: Schema, table_name: str, database_name: str) -> None:
test_catalog.create_namespace(database_name)
test_catalog.create_table((database_name, table_name), table_schema_nested)
assert test_catalog.table_exists((database_name, table_name)) is True
14 changes: 14 additions & 0 deletions tests/catalog/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ def create_table(

if not location:
location = f'{self._warehouse_location}/{"/".join(identifier)}'
location = location.rstrip("/")

metadata_location = self._get_metadata_location(location=location)
metadata = new_table_metadata(
Expand Down Expand Up @@ -353,6 +354,19 @@ def test_create_table_location_override(catalog: InMemoryCatalog) -> None:
assert table.location() == new_location


def test_create_table_removes_trailing_slash_from_location(catalog: InMemoryCatalog) -> None:
new_location = f"{catalog._warehouse_location}/new_location"
table = catalog.create_table(
identifier=TEST_TABLE_IDENTIFIER,
schema=TEST_TABLE_SCHEMA,
location=f"{new_location}/",
partition_spec=TEST_TABLE_PARTITION_SPEC,
properties=TEST_TABLE_PROPERTIES,
)
assert catalog.load_table(TEST_TABLE_IDENTIFIER) == table
assert table.location() == new_location


@pytest.mark.parametrize(
"schema,expected",
[
Expand Down
29 changes: 29 additions & 0 deletions tests/catalog/test_dynamodb.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,21 @@ def test_create_table_with_given_location(
assert TABLE_METADATA_LOCATION_REGEX.match(table.metadata_location)


@mock_aws
def test_create_table_removes_trailing_slash_in_location(
_bucket_initialize: None, moto_endpoint_url: str, table_schema_nested: Schema, database_name: str, table_name: str
) -> None:
catalog_name = "test_ddb_catalog"
identifier = (database_name, table_name)
test_catalog = DynamoDbCatalog(catalog_name, **{"s3.endpoint": moto_endpoint_url})
test_catalog.create_namespace(namespace=database_name)
location = f"s3://{BUCKET_NAME}/{database_name}.db/{table_name}"
table = test_catalog.create_table(identifier=identifier, schema=table_schema_nested, location=f"{location}/")
assert table.identifier == (catalog_name,) + identifier
assert table.location() == location
assert TABLE_METADATA_LOCATION_REGEX.match(table.metadata_location)


@mock_aws
def test_create_table_with_no_location(
_bucket_initialize: None, table_schema_nested: Schema, database_name: str, table_name: str
Expand Down Expand Up @@ -562,3 +577,17 @@ def test_passing_provided_profile() -> None:
assert test_catalog.dynamodb is mock_client
mock_session.assert_called_with(**session_props)
assert test_catalog.dynamodb is mock_session().client()


@mock_aws
def test_table_exists(
_bucket_initialize: None, moto_endpoint_url: str, table_schema_nested: Schema, database_name: str, table_name: str
) -> None:
identifier = (database_name, table_name)
test_catalog = DynamoDbCatalog("test_ddb_catalog", **{"warehouse": f"s3://{BUCKET_NAME}", "s3.endpoint": moto_endpoint_url})
test_catalog.create_namespace(namespace=database_name)
test_catalog.create_table(identifier, table_schema_nested)
# Act and Assert for an existing table
assert test_catalog.table_exists(identifier) is True
# Act and Assert for an non-existing table
assert test_catalog.table_exists(('non', 'exist')) is False
31 changes: 31 additions & 0 deletions tests/catalog/test_glue.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,22 @@ def test_create_table_with_given_location(
assert test_catalog._parse_metadata_version(table.metadata_location) == 0


@mock_aws
def test_create_table_removes_trailing_slash_in_location(
_bucket_initialize: None, moto_endpoint_url: str, table_schema_nested: Schema, database_name: str, table_name: str
) -> None:
catalog_name = "glue"
identifier = (database_name, table_name)
test_catalog = GlueCatalog(catalog_name, **{"s3.endpoint": moto_endpoint_url})
test_catalog.create_namespace(namespace=database_name)
location = f"s3://{BUCKET_NAME}/{database_name}.db/{table_name}"
table = test_catalog.create_table(identifier=identifier, schema=table_schema_nested, location=f"{location}/")
assert table.identifier == (catalog_name,) + identifier
assert table.location() == location
assert TABLE_METADATA_LOCATION_REGEX.match(table.metadata_location)
assert test_catalog._parse_metadata_version(table.metadata_location) == 0


@mock_aws
def test_create_table_with_pyarrow_schema(
_bucket_initialize: None,
Expand Down Expand Up @@ -817,3 +833,18 @@ def test_create_table_transaction(
assert table.spec().fields_by_source_id(2)[0].name == "bar"
assert table.spec().fields_by_source_id(2)[0].field_id == 1001
assert table.spec().fields_by_source_id(2)[0].transform == IdentityTransform()


@mock_aws
def test_table_exists(
_bucket_initialize: None, moto_endpoint_url: str, table_schema_simple: Schema, database_name: str, table_name: str
) -> None:
catalog_name = "glue"
identifier = (database_name, table_name)
test_catalog = GlueCatalog(catalog_name, **{"s3.endpoint": moto_endpoint_url, "warehouse": f"s3://{BUCKET_NAME}"})
test_catalog.create_namespace(namespace=database_name)
test_catalog.create_table(identifier=identifier, schema=table_schema_simple)
# Act and Assert for an existing table
assert test_catalog.table_exists(identifier) is True
# Act and Assert for a non-existing table
assert test_catalog.table_exists(('non', 'exist')) is False
Loading