Relocate to table submodule; code and comment improvements

apache · kevinjqliu · Jan 10, 2025 · Dec 20, 2024 · Dec 20, 2024 · Dec 20, 2024
commit ce5f0d54d22a0267488235e4430eb89854892052
diff --git a/pyiceberg/io/__init__.py b/pyiceberg/io/__init__.py
@@ -44,10 +44,7 @@
 )
 from urllib.parse import urlparse
 
-from pyiceberg.partitioning import PartitionKey
-from pyiceberg.table import TableProperties
 from pyiceberg.typedef import EMPTY_DICT, Properties
-from pyiceberg.utils.properties import property_as_bool
 
 logger = logging.getLogger(__name__)
 
@@ -296,29 +293,6 @@ def delete(self, location: Union[str, InputFile, OutputFile]) -> None:
         """
 
 
-class LocationProvider(ABC):
-    """A base class for location providers, that provide data file locations to write tasks."""
-
-    table_location: str
-    table_properties: Properties
-
-    def __init__(self, table_location: str, table_properties: Properties):
-        self.table_location = table_location
-        self.table_properties = table_properties
-
-    @abstractmethod
-    def new_data_location(self, data_file_name: str, partition_key: Optional[PartitionKey] = None) -> str:
-        """Return a fully-qualified data file location for the given filename.
-
-        Args:
-            data_file_name (str): The name of the data file.
-            partition_key (Optional[PartitionKey]): The data file's partition key. If None, the data file is not partitioned.
-
-        Returns:
-            str: A fully-qualified location URI for the data file.
-        """
-
-
 LOCATION = "location"
 WAREHOUSE = "warehouse"
 
@@ -370,40 +344,6 @@ def _infer_file_io_from_scheme(path: str, properties: Properties) -> Optional[Fi
     return None
 
 
-def _import_location_provider(location_provider_impl: str, table_location: str, table_properties: Properties) -> Optional[LocationProvider]:
-    try:
-        path_parts = location_provider_impl.split(".")
-        if len(path_parts) < 2:
-            raise ValueError(f"{TableProperties.WRITE_LOCATION_PROVIDER_IMPL} should be full path (module.CustomLocationProvider), got: {location_provider_impl}")
-        module_name, class_name = ".".join(path_parts[:-1]), path_parts[-1]
-        module = importlib.import_module(module_name)
-        class_ = getattr(module, class_name)
-        return class_(table_location, table_properties)
-    except ModuleNotFoundError:
-        logger.warning("Could not initialize LocationProvider: %s", location_provider_impl)
-        return None
-
-
-def load_location_provider(table_location: str, table_properties: Properties) -> LocationProvider:
-    table_location = table_location.rstrip("/")
-
-    if location_provider_impl := table_properties.get(TableProperties.WRITE_LOCATION_PROVIDER_IMPL):
-        if location_provider := _import_location_provider(location_provider_impl, table_location, table_properties):
-            logger.info("Loaded LocationProvider: %s", location_provider_impl)
-            return location_provider
-        else:
-            raise ValueError(f"Could not initialize LocationProvider: {location_provider_impl}")
-
-    if property_as_bool(table_properties, TableProperties.OBJECT_STORE_ENABLED, TableProperties.OBJECT_STORE_ENABLED_DEFAULT):
-        from pyiceberg.io.locations import ObjectStoreLocationProvider
-
-        return ObjectStoreLocationProvider(table_location, table_properties)
-    else:
-        from pyiceberg.io.locations import DefaultLocationProvider
-
-        return DefaultLocationProvider(table_location, table_properties)
-
-
 def load_file_io(properties: Properties = EMPTY_DICT, location: Optional[str] = None) -> FileIO:
     # First look for the py-io-impl property to directly load the class
     if io_impl := properties.get(PY_IO_IMPL):

diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py
@@ -114,7 +114,7 @@
     InputStream,
     OutputFile,
     OutputStream,
-    _parse_location, LocationProvider, load_location_provider,
+    _parse_location,
 )
 from pyiceberg.manifest import (
     DataFile,
@@ -136,6 +136,10 @@
     visit,
     visit_with_partner,
 )
+from pyiceberg.table import (
+    LocationProvider,
+    load_location_provider,
+)
 from pyiceberg.table.metadata import TableMetadata
 from pyiceberg.table.name_mapping import NameMapping, apply_name_mapping
 from pyiceberg.transforms import TruncateTransform
@@ -2415,7 +2419,9 @@ def data_file_statistics_from_parquet_metadata(
     )
 
 
-def write_file(io: FileIO, location_provider: LocationProvider, table_metadata: TableMetadata, tasks: Iterator[WriteTask]) -> Iterator[DataFile]:
+def write_file(
+    io: FileIO, location_provider: LocationProvider, table_metadata: TableMetadata, tasks: Iterator[WriteTask]
+) -> Iterator[DataFile]:
     from pyiceberg.table import DOWNCAST_NS_TIMESTAMP_TO_US_ON_WRITE, TableProperties
 
     parquet_writer_kwargs = _get_parquet_writer_kwargs(table_metadata.properties)
@@ -2447,7 +2453,7 @@ def write_parquet(task: WriteTask) -> DataFile:
         ]
         arrow_table = pa.Table.from_batches(batches)
         file_path = location_provider.new_data_location(
-            data_file_name=task.generate_data_file_filename('parquet'),
+            data_file_name=task.generate_data_file_filename("parquet"),
             partition_key=task.partition_key,
         )
         fo = io.new_output(file_path)
@@ -2625,10 +2631,7 @@ def _dataframe_to_data_files(
         property_name=TableProperties.WRITE_TARGET_FILE_SIZE_BYTES,
         default=TableProperties.WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT,
     )
-    location_provider = load_location_provider(
-        table_location=table_metadata.location,
-        table_properties=table_metadata.properties
-    )
+    location_provider = load_location_provider(table_location=table_metadata.location, table_properties=table_metadata.properties)
     name_mapping = table_metadata.schema().name_mapping
     downcast_ns_timestamp_to_us = Config().get_bool(DOWNCAST_NS_TIMESTAMP_TO_US_ON_WRITE) or False
     task_schema = pyarrow_to_schema(df.schema, name_mapping=name_mapping, downcast_ns_timestamp_to_us=downcast_ns_timestamp_to_us)

diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py
@@ -16,7 +16,9 @@
 # under the License.
 from __future__ import annotations
 
+import importlib
 import itertools
+import logging
 import uuid
 import warnings
 from abc import ABC, abstractmethod
@@ -138,7 +140,6 @@
 from pyiceberg.utils.concurrent import ExecutorFactory
 from pyiceberg.utils.config import Config
 from pyiceberg.utils.deprecated import deprecated
-from pyiceberg.utils.deprecated import deprecation_message as deprecation_message
 from pyiceberg.utils.properties import property_as_bool
 
 if TYPE_CHECKING:
@@ -150,6 +151,8 @@
 
     from pyiceberg.catalog import Catalog
 
+logger = logging.getLogger(__name__)
+
 ALWAYS_TRUE = AlwaysTrue()
 DOWNCAST_NS_TIMESTAMP_TO_US_ON_WRITE = "downcast-ns-timestamp-to-us-on-write"
 
@@ -1633,6 +1636,67 @@ class AddFileTask:
     partition_field_value: Record
 
 
+class LocationProvider(ABC):
+    """A base class for location providers, that provide data file locations for write tasks."""
+
+    table_location: str
+    table_properties: Properties
+
+    def __init__(self, table_location: str, table_properties: Properties):
+        self.table_location = table_location
+        self.table_properties = table_properties
+
+    @abstractmethod
+    def new_data_location(self, data_file_name: str, partition_key: Optional[PartitionKey] = None) -> str:
+        """Return a fully-qualified data file location for the given filename.
+
+        Args:
+            data_file_name (str): The name of the data file.
+            partition_key (Optional[PartitionKey]): The data file's partition key. If None, the data is not partitioned.
+
+        Returns:
+            str: A fully-qualified location URI for the data file.
+        """
+
+
+def _import_location_provider(
+    location_provider_impl: str, table_location: str, table_properties: Properties
+) -> Optional[LocationProvider]:
+    try:
+        path_parts = location_provider_impl.split(".")
+        if len(path_parts) < 2:
+            raise ValueError(
+                f"{TableProperties.WRITE_LOCATION_PROVIDER_IMPL} should be full path (module.CustomLocationProvider), got: {location_provider_impl}"
+            )
+        module_name, class_name = ".".join(path_parts[:-1]), path_parts[-1]
+        module = importlib.import_module(module_name)
+        class_ = getattr(module, class_name)
+        return class_(table_location, table_properties)
+    except ModuleNotFoundError:
+        logger.warning("Could not initialize LocationProvider: %s", location_provider_impl)
+        return None
+
+
+def load_location_provider(table_location: str, table_properties: Properties) -> LocationProvider:
+    table_location = table_location.rstrip("/")
+
+    if location_provider_impl := table_properties.get(TableProperties.WRITE_LOCATION_PROVIDER_IMPL):
+        if location_provider := _import_location_provider(location_provider_impl, table_location, table_properties):
+            logger.info("Loaded LocationProvider: %s", location_provider_impl)
+            return location_provider
+        else:
+            raise ValueError(f"Could not initialize LocationProvider: {location_provider_impl}")
+
+    if property_as_bool(table_properties, TableProperties.OBJECT_STORE_ENABLED, TableProperties.OBJECT_STORE_ENABLED_DEFAULT):
+        from pyiceberg.table.locations import ObjectStoreLocationProvider
+
+        return ObjectStoreLocationProvider(table_location, table_properties)
+    else:
+        from pyiceberg.table.locations import DefaultLocationProvider
+
+        return DefaultLocationProvider(table_location, table_properties)
+
+
 def _parquet_files_to_data_files(table_metadata: TableMetadata, file_paths: List[str], io: FileIO) -> Iterable[DataFile]:
     """Convert a list files into DataFiles.
 

diff --git a/pyiceberg/io/locations.py → pyiceberg/table/locations.py b/pyiceberg/io/locations.py → pyiceberg/table/locations.py
@@ -14,17 +14,18 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+
 from typing import Optional
 
-from pyiceberg.io import LocationProvider
+import mmh3
+
 from pyiceberg.partitioning import PartitionKey
-from pyiceberg.table import TableProperties
+from pyiceberg.table import LocationProvider, TableProperties
 from pyiceberg.typedef import Properties
 from pyiceberg.utils.properties import property_as_bool
 
 
 class DefaultLocationProvider(LocationProvider):
-
     def __init__(self, table_location: str, table_properties: Properties):
         super().__init__(table_location, table_properties)
 
@@ -39,12 +40,15 @@ def new_data_location(self, data_file_name: str, partition_key: Optional[Partiti
 
 
 class ObjectStoreLocationProvider(LocationProvider):
-
     _include_partition_paths: bool
 
     def __init__(self, table_location: str, table_properties: Properties):
         super().__init__(table_location, table_properties)
-        self._include_partition_paths = property_as_bool(table_properties, TableProperties.WRITE_OBJECT_STORE_PARTITIONED_PATHS, TableProperties.WRITE_OBJECT_STORE_PARTITIONED_PATHS_DEFAULT)
+        self._include_partition_paths = property_as_bool(
+            table_properties,
+            TableProperties.WRITE_OBJECT_STORE_PARTITIONED_PATHS,
+            TableProperties.WRITE_OBJECT_STORE_PARTITIONED_PATHS_DEFAULT,
+        )
 
     def new_data_location(self, data_file_name: str, partition_key: Optional[PartitionKey] = None) -> str:
         if self._include_partition_paths and partition_key:
@@ -53,22 +57,26 @@ def new_data_location(self, data_file_name: str, partition_key: Optional[Partiti
         prefix = f"{self.table_location}/data"
         hashed_path = self._compute_hash(data_file_name)
 
-        return f"{prefix}/{hashed_path}/{data_file_name}" if self._include_partition_paths else f"{prefix}/{hashed_path}-{data_file_name}"
+        return (
+            f"{prefix}/{hashed_path}/{data_file_name}"
+            if self._include_partition_paths
+            else f"{prefix}/{hashed_path}-{data_file_name}"
+        )
 
     @staticmethod
     def _compute_hash(data_file_name: str) -> str:
-        import mmh3
-
+        # Bitwise AND to combat sign-extension; bitwise OR to preserve leading zeroes that `bin` would otherwise strip.
         hash_code = mmh3.hash(data_file_name) & ((1 << HASH_BINARY_STRING_BITS) - 1) | (1 << HASH_BINARY_STRING_BITS)
         return ObjectStoreLocationProvider._dirs_from_hash(bin(hash_code)[-HASH_BINARY_STRING_BITS:])
 
     @staticmethod
     def _dirs_from_hash(file_hash: str) -> str:
+        """Divides hash into directories for optimized orphan removal operation using ENTROPY_DIR_DEPTH and ENTROPY_DIR_LENGTH."""
         hash_with_dirs = []
         for i in range(0, ENTROPY_DIR_DEPTH * ENTROPY_DIR_LENGTH, ENTROPY_DIR_LENGTH):
-            hash_with_dirs.append(file_hash[i:i + ENTROPY_DIR_LENGTH])
+            hash_with_dirs.append(file_hash[i : i + ENTROPY_DIR_LENGTH])
 
         if len(file_hash) > ENTROPY_DIR_DEPTH * ENTROPY_DIR_LENGTH:
-            hash_with_dirs.append(file_hash[ENTROPY_DIR_DEPTH * ENTROPY_DIR_LENGTH:])
+            hash_with_dirs.append(file_hash[ENTROPY_DIR_DEPTH * ENTROPY_DIR_LENGTH :])
 
-        return '/'.join(hash_with_dirs)
+        return "/".join(hash_with_dirs)