Add support for HDFS storage (#241)

nlathia · Neal Lathia · web-flow · commit 28358a83db69 · 2023-06-08T12:00:38.000+01:00
* Start implementing hdfs storage

* Finish implementation

* Add top-level entrypoint

* Add unit tests

* Finish unit tests

* Add hdfs to change log

* Add pydoop==2.0.0 to dev requirements

* Fix local test bug with pyspark

* Set to 3.8

* Add skips

* Move import in tests

* Add more skips

---------

Co-authored-by: Neal Lathia &lt;neallathia@gmail.com&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,8 @@
 
 ## Not released
 
+Added support for any HDFS storage that you can access via [pydoop](https://crs4.github.io/pydoop/tutorial/hdfs_api.html#hdfs-api-tutorial): `ModelStore.from_hdfs()` [#241](https://github.com/operatorai/modelstore/pull/241), thanks [@sayandigital](https://github.com/sayandigital).
+
 Updated the transformers manager: it no longer requires a `tokenizer` argument, so it can now be used to (for example) save/load SAM models [#238](https://github.com/operatorai/modelstore/pull/238) or DPT models [#239](https://github.com/operatorai/modelstore/pull/239). Thank you, [Cate in the MLOps Community](https://mlops-community.slack.com/archives/C0227QJCDS8/p1683293544101389)
 
 Fixed [issues with saving & loading GPT-2 models](https://github.com/operatorai/modelstore/issues/233) in [#234](https://github.com/operatorai/modelstore/pull/234), thank you [@sayandigital](https://github.com/sayandigital).
diff --git a/bin/_brew_install b/bin/_brew_install
@@ -10,14 +10,16 @@ brew install gcc
 export CC=gcc-11
 export CXX=g++-11
 
-# To use xgboost
+# To use xgboost models on mac
 # https://xgboost.readthedocs.io/en/latest/build.html#building-on-osx
-
 # Note: it looks like there's trouble with libomp 12
 # https://github.com/dmlc/xgboost/issues/7039
 brew install rajivshah3/libomp-tap/libomp@11.1.0
 
-# To use pyspark
+# To use pyspark models on mac
 brew install java
 
+# To use hdfs storage on mac
+brew install hadoop
+
 echo "\n ✅  Done."
diff --git a/bin/_pyenv_config b/bin/_pyenv_config
@@ -1,7 +1,7 @@
 #!/bin/bash
 # export PYTHON_VERSION=3.7.15
-# export PYTHON_VERSION=3.8.12
-export PYTHON_VERSION=3.9.16
+export PYTHON_VERSION=3.8.12
+# export PYTHON_VERSION=3.9.16
 
 export VIRTUALENV_NAME="$1-${PYTHON_VERSION//./-}"
 export REPO_ROOT=$(cd $(dirname $0)/.. && pwd)
diff --git a/modelstore/model_store.py b/modelstore/model_store.py
@@ -25,6 +25,7 @@
 from modelstore.storage.aws import BOTO_EXISTS, AWSStorage
 from modelstore.storage.azure import AZURE_EXISTS, AzureBlobStorage
 from modelstore.storage.gcloud import GCLOUD_EXISTS, GoogleCloudStorage
+from modelstore.storage.hdfs import HDFS_EXISTS, HdfsStorage
 from modelstore.storage.local import FileSystemStorage
 from modelstore.storage.minio import MINIO_EXISTS, MinIOStorage
 from modelstore.storage.storage import CloudStorage
@@ -123,6 +124,16 @@ def from_minio(
             )
         )
 
+    @classmethod
+    def from_hdfs(
+        cls, root_prefix: Optional[str] = None, create_directory: bool = False
+    ) -> "ModelStore":
+        """Creates a ModelStore instance that stores models to
+        the local HDFS system."""
+        if not HDFS_EXISTS:
+            raise ModuleNotFoundError("pydoop is not installed!")
+        return ModelStore(storage=HdfsStorage(root_prefix, create_directory))
+
     @classmethod
     def from_file_system(
         cls, root_directory: Optional[str] = None, create_directory: bool = False
diff --git a/modelstore/models/pyspark.py b/modelstore/models/pyspark.py
@@ -105,4 +105,7 @@ def save_model(tmp_dir: str, model: "pyspark.ml.Model") -> List[str]:
     logger.debug("Saving pyspark model")
     target = os.path.join(tmp_dir, "pyspark")
     model.save(target)
-    return [os.path.join(target, "metadata"), os.path.join(target, "stages")]
+    return [
+        os.path.join(target, "metadata"),
+        os.path.join(target, "stages"),
+    ]
diff --git a/modelstore/storage/hdfs.py b/modelstore/storage/hdfs.py
@@ -0,0 +1,127 @@
+#    Copyright 2023 Neal Lathia
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+import json
+import os
+from typing import Optional
+
+from modelstore.metadata import metadata
+from modelstore.storage.blob_storage import BlobStorage
+from modelstore.storage.util.versions import sorted_by_created
+from modelstore.utils.log import logger
+from modelstore.utils.exceptions import FilePullFailedException
+
+try:
+    import pydoop.hdfs as hdfs
+
+    HDFS_EXISTS = True
+except ImportError:
+    HDFS_EXISTS = False
+
+
+class HdfsStorage(BlobStorage):
+
+    """
+    HDFS Storage
+
+    Assumes that you have `pydoop` installed
+    https://crs4.github.io/pydoop/tutorial/hdfs_api.html#hdfs-api-tutorial
+    """
+
+    NAME = "hdfs"
+    BUILD_FROM_ENVIRONMENT = {
+        "required": [],
+        "optional": [
+            "MODEL_STORE_HDFS_ROOT_PREFIX",
+        ],
+    }
+
+    def __init__(self, root_prefix: Optional[str] = None, create_directory: bool = False):
+        super().__init__(["pydoop"], root_prefix, "MODEL_STORE_HDFS_ROOT_PREFIX")
+        self._create_directory = create_directory
+
+    def validate(self) -> bool:
+        try:
+            hdfs.ls(self.root_prefix)
+        except FileNotFoundError:
+            if not self._create_directory:
+                raise
+            logger.debug("creating root directory %s", self.root_prefix)
+            hdfs.mkdir(self.root_prefix)
+            return True
+
+    def _push(self, file_path: str, prefix: str) -> str:
+        logger.info("Uploading to: %s...", prefix)
+        # This will raise an exception if the file already exists
+        hdfs.put(file_path, prefix)
+        return prefix
+
+    def _pull(self, prefix: str, dir_path: str) -> str:
+        try:
+            logger.debug("Downloading from: %s...", prefix)
+            file_name = os.path.split(prefix)[1]
+            destination = os.path.join(dir_path, file_name)
+            hdfs.get(prefix, destination)
+            return destination
+        except Exception as exc:
+            logger.exception(exc)
+            raise FilePullFailedException(exc) from exc
+
+    def _remove(self, prefix: str) -> bool:
+        """Removes a file from the destination path"""
+        if hdfs.path.exists(prefix):
+            logger.debug("Deleting: %s...", prefix)
+            hdfs.rm(prefix)
+            return True
+        return False
+
+    def _storage_location(self, prefix: str) -> metadata.Storage:
+        """Returns a dict of the location the artifact was stored"""
+        return metadata.Storage.from_path(
+            storage_type="hdfs",
+            root=self.root_prefix,
+            path=prefix,
+        )
+
+    def _get_storage_location(self, meta_data: metadata.Storage) -> str:
+        """Extracts the storage location from a meta data dictionary"""
+        return meta_data.path
+
+    def _read_json_objects(self, prefix: str) -> list:
+        logger.debug("Listing files in: %s", prefix)
+        results = []
+        for obj in hdfs.ls(prefix):
+            logger.debug("reading: %s", obj)
+            if not hdfs.path.basename(obj).endswith(".json"):
+                logger.debug("Skipping non-json file: %s", obj)
+                continue
+            parent = obj[obj.index(prefix):]
+            if os.path.split(parent)[0] != prefix:
+                # We don't want to read files in a sub-prefix
+                logger.debug("Skipping file in sub-prefix: %s", obj)
+                continue
+            json_obj = self._read_json_object(obj)
+            if json_obj is not None:
+                results.append(json_obj)
+        return sorted_by_created(results)
+
+    def _read_json_object(self, prefix: str) -> dict:
+        logger.debug("Reading: %s", prefix)
+        lines = hdfs.load(prefix)
+        if len(lines) == 0:
+            return None
+        try:
+            return json.loads(lines)
+        except json.JSONDecodeError as exc:
+            logger.exception(exc)
+            return None
diff --git a/requirements-dev1.txt b/requirements-dev1.txt
@@ -7,7 +7,7 @@ azure-storage-blob>=12.11.0
 boto3>=1.21.41
 google-cloud-storage>=2.3.0
 minio>=7.1.12
-
+pydoop<=2.0.0; sys_platform == 'darwin'
 pystan>=2.19.1.1 # required to be installed before prophet
 
 # Machine Learning
diff --git a/tests/models/test_pyspark.py b/tests/models/test_pyspark.py
@@ -12,7 +12,7 @@
 #    See the License for the specific language governing permissions and
 #    limitations under the License.
 import os
-
+import platform
 import pytest
 from pyspark import SparkContext
 from pyspark.sql import SQLContext
@@ -97,7 +97,13 @@ def test_save_model(spark_model, tmp_path):
         os.path.join(tmp_path, "pyspark", "stages"),
     ]
     assert exp == res
-    assert all(os.path.exists(x) for x in exp)
+    exists_fn = os.path.exists
+    if platform.system() == 'Darwin':
+        # Running hadoop locally, so need to check
+        # for the files in hdfs
+        import pydoop.hdfs as hdfs
+        exists_fn = hdfs.path.exists
+    assert all(exists_fn(x) for x in exp)
 
 
 def test_load_model(tmp_path, spark_manager, spark_model, spark_df):
diff --git a/tests/storage/test_hdfs.py b/tests/storage/test_hdfs.py
@@ -0,0 +1,148 @@
+#    Copyright 2020 Neal Lathia
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+import os
+import platform
+import pytest
+
+from modelstore.metadata import metadata
+from modelstore.storage.hdfs import HdfsStorage
+
+# pylint: disable=unused-import
+from tests.storage.test_utils import (
+    remote_file_path,
+    remote_path,
+    push_temp_file,
+    push_temp_files,
+)
+
+# pylint: disable=redefined-outer-name
+# pylint: disable=protected-access
+# pylint: disable=missing-function-docstring
+
+
+def is_not_mac() -> bool:
+    return platform.system() != 'Darwin'
+
+
+@pytest.fixture
+def storage(tmp_path):
+    return HdfsStorage(root_prefix=str(tmp_path), create_directory=True)
+
+
+@pytest.mark.skipif(is_not_mac(), reason="no hadoop in ci")
+def test_create_from_environment_variables(monkeypatch):
+    # Does not fail when environment variables exist
+    for key in HdfsStorage.BUILD_FROM_ENVIRONMENT.get("required", []):
+        monkeypatch.setenv(key, "a-value")
+    try:
+        _ = HdfsStorage()
+    except KeyError:
+        pytest.fail("Failed to initialise storage from env variables")
+
+
+@pytest.mark.skipif(is_not_mac(), reason="no hadoop in ci")
+def test_validate(storage):
+    assert storage.validate()
+
+
+@pytest.mark.skipif(is_not_mac(), reason="no hadoop in ci")
+def test_push_and_pull(storage, tmp_path):
+    # pylint: disable=import-outside-toplevel
+    import pydoop.hdfs as hdfs
+    prefix = push_temp_file(storage)
+    files = hdfs.ls(prefix)
+    assert len(files) == 1
+    result = storage._pull(
+        prefix,
+        str(tmp_path),
+    )
+    assert os.path.exists(result)
+    hdfs.rm(files[0])
+
+
+@pytest.mark.skipif(is_not_mac(), reason="no hadoop in ci")
+@pytest.mark.parametrize(
+    "file_exists,should_call_delete",
+    [
+        (
+            False,
+            False,
+        ),
+        (
+            True,
+            True,
+        ),
+    ],
+)
+def test_remove(storage, file_exists, should_call_delete):
+    if file_exists:
+        # Push a file to storage
+        _ = push_temp_file(storage)
+    prefix = remote_file_path()
+    assert storage._remove(prefix) == should_call_delete
+    assert not os.path.exists(prefix)
+
+
+@pytest.mark.skipif(is_not_mac(), reason="no hadoop in ci")
+def test_read_json_objects_ignores_non_json(storage):
+    # pylint: disable=import-outside-toplevel
+    import pydoop.hdfs as hdfs
+    # Create files with different suffixes
+    prefix = remote_path()
+    _ = [hdfs.rm(f) for f in hdfs.ls(prefix)]
+    push_temp_files(storage, prefix)
+
+    # Read the json files at the prefix
+    items = storage._read_json_objects(prefix)
+    assert len(items) == 1
+    _ = [hdfs.rm(f) for f in hdfs.ls(prefix)]
+
+
+@pytest.mark.skipif(is_not_mac(), reason="no hadoop in ci")
+def test_storage_location(storage):
+    prefix = remote_path()
+    # Asserts that the location meta data is correctly formatted
+    expected = metadata.Storage.from_path(
+        storage_type="hdfs",
+        root=storage.root_prefix,
+        path=prefix,
+    )
+    assert storage._storage_location(prefix) == expected
+
+
+@pytest.mark.skipif(is_not_mac(), reason="no hadoop in ci")
+@pytest.mark.parametrize(
+    "meta_data,should_raise,result",
+    [
+        (
+            metadata.Storage(
+                type="hdfs",
+                root="",
+                path="/path/to/file",
+                bucket=None,
+                container=None,
+                prefix=None,
+            ),
+            False,
+            "/path/to/file",
+        ),
+    ],
+)
+def test_get_location(storage, meta_data, should_raise, result):
+    # Asserts that pulling the location out of meta data is correct
+    if should_raise:
+        with pytest.raises(ValueError):
+            storage._get_storage_location(meta_data)
+    else:
+        assert storage._get_storage_location(meta_data) == result