From ca049a1ae68cfdbfcf023f760d4119db75e05a31 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Thu, 16 Dec 2021 14:47:35 -0500 Subject: [PATCH 01/56] Function for finding NWB's & Zarrs --- dandi/consts.py | 9 +++++ dandi/files.py | 70 ++++++++++++++++++++++++++++++++++++ dandi/tests/test_files.py | 75 +++++++++++++++++++++++++++++++++++++++ dandi/validate.py | 20 ++++++++++- setup.cfg | 1 + 5 files changed, 174 insertions(+), 1 deletion(-) create mode 100644 dandi/files.py create mode 100644 dandi/tests/test_files.py diff --git a/dandi/consts.py b/dandi/consts.py index c0237733e..de6ded475 100644 --- a/dandi/consts.py +++ b/dandi/consts.py @@ -142,3 +142,12 @@ class EmbargoStatus(Enum): #: HTTP response status codes that should always be retried (until we run out #: of retries) RETRY_STATUSES = (500, 502, 503, 504) + +#: File extensions recognized as assets by default +ASSET_FILE_EXTENSIONS = [".nwb"] + +#: Recognized extensions for Zarr directories +ZARR_DIR_EXTENSIONS = [".ngff", ".zarr"] + +#: Maximum allowed depth of a Zarr directory tree +MAX_ZARR_DEPTH = 5 diff --git a/dandi/files.py b/dandi/files.py new file mode 100644 index 000000000..1fc3094c4 --- /dev/null +++ b/dandi/files.py @@ -0,0 +1,70 @@ +from collections import deque +from dataclasses import dataclass +from pathlib import Path +from typing import Iterator, Optional, Union + +from . import get_logger +from .consts import ASSET_FILE_EXTENSIONS, ZARR_DIR_EXTENSIONS, dandiset_metadata_file + +lgr = get_logger() + + +@dataclass +class DandiFile: + #: Path to node on disk + filepath: Path + + +class DandisetMetadataFile(DandiFile): + pass + + +@dataclass +class LocalAsset(DandiFile): + #: Forward-slash-separated path relative to root of Dandiset + path: str + + +class LocalFileAsset(LocalAsset): + pass + + +class LocalZarrAsset(LocalAsset): + pass + + +def find_dandi_files( + dirpath: Union[str, Path], + *, + dandiset_path: Optional[Union[str, Path]] = None, + allow_all: bool = False, + include_metadata: bool = False, +) -> Iterator[DandiFile]: + if dandiset_path is None: + dandiset_path = dirpath + else: + try: + Path(dandiset_path).relative_to(dirpath) + except ValueError: + raise ValueError("dirpath must be within dandiset_path") + dirs = deque([Path(dirpath)]) + while dirs: + for p in dirs.popleft().iterdir(): + if p.name.startswith("."): + continue + path = p.relative_to(dandiset_path).as_posix() + if p.is_dir(): + if p.is_symlink(): + lgr.warning( + "%s: Ignoring unsupported symbolic link to directory", p + ) + continue + if p.suffix in ZARR_DIR_EXTENSIONS: + yield LocalZarrAsset(filepath=p, path=path) + else: + dirs.append(p) + elif p == dandiset_path / dandiset_metadata_file: + if allow_all or include_metadata: + yield DandisetMetadataFile(filepath=p) + elif allow_all or p.suffix in ASSET_FILE_EXTENSIONS: + yield LocalFileAsset(filepath=p, path=path) diff --git a/dandi/tests/test_files.py b/dandi/tests/test_files.py new file mode 100644 index 000000000..c00bf95ae --- /dev/null +++ b/dandi/tests/test_files.py @@ -0,0 +1,75 @@ +from operator import attrgetter +from pathlib import Path + +from ..consts import dandiset_metadata_file +from ..files import ( + DandisetMetadataFile, + LocalFileAsset, + LocalZarrAsset, + find_dandi_files, +) + + +def test_find_dandi_files(tmp_path: Path) -> None: + (tmp_path / dandiset_metadata_file).touch() + (tmp_path / "sample01.zarr").mkdir() + (tmp_path / "sample01.zarr" / "inner.nwb").touch() + (tmp_path / "sample01.zarr" / "foo").touch() + (tmp_path / "sample02.nwb").touch() + (tmp_path / "foo").touch() + (tmp_path / "bar.txt").touch() + (tmp_path / "subdir").mkdir() + (tmp_path / "subdir" / "sample03.nwb").touch() + (tmp_path / "subdir" / "sample04.zarr").mkdir() + (tmp_path / "subdir" / "sample04.zarr" / "inner2.nwb").touch() + (tmp_path / "subdir" / "sample04.zarr" / "baz").touch() + (tmp_path / "subdir" / "gnusto").touch() + (tmp_path / "subdir" / "cleesh.txt").touch() + + files = sorted(find_dandi_files(tmp_path), key=attrgetter("filepath")) + assert files == [ + LocalZarrAsset(filepath=tmp_path / "sample01.zarr", path="sample01.zarr"), + LocalFileAsset(filepath=tmp_path / "sample02.nwb", path="sample02.nwb"), + LocalFileAsset( + filepath=tmp_path / "subdir" / "sample03.nwb", path="subdir/sample03.nwb" + ), + LocalZarrAsset( + filepath=tmp_path / "subdir" / "sample04.zarr", path="subdir/sample04.zarr" + ), + ] + + files = sorted( + find_dandi_files(tmp_path, allow_all=True), key=attrgetter("filepath") + ) + assert files == [ + LocalFileAsset(filepath=tmp_path / "bar.txt", path="bar.txt"), + DandisetMetadataFile(filepath=tmp_path / dandiset_metadata_file), + LocalFileAsset(filepath=tmp_path / "foo", path="foo"), + LocalZarrAsset(filepath=tmp_path / "sample01.zarr", path="sample01.zarr"), + LocalFileAsset(filepath=tmp_path / "sample02.nwb", path="sample02.nwb"), + LocalFileAsset( + filepath=tmp_path / "subdir" / "cleesh.txt", path="subdir/cleesh.txt" + ), + LocalFileAsset(filepath=tmp_path / "subdir" / "gnusto", path="subdir/gnusto"), + LocalFileAsset( + filepath=tmp_path / "subdir" / "sample03.nwb", path="subdir/sample03.nwb" + ), + LocalZarrAsset( + filepath=tmp_path / "subdir" / "sample04.zarr", path="subdir/sample04.zarr" + ), + ] + + files = sorted( + find_dandi_files(tmp_path, include_metadata=True), key=attrgetter("filepath") + ) + assert files == [ + DandisetMetadataFile(filepath=tmp_path / dandiset_metadata_file), + LocalZarrAsset(filepath=tmp_path / "sample01.zarr", path="sample01.zarr"), + LocalFileAsset(filepath=tmp_path / "sample02.nwb", path="sample02.nwb"), + LocalFileAsset( + filepath=tmp_path / "subdir" / "sample03.nwb", path="subdir/sample03.nwb" + ), + LocalZarrAsset( + filepath=tmp_path / "subdir" / "sample04.zarr", path="subdir/sample04.zarr" + ), + ] diff --git a/dandi/validate.py b/dandi/validate.py index 16d057442..992e2b81d 100644 --- a/dandi/validate.py +++ b/dandi/validate.py @@ -1,7 +1,11 @@ +import os import os.path as op +from pathlib import Path + +import zarr from . import get_logger -from .consts import dandiset_metadata_file +from .consts import MAX_ZARR_DEPTH, dandiset_metadata_file from .metadata import get_metadata from .pynwb_utils import validate as pynwb_validate from .pynwb_utils import validate_cache @@ -155,3 +159,17 @@ def _check_required_fields(d, required): if v in ("REQUIRED", "PLACEHOLDER"): errors += [f"Required field {f!r} has value {v!r}"] return errors + + +def zarr_validate(path: Path) -> None: + data = zarr.open(path) + if isinstance(data, zarr.Group) and not data: + raise ValueError("Empty Zarr groups not permitted") + try: + next(path.glob(f"*{os.sep}" + os.sep.join(["*"] * MAX_ZARR_DEPTH))) + except StopIteration: + pass + else: + raise ValueError( + f"Zarr directory tree more than {MAX_ZARR_DEPTH} directories deep" + ) diff --git a/setup.cfg b/setup.cfg index f503562cd..20d16f184 100644 --- a/setup.cfg +++ b/setup.cfg @@ -56,6 +56,7 @@ install_requires = semantic-version tenacity tqdm + zarr ~= 2.10 zip_safe = False packages = find: include_package_data = True From 6d1bd2fe177c00b62842706f451c664bf9bf33e1 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Fri, 17 Dec 2021 13:11:19 -0500 Subject: [PATCH 02/56] Add local asset subclasses for different file extensions --- dandi/consts.py | 6 ---- dandi/exceptions.py | 4 +++ dandi/files.py | 65 ++++++++++++++++++++++++++++++++------- dandi/tests/test_files.py | 37 +++++++++++----------- 4 files changed, 77 insertions(+), 35 deletions(-) diff --git a/dandi/consts.py b/dandi/consts.py index de6ded475..984dcf880 100644 --- a/dandi/consts.py +++ b/dandi/consts.py @@ -143,11 +143,5 @@ class EmbargoStatus(Enum): #: of retries) RETRY_STATUSES = (500, 502, 503, 504) -#: File extensions recognized as assets by default -ASSET_FILE_EXTENSIONS = [".nwb"] - -#: Recognized extensions for Zarr directories -ZARR_DIR_EXTENSIONS = [".ngff", ".zarr"] - #: Maximum allowed depth of a Zarr directory tree MAX_ZARR_DEPTH = 5 diff --git a/dandi/exceptions.py b/dandi/exceptions.py index 42b23ceeb..0afbb3b73 100644 --- a/dandi/exceptions.py +++ b/dandi/exceptions.py @@ -64,3 +64,7 @@ def __str__(self): class SchemaVersionError(Exception): pass + + +class UnknownSuffixError(ValueError): + pass diff --git a/dandi/files.py b/dandi/files.py index 1fc3094c4..5a22b3412 100644 --- a/dandi/files.py +++ b/dandi/files.py @@ -4,7 +4,8 @@ from typing import Iterator, Optional, Union from . import get_logger -from .consts import ASSET_FILE_EXTENSIONS, ZARR_DIR_EXTENSIONS, dandiset_metadata_file +from .consts import dandiset_metadata_file +from .exceptions import UnknownSuffixError lgr = get_logger() @@ -29,10 +30,22 @@ class LocalFileAsset(LocalAsset): pass -class LocalZarrAsset(LocalAsset): +class NWBAsset(LocalFileAsset): + EXTENSIONS = [".nwb"] + + +class GenericAsset(LocalFileAsset): + EXTENSIONS = [] + + +class LocalDirectoryAsset(LocalAsset): pass +class ZarrAsset(LocalDirectoryAsset): + EXTENSIONS = [".ngff", ".zarr"] + + def find_dandi_files( dirpath: Union[str, Path], *, @@ -52,19 +65,49 @@ def find_dandi_files( for p in dirs.popleft().iterdir(): if p.name.startswith("."): continue - path = p.relative_to(dandiset_path).as_posix() if p.is_dir(): if p.is_symlink(): lgr.warning( "%s: Ignoring unsupported symbolic link to directory", p ) continue - if p.suffix in ZARR_DIR_EXTENSIONS: - yield LocalZarrAsset(filepath=p, path=path) - else: + try: + df = dandi_file(p, dandiset_path) + except UnknownSuffixError: dirs.append(p) - elif p == dandiset_path / dandiset_metadata_file: - if allow_all or include_metadata: - yield DandisetMetadataFile(filepath=p) - elif allow_all or p.suffix in ASSET_FILE_EXTENSIONS: - yield LocalFileAsset(filepath=p, path=path) + else: + yield df + else: + df = dandi_file(p, dandiset_path) + if isinstance(df, GenericAsset) and not allow_all: + pass + elif isinstance(df, DandisetMetadataFile) and not ( + allow_all or include_metadata + ): + pass + else: + yield df + + +def dandi_file( + filepath: Union[str, Path], dandiset_path: Optional[Union[str, Path]] = None +) -> DandiFile: + filepath = Path(filepath) + if dandiset_path is not None: + path = filepath.relative_to(dandiset_path).as_posix() + else: + path = filepath.name + if filepath.is_dir(): + for dirclass in LocalDirectoryAsset.__subclasses__(): + if filepath.suffix in dirclass.EXTENSIONS: + return dirclass(filepath=filepath, path=path) + raise UnknownSuffixError( + f"Directory has unrecognized suffix {filepath.suffix!r}" + ) + elif path == dandiset_metadata_file: + return DandisetMetadataFile(filepath=filepath) + else: + for fileclass in LocalFileAsset.__subclasses__(): + if filepath.suffix in fileclass.EXTENSIONS: + return fileclass(filepath=filepath, path=path) + return GenericAsset(filepath=filepath, path=path) diff --git a/dandi/tests/test_files.py b/dandi/tests/test_files.py index c00bf95ae..76c99c241 100644 --- a/dandi/tests/test_files.py +++ b/dandi/tests/test_files.py @@ -4,8 +4,9 @@ from ..consts import dandiset_metadata_file from ..files import ( DandisetMetadataFile, - LocalFileAsset, - LocalZarrAsset, + GenericAsset, + NWBAsset, + ZarrAsset, find_dandi_files, ) @@ -28,12 +29,12 @@ def test_find_dandi_files(tmp_path: Path) -> None: files = sorted(find_dandi_files(tmp_path), key=attrgetter("filepath")) assert files == [ - LocalZarrAsset(filepath=tmp_path / "sample01.zarr", path="sample01.zarr"), - LocalFileAsset(filepath=tmp_path / "sample02.nwb", path="sample02.nwb"), - LocalFileAsset( + ZarrAsset(filepath=tmp_path / "sample01.zarr", path="sample01.zarr"), + NWBAsset(filepath=tmp_path / "sample02.nwb", path="sample02.nwb"), + NWBAsset( filepath=tmp_path / "subdir" / "sample03.nwb", path="subdir/sample03.nwb" ), - LocalZarrAsset( + ZarrAsset( filepath=tmp_path / "subdir" / "sample04.zarr", path="subdir/sample04.zarr" ), ] @@ -42,19 +43,19 @@ def test_find_dandi_files(tmp_path: Path) -> None: find_dandi_files(tmp_path, allow_all=True), key=attrgetter("filepath") ) assert files == [ - LocalFileAsset(filepath=tmp_path / "bar.txt", path="bar.txt"), + GenericAsset(filepath=tmp_path / "bar.txt", path="bar.txt"), DandisetMetadataFile(filepath=tmp_path / dandiset_metadata_file), - LocalFileAsset(filepath=tmp_path / "foo", path="foo"), - LocalZarrAsset(filepath=tmp_path / "sample01.zarr", path="sample01.zarr"), - LocalFileAsset(filepath=tmp_path / "sample02.nwb", path="sample02.nwb"), - LocalFileAsset( + GenericAsset(filepath=tmp_path / "foo", path="foo"), + ZarrAsset(filepath=tmp_path / "sample01.zarr", path="sample01.zarr"), + NWBAsset(filepath=tmp_path / "sample02.nwb", path="sample02.nwb"), + GenericAsset( filepath=tmp_path / "subdir" / "cleesh.txt", path="subdir/cleesh.txt" ), - LocalFileAsset(filepath=tmp_path / "subdir" / "gnusto", path="subdir/gnusto"), - LocalFileAsset( + GenericAsset(filepath=tmp_path / "subdir" / "gnusto", path="subdir/gnusto"), + NWBAsset( filepath=tmp_path / "subdir" / "sample03.nwb", path="subdir/sample03.nwb" ), - LocalZarrAsset( + ZarrAsset( filepath=tmp_path / "subdir" / "sample04.zarr", path="subdir/sample04.zarr" ), ] @@ -64,12 +65,12 @@ def test_find_dandi_files(tmp_path: Path) -> None: ) assert files == [ DandisetMetadataFile(filepath=tmp_path / dandiset_metadata_file), - LocalZarrAsset(filepath=tmp_path / "sample01.zarr", path="sample01.zarr"), - LocalFileAsset(filepath=tmp_path / "sample02.nwb", path="sample02.nwb"), - LocalFileAsset( + ZarrAsset(filepath=tmp_path / "sample01.zarr", path="sample01.zarr"), + NWBAsset(filepath=tmp_path / "sample02.nwb", path="sample02.nwb"), + NWBAsset( filepath=tmp_path / "subdir" / "sample03.nwb", path="subdir/sample03.nwb" ), - LocalZarrAsset( + ZarrAsset( filepath=tmp_path / "subdir" / "sample04.zarr", path="subdir/sample04.zarr" ), ] From fd7bfb64ce499b3b42c39ed9461f018aca9f376c Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Mon, 3 Jan 2022 09:14:56 -0500 Subject: [PATCH 03/56] Add methods for metadata & validation --- dandi/files.py | 251 +++++++++++++++++++++++++++++++++++++- dandi/tests/test_files.py | 37 ++++++ dandi/validate.py | 20 +-- 3 files changed, 285 insertions(+), 23 deletions(-) diff --git a/dandi/files.py b/dandi/files.py index 5a22b3412..f4000325c 100644 --- a/dandi/files.py +++ b/dandi/files.py @@ -1,23 +1,106 @@ +from abc import ABC, abstractmethod from collections import deque from dataclasses import dataclass +import os from pathlib import Path -from typing import Iterator, Optional, Union +from typing import Iterator, List, Optional, Union + +from dandischema.models import BareAsset, CommonModel +from dandischema.models import Dandiset as DandisetMeta +from dandischema.models import get_schema_version +from pydantic import ValidationError +import zarr from . import get_logger -from .consts import dandiset_metadata_file +from .consts import MAX_ZARR_DEPTH, dandiset_metadata_file from .exceptions import UnknownSuffixError +from .metadata import get_default_metadata, get_metadata, nwb2asset +from .pynwb_utils import validate as pynwb_validate +from .utils import yaml_load +from .validate import _check_required_fields lgr = get_logger() +# TODO -- should come from schema. This is just a simplistic example for now +_required_dandiset_metadata_fields = ["identifier", "name", "description"] +_required_nwb_metadata_fields = ["subject_id"] + @dataclass -class DandiFile: +class DandiFile(ABC): #: Path to node on disk filepath: Path + @abstractmethod + def get_metadata( + self, + digest: Optional[str] = None, + digest_type: Optional[str] = None, + allow_any_path: bool = True, + ) -> CommonModel: + ... + + @abstractmethod + def get_validation_errors( + self, + schema_version: Optional[str] = None, + devel_debug: bool = False, + ) -> List[str]: + ... + class DandisetMetadataFile(DandiFile): - pass + def get_metadata( + self, + digest: Optional[str] = None, + digest_type: Optional[str] = None, + allow_any_path: bool = True, + ) -> DandisetMeta: + with open(self.filepath) as f: + meta = yaml_load(f, typ="safe") + return DandisetMeta.unvalidated(**meta) + + # TODO: @validate_cache.memoize_path + def get_validation_errors( + self, + schema_version: Optional[str] = None, + devel_debug: bool = False, + ) -> List[str]: + with open(self.filepath) as f: + meta = yaml_load(f, typ="safe") + if schema_version is None: + schema_version = meta.get("schemaVersion") + if schema_version is None: + return _check_required_fields(meta, _required_dandiset_metadata_fields) + else: + current_version = get_schema_version() + if schema_version != current_version: + raise ValueError( + f"Unsupported schema version: {schema_version}; expected {current_version}" + ) + try: + DandisetMeta(**meta) + except ValidationError as e: + if devel_debug: + raise + lgr.warning( + "Validation error for %s: %s", + self.filepath, + e, + extra={"validating": True}, + ) + return [str(e)] + except Exception as e: + if devel_debug: + raise + lgr.warning( + "Unexpected validation error for %s: %s", + self.filepath, + e, + extra={"validating": True}, + ) + return [f"Failed to initialize Dandiset meta: {e}"] + return [] @dataclass @@ -25,6 +108,71 @@ class LocalAsset(DandiFile): #: Forward-slash-separated path relative to root of Dandiset path: str + @abstractmethod + def get_metadata( + self, + digest: Optional[str] = None, + digest_type: Optional[str] = None, + allow_any_path: bool = True, + ) -> BareAsset: + ... + + # TODO: @validate_cache.memoize_path + def get_validation_errors( + self, + schema_version: Optional[str] = None, + devel_debug: bool = False, + ) -> List[str]: + if schema_version is not None: + current_version = get_schema_version() + if schema_version != current_version: + raise ValueError( + f"Unsupported schema version: {schema_version}; expected {current_version}" + ) + try: + asset = self.get_metadata( + digest=32 * "d" + "-1", + digest_type="dandi_etag", + ) + BareAsset(**asset.dict()) + except ValidationError as e: + if devel_debug: + raise + lgr.warning( + "Validation error for %s: %s", + self.filepath, + e, + extra={"validating": True}, + ) + return [str(e)] + except Exception as e: + if devel_debug: + raise + lgr.warning( + "Unexpected validation error for %s: %s", + self.filepath, + e, + extra={"validating": True}, + ) + return [f"Failed to read metadata: {e}"] + return [] + else: + # TODO: Only do this for NWB files + # make sure that we have some basic metadata fields we require + try: + meta = get_metadata(self.filepath) + except Exception as e: + if devel_debug: + raise + lgr.warning( + "Failed to read metadata in %s: %s", + self.filepath, + e, + extra={"validating": True}, + ) + return [f"Failed to read metadata: {e}"] + return _check_required_fields(meta, _required_nwb_metadata_fields) + class LocalFileAsset(LocalAsset): pass @@ -33,10 +181,58 @@ class LocalFileAsset(LocalAsset): class NWBAsset(LocalFileAsset): EXTENSIONS = [".nwb"] + def get_metadata( + self, + digest: Optional[str] = None, + digest_type: Optional[str] = None, + allow_any_path: bool = True, + ) -> BareAsset: + try: + metadata = nwb2asset(self.filepath, digest=digest, digest_type=digest_type) + except Exception as e: + lgr.warning( + "Failed to extract NWB metadata from %s: %s: %s", + self.filepath, + type(e).__name__, + str(e), + ) + if allow_any_path: + metadata = get_default_metadata( + self.filepath, digest=digest, digest_type=digest_type + ) + else: + raise + metadata.path = self.path + return metadata + + # TODO: @validate_cache.memoize_path + def get_validation_errors( + self, + schema_version: Optional[str] = None, + devel_debug: bool = False, + ) -> List[str]: + return pynwb_validate( + self.filepath, devel_debug=devel_debug + ) + super().get_validation_errors( + schema_version=schema_version, devel_debug=devel_debug + ) + class GenericAsset(LocalFileAsset): EXTENSIONS = [] + def get_metadata( + self, + digest: Optional[str] = None, + digest_type: Optional[str] = None, + allow_any_path: bool = True, + ) -> BareAsset: + metadata = get_default_metadata( + self.filepath, digest=digest, digest_type=digest_type + ) + metadata.path = self.path + return metadata + class LocalDirectoryAsset(LocalAsset): pass @@ -45,6 +241,53 @@ class LocalDirectoryAsset(LocalAsset): class ZarrAsset(LocalDirectoryAsset): EXTENSIONS = [".ngff", ".zarr"] + def get_metadata( + self, + digest: Optional[str] = None, + digest_type: Optional[str] = None, + allow_any_path: bool = True, + ) -> BareAsset: + raise NotImplementedError + + def get_validation_errors( + self, + schema_version: Optional[str] = None, + devel_debug: bool = False, + ) -> List[str]: + try: + data = zarr.open(self.filepath) + except Exception as e: + if devel_debug: + raise + lgr.warning( + "Error opening %s: %s: %s", + self.filepath, + type(e).__name__, + e, + extra={"validating": True}, + ) + return [str(e)] + if isinstance(data, zarr.Group) and not data: + msg = "Zarr group is empty" + if devel_debug: + raise ValueError(msg) + lgr.warning("%s: %s", self.filepath, msg, extra={"validating": True}) + return [msg] + try: + next(self.filepath.glob(f"*{os.sep}" + os.sep.join(["*"] * MAX_ZARR_DEPTH))) + except StopIteration: + pass + else: + msg = f"Zarr directory tree more than {MAX_ZARR_DEPTH} directories deep" + if devel_debug: + raise ValueError(msg) + lgr.warning("%s: %s", self.filepath, msg, extra={"validating": True}) + return [msg] + # TODO: Should this be appended to the above errors? + return super().get_validation_errors( + schema_version=schema_version, devel_debug=devel_debug + ) + def find_dandi_files( dirpath: Union[str, Path], diff --git a/dandi/tests/test_files.py b/dandi/tests/test_files.py index 76c99c241..f9fe7a1ff 100644 --- a/dandi/tests/test_files.py +++ b/dandi/tests/test_files.py @@ -1,12 +1,15 @@ from operator import attrgetter from pathlib import Path +from dandischema.models import get_schema_version + from ..consts import dandiset_metadata_file from ..files import ( DandisetMetadataFile, GenericAsset, NWBAsset, ZarrAsset, + dandi_file, find_dandi_files, ) @@ -74,3 +77,37 @@ def test_find_dandi_files(tmp_path: Path) -> None: filepath=tmp_path / "subdir" / "sample04.zarr", path="subdir/sample04.zarr" ), ] + + +def test_validate_simple1(simple1_nwb): + # this file should be ok + errors = dandi_file(simple1_nwb).get_validation_errors( + schema_version=get_schema_version() + ) + assert not errors + + +def test_validate_simple2(simple2_nwb): + # this file should be ok + errors = dandi_file(simple2_nwb).get_validation_errors() + assert not errors + + +def test_validate_simple2_new(simple2_nwb): + # this file should be ok + errors = dandi_file(simple2_nwb).get_validation_errors( + schema_version=get_schema_version() + ) + assert not errors + + +def test_validate_bogus(tmp_path): + path = tmp_path / "wannabe.nwb" + path.write_text("not really nwb") + # intended to produce use-case for https://github.com/dandi/dandi-cli/issues/93 + # but it would be tricky, so it is more of a smoke test that + # we do not crash + errors = dandi_file(path).get_validation_errors() + # ATM we would get 2 errors -- since could not be open in two places, + # but that would be too rigid to test. Let's just see that we have expected errors + assert any(e.startswith("Failed to read metadata") for e in errors) diff --git a/dandi/validate.py b/dandi/validate.py index 992e2b81d..16d057442 100644 --- a/dandi/validate.py +++ b/dandi/validate.py @@ -1,11 +1,7 @@ -import os import os.path as op -from pathlib import Path - -import zarr from . import get_logger -from .consts import MAX_ZARR_DEPTH, dandiset_metadata_file +from .consts import dandiset_metadata_file from .metadata import get_metadata from .pynwb_utils import validate as pynwb_validate from .pynwb_utils import validate_cache @@ -159,17 +155,3 @@ def _check_required_fields(d, required): if v in ("REQUIRED", "PLACEHOLDER"): errors += [f"Required field {f!r} has value {v!r}"] return errors - - -def zarr_validate(path: Path) -> None: - data = zarr.open(path) - if isinstance(data, zarr.Group) and not data: - raise ValueError("Empty Zarr groups not permitted") - try: - next(path.glob(f"*{os.sep}" + os.sep.join(["*"] * MAX_ZARR_DEPTH))) - except StopIteration: - pass - else: - raise ValueError( - f"Zarr directory tree more than {MAX_ZARR_DEPTH} directories deep" - ) From 96b6c5d72d3ba09cd2cc8da5374a534009fc8b21 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Fri, 7 Jan 2022 10:52:57 -0500 Subject: [PATCH 04/56] Use a dedicated type for digest value+algorithm pairs --- dandi/cli/cmd_ls.py | 4 ++-- dandi/files.py | 34 +++++++++++----------------------- dandi/metadata.py | 25 ++++++++++++------------- dandi/misctypes.py | 22 ++++++++++++++++++++++ dandi/upload.py | 4 ++-- dandi/validate.py | 4 ++-- tools/update-assets-on-server | 5 +++-- 7 files changed, 54 insertions(+), 44 deletions(-) create mode 100644 dandi/misctypes.py diff --git a/dandi/cli/cmd_ls.py b/dandi/cli/cmd_ls.py index a485077c7..bd14f30ec 100644 --- a/dandi/cli/cmd_ls.py +++ b/dandi/cli/cmd_ls.py @@ -6,6 +6,7 @@ from .base import devel_option, lgr, map_to_click_exceptions from ..dandiarchive import DandisetURL, _dandi_url_parser, parse_dandi_url +from ..misctypes import Digest from ..utils import is_url # TODO: all the recursion options etc @@ -354,8 +355,7 @@ def fn(): rec = nwb2asset( path, schema_version=schema, - digest=digest, - digest_type="dandi_etag", + digest=Digest.dandi_etag(digest), ).json_dict() else: rec = get_metadata(path) diff --git a/dandi/files.py b/dandi/files.py index f4000325c..c4e35153b 100644 --- a/dandi/files.py +++ b/dandi/files.py @@ -15,6 +15,7 @@ from .consts import MAX_ZARR_DEPTH, dandiset_metadata_file from .exceptions import UnknownSuffixError from .metadata import get_default_metadata, get_metadata, nwb2asset +from .misctypes import DUMMY_DIGEST, Digest from .pynwb_utils import validate as pynwb_validate from .utils import yaml_load from .validate import _check_required_fields @@ -34,8 +35,7 @@ class DandiFile(ABC): @abstractmethod def get_metadata( self, - digest: Optional[str] = None, - digest_type: Optional[str] = None, + digest: Optional[Digest] = None, allow_any_path: bool = True, ) -> CommonModel: ... @@ -52,8 +52,7 @@ def get_validation_errors( class DandisetMetadataFile(DandiFile): def get_metadata( self, - digest: Optional[str] = None, - digest_type: Optional[str] = None, + digest: Optional[Digest] = None, allow_any_path: bool = True, ) -> DandisetMeta: with open(self.filepath) as f: @@ -111,8 +110,7 @@ class LocalAsset(DandiFile): @abstractmethod def get_metadata( self, - digest: Optional[str] = None, - digest_type: Optional[str] = None, + digest: Optional[Digest] = None, allow_any_path: bool = True, ) -> BareAsset: ... @@ -130,10 +128,7 @@ def get_validation_errors( f"Unsupported schema version: {schema_version}; expected {current_version}" ) try: - asset = self.get_metadata( - digest=32 * "d" + "-1", - digest_type="dandi_etag", - ) + asset = self.get_metadata(digest=DUMMY_DIGEST) BareAsset(**asset.dict()) except ValidationError as e: if devel_debug: @@ -183,12 +178,11 @@ class NWBAsset(LocalFileAsset): def get_metadata( self, - digest: Optional[str] = None, - digest_type: Optional[str] = None, + digest: Optional[Digest] = None, allow_any_path: bool = True, ) -> BareAsset: try: - metadata = nwb2asset(self.filepath, digest=digest, digest_type=digest_type) + metadata = nwb2asset(self.filepath, digest=digest) except Exception as e: lgr.warning( "Failed to extract NWB metadata from %s: %s: %s", @@ -197,9 +191,7 @@ def get_metadata( str(e), ) if allow_any_path: - metadata = get_default_metadata( - self.filepath, digest=digest, digest_type=digest_type - ) + metadata = get_default_metadata(self.filepath, digest=digest) else: raise metadata.path = self.path @@ -223,13 +215,10 @@ class GenericAsset(LocalFileAsset): def get_metadata( self, - digest: Optional[str] = None, - digest_type: Optional[str] = None, + digest: Optional[Digest] = None, allow_any_path: bool = True, ) -> BareAsset: - metadata = get_default_metadata( - self.filepath, digest=digest, digest_type=digest_type - ) + metadata = get_default_metadata(self.filepath, digest=digest) metadata.path = self.path return metadata @@ -243,8 +232,7 @@ class ZarrAsset(LocalDirectoryAsset): def get_metadata( self, - digest: Optional[str] = None, - digest_type: Optional[str] = None, + digest: Optional[Digest] = None, allow_any_path: bool = True, ) -> BareAsset: raise NotImplementedError diff --git a/dandi/metadata.py b/dandi/metadata.py index dfae376b8..b7bc72bc6 100644 --- a/dandi/metadata.py +++ b/dandi/metadata.py @@ -3,7 +3,7 @@ import os import os.path as op import re -import typing as ty +from typing import Optional, Tuple from uuid import uuid4 from xml.dom.minidom import parseString @@ -13,6 +13,7 @@ from . import __version__, get_logger from .dandiset import Dandiset +from .misctypes import Digest from .pynwb_utils import ( _get_pynwb_metadata, get_neurodata_types, @@ -388,7 +389,7 @@ def extract_sex(metadata): stop=tenacity.stop_after_attempt(3), wait=tenacity.wait_exponential(exp_base=1.25, multiplier=1.25), ) -def parse_purlobourl(url: str, lookup: ty.Optional[ty.Tuple[str, ...]] = None): +def parse_purlobourl(url: str, lookup: Optional[Tuple[str, ...]] = None): """Parse an Ontobee URL to return properties of a Class node :param url: Ontobee URL @@ -786,12 +787,12 @@ def process_ndtypes(asset, nd_types): def get_asset_metadata( - filepath, relpath, digest=None, digest_type=None, allow_any_path=True + filepath, relpath, digest: Optional[Digest] = None, allow_any_path=True ) -> models.BareAsset: metadata = None if op.splitext(filepath)[1] == ".nwb": try: - metadata = nwb2asset(filepath, digest=digest, digest_type=digest_type) + metadata = nwb2asset(filepath, digest=digest) except Exception as e: lgr.warning( "Failed to extract NWB metadata from %s: %s: %s", @@ -802,15 +803,13 @@ def get_asset_metadata( if not allow_any_path: raise if metadata is None: - metadata = get_default_metadata( - filepath, digest=digest, digest_type=digest_type - ) + metadata = get_default_metadata(filepath, digest=digest) metadata.path = str(relpath) return metadata def nwb2asset( - nwb_path, digest=None, digest_type=None, schema_version=None + nwb_path, digest: Optional[Digest] = None, schema_version=None ) -> models.BareAsset: if schema_version is not None: current_version = models.get_schema_version() @@ -821,8 +820,8 @@ def nwb2asset( start_time = datetime.now().astimezone() metadata = get_metadata(nwb_path) if digest is not None: - metadata["digest"] = digest - metadata["digest_type"] = digest_type + metadata["digest"] = digest.value + metadata["digest_type"] = digest.algorithm.name metadata["contentSize"] = op.getsize(nwb_path) metadata["encodingFormat"] = "application/x-nwb" metadata["dateModified"] = get_utcnow_datetime() @@ -841,12 +840,12 @@ def nwb2asset( return asset -def get_default_metadata(path, digest=None, digest_type=None) -> models.BareAsset: +def get_default_metadata(path, digest: Optional[Digest] = None) -> models.BareAsset: start_time = datetime.now().astimezone() if digest is not None: - digest_model = {models.DigestType[digest_type]: digest} + digest_model = digest.asdict() else: - digest_model = [] + digest_model = {} dateModified = get_utcnow_datetime() blobDateModified = ensure_datetime(os.stat(path).st_mtime) if blobDateModified > dateModified: diff --git a/dandi/misctypes.py b/dandi/misctypes.py new file mode 100644 index 000000000..86e1bbf7c --- /dev/null +++ b/dandi/misctypes.py @@ -0,0 +1,22 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import Dict + +from dandischema.models import DigestType + + +@dataclass +class Digest: + algorithm: DigestType + value: str + + @classmethod + def dandi_etag(cls, value: str) -> Digest: + return cls(algorithm=DigestType.dandi_etag, value=value) + + def asdict(self) -> Dict[DigestType, str]: + return {self.algorithm: self.value} + + +DUMMY_DIGEST = Digest(algorithm=DigestType.dandi_etag, value=32 * "d" + "-1") diff --git a/dandi/upload.py b/dandi/upload.py index 6b6364eb3..93cbd8990 100644 --- a/dandi/upload.py +++ b/dandi/upload.py @@ -9,6 +9,7 @@ from . import lgr from .consts import DRAFT, dandiset_identifier_regex, dandiset_metadata_file from .exceptions import NotFoundError +from .misctypes import Digest from .utils import ensure_datetime, get_instance, pluralize @@ -246,8 +247,7 @@ def process_path(path, relpath): metadata = get_asset_metadata( path, relpath, - digest=file_etag, - digest_type="dandi_etag", + digest=Digest.dandi_etag(file_etag), allow_any_path=allow_any_path, ).json_dict() except Exception as e: diff --git a/dandi/validate.py b/dandi/validate.py index 16d057442..4bd76dcd5 100644 --- a/dandi/validate.py +++ b/dandi/validate.py @@ -3,6 +3,7 @@ from . import get_logger from .consts import dandiset_metadata_file from .metadata import get_metadata +from .misctypes import DUMMY_DIGEST from .pynwb_utils import validate as pynwb_validate from .pynwb_utils import validate_cache from .utils import find_dandi_files, find_files, yaml_load @@ -106,8 +107,7 @@ def validate_asset_file(filepath, schema_version=None, devel_debug=False): asset = get_asset_metadata( filepath, relpath="dummy", - digest=32 * "d" + "-1", - digest_type="dandi_etag", + digest=DUMMY_DIGEST, allow_any_path=True, ) BareAsset(**asset.dict()) diff --git a/tools/update-assets-on-server b/tools/update-assets-on-server index b032e9350..b1e544eac 100755 --- a/tools/update-assets-on-server +++ b/tools/update-assets-on-server @@ -18,6 +18,7 @@ import requests from dandi.dandiapi import DandiAPIClient from dandi.metadata import get_default_metadata, nwb2asset +from dandi.misctypes import Digest from dandi.support.digests import get_digest logging.basicConfig( @@ -37,10 +38,10 @@ def get_meta(path, digest=None): try: if digest is None: digest = get_digest(path, digest="dandi-etag") - localmeta = nwb2asset(path, digest=digest, digest_type="dandi_etag") + localmeta = nwb2asset(path, digest=Digest.dandi_etag(digest)) except Exception as e: ul.error(f"Error {e} getting {path}") - localmeta = get_default_metadata(path, digest=digest, digest_type="dandi_etag") + localmeta = get_default_metadata(path, digest=Digest.dandi_etag(digest)) return localmeta.json_dict() From aaa217f26417a61d713c41a0a61b88fe54dbe57e Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Fri, 7 Jan 2022 12:17:11 -0500 Subject: [PATCH 05/56] Let the new find_dandi_files() take multiple paths --- dandi/files.py | 68 +++++++++++++++++++++++++++----------------------- 1 file changed, 37 insertions(+), 31 deletions(-) diff --git a/dandi/files.py b/dandi/files.py index c4e35153b..38339a62f 100644 --- a/dandi/files.py +++ b/dandi/files.py @@ -278,46 +278,52 @@ def get_validation_errors( def find_dandi_files( - dirpath: Union[str, Path], - *, + *paths: Union[str, Path], dandiset_path: Optional[Union[str, Path]] = None, allow_all: bool = False, include_metadata: bool = False, ) -> Iterator[DandiFile]: if dandiset_path is None: - dandiset_path = dirpath - else: + if len(paths) == 1 and os.path.isdir(paths[0]): + dandiset_path = paths[0] + else: + raise ValueError( + "dandiset_path must be set when not traversing a single directory" + ) + path_queue = deque() + for p in paths: + p = Path(p) try: - Path(dandiset_path).relative_to(dirpath) + p.relative_to(dandiset_path) except ValueError: - raise ValueError("dirpath must be within dandiset_path") - dirs = deque([Path(dirpath)]) - while dirs: - for p in dirs.popleft().iterdir(): - if p.name.startswith("."): + raise ValueError( + "Path {str(p)!r} is not inside Dandiset path {str(dandiset_path)!r}" + ) + path_queue.append(p) + while path_queue: + p = path_queue.popleft() + if p.name.startswith("."): + continue + if p.is_dir(): + if p.is_symlink(): + lgr.warning("%s: Ignoring unsupported symbolic link to directory", p) continue - if p.is_dir(): - if p.is_symlink(): - lgr.warning( - "%s: Ignoring unsupported symbolic link to directory", p - ) - continue - try: - df = dandi_file(p, dandiset_path) - except UnknownSuffixError: - dirs.append(p) - else: - yield df - else: + try: df = dandi_file(p, dandiset_path) - if isinstance(df, GenericAsset) and not allow_all: - pass - elif isinstance(df, DandisetMetadataFile) and not ( - allow_all or include_metadata - ): - pass - else: - yield df + except UnknownSuffixError: + path_queue.extend(p.iterdir()) + else: + yield df + else: + df = dandi_file(p, dandiset_path) + if isinstance(df, GenericAsset) and not allow_all: + pass + elif isinstance(df, DandisetMetadataFile) and not ( + allow_all or include_metadata + ): + pass + else: + yield df def dandi_file( From 8c71cf6c18dc7b3829518d419ea17aec72830b2a Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Fri, 7 Jan 2022 12:42:25 -0500 Subject: [PATCH 06/56] Make upload.py use the new find_dandi_files() --- dandi/upload.py | 59 +++++++++++++++++++++---------------------------- 1 file changed, 25 insertions(+), 34 deletions(-) diff --git a/dandi/upload.py b/dandi/upload.py index 93cbd8990..49bb405ae 100644 --- a/dandi/upload.py +++ b/dandi/upload.py @@ -9,6 +9,7 @@ from . import lgr from .consts import DRAFT, dandiset_identifier_regex, dandiset_metadata_file from .exceptions import NotFoundError +from .files import DandisetMetadataFile, LocalAsset, find_dandi_files from .misctypes import Digest from .utils import ensure_datetime, get_instance, pluralize @@ -56,11 +57,9 @@ def upload( f"convention {dandiset_identifier_regex!r}." ) - from .metadata import get_asset_metadata from .pynwb_utils import ignore_benign_pynwb_warnings from .support.pyout import naturalsize - from .utils import find_dandi_files, find_files, path_is_subpath - from .validate import validate_file + from .utils import path_is_subpath ignore_benign_pynwb_warnings() # so validate doesn't whine @@ -72,21 +71,16 @@ def upload( original_paths = paths # Expand and validate all paths -- they should reside within dandiset - paths = find_files(".*", paths) if allow_any_path else find_dandi_files(paths) - paths = list(map(Path, paths)) - npaths = len(paths) - lgr.info(f"Found {npaths} files to consider") - for path in paths: - if not ( - allow_any_path - or path.name == dandiset_metadata_file - or path.name.endswith(".nwb") - ): - raise NotImplementedError( - f"ATM only .nwb and dandiset.yaml should be in the paths to upload. Got {path}" - ) - if not path_is_subpath(str(path.absolute()), dandiset.path): - raise ValueError(f"{path} is not under {dandiset.path}") + paths = [Path(p).absolute() for p in paths] + dandi_files = list( + find_dandi_files( + *paths, + dandiset_path=dandiset.path, + allow_all=allow_any_path, + include_metadata=True, + ) + ) + lgr.info(f"Found {len(dandi_files)} files to consider") # We will keep a shared set of "being processed" paths so # we could limit the number of them until @@ -102,13 +96,12 @@ def skip_file(msg): # TODO: we might want to always yield a full record so no field is not # provided to pyout to cause it to halt - def process_path(path, relpath): + def process_path(dfile, relpath): """ Parameters ---------- - path: Path - Non Pure (OS specific) Path + dfile: DandiFile relpath: For location on server. Will be cast to PurePosixPath @@ -118,7 +111,7 @@ def process_path(path, relpath): Records for pyout """ # Ensure consistent types - path = Path(path) + path = dfile.filepath relpath = PurePosixPath(relpath) try: try: @@ -136,9 +129,9 @@ def process_path(path, relpath): # Validate first, so we do not bother server at all if not kosher # # TODO: enable back validation of dandiset.yaml - if path.name != dandiset_metadata_file and validation != "skip": + if isinstance(dfile, LocalAsset) and validation != "skip": yield {"status": "pre-validating"} - validation_errors = validate_file(path) + validation_errors = dfile.get_validation_errors() yield {"errors": len(validation_errors)} # TODO: split for dandi, pynwb errors if validation_errors: @@ -157,7 +150,7 @@ def process_path(path, relpath): # Special handling for dandiset.yaml # Yarik hates it but that is life for now. TODO # - if path.name == dandiset_metadata_file: + if isinstance(dfile, DandisetMetadataFile): # TODO This is a temporary measure to avoid breaking web UI # dandiset metadata schema assumptions. All edits should happen # online. @@ -244,9 +237,7 @@ def process_path(path, relpath): # ad-hoc for dandiset.yaml for now yield {"status": "extracting metadata"} try: - metadata = get_asset_metadata( - path, - relpath, + metadata = dfile.get_metadata( digest=Digest.dandi_etag(file_etag), allow_any_path=allow_any_path, ).json_dict() @@ -313,24 +304,24 @@ def upload_agg(*ignored): out = pyouts.LogSafeTabular(style=pyout_style, columns=rec_fields, max_workers=jobs) with out: - for path in paths: + for dfile in dandi_files: while len(process_paths) >= 10: lgr.log(2, "Sleep waiting for some paths to finish processing") time.sleep(0.5) - rec = {"path": str(path)} - process_paths.add(str(path)) + rec = {"path": str(dfile.filepath)} + process_paths.add(str(dfile.filepath)) try: - relpath = path.absolute().relative_to(dandiset.path) + relpath = dfile.filepath.relative_to(dandiset.path) rec["path"] = str(relpath) if devel_debug: # DEBUG: do serially - for v in process_path(path, relpath): + for v in process_path(dfile, relpath): print(str(v), flush=True) else: - rec[tuple(rec_fields[1:])] = process_path(path, relpath) + rec[tuple(rec_fields[1:])] = process_path(dfile, relpath) except ValueError as exc: if "does not start with" in str(exc): # if top_path is not the top path for the path From bf1fa3392dd13d2b1b7203c36470b172456c1355 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Fri, 7 Jan 2022 12:53:26 -0500 Subject: [PATCH 07/56] Rename get_metadata()'s `allow_any_path` to `ignore_errors` --- dandi/files.py | 14 +++++++------- dandi/upload.py | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/dandi/files.py b/dandi/files.py index 38339a62f..b30c5b30c 100644 --- a/dandi/files.py +++ b/dandi/files.py @@ -36,7 +36,7 @@ class DandiFile(ABC): def get_metadata( self, digest: Optional[Digest] = None, - allow_any_path: bool = True, + ignore_errors: bool = True, ) -> CommonModel: ... @@ -53,7 +53,7 @@ class DandisetMetadataFile(DandiFile): def get_metadata( self, digest: Optional[Digest] = None, - allow_any_path: bool = True, + ignore_errors: bool = True, ) -> DandisetMeta: with open(self.filepath) as f: meta = yaml_load(f, typ="safe") @@ -111,7 +111,7 @@ class LocalAsset(DandiFile): def get_metadata( self, digest: Optional[Digest] = None, - allow_any_path: bool = True, + ignore_errors: bool = True, ) -> BareAsset: ... @@ -179,7 +179,7 @@ class NWBAsset(LocalFileAsset): def get_metadata( self, digest: Optional[Digest] = None, - allow_any_path: bool = True, + ignore_errors: bool = True, ) -> BareAsset: try: metadata = nwb2asset(self.filepath, digest=digest) @@ -190,7 +190,7 @@ def get_metadata( type(e).__name__, str(e), ) - if allow_any_path: + if ignore_errors: metadata = get_default_metadata(self.filepath, digest=digest) else: raise @@ -216,7 +216,7 @@ class GenericAsset(LocalFileAsset): def get_metadata( self, digest: Optional[Digest] = None, - allow_any_path: bool = True, + ignore_errors: bool = True, ) -> BareAsset: metadata = get_default_metadata(self.filepath, digest=digest) metadata.path = self.path @@ -233,7 +233,7 @@ class ZarrAsset(LocalDirectoryAsset): def get_metadata( self, digest: Optional[Digest] = None, - allow_any_path: bool = True, + ignore_errors: bool = True, ) -> BareAsset: raise NotImplementedError diff --git a/dandi/upload.py b/dandi/upload.py index 49bb405ae..32da10bdf 100644 --- a/dandi/upload.py +++ b/dandi/upload.py @@ -239,7 +239,7 @@ def process_path(dfile, relpath): try: metadata = dfile.get_metadata( digest=Digest.dandi_etag(file_etag), - allow_any_path=allow_any_path, + ignore_errors=allow_any_path, ).json_dict() except Exception as e: yield skip_file("failed to extract metadata: %s" % str(e)) From f5f84d89f684909a196abdb70cc305eba11d14dd Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Fri, 7 Jan 2022 14:12:48 -0500 Subject: [PATCH 08/56] Abstract out the calculation of local assets' sizes & mtimes --- dandi/files.py | 22 ++++++++++++++++++++-- dandi/upload.py | 5 ++--- 2 files changed, 22 insertions(+), 5 deletions(-) diff --git a/dandi/files.py b/dandi/files.py index b30c5b30c..41ac1528e 100644 --- a/dandi/files.py +++ b/dandi/files.py @@ -1,6 +1,7 @@ from abc import ABC, abstractmethod from collections import deque from dataclasses import dataclass +from datetime import datetime import os from pathlib import Path from typing import Iterator, List, Optional, Union @@ -17,7 +18,7 @@ from .metadata import get_default_metadata, get_metadata, nwb2asset from .misctypes import DUMMY_DIGEST, Digest from .pynwb_utils import validate as pynwb_validate -from .utils import yaml_load +from .utils import ensure_datetime, yaml_load from .validate import _check_required_fields lgr = get_logger() @@ -32,6 +33,13 @@ class DandiFile(ABC): #: Path to node on disk filepath: Path + def get_size(self) -> int: + return self.filepath.stat().st_size + + def get_mtime(self) -> datetime: + # TODO: Should this be overridden for LocalDirectoryAsset? + return ensure_datetime(self.filepath.stat().st_mtime) + @abstractmethod def get_metadata( self, @@ -224,7 +232,17 @@ def get_metadata( class LocalDirectoryAsset(LocalAsset): - pass + def iterfiles(self) -> Iterator[Path]: + dirs = deque([self.filepath]) + while dirs: + for p in dirs.popleft().iterdir(): + if p.is_dir(): + dirs.append(p) + else: + yield p + + def get_size(self) -> int: + return sum(p.stat().st_size for p in self.iterfiles()) class ZarrAsset(LocalDirectoryAsset): diff --git a/dandi/upload.py b/dandi/upload.py index 32da10bdf..2f4bad4b4 100644 --- a/dandi/upload.py +++ b/dandi/upload.py @@ -115,8 +115,7 @@ def process_path(dfile, relpath): relpath = PurePosixPath(relpath) try: try: - path_stat = path.stat() - yield {"size": path_stat.st_size} + yield {"size": dfile.get_size()} except FileNotFoundError: yield skip_file("ERROR: File not found") return @@ -178,7 +177,7 @@ def process_path(dfile, relpath): extant = None else: metadata = extant.get_raw_metadata() - local_mtime = ensure_datetime(path_stat.st_mtime) + local_mtime = dfile.get_mtime() remote_mtime_str = metadata.get("blobDateModified") d = metadata.get("digest", {}) if "dandi:dandi-etag" in d: From 46f7aff1f71b94ca00d9f5c4d5598358a6fd2287 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Fri, 7 Jan 2022 14:27:30 -0500 Subject: [PATCH 09/56] Abstract out calculation of local assets' etags --- dandi/files.py | 12 +++++++++++- dandi/upload.py | 22 ++++++++-------------- 2 files changed, 19 insertions(+), 15 deletions(-) diff --git a/dandi/files.py b/dandi/files.py index 41ac1528e..71520977e 100644 --- a/dandi/files.py +++ b/dandi/files.py @@ -18,6 +18,7 @@ from .metadata import get_default_metadata, get_metadata, nwb2asset from .misctypes import DUMMY_DIGEST, Digest from .pynwb_utils import validate as pynwb_validate +from .support.digests import get_digest from .utils import ensure_datetime, yaml_load from .validate import _check_required_fields @@ -115,6 +116,10 @@ class LocalAsset(DandiFile): #: Forward-slash-separated path relative to root of Dandiset path: str + @abstractmethod + def get_etag(self) -> Digest: + ... + @abstractmethod def get_metadata( self, @@ -178,7 +183,9 @@ def get_validation_errors( class LocalFileAsset(LocalAsset): - pass + def get_etag(self) -> Digest: + value = get_digest(self.filepath, digest="dandi-etag") + return Digest.dandi_etag(value) class NWBAsset(LocalFileAsset): @@ -248,6 +255,9 @@ def get_size(self) -> int: class ZarrAsset(LocalDirectoryAsset): EXTENSIONS = [".ngff", ".zarr"] + def get_etag(self) -> Digest: + raise NotImplementedError + def get_metadata( self, digest: Optional[Digest] = None, diff --git a/dandi/upload.py b/dandi/upload.py index 2f4bad4b4..367a9815f 100644 --- a/dandi/upload.py +++ b/dandi/upload.py @@ -10,7 +10,6 @@ from .consts import DRAFT, dandiset_identifier_regex, dandiset_metadata_file from .exceptions import NotFoundError from .files import DandisetMetadataFile, LocalAsset, find_dandi_files -from .misctypes import Digest from .utils import ensure_datetime, get_instance, pluralize @@ -29,7 +28,6 @@ def upload( ): from .dandiapi import DandiAPIClient from .dandiset import APIDandiset, Dandiset - from .support.digests import get_digest dandiset = Dandiset.find(dandiset_path) if not dandiset: @@ -166,7 +164,7 @@ def process_path(dfile, relpath): # yield {"status": "digesting"} try: - file_etag = get_digest(path, digest="dandi-etag") + file_etag = dfile.get_etag() except Exception as exc: yield skip_file("failed to compute digest: %s" % str(exc)) return @@ -179,17 +177,14 @@ def process_path(dfile, relpath): metadata = extant.get_raw_metadata() local_mtime = dfile.get_mtime() remote_mtime_str = metadata.get("blobDateModified") - d = metadata.get("digest", {}) - if "dandi:dandi-etag" in d: - extant_etag = d["dandi:dandi-etag"] - else: - # TODO: Should this error instead? - extant_etag = None + # TODO: Should this error if the digest is missing? + extant_etag = metadata.get("digest", {}).get(file_etag.algorithm.value) if remote_mtime_str is not None: remote_mtime = ensure_datetime(remote_mtime_str) remote_file_status = ( "same" - if extant_etag == file_etag and remote_mtime == local_mtime + if extant_etag == file_etag.value + and remote_mtime == local_mtime else ( "newer" if remote_mtime > local_mtime @@ -210,11 +205,11 @@ def process_path(dfile, relpath): return # Logic below only for overwrite and reupload if existing == "overwrite": - if extant_etag == file_etag: + if extant_etag == file_etag.value: yield skip_file(exists_msg) return elif existing == "refresh": - if extant_etag == file_etag: + if extant_etag == file_etag.value: yield skip_file("file exists") return elif remote_mtime is not None and remote_mtime >= local_mtime: @@ -237,8 +232,7 @@ def process_path(dfile, relpath): yield {"status": "extracting metadata"} try: metadata = dfile.get_metadata( - digest=Digest.dandi_etag(file_etag), - ignore_errors=allow_any_path, + digest=file_etag, ignore_errors=allow_any_path ).json_dict() except Exception as e: yield skip_file("failed to extract metadata: %s" % str(e)) From 19b1fd62b4eba92499803f173230397998271c7b Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Fri, 7 Jan 2022 14:30:46 -0500 Subject: [PATCH 10/56] Calculating Zarr metadata --- dandi/consts.py | 3 +++ dandi/files.py | 7 +++++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/dandi/consts.py b/dandi/consts.py index 984dcf880..f508a8e11 100644 --- a/dandi/consts.py +++ b/dandi/consts.py @@ -145,3 +145,6 @@ class EmbargoStatus(Enum): #: Maximum allowed depth of a Zarr directory tree MAX_ZARR_DEPTH = 5 + +#: MIME type assigned to & used to identify Zarr assets +ZARR_MIME_TYPE = "application/x-zarr" diff --git a/dandi/files.py b/dandi/files.py index 71520977e..32e173714 100644 --- a/dandi/files.py +++ b/dandi/files.py @@ -13,7 +13,7 @@ import zarr from . import get_logger -from .consts import MAX_ZARR_DEPTH, dandiset_metadata_file +from .consts import MAX_ZARR_DEPTH, ZARR_MIME_TYPE, dandiset_metadata_file from .exceptions import UnknownSuffixError from .metadata import get_default_metadata, get_metadata, nwb2asset from .misctypes import DUMMY_DIGEST, Digest @@ -263,7 +263,10 @@ def get_metadata( digest: Optional[Digest] = None, ignore_errors: bool = True, ) -> BareAsset: - raise NotImplementedError + metadata = get_default_metadata(self.filepath, digest=digest) + metadata.encodingFormat = ZARR_MIME_TYPE + metadata.path = self.path + return metadata def get_validation_errors( self, From 7c19cfc4c1281dc6598a0cd975c9133a97f6ee3e Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Fri, 7 Jan 2022 14:49:48 -0500 Subject: [PATCH 11/56] Some code cleanup --- dandi/upload.py | 43 ++++++++++++++++++------------------------- 1 file changed, 18 insertions(+), 25 deletions(-) diff --git a/dandi/upload.py b/dandi/upload.py index 367a9815f..ca076a08a 100644 --- a/dandi/upload.py +++ b/dandi/upload.py @@ -1,6 +1,6 @@ from functools import reduce import os.path -from pathlib import Path, PurePosixPath +from pathlib import Path import re import time @@ -94,23 +94,19 @@ def skip_file(msg): # TODO: we might want to always yield a full record so no field is not # provided to pyout to cause it to halt - def process_path(dfile, relpath): + def process_path(dfile): """ Parameters ---------- dfile: DandiFile - relpath: - For location on server. Will be cast to PurePosixPath Yields ------ dict Records for pyout """ - # Ensure consistent types - path = dfile.filepath - relpath = PurePosixPath(relpath) + strpath = str(dfile.filepath) try: try: yield {"size": dfile.get_size()} @@ -170,7 +166,7 @@ def process_path(dfile, relpath): return try: - extant = remote_dandiset.get_asset_by_path(str(relpath)) + extant = remote_dandiset.get_asset_by_path(dfile.path) except NotFoundError: extant = None else: @@ -244,11 +240,11 @@ def process_path(dfile, relpath): yield {"status": "uploading"} validating = False for r in remote_dandiset.iter_upload_raw_asset( - path, metadata, jobs=jobs_per_file, replace_asset=extant + dfile.filepath, metadata, jobs=jobs_per_file, replace_asset=extant ): r.pop("asset", None) # to keep pyout from choking if r["status"] == "uploading": - uploaded_paths[str(path)]["size"] = r.pop("current") + uploaded_paths[strpath]["size"] = r.pop("current") yield r elif r["status"] == "post-validating": # Only yield the first "post-validating" status @@ -262,14 +258,14 @@ def process_path(dfile, relpath): except Exception as exc: if devel_debug: raise - lgr.exception("Error uploading %s:", relpath) + lgr.exception("Error uploading %s:", strpath) # Custom formatting for some exceptions we know to extract # user-meaningful message message = str(exc) - uploaded_paths[str(path)]["errors"].append(message) + uploaded_paths[strpath]["errors"].append(message) yield {"status": "ERROR", "message": message} finally: - process_paths.remove(str(path)) + process_paths.remove(strpath) # We will again use pyout to provide a neat table summarizing our progress # with upload etc @@ -302,26 +298,23 @@ def upload_agg(*ignored): lgr.log(2, "Sleep waiting for some paths to finish processing") time.sleep(0.5) - rec = {"path": str(dfile.filepath)} process_paths.add(str(dfile.filepath)) - try: - relpath = dfile.filepath.relative_to(dandiset.path) + if isinstance(dfile, DandisetMetadataFile): + rec = {"path": dandiset_metadata_file} + else: + assert isinstance(dfile, LocalAsset) + rec = {"path": dfile.path} - rec["path"] = str(relpath) + try: if devel_debug: # DEBUG: do serially - for v in process_path(dfile, relpath): + for v in process_path(dfile): print(str(v), flush=True) else: - rec[tuple(rec_fields[1:])] = process_path(dfile, relpath) + rec[tuple(rec_fields[1:])] = process_path(dfile) except ValueError as exc: - if "does not start with" in str(exc): - # if top_path is not the top path for the path - # Provide more concise specific message without path details - rec.update(skip_file("must be a child of top path")) - else: - rec.update(skip_file(exc)) + rec.update(skip_file(exc)) out(rec) if sync: From 891feb10eed6a60e9100b200871782b4bd8361cc Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Fri, 7 Jan 2022 15:36:36 -0500 Subject: [PATCH 12/56] Move core asset upload code to files.py --- dandi/dandiapi.py | 210 +++-------------------------- dandi/files.py | 265 ++++++++++++++++++++++++++++++++++++- dandi/tests/test_upload.py | 16 +-- dandi/upload.py | 4 +- 4 files changed, 290 insertions(+), 205 deletions(-) diff --git a/dandi/dandiapi.py b/dandi/dandiapi.py index 8869f94aa..79eb0413b 100644 --- a/dandi/dandiapi.py +++ b/dandi/dandiapi.py @@ -44,18 +44,15 @@ } """ -from concurrent.futures import ThreadPoolExecutor, as_completed from datetime import datetime import json import os.path from pathlib import Path import re -from threading import Lock from time import sleep, time from types import TracebackType from typing import ( Any, - BinaryIO, Callable, ClassVar, Dict, @@ -69,11 +66,9 @@ cast, ) from urllib.parse import urlparse, urlunparse -from xml.etree.ElementTree import fromstring import click from dandischema import models -from dandischema.digests.dandietag import DandiETag from pydantic import BaseModel, Field, PrivateAttr import requests import tenacity @@ -1032,6 +1027,10 @@ def upload_raw_asset( this version of the Dandiset and return the resulting asset. Blocks until the upload is complete. + .. deprecated:: 0.35.0 + Use the ``upload()`` method of `~dandi.files.LocalAsset` instances + instead + :param filepath: the path to the local file to upload :type filepath: str or PathLike :param dict asset_metadata: @@ -1042,12 +1041,11 @@ def upload_raw_asset( :param RemoteAsset replace_asset: If set, replace the given asset, which must have the same path as the new asset """ - for status in self.iter_upload_raw_asset( - filepath, asset_metadata, jobs=jobs, replace_asset=replace_asset - ): - if status["status"] == "done": - return status["asset"] - raise RuntimeError("iter_upload_raw_asset() finished without returning 'done'") + from .files import dandi_file + + return dandi_file(filepath).upload( + self, metadata=asset_metadata, jobs=jobs, replacing=replace_asset + ) def iter_upload_raw_asset( self, @@ -1061,6 +1059,10 @@ def iter_upload_raw_asset( this version of the Dandiset, returning a generator of status `dict`\\s. + .. deprecated:: 0.35.0 + Use the ``iter_upload()`` method of `~dandi.files.LocalAsset` + instances instead + :param filepath: the path to the local file to upload :type filepath: str or PathLike :param dict asset_metadata: @@ -1077,130 +1079,11 @@ def iter_upload_raw_asset( ``"done"`` and an ``"asset"`` key containing the resulting `RemoteAsset`. """ - from .support.digests import get_dandietag - - asset_path = asset_metadata["path"] - yield {"status": "calculating etag"} - etagger = get_dandietag(filepath) - filetag = etagger.as_str() - lgr.debug("Calculated dandi-etag of %s for %s", filetag, filepath) - digest = asset_metadata.get("digest", {}) - if "dandi:dandi-etag" in digest: - if digest["dandi:dandi-etag"] != filetag: - raise RuntimeError( - f"{filepath}: File etag changed; was originally" - f" {digest['dandi:dandi-etag']} but is now {filetag}" - ) - yield {"status": "initiating upload"} - lgr.debug("%s: Beginning upload", asset_path) - total_size = os.path.getsize(filepath) - try: - resp = self.client.post( - "/uploads/initialize/", - json={ - "contentSize": total_size, - "digest": { - "algorithm": "dandi:dandi-etag", - "value": filetag, - }, - "dandiset": self.identifier, - }, - ) - except requests.HTTPError as e: - if e.response.status_code == 409: - lgr.debug("%s: Blob already exists on server", asset_path) - blob_id = e.response.headers["Location"] - else: - raise - else: - upload_id = resp["upload_id"] - parts = resp["parts"] - if len(parts) != etagger.part_qty: - raise RuntimeError( - f"Server and client disagree on number of parts for upload;" - f" server says {len(parts)}, client says {etagger.part_qty}" - ) - parts_out = [] - bytes_uploaded = 0 - lgr.debug("Uploading %s in %d parts", filepath, len(parts)) - with RESTFullAPIClient("http://nil.nil") as storage: - with open(filepath, "rb") as fp: - with ThreadPoolExecutor(max_workers=jobs or 5) as executor: - lock = Lock() - futures = [ - executor.submit( - _upload_part, - storage_session=storage, - fp=fp, - lock=lock, - etagger=etagger, - asset_path=asset_path, - part=part, - ) - for part in parts - ] - for fut in as_completed(futures): - out_part = fut.result() - bytes_uploaded += out_part["size"] - yield { - "status": "uploading", - "upload": 100 * bytes_uploaded / total_size, - "current": bytes_uploaded, - } - parts_out.append(out_part) - lgr.debug("%s: Completing upload", asset_path) - resp = self.client.post( - f"/uploads/{upload_id}/complete/", - json={"parts": parts_out}, - ) - lgr.debug( - "%s: Announcing completion to %s", - asset_path, - resp["complete_url"], - ) - r = storage.post( - resp["complete_url"], data=resp["body"], json_resp=False - ) - lgr.debug( - "%s: Upload completed. Response content: %s", - asset_path, - r.content, - ) - rxml = fromstring(r.text) - m = re.match(r"\{.+?\}", rxml.tag) - ns = m.group(0) if m else "" - final_etag = rxml.findtext(f"{ns}ETag") - if final_etag is not None: - final_etag = final_etag.strip('"') - if final_etag != filetag: - raise RuntimeError( - "Server and client disagree on final ETag of uploaded file;" - f" server says {final_etag}, client says {filetag}" - ) - # else: Error? Warning? - resp = self.client.post(f"/uploads/{upload_id}/validate/") - blob_id = resp["blob_id"] - lgr.debug("%s: Assigning asset blob to dandiset & version", asset_path) - yield {"status": "producing asset"} - if replace_asset is not None: - lgr.debug("%s: Replacing pre-existing asset") - a = RemoteAsset.from_data( - self, - self.client.put( - replace_asset.api_path, - json={"metadata": asset_metadata, "blob_id": blob_id}, - ), - ) - else: - a = RemoteAsset.from_data( - self, - self.client.post( - f"{self.version_api_path}assets/", - json={"metadata": asset_metadata, "blob_id": blob_id}, - ), - ) - lgr.info("%s: Asset successfully uploaded", asset_path) - yield {"status": "done", "asset": a} + from .files import dandi_file + + return dandi_file(filepath).iter_upload( + self, metadata=asset_metadata, jobs=jobs, replacing=replace_asset + ) class BaseRemoteAsset(APIBase): @@ -1501,60 +1384,3 @@ def set_raw_metadata(self, metadata: Dict[str, Any]) -> None: def delete(self) -> None: """Delete the asset""" self.client.delete(self.api_path) - - -def _upload_part( - storage_session: RESTFullAPIClient, - fp: BinaryIO, - lock: Lock, - etagger: DandiETag, - asset_path: str, - part: dict, -) -> dict: - etag_part = etagger.get_part(part["part_number"]) - if part["size"] != etag_part.size: - raise RuntimeError( - f"Server and client disagree on size of upload part" - f" {part['part_number']}; server says {part['size']}," - f" client says {etag_part.size}" - ) - with lock: - fp.seek(etag_part.offset) - chunk = fp.read(part["size"]) - if len(chunk) != part["size"]: - raise RuntimeError( - f"End of file {fp.name} reached unexpectedly early:" - f" read {len(chunk)} bytes of out of an expected {part['size']}" - ) - lgr.debug( - "%s: Uploading part %d/%d (%d bytes)", - asset_path, - part["part_number"], - etagger.part_qty, - part["size"], - ) - r = storage_session.put( - part["upload_url"], - data=chunk, - json_resp=False, - retry_statuses=[500], - ) - server_etag = r.headers["ETag"].strip('"') - lgr.debug( - "%s: Part upload finished ETag=%s Content-Length=%s", - asset_path, - server_etag, - r.headers.get("Content-Length"), - ) - client_etag = etagger.get_part_etag(etag_part) - if server_etag != client_etag: - raise RuntimeError( - f"Server and client disagree on ETag of upload part" - f" {part['part_number']}; server says" - f" {server_etag}, client says {client_etag}" - ) - return { - "part_number": part["part_number"], - "size": part["size"], - "etag": server_etag, - } diff --git a/dandi/files.py b/dandi/files.py index 32e173714..42e9274ab 100644 --- a/dandi/files.py +++ b/dandi/files.py @@ -1,24 +1,31 @@ from abc import ABC, abstractmethod from collections import deque +from concurrent.futures import ThreadPoolExecutor, as_completed from dataclasses import dataclass from datetime import datetime import os from pathlib import Path -from typing import Iterator, List, Optional, Union +import re +from threading import Lock +from typing import Any, BinaryIO, Dict, Iterator, List, Optional, Union +from xml.etree.ElementTree import fromstring +from dandischema.digests.dandietag import DandiETag from dandischema.models import BareAsset, CommonModel from dandischema.models import Dandiset as DandisetMeta from dandischema.models import get_schema_version from pydantic import ValidationError +import requests import zarr from . import get_logger from .consts import MAX_ZARR_DEPTH, ZARR_MIME_TYPE, dandiset_metadata_file +from .dandiapi import RemoteAsset, RemoteDandiset, RESTFullAPIClient from .exceptions import UnknownSuffixError from .metadata import get_default_metadata, get_metadata, nwb2asset from .misctypes import DUMMY_DIGEST, Digest from .pynwb_utils import validate as pynwb_validate -from .support.digests import get_digest +from .support.digests import get_dandietag, get_digest from .utils import ensure_datetime, yaml_load from .validate import _check_required_fields @@ -35,7 +42,7 @@ class DandiFile(ABC): filepath: Path def get_size(self) -> int: - return self.filepath.stat().st_size + return os.path.getsize(self.filepath) def get_mtime(self) -> datetime: # TODO: Should this be overridden for LocalDirectoryAsset? @@ -181,12 +188,198 @@ def get_validation_errors( return [f"Failed to read metadata: {e}"] return _check_required_fields(meta, _required_nwb_metadata_fields) + def upload( + self, + dandiset: RemoteDandiset, + metadata: Dict[str, Any], + jobs: Optional[int] = None, + replacing: Optional[RemoteAsset] = None, + ) -> RemoteAsset: + """ + Upload the file as an asset with the given metadata to the given + Dandiset and return the resulting asset. Blocks until the upload is + complete. + + :dandiset RemoteDandiset: + the Dandiset to which the file will be uploaded + :param dict metadata: + Metadata for the uploaded asset. The "path" field will be set to + the value of the instance's ``path`` attribute if no such field is + already present. + :param int jobs: Number of threads to use for uploading; defaults to 5 + :param RemoteAsset replacing: If set, replace the given asset, which + must have the same path as the new asset + :rtype: RemoteAsset + """ + for status in self.iter_upload( + dandiset, metadata, jobs=jobs, replacing=replacing + ): + if status["status"] == "done": + return status["asset"] + raise AssertionError("iter_upload() finished without returning 'done'") + + @abstractmethod + def iter_upload( + self, + dandiset: RemoteDandiset, + metadata: Dict[str, Any], + jobs: Optional[int] = None, + replacing: Optional[RemoteAsset] = None, + ) -> Iterator[dict]: + ... + class LocalFileAsset(LocalAsset): def get_etag(self) -> Digest: value = get_digest(self.filepath, digest="dandi-etag") return Digest.dandi_etag(value) + def iter_upload( + self, + dandiset: RemoteDandiset, + metadata: Dict[str, Any], + jobs: Optional[int] = None, + replacing: Optional[RemoteAsset] = None, + ) -> Iterator[dict]: + """ + Upload the file as an asset with the given metadata to the given + Dandiset, returning a generator of status `dict`\\s. + + :dandiset RemoteDandiset: + the Dandiset to which the file will be uploaded + :param dict metadata: + Metadata for the uploaded asset. The "path" field will be set to + the value of the instance's ``path`` attribute if no such field is + already present. + :param int jobs: + Number of threads to use for uploading; defaults to 5 + :param RemoteAsset replacing: If set, replace the given asset, which + must have the same path as the new asset + :returns: + A generator of `dict`\\s containing at least a ``"status"`` key. + Upon successful upload, the last `dict` will have a status of + ``"done"`` and an ``"asset"`` key containing the resulting + `RemoteAsset`. + """ + asset_path = metadata.setdefault("path", self.path) + client = dandiset.client + yield {"status": "calculating etag"} + etagger = get_dandietag(self.filepath) + filetag = etagger.as_str() + lgr.debug("Calculated dandi-etag of %s for %s", filetag, self.filepath) + digest = metadata.get("digest", {}) + if "dandi:dandi-etag" in digest: + if digest["dandi:dandi-etag"] != filetag: + raise RuntimeError( + f"{self.filepath}: File etag changed; was originally" + f" {digest['dandi:dandi-etag']} but is now {filetag}" + ) + yield {"status": "initiating upload"} + lgr.debug("%s: Beginning upload", asset_path) + total_size = self.get_size() + try: + resp = client.post( + "/uploads/initialize/", + json={ + "contentSize": total_size, + "digest": { + "algorithm": "dandi:dandi-etag", + "value": filetag, + }, + "dandiset": dandiset.identifier, + }, + ) + except requests.HTTPError as e: + if e.response.status_code == 409: + lgr.debug("%s: Blob already exists on server", asset_path) + blob_id = e.response.headers["Location"] + else: + raise + else: + upload_id = resp["upload_id"] + parts = resp["parts"] + if len(parts) != etagger.part_qty: + raise RuntimeError( + f"Server and client disagree on number of parts for upload;" + f" server says {len(parts)}, client says {etagger.part_qty}" + ) + parts_out = [] + bytes_uploaded = 0 + lgr.debug("Uploading %s in %d parts", self.filepath, len(parts)) + with RESTFullAPIClient("http://nil.nil") as storage: + with self.filepath.open("rb") as fp: + with ThreadPoolExecutor(max_workers=jobs or 5) as executor: + lock = Lock() + futures = [ + executor.submit( + _upload_part, + storage_session=storage, + fp=fp, + lock=lock, + etagger=etagger, + asset_path=asset_path, + part=part, + ) + for part in parts + ] + for fut in as_completed(futures): + out_part = fut.result() + bytes_uploaded += out_part["size"] + yield { + "status": "uploading", + "upload": 100 * bytes_uploaded / total_size, + "current": bytes_uploaded, + } + parts_out.append(out_part) + lgr.debug("%s: Completing upload", asset_path) + resp = client.post( + f"/uploads/{upload_id}/complete/", + json={"parts": parts_out}, + ) + lgr.debug( + "%s: Announcing completion to %s", + asset_path, + resp["complete_url"], + ) + r = storage.post( + resp["complete_url"], data=resp["body"], json_resp=False + ) + lgr.debug( + "%s: Upload completed. Response content: %s", + asset_path, + r.content, + ) + rxml = fromstring(r.text) + m = re.match(r"\{.+?\}", rxml.tag) + ns = m.group(0) if m else "" + final_etag = rxml.findtext(f"{ns}ETag") + if final_etag is not None: + final_etag = final_etag.strip('"') + if final_etag != filetag: + raise RuntimeError( + "Server and client disagree on final ETag of uploaded file;" + f" server says {final_etag}, client says {filetag}" + ) + # else: Error? Warning? + resp = client.post(f"/uploads/{upload_id}/validate/") + blob_id = resp["blob_id"] + lgr.debug("%s: Assigning asset blob to dandiset & version", asset_path) + yield {"status": "producing asset"} + if replacing is not None: + lgr.debug("%s: Replacing pre-existing asset") + r = client.put( + replacing.api_path, + json={"metadata": metadata, "blob_id": blob_id}, + ) + else: + r = client.post( + f"{dandiset.version_api_path}assets/", + json={"metadata": metadata, "blob_id": blob_id}, + ) + a = RemoteAsset.from_data(dandiset, r) + lgr.info("%s: Asset successfully uploaded", asset_path) + yield {"status": "done", "asset": a} + class NWBAsset(LocalFileAsset): EXTENSIONS = [".nwb"] @@ -307,6 +500,15 @@ def get_validation_errors( schema_version=schema_version, devel_debug=devel_debug ) + def iter_upload( + self, + dandiset: RemoteDandiset, + metadata: Dict[str, Any], + jobs: Optional[int] = None, + replacing: Optional[RemoteAsset] = None, + ) -> Iterator[dict]: + raise NotImplementedError + def find_dandi_files( *paths: Union[str, Path], @@ -379,3 +581,60 @@ def dandi_file( if filepath.suffix in fileclass.EXTENSIONS: return fileclass(filepath=filepath, path=path) return GenericAsset(filepath=filepath, path=path) + + +def _upload_part( + storage_session: RESTFullAPIClient, + fp: BinaryIO, + lock: Lock, + etagger: DandiETag, + asset_path: str, + part: dict, +) -> dict: + etag_part = etagger.get_part(part["part_number"]) + if part["size"] != etag_part.size: + raise RuntimeError( + f"Server and client disagree on size of upload part" + f" {part['part_number']}; server says {part['size']}," + f" client says {etag_part.size}" + ) + with lock: + fp.seek(etag_part.offset) + chunk = fp.read(part["size"]) + if len(chunk) != part["size"]: + raise RuntimeError( + f"End of file {fp.name} reached unexpectedly early:" + f" read {len(chunk)} bytes of out of an expected {part['size']}" + ) + lgr.debug( + "%s: Uploading part %d/%d (%d bytes)", + asset_path, + part["part_number"], + etagger.part_qty, + part["size"], + ) + r = storage_session.put( + part["upload_url"], + data=chunk, + json_resp=False, + retry_statuses=[500], + ) + server_etag = r.headers["ETag"].strip('"') + lgr.debug( + "%s: Part upload finished ETag=%s Content-Length=%s", + asset_path, + server_etag, + r.headers.get("Content-Length"), + ) + client_etag = etagger.get_part_etag(etag_part) + if server_etag != client_etag: + raise RuntimeError( + f"Server and client disagree on ETag of upload part" + f" {part['part_number']}; server says" + f" {server_etag}, client says {client_etag}" + ) + return { + "part_number": part["part_number"], + "size": part["size"], + "etag": server_etag, + } diff --git a/dandi/tests/test_upload.py b/dandi/tests/test_upload.py index 2881c9de7..667240d16 100644 --- a/dandi/tests/test_upload.py +++ b/dandi/tests/test_upload.py @@ -5,9 +5,9 @@ import pytest from ..consts import DRAFT, dandiset_metadata_file -from ..dandiapi import RemoteDandiset from ..download import download from ..exceptions import NotFoundError +from ..files import LocalFileAsset from ..pynwb_utils import make_nwb_file from ..upload import upload from ..utils import list_paths @@ -54,21 +54,21 @@ def test_new_upload_download(local_dandi_api, monkeypatch, organized_nwb_dir, tm def test_new_upload_extant_existing(mocker, text_dandiset): - iter_upload_spy = mocker.spy(RemoteDandiset, "iter_upload_raw_asset") + iter_upload_spy = mocker.spy(LocalFileAsset, "iter_upload") with pytest.raises(FileExistsError): text_dandiset["reupload"](existing="error") iter_upload_spy.assert_not_called() def test_new_upload_extant_skip(mocker, text_dandiset): - iter_upload_spy = mocker.spy(RemoteDandiset, "iter_upload_raw_asset") + iter_upload_spy = mocker.spy(LocalFileAsset, "iter_upload") text_dandiset["reupload"](existing="skip") iter_upload_spy.assert_not_called() @pytest.mark.parametrize("existing", ["overwrite", "refresh"]) def test_new_upload_extant_eq_overwrite(existing, mocker, text_dandiset): - iter_upload_spy = mocker.spy(RemoteDandiset, "iter_upload_raw_asset") + iter_upload_spy = mocker.spy(LocalFileAsset, "iter_upload") text_dandiset["reupload"](existing=existing) iter_upload_spy.assert_not_called() @@ -77,7 +77,7 @@ def test_new_upload_extant_eq_overwrite(existing, mocker, text_dandiset): def test_new_upload_extant_neq_overwrite(existing, mocker, text_dandiset, tmp_path): dandiset_id = text_dandiset["dandiset_id"] (text_dandiset["dspath"] / "file.txt").write_text("This is different text.\n") - iter_upload_spy = mocker.spy(RemoteDandiset, "iter_upload_raw_asset") + iter_upload_spy = mocker.spy(LocalFileAsset, "iter_upload") text_dandiset["reupload"](existing=existing) iter_upload_spy.assert_called() download(text_dandiset["dandiset"].version_api_url, tmp_path) @@ -89,19 +89,19 @@ def test_new_upload_extant_neq_overwrite(existing, mocker, text_dandiset, tmp_pa def test_new_upload_extant_old_refresh(mocker, text_dandiset): (text_dandiset["dspath"] / "file.txt").write_text("This is different text.\n") os.utime(text_dandiset["dspath"] / "file.txt", times=(0, 0)) - iter_upload_spy = mocker.spy(RemoteDandiset, "iter_upload_raw_asset") + iter_upload_spy = mocker.spy(LocalFileAsset, "iter_upload") text_dandiset["reupload"](existing="refresh") iter_upload_spy.assert_not_called() def test_new_upload_extant_force(mocker, text_dandiset): - iter_upload_spy = mocker.spy(RemoteDandiset, "iter_upload_raw_asset") + iter_upload_spy = mocker.spy(LocalFileAsset, "iter_upload") text_dandiset["reupload"](existing="force") iter_upload_spy.assert_called() def test_new_upload_extant_bad_existing(mocker, text_dandiset): - iter_upload_spy = mocker.spy(RemoteDandiset, "iter_upload_raw_asset") + iter_upload_spy = mocker.spy(LocalFileAsset, "iter_upload") text_dandiset["reupload"](existing="foobar") iter_upload_spy.assert_not_called() diff --git a/dandi/upload.py b/dandi/upload.py index ca076a08a..9cdd74080 100644 --- a/dandi/upload.py +++ b/dandi/upload.py @@ -239,8 +239,8 @@ def process_path(dfile): # yield {"status": "uploading"} validating = False - for r in remote_dandiset.iter_upload_raw_asset( - dfile.filepath, metadata, jobs=jobs_per_file, replace_asset=extant + for r in dfile.iter_upload( + remote_dandiset, metadata, jobs=jobs_per_file, replacing=extant ): r.pop("asset", None) # to keep pyout from choking if r["status"] == "uploading": From 0bbe8add73f84fd33b9d56883bc7990d2d56d652 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Mon, 10 Jan 2022 09:18:00 -0500 Subject: [PATCH 13/56] Add RemoteBlobAsset and RemoteZarrAsset classes --- dandi/dandiapi.py | 69 ++++++++++++++++++++++++++++-------- dandi/tests/test_dandiapi.py | 1 + 2 files changed, 55 insertions(+), 15 deletions(-) diff --git a/dandi/dandiapi.py b/dandi/dandiapi.py index 79eb0413b..881b7ecea 100644 --- a/dandi/dandiapi.py +++ b/dandi/dandiapi.py @@ -44,6 +44,7 @@ } """ +from abc import ABC, abstractmethod from datetime import datetime import json import os.path @@ -1283,7 +1284,7 @@ def download( fp.write(chunk) -class RemoteAsset(BaseRemoteAsset): +class RemoteAsset(ABC, BaseRemoteAsset): """ Subclass of `BaseRemoteAsset` that includes information about the Dandiset to which the asset belongs. @@ -1307,7 +1308,7 @@ class RemoteAsset(BaseRemoteAsset): @classmethod def from_data( - self, + cls, dandiset: RemoteDandiset, data: Dict[str, Any], metadata: Optional[Dict[str, Any]] = None, @@ -1320,7 +1321,17 @@ def from_data( This is a low-level method that non-developers would normally only use when acquiring data using means outside of this library. """ - return RemoteAsset( + if data.get("blob") is not None: + klass = RemoteBlobAsset + if data.pop("zarr", None) is not None: + raise ValueError("Asset data contains both `blob` and `zarr`'") + elif data.get("zarr") is not None: + klass = RemoteZarrAsset + if data.pop("blob", None) is not None: + raise ValueError("Asset data contains both `blob` and `zarr`'") + else: + raise ValueError("Asset data contains neither `blob` nor `zarr`") + return klass( client=dandiset.client, dandiset_id=dandiset.identifier, version_id=dandiset.version_id, @@ -1359,21 +1370,32 @@ def set_metadata(self, metadata: models.Asset) -> None: """ return self.set_raw_metadata(metadata.json_dict()) + @abstractmethod def set_raw_metadata(self, metadata: Dict[str, Any]) -> None: """ Set the metadata for the asset on the server to the given value and update the `RemoteAsset` in place. """ - try: - etag = metadata["digest"]["dandi:dandi-etag"] - except KeyError: - raise ValueError("dandi-etag digest not set in new asset metadata") - r = self.client.post( - "/blobs/digest/", - json={"algorithm": "dandi:dandi-etag", "value": etag}, - ) + ... + + def delete(self) -> None: + """Delete the asset""" + self.client.delete(self.api_path) + + +class RemoteBlobAsset(RemoteAsset): + """A `RemoteAsset` whose actual data is a blob resource""" + + #: The ID of the underlying blob resource + blob: str + + def set_raw_metadata(self, metadata: Dict[str, Any]) -> None: + """ + Set the metadata for the asset on the server to the given value and + update the `RemoteBlobAsset` in place. + """ data = self.client.put( - self.api_path, json={"metadata": metadata, "blob_id": r["blob_id"]} + self.api_path, json={"metadata": metadata, "blob_id": self.blob} ) self.identifier = data["asset_id"] self.path = data["path"] @@ -1381,6 +1403,23 @@ def set_raw_metadata(self, metadata: Dict[str, Any]) -> None: self.modified = ensure_datetime(data["modified"]) self._metadata = data["metadata"] - def delete(self) -> None: - """Delete the asset""" - self.client.delete(self.api_path) + +class RemoteZarrAsset(RemoteAsset): + """A `RemoteAsset` whose actual data is a Zarr resource""" + + #: The ID of the underlying Zarr resource + zarr: str + + def set_raw_metadata(self, metadata: Dict[str, Any]) -> None: + """ + Set the metadata for the asset on the server to the given value and + update the `RemoteZarrAsset` in place. + """ + data = self.client.put( + self.api_path, json={"metadata": metadata, "zarr_id": self.zarr} + ) + self.identifier = data["asset_id"] + self.path = data["path"] + self.size = int(data["size"]) + self.modified = ensure_datetime(data["modified"]) + self._metadata = data["metadata"] diff --git a/dandi/tests/test_dandiapi.py b/dandi/tests/test_dandiapi.py index dc3cddd94..04062e5d2 100644 --- a/dandi/tests/test_dandiapi.py +++ b/dandi/tests/test_dandiapi.py @@ -312,6 +312,7 @@ def test_remote_asset_json_dict(text_dandiset): "created": anys.ANY_AWARE_DATETIME_STR, "path": anys.ANY_STR, "size": anys.ANY_INT, + "blob": anys.ANY_STR, } From 362a43207e618634ade38627230f12474a8f5a74 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Mon, 10 Jan 2022 09:33:02 -0500 Subject: [PATCH 14/56] Implementation of Zarr checksumming --- dandi/files.py | 1 + dandi/misctypes.py | 4 ++++ dandi/support/digests.py | 33 +++++++++++++++++++++++++++++++++ 3 files changed, 38 insertions(+) diff --git a/dandi/files.py b/dandi/files.py index 42e9274ab..1215142cc 100644 --- a/dandi/files.py +++ b/dandi/files.py @@ -450,6 +450,7 @@ class ZarrAsset(LocalDirectoryAsset): def get_etag(self) -> Digest: raise NotImplementedError + # return Digest.dandi_zarr(get_zarr_checksum(self.filepath)) def get_metadata( self, diff --git a/dandi/misctypes.py b/dandi/misctypes.py index 86e1bbf7c..10fd9e152 100644 --- a/dandi/misctypes.py +++ b/dandi/misctypes.py @@ -15,6 +15,10 @@ class Digest: def dandi_etag(cls, value: str) -> Digest: return cls(algorithm=DigestType.dandi_etag, value=value) + # @classmethod + # def dandi_zarr(cls, value: str) -> Digest: + # return cls(algorithm=DigestType.dandi_zarr_checksum, value=value) + def asdict(self) -> Dict[DigestType, str]: return {self.algorithm: self.value} diff --git a/dandi/support/digests.py b/dandi/support/digests.py index 6656697c9..d57fa9ae2 100644 --- a/dandi/support/digests.py +++ b/dandi/support/digests.py @@ -10,7 +10,11 @@ """ import hashlib +import json import logging +from operator import itemgetter +from pathlib import Path +from typing import Optional from dandischema.digests.dandietag import DandiETag from fscacher import PersistentCache @@ -87,3 +91,32 @@ def get_digest(filepath, digest="sha256") -> str: @checksums.memoize_path def get_dandietag(filepath) -> DandiETag: return DandiETag.from_file(filepath) + + +def get_zarr_checksum(dirpath: Path, basepath: Optional[Path] = None) -> str: + if basepath is None: + basepath = dirpath + dirs = [] + files = [] + for p in dirpath.iterdir(): + if p.is_dir(): + dirs.append( + { + "md5": get_zarr_checksum(p, basepath), + "path": p.relative_to(basepath).as_posix(), + } + ) + else: + files.append( + { + "md5": get_digest(p, "md5"), + "path": p.relative_to(basepath).as_posix(), + } + ) + data = { + "directories": sorted(dirs, key=itemgetter("path")), + "files": sorted(files, key=itemgetter("path")), + } + return hashlib.md5( + json.dumps(data, sort_keys=True, ensure_ascii=True).encode("utf-8") + ).hexdigest() From 9b4d78c0c44a48484d776f948f5ad68b81f42bcd Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Mon, 10 Jan 2022 09:54:48 -0500 Subject: [PATCH 15/56] Give `BaseRemoteAsset` `is_blob()` and `is_zarr()` methods --- dandi/dandiapi.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/dandi/dandiapi.py b/dandi/dandiapi.py index 881b7ecea..dd957e4be 100644 --- a/dandi/dandiapi.py +++ b/dandi/dandiapi.py @@ -79,6 +79,7 @@ DRAFT, MAX_CHUNK_SIZE, RETRY_STATUSES, + ZARR_MIME_TYPE, DandiInstance, EmbargoStatus, known_instances, @@ -1283,6 +1284,14 @@ def download( for chunk in downloader(): fp.write(chunk) + def is_blob(self) -> bool: + """Returns true if the asset's actual data is a blob resource""" + return self.get_raw_metadata().get("encodingFormat") != ZARR_MIME_TYPE + + def is_zarr(self) -> bool: + """Returns true if the asset's actual data is a Zarr resource""" + return self.get_raw_metadata().get("encodingFormat") == ZARR_MIME_TYPE + class RemoteAsset(ABC, BaseRemoteAsset): """ @@ -1389,6 +1398,14 @@ class RemoteBlobAsset(RemoteAsset): #: The ID of the underlying blob resource blob: str + def is_blob(self) -> bool: + """Returns true if the asset's actual data is a blob resource""" + return True + + def is_zarr(self) -> bool: + """Returns true if the asset's actual data is a Zarr resource""" + return False + def set_raw_metadata(self, metadata: Dict[str, Any]) -> None: """ Set the metadata for the asset on the server to the given value and @@ -1410,6 +1427,14 @@ class RemoteZarrAsset(RemoteAsset): #: The ID of the underlying Zarr resource zarr: str + def is_blob(self) -> bool: + """Returns true if the asset's actual data is a blob resource""" + return False + + def is_zarr(self) -> bool: + """Returns true if the asset's actual data is a Zarr resource""" + return True + def set_raw_metadata(self, metadata: Dict[str, Any]) -> None: """ Set the metadata for the asset on the server to the given value and From c3c51022a3af07569dc70b0b0a7694cb9ffcf801 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Mon, 10 Jan 2022 11:30:32 -0500 Subject: [PATCH 16/56] download --sync: Use new `find_dandi_files()` --- dandi/download.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/dandi/download.py b/dandi/download.py index f3ac67053..65f2f5c1c 100644 --- a/dandi/download.py +++ b/dandi/download.py @@ -16,12 +16,12 @@ from .dandiarchive import DandisetURL, MultiAssetURL, SingleAssetURL, parse_dandi_url from .dandiset import Dandiset from .exceptions import NotFoundError +from .files import DandisetMetadataFile, find_dandi_files from .support.digests import get_digest from .support.pyout import naturalsize from .utils import ( abbrev_prompt, ensure_datetime, - find_files, flattened, is_same_time, on_windows, @@ -127,14 +127,14 @@ def download( f"Unexpected URL type {type(parsed_url).__name__}" ) to_delete = [] - for p in find_files(".*", download_dir, exclude_datalad=True): - if p == op.join(output_path, dandiset_metadata_file): + for df in find_dandi_files(download_dir, allow_all=True): + if isinstance(df, DandisetMetadataFile): continue - a_path = op.normpath(op.join(prefix, op.relpath(p, download_dir))) + a_path = op.normpath(op.join(prefix, df.path)) if on_windows: a_path = a_path.replace("\\", "/") if a_path not in asset_paths: - to_delete.append(p) + to_delete.append(df.filepath) if to_delete: while True: opt = abbrev_prompt( From 580b9cdd2a949666cfdaee25983a25fcf0c619fc Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Mon, 10 Jan 2022 09:49:57 -0500 Subject: [PATCH 17/56] dandi_zarr_checksum is now a digest type --- dandi/files.py | 5 ++--- dandi/misctypes.py | 6 +++--- setup.cfg | 2 +- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/dandi/files.py b/dandi/files.py index 1215142cc..e10ea1415 100644 --- a/dandi/files.py +++ b/dandi/files.py @@ -25,7 +25,7 @@ from .metadata import get_default_metadata, get_metadata, nwb2asset from .misctypes import DUMMY_DIGEST, Digest from .pynwb_utils import validate as pynwb_validate -from .support.digests import get_dandietag, get_digest +from .support.digests import get_dandietag, get_digest, get_zarr_checksum from .utils import ensure_datetime, yaml_load from .validate import _check_required_fields @@ -449,8 +449,7 @@ class ZarrAsset(LocalDirectoryAsset): EXTENSIONS = [".ngff", ".zarr"] def get_etag(self) -> Digest: - raise NotImplementedError - # return Digest.dandi_zarr(get_zarr_checksum(self.filepath)) + return Digest.dandi_zarr(get_zarr_checksum(self.filepath)) def get_metadata( self, diff --git a/dandi/misctypes.py b/dandi/misctypes.py index 10fd9e152..ad4c81e7b 100644 --- a/dandi/misctypes.py +++ b/dandi/misctypes.py @@ -15,9 +15,9 @@ class Digest: def dandi_etag(cls, value: str) -> Digest: return cls(algorithm=DigestType.dandi_etag, value=value) - # @classmethod - # def dandi_zarr(cls, value: str) -> Digest: - # return cls(algorithm=DigestType.dandi_zarr_checksum, value=value) + @classmethod + def dandi_zarr(cls, value: str) -> Digest: + return cls(algorithm=DigestType.dandi_zarr_checksum, value=value) def asdict(self) -> Dict[DigestType, str]: return {self.algorithm: self.value} diff --git a/setup.cfg b/setup.cfg index 20d16f184..dbc4b9165 100644 --- a/setup.cfg +++ b/setup.cfg @@ -32,7 +32,7 @@ install_requires = appdirs click click-didyoumean - dandischema ~= 0.4.1 + dandischema ~= 0.5.0 etelemetry >= 0.2.2 fasteners fscacher From 52995616863062c7d960d776acfa1e47a0dd9394 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Mon, 10 Jan 2022 10:02:13 -0500 Subject: [PATCH 18/56] `BaseRemoteAsset.get_etag()` --- dandi/dandiapi.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/dandi/dandiapi.py b/dandi/dandiapi.py index dd957e4be..ae6c8919f 100644 --- a/dandi/dandiapi.py +++ b/dandi/dandiapi.py @@ -87,6 +87,7 @@ ) from .exceptions import NotFoundError, SchemaVersionError from .keyring import keyring_lookup +from .misctypes import Digest from .utils import USER_AGENT, check_dandi_version, ensure_datetime, is_interactive lgr = get_logger() @@ -1202,6 +1203,18 @@ def get_digest( except KeyError: raise NotFoundError(f"No {digest_type} digest found in metadata") + def get_etag(self) -> Digest: + """ + Retrieves the DANDI etag digest of the appropriate type for the asset: + a dandi-etag digest for blob resources or a dandi-zarr-checksum for + Zarr resources + """ + if self.is_zarr(): + algorithm = models.DigestType.dandi_zarr_checksum + else: + algorithm = models.DigestType.dandi_etag + return Digest(algorithm=algorithm, value=self.get_digest(algorithm)) + def get_content_url( self, regex: str = r".*", From f1f4e0ff090ca053b4133c1929fade8e00738990 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Mon, 10 Jan 2022 12:25:31 -0500 Subject: [PATCH 19/56] `ZarrAsset.iter_upload()` --- dandi/consts.py | 3 ++ dandi/files.py | 139 ++++++++++++++++++++++++++++++++++++++++++++---- dandi/utils.py | 21 +++++++- 3 files changed, 151 insertions(+), 12 deletions(-) diff --git a/dandi/consts.py b/dandi/consts.py index f508a8e11..13642b3ef 100644 --- a/dandi/consts.py +++ b/dandi/consts.py @@ -148,3 +148,6 @@ class EmbargoStatus(Enum): #: MIME type assigned to & used to identify Zarr assets ZARR_MIME_TYPE = "application/x-zarr" + +#: Maximum number of Zarr directory entries to upload at once +ZARR_UPLOAD_BATCH_SIZE = 255 diff --git a/dandi/files.py b/dandi/files.py index e10ea1415..82161c4de 100644 --- a/dandi/files.py +++ b/dandi/files.py @@ -19,14 +19,19 @@ import zarr from . import get_logger -from .consts import MAX_ZARR_DEPTH, ZARR_MIME_TYPE, dandiset_metadata_file +from .consts import ( + MAX_ZARR_DEPTH, + ZARR_MIME_TYPE, + ZARR_UPLOAD_BATCH_SIZE, + dandiset_metadata_file, +) from .dandiapi import RemoteAsset, RemoteDandiset, RESTFullAPIClient from .exceptions import UnknownSuffixError from .metadata import get_default_metadata, get_metadata, nwb2asset from .misctypes import DUMMY_DIGEST, Digest from .pynwb_utils import validate as pynwb_validate from .support.digests import get_dandietag, get_digest, get_zarr_checksum -from .utils import ensure_datetime, yaml_load +from .utils import chunked, ensure_datetime, pluralize, yaml_load from .validate import _check_required_fields lgr = get_logger() @@ -207,8 +212,9 @@ def upload( the value of the instance's ``path`` attribute if no such field is already present. :param int jobs: Number of threads to use for uploading; defaults to 5 - :param RemoteAsset replacing: If set, replace the given asset, which - must have the same path as the new asset + :param RemoteAsset replacing: + If set, replace the given asset, which must have the same path as + the new asset :rtype: RemoteAsset """ for status in self.iter_upload( @@ -251,10 +257,10 @@ def iter_upload( Metadata for the uploaded asset. The "path" field will be set to the value of the instance's ``path`` attribute if no such field is already present. - :param int jobs: - Number of threads to use for uploading; defaults to 5 - :param RemoteAsset replacing: If set, replace the given asset, which - must have the same path as the new asset + :param int jobs: Number of threads to use for uploading; defaults to 5 + :param RemoteAsset replacing: + If set, replace the given asset, which must have the same path as + the new asset :returns: A generator of `dict`\\s containing at least a ``"status"`` key. Upon successful upload, the last `dict` will have a status of @@ -312,7 +318,7 @@ def iter_upload( lock = Lock() futures = [ executor.submit( - _upload_part, + _upload_blob_part, storage_session=storage, fp=fp, lock=lock, @@ -507,7 +513,110 @@ def iter_upload( jobs: Optional[int] = None, replacing: Optional[RemoteAsset] = None, ) -> Iterator[dict]: - raise NotImplementedError + """ + Upload the Zarr directory as an asset with the given metadata to the + given Dandiset, returning a generator of status `dict`\\s. + + :dandiset RemoteDandiset: + the Dandiset to which the Zarr will be uploaded + :param dict metadata: + Metadata for the uploaded asset. The "path" field will be set to + the value of the instance's ``path`` attribute if no such field is + already present. + :param int jobs: Number of threads to use for uploading; defaults to 5 + :param RemoteAsset replacing: + If set, replace the given asset, which must have the same path as + the new asset + :returns: + A generator of `dict`\\s containing at least a ``"status"`` key. + Upon successful upload, the last `dict` will have a status of + ``"done"`` and an ``"asset"`` key containing the resulting + `RemoteAsset`. + """ + # TODO: Only iterate over the filetree once and save the results in + # memory + asset_path = metadata.setdefault("path", self.path) + client = dandiset.client + yield {"status": "calculating etag"} + filetag = self.get_etag().value + lgr.debug("Calculated dandi-zarr-checksum of %s for %s", filetag, self.filepath) + digest = metadata.get("digest", {}) + if "dandi:dandi-zarr-checksum" in digest: + if digest["dandi:dandi-zarr-checksum"] != filetag: + raise RuntimeError( + f"{self.filepath}: Zarr etag changed; was originally" + f" {digest['dandi:dandi-zarr-checksum']} but is now {filetag}" + ) + yield {"status": "initiating upload"} + lgr.debug("%s: Beginning upload", asset_path) + total_size = self.get_size() + bytes_uploaded = 0 + r = client.post("/zarr/", json={"name": self.filepath.name}) + zarr_id = r["zarr_id"] + with RESTFullAPIClient( + "http://nil.nil", + headers={"X-Amz-ACL": "bucket-owner-full-control"}, + ) as storage: + for i, filebatch in enumerate( + chunked(self.iterfiles(), ZARR_UPLOAD_BATCH_SIZE), start=1 + ): + upload_body = [ + { + "path": p.relative_to(self.filepath).as_posix(), + "etag": get_digest(p, "md5"), + } + for p in filebatch + ] + lgr.debug( + "%s: Uploading Zarr file batch #%d (%s)", + asset_path, + i, + pluralize(len(filebatch), "file"), + ) + r = client.post(f"/zarr/{zarr_id}/upload/", json=upload_body) + with ThreadPoolExecutor(max_workers=jobs or 5) as executor: + futures = [ + executor.submit( + _upload_zarr_file, + storage_session=storage, + path=self.filepath / upspec["path"], + upload_url=upspec["upload_url"], + ) + for upspec in r + ] + for fut in as_completed(futures): + size = fut.result() + bytes_uploaded += size + yield { + "status": "uploading", + "upload": 100 * bytes_uploaded / total_size, + "current": bytes_uploaded, + } + lgr.debug("%s: Completing upload of batch #%d", asset_path, i) + r = client.post(f"/zarr/{zarr_id}/upload/complete/") + lgr.debug("%s: Upload completed", asset_path) + r = client.get(f"/zarr/{zarr_id}/") + if r["checksum"] != filetag: + raise RuntimeError( + "Server and client disagree on final ETag of uploaded Zarr;" + f" server says {r['checksum']}, client says {filetag}" + ) + lgr.debug("%s: Assigning asset blob to dandiset & version", asset_path) + yield {"status": "producing asset"} + if replacing is not None: + lgr.debug("%s: Replacing pre-existing asset") + r = client.put( + replacing.api_path, + json={"metadata": metadata, "zarr_id": zarr_id}, + ) + else: + r = client.post( + f"{dandiset.version_api_path}assets/", + json={"metadata": metadata, "zarr_id": zarr_id}, + ) + a = RemoteAsset.from_data(dandiset, r) + lgr.info("%s: Asset successfully uploaded", asset_path) + yield {"status": "done", "asset": a} def find_dandi_files( @@ -583,7 +692,7 @@ def dandi_file( return GenericAsset(filepath=filepath, path=path) -def _upload_part( +def _upload_blob_part( storage_session: RESTFullAPIClient, fp: BinaryIO, lock: Lock, @@ -638,3 +747,11 @@ def _upload_part( "size": part["size"], "etag": server_etag, } + + +def _upload_zarr_file( + storage_session: RESTFullAPIClient, path: Path, upload_url: str +) -> int: + with path.open("rb") as fp: + storage_session.put(upload_url, data=fp, json_resp=False) + return path.stat().st_size diff --git a/dandi/utils.py b/dandi/utils.py index c0d9c171b..9f25469ef 100644 --- a/dandi/utils.py +++ b/dandi/utils.py @@ -19,7 +19,7 @@ import subprocess import sys import types -from typing import List, Optional, Union +from typing import Iterable, Iterator, List, Optional, TypeVar, Union import dateutil.parser import requests @@ -708,3 +708,22 @@ def check_dandi_version(): exc, ) os.environ["DANDI_NO_ET"] = "1" + + +T = TypeVar("T") + + +def chunked(iterable: Iterable[T], size: int) -> Iterator[List[T]]: + # cf. chunked() from more-itertools + i = iter(iterable) + while True: + xs = [] + for _ in range(size): + try: + xs.append(next(i)) + except StopIteration: + if xs: + break + else: + return + yield xs From 67a2bb5536d03bb90539eed9798fa377f48266fc Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Tue, 11 Jan 2022 12:26:16 -0500 Subject: [PATCH 20/56] Basic test of uploading a Zarr --- dandi/tests/test_files.py | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/dandi/tests/test_files.py b/dandi/tests/test_files.py index f9fe7a1ff..a57628975 100644 --- a/dandi/tests/test_files.py +++ b/dandi/tests/test_files.py @@ -2,8 +2,11 @@ from pathlib import Path from dandischema.models import get_schema_version +import numpy as np +import zarr -from ..consts import dandiset_metadata_file +from ..consts import ZARR_MIME_TYPE, dandiset_metadata_file +from ..dandiapi import RemoteZarrAsset from ..files import ( DandisetMetadataFile, GenericAsset, @@ -111,3 +114,23 @@ def test_validate_bogus(tmp_path): # ATM we would get 2 errors -- since could not be open in two places, # but that would be too rigid to test. Let's just see that we have expected errors assert any(e.startswith("Failed to read metadata") for e in errors) + + +def test_upload_zarr(local_dandi_api, tmp_path): + filepath = tmp_path / "example.zarr" + zarr.save(filepath, np.arange(1000), np.arange(1000, 0, -1)) + zf = dandi_file(filepath) + assert isinstance(zf, ZarrAsset) + d = local_dandi_api["client"].create_dandiset("Zarr Dandiset", {}) + asset = zf.upload(d, {"description": "A test Zarr"}) + assert isinstance(asset, RemoteZarrAsset) + assert asset.is_zarr() + assert not asset.is_blob() + assert asset.path == "example.zarr" + md = asset.get_raw_metadata() + assert md["encodingFormat"] == ZARR_MIME_TYPE + assert md["description"] == "A test Zarr" + md["description"] = "A modified Zarr" + asset.set_raw_metadata(md) + md = asset.get_raw_metadata() + assert md["description"] == "A modified Zarr" From f660050fe54c4b377e6bbb3cf62e234804aab21a Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Wed, 12 Jan 2022 09:49:43 -0500 Subject: [PATCH 21/56] Fix --- dandi/support/digests.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/dandi/support/digests.py b/dandi/support/digests.py index d57fa9ae2..8d9db0476 100644 --- a/dandi/support/digests.py +++ b/dandi/support/digests.py @@ -118,5 +118,7 @@ def get_zarr_checksum(dirpath: Path, basepath: Optional[Path] = None) -> str: "files": sorted(files, key=itemgetter("path")), } return hashlib.md5( - json.dumps(data, sort_keys=True, ensure_ascii=True).encode("utf-8") + json.dumps( + data, sort_keys=True, ensure_ascii=True, separators=(",", ":") + ).encode("utf-8") ).hexdigest() From 8c8e87ef0de81307cc3e30d5b7d92b0b466eacc2 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Wed, 12 Jan 2022 10:14:50 -0500 Subject: [PATCH 22/56] Test & fix --- dandi/files.py | 48 ++++++++++++++++++++++---------------- dandi/tests/test_upload.py | 22 +++++++++++++++++ 2 files changed, 50 insertions(+), 20 deletions(-) diff --git a/dandi/files.py b/dandi/files.py index 82161c4de..e7bc71ebb 100644 --- a/dandi/files.py +++ b/dandi/files.py @@ -177,21 +177,8 @@ def get_validation_errors( return [f"Failed to read metadata: {e}"] return [] else: - # TODO: Only do this for NWB files - # make sure that we have some basic metadata fields we require - try: - meta = get_metadata(self.filepath) - except Exception as e: - if devel_debug: - raise - lgr.warning( - "Failed to read metadata in %s: %s", - self.filepath, - e, - extra={"validating": True}, - ) - return [f"Failed to read metadata: {e}"] - return _check_required_fields(meta, _required_nwb_metadata_fields) + # TODO: Do something else? + return [] def upload( self, @@ -417,11 +404,32 @@ def get_validation_errors( schema_version: Optional[str] = None, devel_debug: bool = False, ) -> List[str]: - return pynwb_validate( - self.filepath, devel_debug=devel_debug - ) + super().get_validation_errors( - schema_version=schema_version, devel_debug=devel_debug - ) + errors = pynwb_validate(self.filepath, devel_debug=devel_debug) + if schema_version is not None: + errors.extend( + super().get_validation_errors( + schema_version=schema_version, devel_debug=devel_debug + ) + ) + else: + # make sure that we have some basic metadata fields we require + try: + meta = get_metadata(self.filepath) + except Exception as e: + if devel_debug: + raise + lgr.warning( + "Failed to read metadata in %s: %s", + self.filepath, + e, + extra={"validating": True}, + ) + errors.append(f"Failed to read metadata: {e}") + else: + errors.extend( + _check_required_fields(meta, _required_nwb_metadata_fields) + ) + return errors class GenericAsset(LocalFileAsset): diff --git a/dandi/tests/test_upload.py b/dandi/tests/test_upload.py index 667240d16..07255e930 100644 --- a/dandi/tests/test_upload.py +++ b/dandi/tests/test_upload.py @@ -1,10 +1,13 @@ import os from pathlib import Path +import numpy as np import pynwb import pytest +import zarr from ..consts import DRAFT, dandiset_metadata_file +from ..dandiapi import RemoteZarrAsset from ..download import download from ..exceptions import NotFoundError from ..files import LocalFileAsset @@ -190,3 +193,22 @@ def test_upload_invalid_metadata( upload(paths=[], dandi_instance=local_dandi_api["instance_id"], devel_debug=True) with pytest.raises(NotFoundError): d.get_asset_by_path(nwb_file) + + +def test_upload_zarr(local_dandi_api, monkeypatch, tmp_path): + d = local_dandi_api["client"].create_dandiset("Test Dandiset", {}) + dandiset_id = d.identifier + (tmp_path / dandiset_metadata_file).write_text(f"identifier: '{dandiset_id}'\n") + zarr.save(tmp_path / "sample.zarr", np.arange(1000), np.arange(1000, 0, -1)) + monkeypatch.setenv("DANDI_API_KEY", local_dandi_api["api_key"]) + upload( + paths=[], + dandiset_path=tmp_path, + dandi_instance=local_dandi_api["instance_id"], + devel_debug=True, + ) + (asset,) = d.get_assets() + assert isinstance(asset, RemoteZarrAsset) + assert asset.is_zarr() + assert not asset.is_blob() + assert asset.path == "sample.zarr" From f503ebab66c2ce0e03ce54cd16d5b6badf50d212 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Wed, 12 Jan 2022 10:38:48 -0500 Subject: [PATCH 23/56] Make LGTM happy --- dandi/files.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dandi/files.py b/dandi/files.py index e7bc71ebb..76c94a0bb 100644 --- a/dandi/files.py +++ b/dandi/files.py @@ -601,7 +601,7 @@ def iter_upload( "current": bytes_uploaded, } lgr.debug("%s: Completing upload of batch #%d", asset_path, i) - r = client.post(f"/zarr/{zarr_id}/upload/complete/") + client.post(f"/zarr/{zarr_id}/upload/complete/") lgr.debug("%s: Upload completed", asset_path) r = client.get(f"/zarr/{zarr_id}/") if r["checksum"] != filetag: From ab9ffb0b775723751498d2cea0d8a3fea90750ce Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Wed, 12 Jan 2022 10:44:34 -0500 Subject: [PATCH 24/56] Add a zarr_dandiset fixture --- dandi/tests/fixtures.py | 118 +++++++++++++++++++++------- dandi/tests/test_dandiapi.py | 104 ++++++++++++------------- dandi/tests/test_dandiarchive.py | 61 +++++++-------- dandi/tests/test_delete.py | 128 +++++++++++++++---------------- dandi/tests/test_download.py | 68 ++++++++-------- dandi/tests/test_files.py | 2 +- dandi/tests/test_keyring.py | 6 +- dandi/tests/test_upload.py | 70 ++++++++--------- 8 files changed, 301 insertions(+), 256 deletions(-) diff --git a/dandi/tests/fixtures.py b/dandi/tests/fixtures.py index 5c4c5844c..a6bdfe9a9 100644 --- a/dandi/tests/fixtures.py +++ b/dandi/tests/fixtures.py @@ -1,3 +1,4 @@ +from dataclasses import dataclass, field from datetime import datetime import logging import os @@ -7,20 +8,23 @@ from subprocess import DEVNULL, check_output, run import tempfile from time import sleep +from typing import Any, Dict from uuid import uuid4 from click.testing import CliRunner from dandischema.consts import DANDI_SCHEMA_VERSION from dateutil.tz import tzutc +import numpy as np import pynwb import pytest import requests +import zarr from .skip import skipif from .. import get_logger from ..cli.command import organize -from ..consts import dandiset_metadata_file, known_instances -from ..dandiapi import DandiAPIClient +from ..consts import DandiInstance, dandiset_metadata_file, known_instances +from ..dandiapi import DandiAPIClient, RemoteDandiset from ..pynwb_utils import make_nwb_file, metadata_nwb_file_fields from ..upload import upload @@ -241,23 +245,59 @@ def docker_compose_setup(): run(["docker-compose", "down", "-v"], cwd=str(LOCAL_DOCKER_DIR), check=True) +@dataclass +class DandiAPI: + api_key: str + client: DandiAPIClient + instance: DandiInstance + instance_id: str + + @property + def api_url(self) -> str: + return self.instance.api + + @pytest.fixture(scope="session") def local_dandi_api(docker_compose_setup): instance_id = "dandi-api-local-docker-tests" instance = known_instances[instance_id] api_key = docker_compose_setup["django_api_key"] with DandiAPIClient(api_url=instance.api, token=api_key) as client: - yield { - "api_key": api_key, - "client": client, - "instance": instance, - "instance_id": instance_id, - } + yield DandiAPI( + api_key=api_key, + client=client, + instance=instance, + instance_id=instance_id, + ) + + +@dataclass +class SampleDandiset: + api: DandiAPI + dspath: Path + dandiset: RemoteDandiset + dandiset_id: str + upload_kwargs: Dict[str, Any] = field(default_factory=dict) + + @property + def client(self) -> DandiAPIClient: + return self.api.client + + def upload(self, paths=None, **kwargs) -> None: + with pytest.MonkeyPatch().context() as m: + m.setenv("DANDI_API_KEY", self.api.api_key) + upload( + paths=paths or [], + dandiset_path=self.dspath, + dandi_instance=self.api.instance_id, + devel_debug=True, + **{**self.upload_kwargs, **kwargs}, + ) @pytest.fixture() def text_dandiset(local_dandi_api, monkeypatch, tmp_path_factory): - d = local_dandi_api["client"].create_dandiset( + d = local_dandi_api.client.create_dandiset( "Text Dandiset", { "schemaKey": "Dandiset", @@ -283,28 +323,48 @@ def text_dandiset(local_dandi_api, monkeypatch, tmp_path_factory): (dspath / "subdir2").mkdir() (dspath / "subdir2" / "banana.txt").write_text("Banana\n") (dspath / "subdir2" / "coconut.txt").write_text("Coconut\n") + td = SampleDandiset( + api=local_dandi_api, + dspath=dspath, + dandiset=d, + dandiset_id=dandiset_id, + upload_kwargs={"allow_any_path": True, "validation": "skip"}, + ) + td.upload() + return td - def upload_dandiset(paths=None, **kwargs): - with monkeypatch.context() as m: - m.setenv("DANDI_API_KEY", local_dandi_api["api_key"]) - upload( - paths=paths or [], - dandiset_path=dspath, - dandi_instance=local_dandi_api["instance_id"], - devel_debug=True, - allow_any_path=True, - validation="skip", - **kwargs, - ) - upload_dandiset() - return { - "client": local_dandi_api["client"], - "dspath": dspath, - "dandiset": d, - "dandiset_id": dandiset_id, - "reupload": upload_dandiset, - } +@pytest.fixture() +def zarr_dandiset(local_dandi_api, monkeypatch, tmp_path_factory): + d = local_dandi_api.client.create_dandiset( + "Zarr Dandiset", + { + "schemaKey": "Dandiset", + "name": "Zarr Dandiset", + "description": "A test Zarr Dandiset", + "contributor": [ + { + "schemaKey": "Person", + "name": "Wodder, John", + "roleName": ["dcite:Author", "dcite:ContactPerson"], + } + ], + "license": ["spdx:CC0-1.0"], + "manifestLocation": ["https://github.com/dandi/dandi-cli"], + }, + ) + dandiset_id = d.identifier + dspath = tmp_path_factory.mktemp("zarr_dandiset") + (dspath / dandiset_metadata_file).write_text(f"identifier: '{dandiset_id}'\n") + zarr.save(dspath / "sample.zarr", np.arange(1000), np.arange(1000, 0, -1)) + td = SampleDandiset( + api=local_dandi_api, + dspath=dspath, + dandiset=d, + dandiset_id=dandiset_id, + ) + td.upload() + return td @pytest.fixture() diff --git a/dandi/tests/test_dandiapi.py b/dandi/tests/test_dandiapi.py index 04062e5d2..ad7a1fa45 100644 --- a/dandi/tests/test_dandiapi.py +++ b/dandi/tests/test_dandiapi.py @@ -28,7 +28,7 @@ def test_upload(local_dandi_api, simple1_nwb, tmp_path): - client = local_dandi_api["client"] + client = local_dandi_api.client d = client.create_dandiset(name="Upload Test", metadata={}) assert d.version_id == DRAFT d.upload_raw_asset(simple1_nwb, {"path": "testing/simple1.nwb"}) @@ -41,7 +41,7 @@ def test_upload(local_dandi_api, simple1_nwb, tmp_path): def test_publish_and_manipulate(local_dandi_api, monkeypatch, tmp_path): - client = local_dandi_api["client"] + client = local_dandi_api.client d = client.create_dandiset( "Test Dandiset", { @@ -67,10 +67,10 @@ def test_publish_and_manipulate(local_dandi_api, monkeypatch, tmp_path): (upload_dir / "subdir").mkdir() (upload_dir / "subdir" / "file.txt").write_text("This is test text.\n") monkeypatch.chdir(upload_dir) - monkeypatch.setenv("DANDI_API_KEY", local_dandi_api["api_key"]) + monkeypatch.setenv("DANDI_API_KEY", local_dandi_api.api_key) upload( paths=[], - dandi_instance=local_dandi_api["instance_id"], + dandi_instance=local_dandi_api.instance_id, devel_debug=True, allow_any_path=True, validation="skip", @@ -96,7 +96,7 @@ def test_publish_and_manipulate(local_dandi_api, monkeypatch, tmp_path): (upload_dir / "subdir" / "file.txt").write_text("This is different text.\n") upload( paths=[], - dandi_instance=local_dandi_api["instance_id"], + dandi_instance=local_dandi_api.instance_id, devel_debug=True, allow_any_path=True, validation="skip", @@ -109,7 +109,7 @@ def test_publish_and_manipulate(local_dandi_api, monkeypatch, tmp_path): (upload_dir / "subdir" / "file2.txt").write_text("This is more text.\n") upload( paths=[], - dandi_instance=local_dandi_api["instance_id"], + dandi_instance=local_dandi_api.instance_id, devel_debug=True, allow_any_path=True, validation="skip", @@ -147,7 +147,7 @@ def test_publish_and_manipulate(local_dandi_api, monkeypatch, tmp_path): def test_get_asset_metadata(local_dandi_api, simple1_nwb): - client = local_dandi_api["client"] + client = local_dandi_api.client d = client.create_dandiset(name="Include Metadata Test", metadata={}) d.upload_raw_asset(simple1_nwb, {"path": "testing/simple1.nwb", "foo": "bar"}) (asset,) = d.get_assets() @@ -158,7 +158,7 @@ def test_get_asset_metadata(local_dandi_api, simple1_nwb): def test_large_upload(local_dandi_api, tmp_path): - client = local_dandi_api["client"] + client = local_dandi_api.client asset_file = tmp_path / "asset.dat" meg = bytes(random.choices(range(256), k=1 << 20)) with asset_file.open("wb") as fp: @@ -169,9 +169,9 @@ def test_large_upload(local_dandi_api, tmp_path): def test_authenticate_bad_key_good_key_input(local_dandi_api, mocker, monkeypatch): - good_key = local_dandi_api["api_key"] + good_key = local_dandi_api.api_key bad_key = "1234567890" - client_name = local_dandi_api["instance_id"] + client_name = local_dandi_api.instance_id app_id = f"dandi-api-{client_name}" backend_mock = mocker.Mock(spec=["set_password"]) @@ -186,7 +186,7 @@ def test_authenticate_bad_key_good_key_input(local_dandi_api, mocker, monkeypatc monkeypatch.delenv("DANDI_API_KEY", raising=False) - client = DandiAPIClient(local_dandi_api["instance"].api) + client = DandiAPIClient(local_dandi_api.api_url) assert "Authorization" not in client.session.headers client.dandi_authenticate() assert client.session.headers["Authorization"] == f"token {good_key}" @@ -201,8 +201,8 @@ def test_authenticate_bad_key_good_key_input(local_dandi_api, mocker, monkeypatc def test_authenticate_good_key_keyring(local_dandi_api, mocker, monkeypatch): - good_key = local_dandi_api["api_key"] - client_name = local_dandi_api["instance_id"] + good_key = local_dandi_api.api_key + client_name = local_dandi_api.instance_id app_id = f"dandi-api-{client_name}" backend_mock = mocker.Mock(spec=["set_password"]) @@ -215,7 +215,7 @@ def test_authenticate_good_key_keyring(local_dandi_api, mocker, monkeypatch): monkeypatch.delenv("DANDI_API_KEY", raising=False) - client = DandiAPIClient(local_dandi_api["instance"].api) + client = DandiAPIClient(local_dandi_api.api_url) assert "Authorization" not in client.session.headers client.dandi_authenticate() assert client.session.headers["Authorization"] == f"token {good_key}" @@ -230,9 +230,9 @@ def test_authenticate_good_key_keyring(local_dandi_api, mocker, monkeypatch): def test_authenticate_bad_key_keyring_good_key_input( local_dandi_api, mocker, monkeypatch ): - good_key = local_dandi_api["api_key"] + good_key = local_dandi_api.api_key bad_key = "1234567890" - client_name = local_dandi_api["instance_id"] + client_name = local_dandi_api.instance_id app_id = f"dandi-api-{client_name}" backend_mock = mocker.Mock(spec=["set_password"]) @@ -247,7 +247,7 @@ def test_authenticate_bad_key_keyring_good_key_input( monkeypatch.delenv("DANDI_API_KEY", raising=False) - client = DandiAPIClient(local_dandi_api["instance"].api) + client = DandiAPIClient(local_dandi_api.api_url) assert "Authorization" not in client.session.headers client.dandi_authenticate() assert client.session.headers["Authorization"] == f"token {good_key}" @@ -305,7 +305,7 @@ def test_get_content_url_follow_one_redirects_strip_query(): def test_remote_asset_json_dict(text_dandiset): - asset = text_dandiset["dandiset"].get_asset_by_path("file.txt") + asset = text_dandiset.dandiset.get_asset_by_path("file.txt") assert asset.json_dict() == { "asset_id": anys.ANY_STR, "modified": anys.ANY_AWARE_DATETIME_STR, @@ -343,16 +343,14 @@ def test_check_schema_version_mismatch(): def test_get_dandisets(text_dandiset): - dandisets = list(text_dandiset["client"].get_dandisets()) - assert ( - sum(1 for d in dandisets if d.identifier == text_dandiset["dandiset_id"]) == 1 - ) + dandisets = list(text_dandiset.client.get_dandisets()) + assert sum(1 for d in dandisets if d.identifier == text_dandiset.dandiset_id) == 1 def test_get_dandiset_lazy(mocker, text_dandiset): - client = text_dandiset["client"] + client = text_dandiset.client get_spy = mocker.spy(client, "get") - dandiset = client.get_dandiset(text_dandiset["dandiset_id"], DRAFT, lazy=True) + dandiset = client.get_dandiset(text_dandiset.dandiset_id, DRAFT, lazy=True) get_spy.assert_not_called() assert dandiset.version_id == DRAFT get_spy.assert_not_called() @@ -370,9 +368,9 @@ def test_get_dandiset_lazy(mocker, text_dandiset): def test_get_dandiset_non_lazy(mocker, text_dandiset): - client = text_dandiset["client"] + client = text_dandiset.client get_spy = mocker.spy(client, "get") - dandiset = client.get_dandiset(text_dandiset["dandiset_id"], DRAFT, lazy=False) + dandiset = client.get_dandiset(text_dandiset.dandiset_id, DRAFT, lazy=False) get_spy.assert_called_once() get_spy.reset_mock() assert dandiset.version_id == DRAFT @@ -391,9 +389,7 @@ def test_get_dandiset_non_lazy(mocker, text_dandiset): @pytest.mark.parametrize("lazy", [True, False]) def test_get_dandiset_no_version_id(lazy, text_dandiset): - dandiset = text_dandiset["client"].get_dandiset( - text_dandiset["dandiset_id"], lazy=lazy - ) + dandiset = text_dandiset.client.get_dandiset(text_dandiset.dandiset_id, lazy=lazy) assert dandiset.version_id == DRAFT assert isinstance(dandiset.created, datetime) assert isinstance(dandiset.created, datetime) @@ -410,10 +406,10 @@ def test_get_dandiset_no_version_id(lazy, text_dandiset): @pytest.mark.parametrize("lazy", [True, False]) def test_get_dandiset_published(lazy, text_dandiset): - d = text_dandiset["dandiset"] + d = text_dandiset.dandiset d.wait_until_valid() v = d.publish().version.identifier - dandiset = text_dandiset["client"].get_dandiset(d.identifier, v, lazy=lazy) + dandiset = text_dandiset.client.get_dandiset(d.identifier, v, lazy=lazy) assert dandiset.version_id == v assert isinstance(dandiset.created, datetime) assert isinstance(dandiset.created, datetime) @@ -431,10 +427,10 @@ def test_get_dandiset_published(lazy, text_dandiset): @pytest.mark.parametrize("lazy", [True, False]) def test_get_dandiset_published_no_version_id(lazy, text_dandiset): - d = text_dandiset["dandiset"] + d = text_dandiset.dandiset d.wait_until_valid() v = d.publish().version.identifier - dandiset = text_dandiset["client"].get_dandiset(d.identifier, lazy=lazy) + dandiset = text_dandiset.client.get_dandiset(d.identifier, lazy=lazy) assert dandiset.version_id == v assert isinstance(dandiset.created, datetime) assert isinstance(dandiset.created, datetime) @@ -452,10 +448,10 @@ def test_get_dandiset_published_no_version_id(lazy, text_dandiset): @pytest.mark.parametrize("lazy", [True, False]) def test_get_dandiset_published_draft(lazy, text_dandiset): - d = text_dandiset["dandiset"] + d = text_dandiset.dandiset d.wait_until_valid() v = d.publish().version.identifier - dandiset = text_dandiset["client"].get_dandiset(d.identifier, DRAFT, lazy=lazy) + dandiset = text_dandiset.client.get_dandiset(d.identifier, DRAFT, lazy=lazy) assert dandiset.version_id == DRAFT assert isinstance(dandiset.created, datetime) assert isinstance(dandiset.created, datetime) @@ -473,17 +469,17 @@ def test_get_dandiset_published_draft(lazy, text_dandiset): @pytest.mark.parametrize("lazy", [True, False]) def test_get_dandiset_published_other_version(lazy, text_dandiset): - d = text_dandiset["dandiset"] + d = text_dandiset.dandiset d.wait_until_valid() v1 = d.publish().version.identifier - (text_dandiset["dspath"] / "file2.txt").write_text("This is more text.\n") - text_dandiset["reupload"]() + (text_dandiset.dspath / "file2.txt").write_text("This is more text.\n") + text_dandiset.upload() d.wait_until_valid() v2 = d.publish().version.identifier assert v1 != v2 - dandiset = text_dandiset["client"].get_dandiset(d.identifier, v1, lazy=lazy) + dandiset = text_dandiset.client.get_dandiset(d.identifier, v1, lazy=lazy) assert dandiset.version_id == v1 assert isinstance(dandiset.created, datetime) assert isinstance(dandiset.created, datetime) @@ -501,7 +497,7 @@ def test_get_dandiset_published_other_version(lazy, text_dandiset): def test_set_asset_metadata(text_dandiset): - asset = text_dandiset["dandiset"].get_asset_by_path("file.txt") + asset = text_dandiset.dandiset.get_asset_by_path("file.txt") md = asset.get_metadata() md.blobDateModified = datetime(2038, 1, 19, 3, 14, 7, tzinfo=timezone.utc) asset.set_metadata(md) @@ -509,7 +505,7 @@ def test_set_asset_metadata(text_dandiset): def test_remote_dandiset_json_dict(text_dandiset): - data = text_dandiset["dandiset"].json_dict() + data = text_dandiset.dandiset.json_dict() assert data == { "identifier": anys.AnyFullmatch(dandiset_identifier_regex), "created": anys.ANY_AWARE_DATETIME_STR, @@ -531,7 +527,7 @@ def test_remote_dandiset_json_dict(text_dandiset): def test_set_dandiset_metadata(text_dandiset): - dandiset = text_dandiset["dandiset"] + dandiset = text_dandiset.dandiset md = dandiset.get_metadata() md.description = "A test Dandiset with altered metadata" dandiset.set_metadata(md) @@ -549,19 +545,19 @@ def test_set_dandiset_metadata(text_dandiset): ], ) def test_get_digest(digest_type, digest_regex, text_dandiset): - asset = text_dandiset["dandiset"].get_asset_by_path("file.txt") + asset = text_dandiset.dandiset.get_asset_by_path("file.txt") d = asset.get_digest(digest_type) assert re.fullmatch(digest_regex, d) def test_get_digest_nonexistent(text_dandiset): - asset = text_dandiset["dandiset"].get_asset_by_path("file.txt") + asset = text_dandiset.dandiset.get_asset_by_path("file.txt") with pytest.raises(NotFoundError): asset.get_digest("md5") def test_refresh(text_dandiset): - dandiset = text_dandiset["dandiset"] + dandiset = text_dandiset.dandiset mtime = dandiset.version.modified md = dandiset.get_metadata() md.description = "A test Dandiset with altered metadata" @@ -577,12 +573,12 @@ def test_refresh(text_dandiset): def test_get_asset_with_and_without_metadata(mocker, text_dandiset): - path_asset = text_dandiset["dandiset"].get_asset_by_path("file.txt") - id_asset = text_dandiset["dandiset"].get_asset(path_asset.identifier) + path_asset = text_dandiset.dandiset.get_asset_by_path("file.txt") + id_asset = text_dandiset.dandiset.get_asset(path_asset.identifier) assert path_asset == id_asset assert path_asset._metadata is None assert id_asset._metadata is not None - get_spy = mocker.spy(text_dandiset["client"], "get") + get_spy = mocker.spy(text_dandiset.client, "get") id_metadata = id_asset.get_raw_metadata() get_spy.assert_not_called() path_metadata = path_asset.get_raw_metadata() @@ -618,31 +614,31 @@ def test_retry_logging(caplog): def test_get_assets_order(text_dandiset): assert [ - asset.path for asset in text_dandiset["dandiset"].get_assets(order="path") + asset.path for asset in text_dandiset.dandiset.get_assets(order="path") ] == ["file.txt", "subdir1/apple.txt", "subdir2/banana.txt", "subdir2/coconut.txt"] assert [ - asset.path for asset in text_dandiset["dandiset"].get_assets(order="-path") + asset.path for asset in text_dandiset.dandiset.get_assets(order="-path") ] == ["subdir2/coconut.txt", "subdir2/banana.txt", "subdir1/apple.txt", "file.txt"] def test_get_assets_with_path_prefix(text_dandiset): assert sorted( asset.path - for asset in text_dandiset["dandiset"].get_assets_with_path_prefix("subdir") + for asset in text_dandiset.dandiset.get_assets_with_path_prefix("subdir") ) == ["subdir1/apple.txt", "subdir2/banana.txt", "subdir2/coconut.txt"] assert sorted( asset.path - for asset in text_dandiset["dandiset"].get_assets_with_path_prefix("subdir2") + for asset in text_dandiset.dandiset.get_assets_with_path_prefix("subdir2") ) == ["subdir2/banana.txt", "subdir2/coconut.txt"] assert [ asset.path - for asset in text_dandiset["dandiset"].get_assets_with_path_prefix( + for asset in text_dandiset.dandiset.get_assets_with_path_prefix( "subdir", order="path" ) ] == ["subdir1/apple.txt", "subdir2/banana.txt", "subdir2/coconut.txt"] assert [ asset.path - for asset in text_dandiset["dandiset"].get_assets_with_path_prefix( + for asset in text_dandiset.dandiset.get_assets_with_path_prefix( "subdir", order="-path" ) ] == ["subdir2/coconut.txt", "subdir2/banana.txt", "subdir1/apple.txt"] diff --git a/dandi/tests/test_dandiarchive.py b/dandi/tests/test_dandiarchive.py index 46828d30b..d5194c86b 100644 --- a/dandi/tests/test_dandiarchive.py +++ b/dandi/tests/test_dandiarchive.py @@ -360,9 +360,9 @@ def test_parse_gui_new_redirect(): @pytest.mark.parametrize("version_suffix", ["", "@draft", "@0.999999.9999"]) def test_get_nonexistent_dandiset(local_dandi_api, version_suffix): - url = f"dandi://{local_dandi_api['instance_id']}/999999{version_suffix}" + url = f"dandi://{local_dandi_api.instance_id}/999999{version_suffix}" parsed_url = parse_dandi_url(url) - client = local_dandi_api["client"] + client = local_dandi_api.client parsed_url.get_dandiset(client) # No error with pytest.raises(NotFoundError) as excinfo: parsed_url.get_dandiset(client, lazy=False) @@ -376,41 +376,38 @@ def test_get_nonexistent_dandiset(local_dandi_api, version_suffix): @pytest.mark.parametrize("version", ["draft", "0.999999.9999"]) def test_get_nonexistent_dandiset_asset_id(local_dandi_api, version): url = ( - f"{local_dandi_api['instance'].api}/dandisets/999999/versions/{version}" + f"{local_dandi_api.api_url}/dandisets/999999/versions/{version}" "/assets/00000000-0000-0000-0000-000000000000/" ) parsed_url = parse_dandi_url(url) - client = local_dandi_api["client"] + client = local_dandi_api.client assert list(parsed_url.get_assets(client)) == [] with pytest.raises(NotFoundError) as excinfo: next(parsed_url.get_assets(client, strict=True)) assert str(excinfo.value) == "No such Dandiset: '999999'" -def test_get_dandiset_nonexistent_asset_id(local_dandi_api, text_dandiset): +def test_get_dandiset_nonexistent_asset_id(text_dandiset): url = ( - f"{local_dandi_api['instance'].api}/dandisets/" - f"{text_dandiset['dandiset_id']}/versions/draft/assets/" + f"{text_dandiset.api.api_url}/dandisets/" + f"{text_dandiset.dandiset_id}/versions/draft/assets/" "00000000-0000-0000-0000-000000000000/" ) parsed_url = parse_dandi_url(url) - client = local_dandi_api["client"] + client = text_dandiset.client assert list(parsed_url.get_assets(client)) == [] with pytest.raises(NotFoundError) as excinfo: next(parsed_url.get_assets(client, strict=True)) assert str(excinfo.value) == ( "No such asset: '00000000-0000-0000-0000-000000000000' for" - f" DANDI-API-LOCAL-DOCKER-TESTS:{text_dandiset['dandiset_id']}/draft" + f" DANDI-API-LOCAL-DOCKER-TESTS:{text_dandiset.dandiset_id}/draft" ) def test_get_nonexistent_asset_id(local_dandi_api): - url = ( - f"{local_dandi_api['instance'].api}/assets/" - "00000000-0000-0000-0000-000000000000/" - ) + url = f"{local_dandi_api.api_url}/assets/00000000-0000-0000-0000-000000000000/" parsed_url = parse_dandi_url(url) - client = local_dandi_api["client"] + client = local_dandi_api.client assert list(parsed_url.get_assets(client)) == [] with pytest.raises(NotFoundError) as excinfo: next(parsed_url.get_assets(client, strict=True)) @@ -419,22 +416,22 @@ def test_get_nonexistent_asset_id(local_dandi_api): @pytest.mark.parametrize("version_suffix", ["", "@draft", "@0.999999.9999"]) def test_get_nonexistent_dandiset_asset_path(local_dandi_api, version_suffix): - url = f"dandi://{local_dandi_api['instance_id']}/999999{version_suffix}/does/not/exist" + url = f"dandi://{local_dandi_api.instance_id}/999999{version_suffix}/does/not/exist" parsed_url = parse_dandi_url(url) - client = local_dandi_api["client"] + client = local_dandi_api.client assert list(parsed_url.get_assets(client)) == [] with pytest.raises(NotFoundError) as excinfo: next(parsed_url.get_assets(client, strict=True)) assert str(excinfo.value) == "No such Dandiset: '999999'" -def test_get_nonexistent_asset_path(local_dandi_api, text_dandiset): +def test_get_nonexistent_asset_path(text_dandiset): url = ( - f"dandi://{local_dandi_api['instance_id']}/" - f"{text_dandiset['dandiset_id']}/does/not/exist" + f"dandi://{text_dandiset.api.instance_id}/" + f"{text_dandiset.dandiset_id}/does/not/exist" ) parsed_url = parse_dandi_url(url) - client = local_dandi_api["client"] + client = text_dandiset.client assert list(parsed_url.get_assets(client)) == [] with pytest.raises(NotFoundError) as excinfo: next(parsed_url.get_assets(client, strict=True)) @@ -444,24 +441,24 @@ def test_get_nonexistent_asset_path(local_dandi_api, text_dandiset): @pytest.mark.parametrize("version_suffix", ["", "@draft", "@0.999999.9999"]) def test_get_nonexistent_dandiset_asset_folder(local_dandi_api, version_suffix): url = ( - f"dandi://{local_dandi_api['instance_id']}/999999{version_suffix}" + f"dandi://{local_dandi_api.instance_id}/999999{version_suffix}" "/does/not/exist/" ) parsed_url = parse_dandi_url(url) - client = local_dandi_api["client"] + client = local_dandi_api.client assert list(parsed_url.get_assets(client)) == [] with pytest.raises(NotFoundError) as excinfo: next(parsed_url.get_assets(client, strict=True)) assert str(excinfo.value) == "No such Dandiset: '999999'" -def test_get_nonexistent_asset_folder(local_dandi_api, text_dandiset): +def test_get_nonexistent_asset_folder(text_dandiset): url = ( - f"dandi://{local_dandi_api['instance_id']}/" - f"{text_dandiset['dandiset_id']}/does/not/exist/" + f"dandi://{text_dandiset.api.instance_id}/" + f"{text_dandiset.dandiset_id}/does/not/exist/" ) parsed_url = parse_dandi_url(url) - client = local_dandi_api["client"] + client = text_dandiset.client assert list(parsed_url.get_assets(client)) == [] assert list(parsed_url.get_assets(client, strict=True)) == [] @@ -469,23 +466,23 @@ def test_get_nonexistent_asset_folder(local_dandi_api, text_dandiset): @pytest.mark.parametrize("version", ["draft", "0.999999.9999"]) def test_get_nonexistent_dandiset_asset_prefix(local_dandi_api, version): url = ( - f"{local_dandi_api['instance'].api}/dandisets/999999/versions/{version}" + f"{local_dandi_api.api_url}/dandisets/999999/versions/{version}" "/assets/?path=does/not/exist" ) parsed_url = parse_dandi_url(url) - client = local_dandi_api["client"] + client = local_dandi_api.client assert list(parsed_url.get_assets(client)) == [] with pytest.raises(NotFoundError) as excinfo: next(parsed_url.get_assets(client, strict=True)) assert str(excinfo.value) == "No such Dandiset: '999999'" -def test_get_nonexistent_asset_prefix(local_dandi_api, text_dandiset): +def test_get_nonexistent_asset_prefix(text_dandiset): url = ( - f"{local_dandi_api['instance'].api}/dandisets/" - f"{text_dandiset['dandiset_id']}/versions/draft/assets/?path=does/not/exist" + f"{text_dandiset.api.api_url}/dandisets/" + f"{text_dandiset.dandiset_id}/versions/draft/assets/?path=does/not/exist" ) parsed_url = parse_dandi_url(url) - client = local_dandi_api["client"] + client = text_dandiset.client assert list(parsed_url.get_assets(client)) == [] assert list(parsed_url.get_assets(client, strict=True)) == [] diff --git a/dandi/tests/test_delete.py b/dandi/tests/test_delete.py index e5fa7128b..087e56a12 100644 --- a/dandi/tests/test_delete.py +++ b/dandi/tests/test_delete.py @@ -54,13 +54,11 @@ ), ], ) -def test_delete_paths( - local_dandi_api, mocker, monkeypatch, text_dandiset, tmp_path, paths, remainder -): - monkeypatch.chdir(text_dandiset["dspath"]) - monkeypatch.setenv("DANDI_API_KEY", local_dandi_api["api_key"]) - instance = local_dandi_api["instance_id"] - dandiset_id = text_dandiset["dandiset_id"] +def test_delete_paths(mocker, monkeypatch, text_dandiset, tmp_path, paths, remainder): + monkeypatch.chdir(text_dandiset.dspath) + monkeypatch.setenv("DANDI_API_KEY", text_dandiset.api.api_key) + instance = text_dandiset.api.instance_id + dandiset_id = text_dandiset.dandiset_id delete_spy = mocker.spy(RESTFullAPIClient, "delete") delete( [p.format(instance=instance, dandiset_id=dandiset_id) for p in paths], @@ -69,20 +67,18 @@ def test_delete_paths( force=True, ) delete_spy.assert_called() - download(text_dandiset["dandiset"].version_api_url, tmp_path) + download(text_dandiset.dandiset.version_api_url, tmp_path) assert list_paths(tmp_path) == [ tmp_path / dandiset_id / f for f in ["dandiset.yaml"] + remainder ] @pytest.mark.parametrize("confirm", [True, False]) -def test_delete_path_confirm( - confirm, local_dandi_api, mocker, monkeypatch, text_dandiset -): - monkeypatch.chdir(text_dandiset["dspath"]) - monkeypatch.setenv("DANDI_API_KEY", local_dandi_api["api_key"]) - instance = local_dandi_api["instance_id"] - dandiset_id = text_dandiset["dandiset_id"] +def test_delete_path_confirm(confirm, mocker, monkeypatch, text_dandiset): + monkeypatch.chdir(text_dandiset.dspath) + monkeypatch.setenv("DANDI_API_KEY", text_dandiset.api.api_key) + instance = text_dandiset.api.instance_id + dandiset_id = text_dandiset.dandiset_id delete_spy = mocker.spy(RESTFullAPIClient, "delete") confirm_mock = mocker.patch("click.confirm", return_value=confirm) delete(["subdir2/coconut.txt"], dandi_instance=instance, devel_debug=True) @@ -95,10 +91,10 @@ def test_delete_path_confirm( delete_spy.assert_not_called() -def test_delete_path_pyout(local_dandi_api, mocker, monkeypatch, text_dandiset): - monkeypatch.chdir(text_dandiset["dspath"]) - monkeypatch.setenv("DANDI_API_KEY", local_dandi_api["api_key"]) - instance = local_dandi_api["instance_id"] +def test_delete_path_pyout(mocker, monkeypatch, text_dandiset): + monkeypatch.chdir(text_dandiset.dspath) + monkeypatch.setenv("DANDI_API_KEY", text_dandiset.api.api_key) + instance = text_dandiset.api.instance_id delete_spy = mocker.spy(RESTFullAPIClient, "delete") delete(["subdir2/coconut.txt"], dandi_instance=instance, force=True) delete_spy.assert_called() @@ -120,11 +116,11 @@ def test_delete_path_pyout(local_dandi_api, mocker, monkeypatch, text_dandiset): ], ], ) -def test_delete_dandiset(local_dandi_api, mocker, monkeypatch, text_dandiset, paths): - monkeypatch.chdir(text_dandiset["dspath"]) - monkeypatch.setenv("DANDI_API_KEY", local_dandi_api["api_key"]) - instance = local_dandi_api["instance_id"] - dandiset_id = text_dandiset["dandiset_id"] +def test_delete_dandiset(mocker, monkeypatch, text_dandiset, paths): + monkeypatch.chdir(text_dandiset.dspath) + monkeypatch.setenv("DANDI_API_KEY", text_dandiset.api.api_key) + instance = text_dandiset.api.instance_id + dandiset_id = text_dandiset.dandiset_id delete_spy = mocker.spy(RESTFullAPIClient, "delete") delete( [p.format(instance=instance, dandiset_id=dandiset_id) for p in paths], @@ -134,17 +130,15 @@ def test_delete_dandiset(local_dandi_api, mocker, monkeypatch, text_dandiset, pa ) delete_spy.assert_called() with pytest.raises(NotFoundError): - local_dandi_api["client"].get_dandiset(dandiset_id, DRAFT, lazy=False) + text_dandiset.client.get_dandiset(dandiset_id, DRAFT, lazy=False) @pytest.mark.parametrize("confirm", [True, False]) -def test_delete_dandiset_confirm( - confirm, local_dandi_api, mocker, monkeypatch, text_dandiset -): - monkeypatch.chdir(text_dandiset["dspath"]) - monkeypatch.setenv("DANDI_API_KEY", local_dandi_api["api_key"]) - instance = local_dandi_api["instance_id"] - dandiset_id = text_dandiset["dandiset_id"] +def test_delete_dandiset_confirm(confirm, mocker, monkeypatch, text_dandiset): + monkeypatch.chdir(text_dandiset.dspath) + monkeypatch.setenv("DANDI_API_KEY", text_dandiset.api.api_key) + instance = text_dandiset.api.instance_id + dandiset_id = text_dandiset.dandiset_id delete_spy = mocker.spy(RESTFullAPIClient, "delete") confirm_mock = mocker.patch("click.confirm", return_value=confirm) delete( @@ -157,11 +151,11 @@ def test_delete_dandiset_confirm( delete_spy.assert_not_called() -def test_delete_dandiset_mismatch(local_dandi_api, mocker, monkeypatch, text_dandiset): - monkeypatch.chdir(text_dandiset["dspath"]) - monkeypatch.setenv("DANDI_API_KEY", local_dandi_api["api_key"]) - instance = local_dandi_api["instance_id"] - dandiset_id = text_dandiset["dandiset_id"] +def test_delete_dandiset_mismatch(mocker, monkeypatch, text_dandiset): + monkeypatch.chdir(text_dandiset.dspath) + monkeypatch.setenv("DANDI_API_KEY", text_dandiset.api.api_key) + instance = text_dandiset.api.instance_id + dandiset_id = text_dandiset.dandiset_id not_dandiset = str(int(dandiset_id) - 1).zfill(6) delete_spy = mocker.spy(RESTFullAPIClient, "delete") for paths in [ @@ -182,11 +176,11 @@ def test_delete_dandiset_mismatch(local_dandi_api, mocker, monkeypatch, text_dan delete_spy.assert_not_called() -def test_delete_instance_mismatch(local_dandi_api, mocker, monkeypatch, text_dandiset): - monkeypatch.chdir(text_dandiset["dspath"]) - monkeypatch.setenv("DANDI_API_KEY", local_dandi_api["api_key"]) - instance = local_dandi_api["instance_id"] - dandiset_id = text_dandiset["dandiset_id"] +def test_delete_instance_mismatch(mocker, monkeypatch, text_dandiset): + monkeypatch.chdir(text_dandiset.dspath) + monkeypatch.setenv("DANDI_API_KEY", text_dandiset.api.api_key) + instance = text_dandiset.api.instance_id + dandiset_id = text_dandiset.dandiset_id delete_spy = mocker.spy(RESTFullAPIClient, "delete") for paths in [ [ @@ -208,8 +202,8 @@ def test_delete_instance_mismatch(local_dandi_api, mocker, monkeypatch, text_dan def test_delete_nonexistent_dandiset(local_dandi_api, mocker, monkeypatch): - monkeypatch.setenv("DANDI_API_KEY", local_dandi_api["api_key"]) - instance = local_dandi_api["instance_id"] + monkeypatch.setenv("DANDI_API_KEY", local_dandi_api.api_key) + instance = local_dandi_api.instance_id delete_spy = mocker.spy(RESTFullAPIClient, "delete") with pytest.raises(NotFoundError) as excinfo: delete( @@ -223,8 +217,8 @@ def test_delete_nonexistent_dandiset(local_dandi_api, mocker, monkeypatch): def test_delete_nonexistent_dandiset_skip_missing(local_dandi_api, mocker, monkeypatch): - monkeypatch.setenv("DANDI_API_KEY", local_dandi_api["api_key"]) - instance = local_dandi_api["instance_id"] + monkeypatch.setenv("DANDI_API_KEY", local_dandi_api.api_key) + instance = local_dandi_api.instance_id delete_spy = mocker.spy(RESTFullAPIClient, "delete") delete( [f"dandi://{instance}/999999/subdir1/apple.txt"], @@ -236,10 +230,10 @@ def test_delete_nonexistent_dandiset_skip_missing(local_dandi_api, mocker, monke delete_spy.assert_not_called() -def test_delete_nonexistent_asset(local_dandi_api, mocker, monkeypatch, text_dandiset): - monkeypatch.setenv("DANDI_API_KEY", local_dandi_api["api_key"]) - instance = local_dandi_api["instance_id"] - dandiset_id = text_dandiset["dandiset_id"] +def test_delete_nonexistent_asset(mocker, monkeypatch, text_dandiset): + monkeypatch.setenv("DANDI_API_KEY", text_dandiset.api.api_key) + instance = text_dandiset.api.instance_id + dandiset_id = text_dandiset.dandiset_id delete_spy = mocker.spy(RESTFullAPIClient, "delete") with pytest.raises(NotFoundError) as excinfo: delete( @@ -259,11 +253,11 @@ def test_delete_nonexistent_asset(local_dandi_api, mocker, monkeypatch, text_dan def test_delete_nonexistent_asset_skip_missing( - local_dandi_api, mocker, monkeypatch, text_dandiset, tmp_path + mocker, monkeypatch, text_dandiset, tmp_path ): - monkeypatch.setenv("DANDI_API_KEY", local_dandi_api["api_key"]) - instance = local_dandi_api["instance_id"] - dandiset_id = text_dandiset["dandiset_id"] + monkeypatch.setenv("DANDI_API_KEY", text_dandiset.api.api_key) + instance = text_dandiset.api.instance_id + dandiset_id = text_dandiset.dandiset_id delete_spy = mocker.spy(RESTFullAPIClient, "delete") delete( [ @@ -276,7 +270,7 @@ def test_delete_nonexistent_asset_skip_missing( skip_missing=True, ) delete_spy.assert_called() - download(text_dandiset["dandiset"].version_api_url, tmp_path) + download(text_dandiset.dandiset.version_api_url, tmp_path) assert list_paths(tmp_path) == [ tmp_path / dandiset_id / "dandiset.yaml", tmp_path / dandiset_id / "subdir1" / "apple.txt", @@ -285,12 +279,10 @@ def test_delete_nonexistent_asset_skip_missing( ] -def test_delete_nonexistent_asset_folder( - local_dandi_api, mocker, monkeypatch, text_dandiset -): - monkeypatch.setenv("DANDI_API_KEY", local_dandi_api["api_key"]) - instance = local_dandi_api["instance_id"] - dandiset_id = text_dandiset["dandiset_id"] +def test_delete_nonexistent_asset_folder(mocker, monkeypatch, text_dandiset): + monkeypatch.setenv("DANDI_API_KEY", text_dandiset.api.api_key) + instance = text_dandiset.api.instance_id + dandiset_id = text_dandiset.dandiset_id delete_spy = mocker.spy(RESTFullAPIClient, "delete") with pytest.raises(NotFoundError) as excinfo: delete( @@ -310,11 +302,11 @@ def test_delete_nonexistent_asset_folder( def test_delete_nonexistent_asset_folder_skip_missing( - local_dandi_api, mocker, monkeypatch, text_dandiset, tmp_path + mocker, monkeypatch, text_dandiset, tmp_path ): - monkeypatch.setenv("DANDI_API_KEY", local_dandi_api["api_key"]) - instance = local_dandi_api["instance_id"] - dandiset_id = text_dandiset["dandiset_id"] + monkeypatch.setenv("DANDI_API_KEY", text_dandiset.api.api_key) + instance = text_dandiset.api.instance_id + dandiset_id = text_dandiset.dandiset_id delete_spy = mocker.spy(RESTFullAPIClient, "delete") delete( [ @@ -327,7 +319,7 @@ def test_delete_nonexistent_asset_folder_skip_missing( skip_missing=True, ) delete_spy.assert_called() - download(text_dandiset["dandiset"].version_api_url, tmp_path) + download(text_dandiset.dandiset.version_api_url, tmp_path) assert list_paths(tmp_path) == [ tmp_path / dandiset_id / "dandiset.yaml", tmp_path / dandiset_id / "file.txt", @@ -337,8 +329,8 @@ def test_delete_nonexistent_asset_folder_skip_missing( def test_delete_version(local_dandi_api, mocker, monkeypatch): - monkeypatch.setenv("DANDI_API_KEY", local_dandi_api["api_key"]) - instance = local_dandi_api["instance_id"] + monkeypatch.setenv("DANDI_API_KEY", local_dandi_api.api_key) + instance = local_dandi_api.instance_id delete_spy = mocker.spy(RESTFullAPIClient, "delete") with pytest.raises(NotImplementedError) as excinfo: delete( diff --git a/dandi/tests/test_download.py b/dandi/tests/test_download.py index 7e76ebf14..c89ab0e9e 100644 --- a/dandi/tests/test_download.py +++ b/dandi/tests/test_download.py @@ -118,23 +118,23 @@ def test_download_000027_resume(tmp_path, resizer, version): def test_download_newest_version(text_dandiset, tmp_path): - dandiset = text_dandiset["dandiset"] - dandiset_id = text_dandiset["dandiset_id"] + dandiset = text_dandiset.dandiset + dandiset_id = text_dandiset.dandiset_id download(dandiset.api_url, tmp_path) assert (tmp_path / dandiset_id / "file.txt").read_text() == "This is test text.\n" dandiset.wait_until_valid() dandiset.publish() - (text_dandiset["dspath"] / "file.txt").write_text("This is different text.\n") - text_dandiset["reupload"]() + (text_dandiset.dspath / "file.txt").write_text("This is different text.\n") + text_dandiset.upload() rmtree(tmp_path / dandiset_id) download(dandiset.api_url, tmp_path) assert (tmp_path / dandiset_id / "file.txt").read_text() == "This is test text.\n" -def test_download_folder(local_dandi_api, text_dandiset, tmp_path): - dandiset_id = text_dandiset["dandiset_id"] +def test_download_folder(text_dandiset, tmp_path): + dandiset_id = text_dandiset.dandiset_id download( - f"dandi://{local_dandi_api['instance_id']}/{dandiset_id}/subdir2/", tmp_path + f"dandi://{text_dandiset.api.instance_id}/{dandiset_id}/subdir2/", tmp_path ) assert list_paths(tmp_path, dirs=True) == [ tmp_path / "subdir2", @@ -145,10 +145,10 @@ def test_download_folder(local_dandi_api, text_dandiset, tmp_path): assert (tmp_path / "subdir2" / "coconut.txt").read_text() == "Coconut\n" -def test_download_item(local_dandi_api, text_dandiset, tmp_path): - dandiset_id = text_dandiset["dandiset_id"] +def test_download_item(text_dandiset, tmp_path): + dandiset_id = text_dandiset.dandiset_id download( - f"dandi://{local_dandi_api['instance_id']}/{dandiset_id}/subdir2/coconut.txt", + f"dandi://{text_dandiset.api.instance_id}/{dandiset_id}/subdir2/coconut.txt", tmp_path, ) assert list_paths(tmp_path, dirs=True) == [tmp_path / "coconut.txt"] @@ -156,29 +156,29 @@ def test_download_item(local_dandi_api, text_dandiset, tmp_path): def test_download_asset_id(text_dandiset, tmp_path): - asset = text_dandiset["dandiset"].get_asset_by_path("subdir2/coconut.txt") + asset = text_dandiset.dandiset.get_asset_by_path("subdir2/coconut.txt") download(asset.download_url, tmp_path) assert list_paths(tmp_path, dirs=True) == [tmp_path / "coconut.txt"] assert (tmp_path / "coconut.txt").read_text() == "Coconut\n" def test_download_asset_id_only(text_dandiset, tmp_path): - asset = text_dandiset["dandiset"].get_asset_by_path("subdir2/coconut.txt") + asset = text_dandiset.dandiset.get_asset_by_path("subdir2/coconut.txt") download(asset.base_download_url, tmp_path) assert list_paths(tmp_path, dirs=True) == [tmp_path / "coconut.txt"] assert (tmp_path / "coconut.txt").read_text() == "Coconut\n" @pytest.mark.parametrize("confirm", [True, False]) -def test_download_sync(confirm, local_dandi_api, mocker, text_dandiset, tmp_path): - text_dandiset["dandiset"].get_asset_by_path("file.txt").delete() - dspath = tmp_path / text_dandiset["dandiset_id"] - os.rename(text_dandiset["dspath"], dspath) +def test_download_sync(confirm, mocker, text_dandiset, tmp_path): + text_dandiset.dandiset.get_asset_by_path("file.txt").delete() + dspath = tmp_path / text_dandiset.dandiset_id + os.rename(text_dandiset.dspath, dspath) confirm_mock = mocker.patch( "dandi.download.abbrev_prompt", return_value="yes" if confirm else "no" ) download( - f"dandi://{local_dandi_api['instance_id']}/{text_dandiset['dandiset_id']}", + f"dandi://{text_dandiset.api.instance_id}/{text_dandiset.dandiset_id}", tmp_path, existing="overwrite", sync=True, @@ -190,28 +190,28 @@ def test_download_sync(confirm, local_dandi_api, mocker, text_dandiset, tmp_path assert (dspath / "file.txt").exists() -def test_download_sync_folder(local_dandi_api, mocker, text_dandiset): - text_dandiset["dandiset"].get_asset_by_path("file.txt").delete() - text_dandiset["dandiset"].get_asset_by_path("subdir2/banana.txt").delete() +def test_download_sync_folder(mocker, text_dandiset): + text_dandiset.dandiset.get_asset_by_path("file.txt").delete() + text_dandiset.dandiset.get_asset_by_path("subdir2/banana.txt").delete() confirm_mock = mocker.patch("dandi.download.abbrev_prompt", return_value="yes") download( - f"dandi://{local_dandi_api['instance_id']}/{text_dandiset['dandiset_id']}/subdir2/", - text_dandiset["dspath"], + f"dandi://{text_dandiset.api.instance_id}/{text_dandiset.dandiset_id}/subdir2/", + text_dandiset.dspath, existing="overwrite", sync=True, ) confirm_mock.assert_called_with("Delete 1 local asset?", "yes", "no", "list") - assert (text_dandiset["dspath"] / "file.txt").exists() - assert not (text_dandiset["dspath"] / "subdir2" / "banana.txt").exists() + assert (text_dandiset.dspath / "file.txt").exists() + assert not (text_dandiset.dspath / "subdir2" / "banana.txt").exists() -def test_download_sync_list(capsys, local_dandi_api, mocker, text_dandiset, tmp_path): - text_dandiset["dandiset"].get_asset_by_path("file.txt").delete() - dspath = tmp_path / text_dandiset["dandiset_id"] - os.rename(text_dandiset["dspath"], dspath) +def test_download_sync_list(capsys, mocker, text_dandiset, tmp_path): + text_dandiset.dandiset.get_asset_by_path("file.txt").delete() + dspath = tmp_path / text_dandiset.dandiset_id + os.rename(text_dandiset.dspath, dspath) input_mock = mocker.patch("dandi.utils.input", side_effect=["list", "yes"]) download( - f"dandi://{local_dandi_api['instance_id']}/{text_dandiset['dandiset_id']}", + f"dandi://{text_dandiset.api.instance_id}/{text_dandiset.dandiset_id}", tmp_path, existing="overwrite", sync=True, @@ -228,7 +228,7 @@ def test_download_sync_list(capsys, local_dandi_api, mocker, text_dandiset, tmp_ def test_download_no_blobDateModified(text_dandiset, tmp_path): # Regression test for #806 responses.add_passthru(re.compile("^http")) - dandiset = text_dandiset["dandiset"] + dandiset = text_dandiset.dandiset asset = dandiset.get_asset_by_path("file.txt") metadata = asset.get_raw_metadata() del metadata["blobDateModified"] @@ -239,14 +239,14 @@ def test_download_no_blobDateModified(text_dandiset, tmp_path): @responses.activate def test_download_metadata404(text_dandiset, tmp_path): responses.add_passthru(re.compile("^http")) - asset = text_dandiset["dandiset"].get_asset_by_path("subdir1/apple.txt") + asset = text_dandiset.dandiset.get_asset_by_path("subdir1/apple.txt") responses.add(responses.GET, asset.api_url, status=404) statuses = list( download_generator( DandisetURL( - api_url=text_dandiset["client"].api_url, - dandiset_id=text_dandiset["dandiset"].identifier, - version_id=text_dandiset["dandiset"].version_id, + api_url=text_dandiset.client.api_url, + dandiset_id=text_dandiset.dandiset.identifier, + version_id=text_dandiset.dandiset.version_id, ), tmp_path, ) diff --git a/dandi/tests/test_files.py b/dandi/tests/test_files.py index a57628975..8c42f5560 100644 --- a/dandi/tests/test_files.py +++ b/dandi/tests/test_files.py @@ -121,7 +121,7 @@ def test_upload_zarr(local_dandi_api, tmp_path): zarr.save(filepath, np.arange(1000), np.arange(1000, 0, -1)) zf = dandi_file(filepath) assert isinstance(zf, ZarrAsset) - d = local_dandi_api["client"].create_dandiset("Zarr Dandiset", {}) + d = local_dandi_api.client.create_dandiset("Zarr Dandiset", {}) asset = zf.upload(d, {"description": "A test Zarr"}) assert isinstance(asset, RemoteZarrAsset) assert asset.is_zarr() diff --git a/dandi/tests/test_keyring.py b/dandi/tests/test_keyring.py index 5d3400a33..50fea4912 100644 --- a/dandi/tests/test_keyring.py +++ b/dandi/tests/test_keyring.py @@ -25,11 +25,11 @@ def test_dandi_authenticate_no_env_var(local_dandi_api, monkeypatch, mocker): monkeypatch.delenv("DANDI_API_KEY", raising=False) monkeypatch.setenv("PYTHON_KEYRING_BACKEND", "keyring.backends.null.Keyring") inputmock = mocker.patch( - "dandi.dandiapi.input", return_value=local_dandi_api["api_key"] + "dandi.dandiapi.input", return_value=local_dandi_api.api_key ) - DandiAPIClient(local_dandi_api["instance"].api).dandi_authenticate() + DandiAPIClient(local_dandi_api.api_url).dandi_authenticate() inputmock.assert_called_once_with( - "Please provide API Key for {}: ".format(local_dandi_api["instance_id"]) + "Please provide API Key for {}: ".format(local_dandi_api.instance_id) ) diff --git a/dandi/tests/test_upload.py b/dandi/tests/test_upload.py index 07255e930..eb7ded18d 100644 --- a/dandi/tests/test_upload.py +++ b/dandi/tests/test_upload.py @@ -17,15 +17,15 @@ def test_new_upload_download(local_dandi_api, monkeypatch, organized_nwb_dir, tmp_path): - d = local_dandi_api["client"].create_dandiset("Test Dandiset", {}) + d = local_dandi_api.client.create_dandiset("Test Dandiset", {}) dandiset_id = d.identifier (nwb_file,) = organized_nwb_dir.glob(f"*{os.sep}*.nwb") (organized_nwb_dir / dandiset_metadata_file).write_text( f"identifier: '{dandiset_id}'\n" ) monkeypatch.chdir(organized_nwb_dir) - monkeypatch.setenv("DANDI_API_KEY", local_dandi_api["api_key"]) - upload(paths=[], dandi_instance=local_dandi_api["instance_id"], devel_debug=True) + monkeypatch.setenv("DANDI_API_KEY", local_dandi_api.api_key) + upload(paths=[], dandi_instance=local_dandi_api.instance_id, devel_debug=True) download(d.version_api_url, tmp_path) (nwb_file2,) = tmp_path.glob(f"{dandiset_id}{os.sep}*{os.sep}*.nwb") assert nwb_file.name == nwb_file2.name @@ -47,65 +47,65 @@ def test_new_upload_download(local_dandi_api, monkeypatch, organized_nwb_dir, tm Path(dandiset_metadata_file).write_text(yaml_dump(ds_metadata)) upload( paths=[dandiset_metadata_file], - dandi_instance=local_dandi_api["instance_id"], + dandi_instance=local_dandi_api.instance_id, devel_debug=True, upload_dandiset_metadata=True, ) - d = local_dandi_api["client"].get_dandiset(dandiset_id, DRAFT) + d = local_dandi_api.client.get_dandiset(dandiset_id, DRAFT) assert d.version.name == "shorty" def test_new_upload_extant_existing(mocker, text_dandiset): iter_upload_spy = mocker.spy(LocalFileAsset, "iter_upload") with pytest.raises(FileExistsError): - text_dandiset["reupload"](existing="error") + text_dandiset.upload(existing="error") iter_upload_spy.assert_not_called() def test_new_upload_extant_skip(mocker, text_dandiset): iter_upload_spy = mocker.spy(LocalFileAsset, "iter_upload") - text_dandiset["reupload"](existing="skip") + text_dandiset.upload(existing="skip") iter_upload_spy.assert_not_called() @pytest.mark.parametrize("existing", ["overwrite", "refresh"]) def test_new_upload_extant_eq_overwrite(existing, mocker, text_dandiset): iter_upload_spy = mocker.spy(LocalFileAsset, "iter_upload") - text_dandiset["reupload"](existing=existing) + text_dandiset.upload(existing=existing) iter_upload_spy.assert_not_called() @pytest.mark.parametrize("existing", ["overwrite", "refresh"]) def test_new_upload_extant_neq_overwrite(existing, mocker, text_dandiset, tmp_path): - dandiset_id = text_dandiset["dandiset_id"] - (text_dandiset["dspath"] / "file.txt").write_text("This is different text.\n") + dandiset_id = text_dandiset.dandiset_id + (text_dandiset.dspath / "file.txt").write_text("This is different text.\n") iter_upload_spy = mocker.spy(LocalFileAsset, "iter_upload") - text_dandiset["reupload"](existing=existing) + text_dandiset.upload(existing=existing) iter_upload_spy.assert_called() - download(text_dandiset["dandiset"].version_api_url, tmp_path) + download(text_dandiset.dandiset.version_api_url, tmp_path) assert ( tmp_path / dandiset_id / "file.txt" ).read_text() == "This is different text.\n" def test_new_upload_extant_old_refresh(mocker, text_dandiset): - (text_dandiset["dspath"] / "file.txt").write_text("This is different text.\n") - os.utime(text_dandiset["dspath"] / "file.txt", times=(0, 0)) + (text_dandiset.dspath / "file.txt").write_text("This is different text.\n") + os.utime(text_dandiset.dspath / "file.txt", times=(0, 0)) iter_upload_spy = mocker.spy(LocalFileAsset, "iter_upload") - text_dandiset["reupload"](existing="refresh") + text_dandiset.upload(existing="refresh") iter_upload_spy.assert_not_called() def test_new_upload_extant_force(mocker, text_dandiset): iter_upload_spy = mocker.spy(LocalFileAsset, "iter_upload") - text_dandiset["reupload"](existing="force") + text_dandiset.upload(existing="force") iter_upload_spy.assert_called() def test_new_upload_extant_bad_existing(mocker, text_dandiset): iter_upload_spy = mocker.spy(LocalFileAsset, "iter_upload") - text_dandiset["reupload"](existing="foobar") + text_dandiset.upload(existing="foobar") iter_upload_spy.assert_not_called() @@ -122,18 +122,18 @@ def test_new_upload_extant_bad_existing(mocker, text_dandiset): ], ) def test_upload_download_small_file(contents, local_dandi_api, monkeypatch, tmp_path): - client = local_dandi_api["client"] + client = local_dandi_api.client d = client.create_dandiset("Small Dandiset", {}) dandiset_id = d.identifier dspath = tmp_path / "upload" dspath.mkdir() (dspath / dandiset_metadata_file).write_text(f"identifier: '{dandiset_id}'\n") (dspath / "file.txt").write_bytes(contents) - monkeypatch.setenv("DANDI_API_KEY", local_dandi_api["api_key"]) + monkeypatch.setenv("DANDI_API_KEY", local_dandi_api.api_key) upload( paths=[], dandiset_path=dspath, - dandi_instance=local_dandi_api["instance_id"], + dandi_instance=local_dandi_api.instance_id, devel_debug=True, allow_any_path=True, validation="skip", @@ -150,34 +150,34 @@ def test_upload_download_small_file(contents, local_dandi_api, monkeypatch, tmp_ @pytest.mark.parametrize("confirm", [True, False]) def test_upload_sync(confirm, mocker, text_dandiset): - (text_dandiset["dspath"] / "file.txt").unlink() + (text_dandiset.dspath / "file.txt").unlink() confirm_mock = mocker.patch("click.confirm", return_value=confirm) - text_dandiset["reupload"](sync=True) + text_dandiset.upload(sync=True) confirm_mock.assert_called_with("Delete 1 asset on server?") if confirm: with pytest.raises(NotFoundError): - text_dandiset["dandiset"].get_asset_by_path("file.txt") + text_dandiset.dandiset.get_asset_by_path("file.txt") else: - text_dandiset["dandiset"].get_asset_by_path("file.txt") + text_dandiset.dandiset.get_asset_by_path("file.txt") def test_upload_sync_folder(mocker, text_dandiset): - (text_dandiset["dspath"] / "file.txt").unlink() - (text_dandiset["dspath"] / "subdir2" / "banana.txt").unlink() + (text_dandiset.dspath / "file.txt").unlink() + (text_dandiset.dspath / "subdir2" / "banana.txt").unlink() confirm_mock = mocker.patch("click.confirm", return_value=True) - text_dandiset["reupload"](paths=[text_dandiset["dspath"] / "subdir2"], sync=True) + text_dandiset.upload(paths=[text_dandiset.dspath / "subdir2"], sync=True) confirm_mock.assert_called_with("Delete 1 asset on server?") - text_dandiset["dandiset"].get_asset_by_path("file.txt") + text_dandiset.dandiset.get_asset_by_path("file.txt") with pytest.raises(NotFoundError): - text_dandiset["dandiset"].get_asset_by_path("subdir2/banana.txt") + text_dandiset.dandiset.get_asset_by_path("subdir2/banana.txt") def test_upload_invalid_metadata( local_dandi_api, monkeypatch, simple1_nwb_metadata, tmp_path ): monkeypatch.chdir(tmp_path) - monkeypatch.setenv("DANDI_API_KEY", local_dandi_api["api_key"]) - d = local_dandi_api["client"].create_dandiset("Broken Dandiset", {}) + monkeypatch.setenv("DANDI_API_KEY", local_dandi_api.api_key) + d = local_dandi_api.client.create_dandiset("Broken Dandiset", {}) nwb_file = "broken.nwb" make_nwb_file( nwb_file, @@ -190,21 +190,21 @@ def test_upload_invalid_metadata( **simple1_nwb_metadata, ) Path(dandiset_metadata_file).write_text(f"identifier: '{d.identifier}'\n") - upload(paths=[], dandi_instance=local_dandi_api["instance_id"], devel_debug=True) + upload(paths=[], dandi_instance=local_dandi_api.instance_id, devel_debug=True) with pytest.raises(NotFoundError): d.get_asset_by_path(nwb_file) def test_upload_zarr(local_dandi_api, monkeypatch, tmp_path): - d = local_dandi_api["client"].create_dandiset("Test Dandiset", {}) + d = local_dandi_api.client.create_dandiset("Test Dandiset", {}) dandiset_id = d.identifier (tmp_path / dandiset_metadata_file).write_text(f"identifier: '{dandiset_id}'\n") zarr.save(tmp_path / "sample.zarr", np.arange(1000), np.arange(1000, 0, -1)) - monkeypatch.setenv("DANDI_API_KEY", local_dandi_api["api_key"]) + monkeypatch.setenv("DANDI_API_KEY", local_dandi_api.api_key) upload( paths=[], dandiset_path=tmp_path, - dandi_instance=local_dandi_api["instance_id"], + dandi_instance=local_dandi_api.instance_id, devel_debug=True, ) (asset,) = d.get_assets() From 4e5d10c47a7e39272ed6c95220e9e9741c366249 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Wed, 12 Jan 2022 10:56:14 -0500 Subject: [PATCH 25/56] Test replacing an uploaded Zarr with a non-Zarr at the same path --- dandi/tests/test_upload.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/dandi/tests/test_upload.py b/dandi/tests/test_upload.py index eb7ded18d..96fc4a680 100644 --- a/dandi/tests/test_upload.py +++ b/dandi/tests/test_upload.py @@ -1,5 +1,6 @@ import os from pathlib import Path +from shutil import rmtree import numpy as np import pynwb @@ -7,7 +8,7 @@ import zarr from ..consts import DRAFT, dandiset_metadata_file -from ..dandiapi import RemoteZarrAsset +from ..dandiapi import RemoteBlobAsset, RemoteZarrAsset from ..download import download from ..exceptions import NotFoundError from ..files import LocalFileAsset @@ -212,3 +213,19 @@ def test_upload_zarr(local_dandi_api, monkeypatch, tmp_path): assert asset.is_zarr() assert not asset.is_blob() assert asset.path == "sample.zarr" + + +def test_upload_nonzarr_to_zarr_path(tmp_path, zarr_dandiset): + rmtree(zarr_dandiset.dspath / "sample.zarr") + (zarr_dandiset.dspath / "sample.zarr").write_text("This is not a Zarr.\n") + zarr_dandiset.upload(allow_any_path=True) + (asset,) = zarr_dandiset.dandiset.get_assets() + assert isinstance(asset, RemoteBlobAsset) + assert asset.is_blob() + assert not asset.is_zarr() + assert asset.path == "sample.zarr" + assert asset.get_raw_metadata()["encodingFormat"] == "application/octet-stream" + download(zarr_dandiset.dandiset.version_api_url, tmp_path) + assert ( + tmp_path / zarr_dandiset.dandiset_id / "sample.zarr" + ).read_text() == "This is not a Zarr.\n" From 25176da2413c6247872b7779cdd37cc4d3507216 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Wed, 12 Jan 2022 11:56:50 -0500 Subject: [PATCH 26/56] Add xfailing tests involving Zarr downloads --- dandi/tests/test_download.py | 62 +++++++++++++++++++++++++++++++++- dandi/tests/test_upload.py | 65 ++++++++++++++++++++++++++++++++++-- dandi/utils.py | 20 +++++++++++ 3 files changed, 144 insertions(+), 3 deletions(-) diff --git a/dandi/tests/test_download.py b/dandi/tests/test_download.py index c89ab0e9e..9a2c746b5 100644 --- a/dandi/tests/test_download.py +++ b/dandi/tests/test_download.py @@ -4,14 +4,17 @@ import re from shutil import rmtree +import numpy as np import pytest import responses +import zarr from .skip import mark from ..consts import DRAFT, dandiset_metadata_file from ..dandiarchive import DandisetURL from ..download import download, download_generator -from ..utils import list_paths +from ..upload import upload +from ..utils import assert_dirtrees_eq, list_paths # both urls point to 000027 (lean test dataset), and both draft and "released" @@ -266,3 +269,60 @@ def test_download_metadata404(text_dandiset, tmp_path): tmp_path / "subdir2" / "banana.txt", tmp_path / "subdir2" / "coconut.txt", ] + + +@pytest.mark.xfail(reason="Zarr download not implemented yet", strict=True) +def test_download_zarr(tmp_path, zarr_dandiset): + download(zarr_dandiset.dandiset.version_api_url, tmp_path) + assert_dirtrees_eq( + zarr_dandiset.dspath / "sample.zarr", + tmp_path / zarr_dandiset.dandiset_id / "sample.zarr", + ) + + +@pytest.mark.xfail(reason="Zarr download not implemented yet", strict=True) +def test_download_different_zarr(tmp_path, zarr_dandiset): + dd = tmp_path / zarr_dandiset.dandiset_id + dd.mkdir() + zarr.save(dd / "sample.zarr", np.eye(5)) + download(zarr_dandiset.dandiset.version_api_url, tmp_path) + assert_dirtrees_eq( + zarr_dandiset.dspath / "sample.zarr", + tmp_path / zarr_dandiset.dandiset_id / "sample.zarr", + ) + + +@pytest.mark.xfail(reason="Zarr download not implemented yet", strict=True) +def test_download_zarr_to_nonzarr_path(tmp_path, zarr_dandiset): + dd = tmp_path / zarr_dandiset.dandiset_id + dd.mkdir() + (dd / "sample.zarr").write_text("This is not a Zarr.\n") + download(zarr_dandiset.dandiset.version_api_url, tmp_path) + assert_dirtrees_eq( + zarr_dandiset.dspath / "sample.zarr", + tmp_path / zarr_dandiset.dandiset_id / "sample.zarr", + ) + + +@pytest.mark.xfail(reason="Zarr download not implemented yet", strict=True) +def test_download_nonzarr_to_zarr_path(local_dandi_api, monkeypatch, tmp_path): + monkeypatch.setenv("DANDI_API_KEY", local_dandi_api.api_key) + d = local_dandi_api.client.create_dandiset("Test Dandiset", {}) + dandiset_id = d.identifier + dspath = tmp_path / "dandiset" + dspath.mkdir() + (dspath / dandiset_metadata_file).write_text(f"identifier: '{dandiset_id}'\n") + (dspath / "sample.zarr").write_text("This is not a Zarr.\n") + upload( + paths=[], + dandiset_path=dspath, + dandi_instance=local_dandi_api.instance_id, + devel_debug=True, + allow_any_path=True, + ) + dd = tmp_path / "download" / dandiset_id + dd.mkdir(parents=True, exist_ok=True) + zarr.save(dd / "sample.zarr", np.arange(1000), np.arange(1000, 0, -1)) + download(d.version_api_url, tmp_path / "download") + assert (dd / "sample.zarr").is_file() + assert (dd / "sample.zarr").read_text() == "This is not a Zarr.\n" diff --git a/dandi/tests/test_upload.py b/dandi/tests/test_upload.py index 96fc4a680..8b9c4de62 100644 --- a/dandi/tests/test_upload.py +++ b/dandi/tests/test_upload.py @@ -7,14 +7,14 @@ import pytest import zarr -from ..consts import DRAFT, dandiset_metadata_file +from ..consts import DRAFT, ZARR_MIME_TYPE, dandiset_metadata_file from ..dandiapi import RemoteBlobAsset, RemoteZarrAsset from ..download import download from ..exceptions import NotFoundError from ..files import LocalFileAsset from ..pynwb_utils import make_nwb_file from ..upload import upload -from ..utils import list_paths +from ..utils import assert_dirtrees_eq, list_paths def test_new_upload_download(local_dandi_api, monkeypatch, organized_nwb_dir, tmp_path): @@ -215,6 +215,18 @@ def test_upload_zarr(local_dandi_api, monkeypatch, tmp_path): assert asset.path == "sample.zarr" +@pytest.mark.xfail(reason="Zarr download not implemented yet", strict=True) +def test_upload_different_zarr(tmp_path, zarr_dandiset): + rmtree(zarr_dandiset.dspath / "sample.zarr") + zarr.save(tmp_path / "sample.zarr", np.eye(5)) + zarr_dandiset.upload() + download(zarr_dandiset.dandiset.version_api_url, tmp_path) + assert_dirtrees_eq( + zarr_dandiset.dspath / "sample.zarr", + tmp_path / zarr_dandiset.dandiset_id / "sample.zarr", + ) + + def test_upload_nonzarr_to_zarr_path(tmp_path, zarr_dandiset): rmtree(zarr_dandiset.dspath / "sample.zarr") (zarr_dandiset.dspath / "sample.zarr").write_text("This is not a Zarr.\n") @@ -229,3 +241,52 @@ def test_upload_nonzarr_to_zarr_path(tmp_path, zarr_dandiset): assert ( tmp_path / zarr_dandiset.dandiset_id / "sample.zarr" ).read_text() == "This is not a Zarr.\n" + + +@pytest.mark.xfail(reason="Zarr download not implemented yet", strict=True) +def test_upload_zarr_to_nonzarr_path(local_dandi_api, monkeypatch, tmp_path): + monkeypatch.setenv("DANDI_API_KEY", local_dandi_api.api_key) + d = local_dandi_api.client.create_dandiset("Test Dandiset", {}) + dandiset_id = d.identifier + dspath = tmp_path / "dandiset" + dspath.mkdir() + (dspath / dandiset_metadata_file).write_text(f"identifier: '{dandiset_id}'\n") + (dspath / "sample.zarr").write_text("This is not a Zarr.\n") + upload( + paths=[], + dandiset_path=dspath, + dandi_instance=local_dandi_api.instance_id, + devel_debug=True, + allow_any_path=True, + ) + + (asset,) = d.get_assets() + assert isinstance(asset, RemoteBlobAsset) + assert asset.is_blob() + assert not asset.is_zarr() + assert asset.path == "sample.zarr" + assert asset.get_raw_metadata()["encodingFormat"] == "application/octet-stream" + + (dspath / "sample.zarr").unlink() + zarr.save(dspath / "sample.zarr", np.arange(1000), np.arange(1000, 0, -1)) + upload( + paths=[], + dandiset_path=dspath, + dandi_instance=local_dandi_api.instance_id, + devel_debug=True, + allow_any_path=True, + ) + + (asset,) = d.get_assets() + assert isinstance(asset, RemoteZarrAsset) + assert asset.is_zarr() + assert not asset.is_blob() + assert asset.path == "sample.zarr" + assert asset.get_raw_metadata()["encodingFormat"] == ZARR_MIME_TYPE + + (tmp_path / "download").mkdir() + download(d.version_api_url, tmp_path / "download") + assert_dirtrees_eq( + dspath / "sample.zarr", + tmp_path / "download" / dandiset_id / "sample.zarr", + ) diff --git a/dandi/utils.py b/dandi/utils.py index 9f25469ef..b318c519c 100644 --- a/dandi/utils.py +++ b/dandi/utils.py @@ -10,6 +10,7 @@ import io import itertools from mimetypes import guess_type +from operator import attrgetter import os import os.path as op from pathlib import Path @@ -727,3 +728,22 @@ def chunked(iterable: Iterable[T], size: int) -> Iterator[List[T]]: else: return yield xs + + +def assert_dirtrees_eq(tree1: Path, tree2: Path) -> None: + """Assert that the file trees at the given paths are equal""" + assert sorted(map(attrgetter("name"), tree1.iterdir())) == sorted( + map(attrgetter("name"), tree2.iterdir()) + ) + for p1 in tree1.iterdir(): + p2 = tree2 / p1.name + assert p1.is_dir() == p2.is_dir() + if p1.is_dir(): + assert_dirtrees_eq(p1, p2) + # TODO: Considering using the identify library to test for binary-ness. + # (We can't use mimetypes, as .json maps to application/json instead of + # text/json.) + elif p1.suffix in {".txt", ".py", ".json"}: + assert p1.read_text() == p2.read_text() + else: + assert p1.read_bytes() == p2.read_bytes() From 692eebfd205c5d1e3b86f02368d2a88b042bc13c Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Wed, 12 Jan 2022 12:32:52 -0500 Subject: [PATCH 27/56] Error on attempting to upload a Zarr to an embargoed Dandiset --- dandi/files.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/dandi/files.py b/dandi/files.py index 76c94a0bb..e2b7951ff 100644 --- a/dandi/files.py +++ b/dandi/files.py @@ -23,6 +23,7 @@ MAX_ZARR_DEPTH, ZARR_MIME_TYPE, ZARR_UPLOAD_BATCH_SIZE, + EmbargoStatus, dandiset_metadata_file, ) from .dandiapi import RemoteAsset, RemoteDandiset, RESTFullAPIClient @@ -541,11 +542,17 @@ def iter_upload( ``"done"`` and an ``"asset"`` key containing the resulting `RemoteAsset`. """ - # TODO: Only iterate over the filetree once and save the results in - # memory + # So that older clients don't get away with doing the wrong thing once + # Zarr upload to embargoed Dandisets is implemented in the API: + if dandiset.embargo_status is EmbargoStatus.EMBARGOED: + raise NotImplementedError( + "Uploading Zarr assets to embargoed Dandisets is currently not implemented" + ) asset_path = metadata.setdefault("path", self.path) client = dandiset.client yield {"status": "calculating etag"} + # TODO: Only iterate over the filetree once and save the results in + # memory filetag = self.get_etag().value lgr.debug("Calculated dandi-zarr-checksum of %s for %s", filetag, self.filepath) digest = metadata.get("digest", {}) From b0796a4271550e043ad192ef31ba688a79487b82 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Thu, 13 Jan 2022 09:44:42 -0500 Subject: [PATCH 28/56] Add some docstrings --- dandi/files.py | 115 +++++++++++++++++++++++++++++-- dandi/misctypes.py | 17 +++++ docs/source/modref/files.rst | 4 ++ docs/source/modref/index.rst | 2 + docs/source/modref/misctypes.rst | 4 ++ 5 files changed, 137 insertions(+), 5 deletions(-) create mode 100644 docs/source/modref/files.rst create mode 100644 docs/source/modref/misctypes.rst diff --git a/dandi/files.py b/dandi/files.py index e2b7951ff..4d91d8046 100644 --- a/dandi/files.py +++ b/dandi/files.py @@ -1,3 +1,12 @@ +""" +This module defines functionality for working with local files & directories +(as opposed to remote resources on a DANDI Archive server) that are of interest +to DANDI. The classes for such files & directories all inherit from +`DandiFile`, which has two immediate subclasses: `DandisetMetadataFile`, for +representing :file:`dandiset.yaml` files, and `LocalAsset`, for representing +files that can be uploaded as assets to DANDI Archive. +""" + from abc import ABC, abstractmethod from collections import deque from concurrent.futures import ThreadPoolExecutor, as_completed @@ -44,13 +53,17 @@ @dataclass class DandiFile(ABC): - #: Path to node on disk + """Base class for local files & directories of interest to DANDI""" + + #: The path to the actual file or directory on disk filepath: Path def get_size(self) -> int: + """Return the size of the file""" return os.path.getsize(self.filepath) def get_mtime(self) -> datetime: + """Return the time at which the file was last modified""" # TODO: Should this be overridden for LocalDirectoryAsset? return ensure_datetime(self.filepath.stat().st_mtime) @@ -72,6 +85,8 @@ def get_validation_errors( class DandisetMetadataFile(DandiFile): + """Representation of a :file:`dandiset.yaml` file""" + def get_metadata( self, digest: Optional[Digest] = None, @@ -126,11 +141,21 @@ def get_validation_errors( @dataclass class LocalAsset(DandiFile): - #: Forward-slash-separated path relative to root of Dandiset + """ + Representation of a file or directory that can be uploaded to a DANDI + Archive as an asset of a Dandiset + """ + + #: The foward-slash-separated path to the asset within its local Dandiset + #: (i.e., relative to the Dandiset's root) path: str @abstractmethod def get_etag(self) -> Digest: + """ + Calculate an etag digest for the asset using the appropriate algorithm + for its type + """ ... @abstractmethod @@ -193,7 +218,7 @@ def upload( Dandiset and return the resulting asset. Blocks until the upload is complete. - :dandiset RemoteDandiset: + :param RemoteDandiset dandiset: the Dandiset to which the file will be uploaded :param dict metadata: Metadata for the uploaded asset. The "path" field will be set to @@ -220,11 +245,37 @@ def iter_upload( jobs: Optional[int] = None, replacing: Optional[RemoteAsset] = None, ) -> Iterator[dict]: + """ + Upload the asset with the given metadata to the given Dandiset, + returning a generator of status `dict`\\s. + + :param RemoteDandiset dandiset: + the Dandiset to which the asset will be uploaded + :param dict metadata: + Metadata for the uploaded asset. The "path" field will be set to + the value of the instance's ``path`` attribute if no such field is + already present. + :param int jobs: Number of threads to use for uploading; defaults to 5 + :param RemoteAsset replacing: + If set, replace the given asset, which must have the same path as + the new asset + :returns: + A generator of `dict`\\s containing at least a ``"status"`` key. + Upon successful upload, the last `dict` will have a status of + ``"done"`` and an ``"asset"`` key containing the resulting + `RemoteAsset`. + """ ... class LocalFileAsset(LocalAsset): + """ + Representation of a regular file that can be uploaded to a DANDI Archive as + an asset of a Dandiset + """ + def get_etag(self) -> Digest: + """Calculate a dandi-etag digest for the asset""" value = get_digest(self.filepath, digest="dandi-etag") return Digest.dandi_etag(value) @@ -239,7 +290,7 @@ def iter_upload( Upload the file as an asset with the given metadata to the given Dandiset, returning a generator of status `dict`\\s. - :dandiset RemoteDandiset: + :param RemoteDandiset dandiset: the Dandiset to which the file will be uploaded :param dict metadata: Metadata for the uploaded asset. The "path" field will be set to @@ -376,6 +427,8 @@ def iter_upload( class NWBAsset(LocalFileAsset): + """Representation of a local NWB file""" + EXTENSIONS = [".nwb"] def get_metadata( @@ -434,6 +487,10 @@ def get_validation_errors( class GenericAsset(LocalFileAsset): + """ + Representation of a generic regular file, one that is not of any known type + """ + EXTENSIONS = [] def get_metadata( @@ -447,7 +504,13 @@ def get_metadata( class LocalDirectoryAsset(LocalAsset): + """ + Representation of a directory that can be uploaded to a DANDI Archive as + a single asset of a Dandiset + """ + def iterfiles(self) -> Iterator[Path]: + """Yield all files within the directory""" dirs = deque([self.filepath]) while dirs: for p in dirs.popleft().iterdir(): @@ -457,13 +520,17 @@ def iterfiles(self) -> Iterator[Path]: yield p def get_size(self) -> int: + """Return the total size of the files in the directory""" return sum(p.stat().st_size for p in self.iterfiles()) class ZarrAsset(LocalDirectoryAsset): + """Representation of a local Zarr directory""" + EXTENSIONS = [".ngff", ".zarr"] def get_etag(self) -> Digest: + """Calculate a dandi-zarr-checksum digest for the asset""" return Digest.dandi_zarr(get_zarr_checksum(self.filepath)) def get_metadata( @@ -526,7 +593,7 @@ def iter_upload( Upload the Zarr directory as an asset with the given metadata to the given Dandiset, returning a generator of status `dict`\\s. - :dandiset RemoteDandiset: + :param RemoteDandiset dandiset: the Dandiset to which the Zarr will be uploaded :param dict metadata: Metadata for the uploaded asset. The "path" field will be set to @@ -640,6 +707,28 @@ def find_dandi_files( allow_all: bool = False, include_metadata: bool = False, ) -> Iterator[DandiFile]: + """ + Yield all DANDI files at or under the paths in ``paths`` (which may be + either files or directories). Files & directories whose names start with a + period are ignored. Directories are only included in the return value if + they are of a type represented by a `LocalDirectoryAsset` subclass, in + which case they are not recursed into. + + :param dandiset_path: + The path to the root of the Dandiset in which the paths are located. + All paths in ``paths`` must be equal to or subpaths of + ``dandiset_path``. Can only be omitted when ``paths`` is a single + directory, in which case ``dandiset_path`` is set to that directory. + :param allow_all: + If true, unrecognized assets and the Dandiset's :file:`dandiset.yaml` + file are returned as `GenericAsset` and `DandisetMetadataFile` + instances, respectively. If false, they are not returned at all. + :param include_metadata: + If true, the Dandiset's :file:`dandiset.yaml` file is returned as a + `DandisetMetadataFile` instance. If false, it is not returned at all + (unless ``allow_all`` is true). + """ + if dandiset_path is None: if len(paths) == 1 and os.path.isdir(paths[0]): dandiset_path = paths[0] @@ -686,6 +775,22 @@ def find_dandi_files( def dandi_file( filepath: Union[str, Path], dandiset_path: Optional[Union[str, Path]] = None ) -> DandiFile: + """ + Return a `DandiFile` instance of the appropriate type for the file at + ``filepath`` inside the Dandiset rooted at ``dandiset_path``. If + ``dandiset_path`` is not set, it will default to ``filepath``'s parent + directory. + + If ``filepath`` is a directory, it must be of a type represented by a + `LocalDirectoryAsset` subclass; otherwise, an `UnknownSuffixError` + exception will be raised. + + A regular file named :file:`dandiset.yaml` will only be represented by a + `DandisetMetadataFile` instance if it is at the root of the Dandiset. + + A regular file that is not of a known type will be represented by a + `GenericAsset` instance. + """ filepath = Path(filepath) if dandiset_path is not None: path = filepath.relative_to(dandiset_path).as_posix() diff --git a/dandi/misctypes.py b/dandi/misctypes.py index ad4c81e7b..1e5e0d064 100644 --- a/dandi/misctypes.py +++ b/dandi/misctypes.py @@ -8,18 +8,35 @@ @dataclass class Digest: + """A computed digest for a file or directory""" + + #: The digest algorithm used algorithm: DigestType + + #: The digest itself value: str @classmethod def dandi_etag(cls, value: str) -> Digest: + """ + Construct a `Digest` with the given value and a ``algorithm`` of + ``DigestType.dandi_etag`` + """ return cls(algorithm=DigestType.dandi_etag, value=value) @classmethod def dandi_zarr(cls, value: str) -> Digest: + """ + Construct a `Digest` with the given value and a ``algorithm`` of + ``DigestType.dandi_zarr_checksum`` + """ return cls(algorithm=DigestType.dandi_zarr_checksum, value=value) def asdict(self) -> Dict[DigestType, str]: + """ + Convert the instance to a single-item `dict` mapping the digest + algorithm to the digest value + """ return {self.algorithm: self.value} diff --git a/docs/source/modref/files.rst b/docs/source/modref/files.rst new file mode 100644 index 000000000..cd80ddc23 --- /dev/null +++ b/docs/source/modref/files.rst @@ -0,0 +1,4 @@ +``dandi.files`` +=============== + +.. automodule:: dandi.files diff --git a/docs/source/modref/index.rst b/docs/source/modref/index.rst index 96e4cdd43..6d6e78e8d 100644 --- a/docs/source/modref/index.rst +++ b/docs/source/modref/index.rst @@ -22,6 +22,8 @@ Mid-level user interfaces .. toctree:: dandiapi + files + misctypes Support functionality ===================== diff --git a/docs/source/modref/misctypes.rst b/docs/source/modref/misctypes.rst new file mode 100644 index 000000000..d1ee6a0b1 --- /dev/null +++ b/docs/source/modref/misctypes.rst @@ -0,0 +1,4 @@ +``dandi.misctypes`` +=================== + +.. automodule:: dandi.misctypes From 6b8f07a79ae7f20070b612129402427c7e61a970 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Thu, 13 Jan 2022 10:48:55 -0500 Subject: [PATCH 29/56] Rename `get_size()` and `get_mtime()` to `size` and `modified` For consistency with the RemoteAsset API --- dandi/files.py | 19 +++++++++++-------- dandi/upload.py | 4 ++-- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/dandi/files.py b/dandi/files.py index 4d91d8046..82d57fef3 100644 --- a/dandi/files.py +++ b/dandi/files.py @@ -58,12 +58,14 @@ class DandiFile(ABC): #: The path to the actual file or directory on disk filepath: Path - def get_size(self) -> int: - """Return the size of the file""" + @property + def size(self) -> int: + """The size of the file""" return os.path.getsize(self.filepath) - def get_mtime(self) -> datetime: - """Return the time at which the file was last modified""" + @property + def modified(self) -> datetime: + """The time at which the file was last modified""" # TODO: Should this be overridden for LocalDirectoryAsset? return ensure_datetime(self.filepath.stat().st_mtime) @@ -321,7 +323,7 @@ def iter_upload( ) yield {"status": "initiating upload"} lgr.debug("%s: Beginning upload", asset_path) - total_size = self.get_size() + total_size = self.size try: resp = client.post( "/uploads/initialize/", @@ -519,8 +521,9 @@ def iterfiles(self) -> Iterator[Path]: else: yield p - def get_size(self) -> int: - """Return the total size of the files in the directory""" + @property + def size(self) -> int: + """The total size of the files in the directory""" return sum(p.stat().st_size for p in self.iterfiles()) @@ -631,7 +634,7 @@ def iter_upload( ) yield {"status": "initiating upload"} lgr.debug("%s: Beginning upload", asset_path) - total_size = self.get_size() + total_size = self.size bytes_uploaded = 0 r = client.post("/zarr/", json={"name": self.filepath.name}) zarr_id = r["zarr_id"] diff --git a/dandi/upload.py b/dandi/upload.py index 9cdd74080..da12929a1 100644 --- a/dandi/upload.py +++ b/dandi/upload.py @@ -109,7 +109,7 @@ def process_path(dfile): strpath = str(dfile.filepath) try: try: - yield {"size": dfile.get_size()} + yield {"size": dfile.size} except FileNotFoundError: yield skip_file("ERROR: File not found") return @@ -171,7 +171,7 @@ def process_path(dfile): extant = None else: metadata = extant.get_raw_metadata() - local_mtime = dfile.get_mtime() + local_mtime = dfile.modified remote_mtime_str = metadata.get("blobDateModified") # TODO: Should this error if the digest is missing? extant_etag = metadata.get("digest", {}).get(file_etag.algorithm.value) From 193d27c5378fee9c5baa74c68b689a7ba68be456 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Thu, 13 Jan 2022 14:33:40 -0500 Subject: [PATCH 30/56] Raise NotImplementedError when trying to download a Zarr --- dandi/download.py | 10 +++++++++- dandi/tests/test_download.py | 23 ++++++++++++++++------- dandi/tests/test_upload.py | 8 ++++++-- 3 files changed, 31 insertions(+), 10 deletions(-) diff --git a/dandi/download.py b/dandi/download.py index 65f2f5c1c..9115d9da1 100644 --- a/dandi/download.py +++ b/dandi/download.py @@ -197,6 +197,10 @@ def download_generator( return for asset in assets: + if asset.is_zarr(): + raise NotImplementedError( + "Download of Zarr assets is not yet implemented" + ) path = asset.path.lstrip("/") # make into relative path path = op.normpath(path) if not isinstance(parsed_url, DandisetURL): @@ -681,7 +685,11 @@ def __exit__(self, exc_type, exc_value, traceback): self.fp.close() try: if exc_type is None: - self.writefile.replace(self.filepath) + try: + self.writefile.replace(self.filepath) + except IsADirectoryError: + rmtree(self.filepath) + self.writefile.replace(self.filepath) finally: self.lock.release() if exc_type is None: diff --git a/dandi/tests/test_download.py b/dandi/tests/test_download.py index 9a2c746b5..cb2d4f273 100644 --- a/dandi/tests/test_download.py +++ b/dandi/tests/test_download.py @@ -271,7 +271,9 @@ def test_download_metadata404(text_dandiset, tmp_path): ] -@pytest.mark.xfail(reason="Zarr download not implemented yet", strict=True) +@pytest.mark.xfail( + reason="Zarr download not implemented yet", raises=NotImplementedError, strict=True +) def test_download_zarr(tmp_path, zarr_dandiset): download(zarr_dandiset.dandiset.version_api_url, tmp_path) assert_dirtrees_eq( @@ -280,31 +282,38 @@ def test_download_zarr(tmp_path, zarr_dandiset): ) -@pytest.mark.xfail(reason="Zarr download not implemented yet", strict=True) +@pytest.mark.xfail( + reason="Zarr download not implemented yet", raises=NotImplementedError, strict=True +) def test_download_different_zarr(tmp_path, zarr_dandiset): dd = tmp_path / zarr_dandiset.dandiset_id dd.mkdir() zarr.save(dd / "sample.zarr", np.eye(5)) - download(zarr_dandiset.dandiset.version_api_url, tmp_path) + download( + zarr_dandiset.dandiset.version_api_url, tmp_path, existing="overwrite-different" + ) assert_dirtrees_eq( zarr_dandiset.dspath / "sample.zarr", tmp_path / zarr_dandiset.dandiset_id / "sample.zarr", ) -@pytest.mark.xfail(reason="Zarr download not implemented yet", strict=True) +@pytest.mark.xfail( + reason="Zarr download not implemented yet", raises=NotImplementedError, strict=True +) def test_download_zarr_to_nonzarr_path(tmp_path, zarr_dandiset): dd = tmp_path / zarr_dandiset.dandiset_id dd.mkdir() (dd / "sample.zarr").write_text("This is not a Zarr.\n") - download(zarr_dandiset.dandiset.version_api_url, tmp_path) + download( + zarr_dandiset.dandiset.version_api_url, tmp_path, existing="overwrite-different" + ) assert_dirtrees_eq( zarr_dandiset.dspath / "sample.zarr", tmp_path / zarr_dandiset.dandiset_id / "sample.zarr", ) -@pytest.mark.xfail(reason="Zarr download not implemented yet", strict=True) def test_download_nonzarr_to_zarr_path(local_dandi_api, monkeypatch, tmp_path): monkeypatch.setenv("DANDI_API_KEY", local_dandi_api.api_key) d = local_dandi_api.client.create_dandiset("Test Dandiset", {}) @@ -323,6 +332,6 @@ def test_download_nonzarr_to_zarr_path(local_dandi_api, monkeypatch, tmp_path): dd = tmp_path / "download" / dandiset_id dd.mkdir(parents=True, exist_ok=True) zarr.save(dd / "sample.zarr", np.arange(1000), np.arange(1000, 0, -1)) - download(d.version_api_url, tmp_path / "download") + download(d.version_api_url, tmp_path / "download", existing="overwrite-different") assert (dd / "sample.zarr").is_file() assert (dd / "sample.zarr").read_text() == "This is not a Zarr.\n" diff --git a/dandi/tests/test_upload.py b/dandi/tests/test_upload.py index 8b9c4de62..243488f20 100644 --- a/dandi/tests/test_upload.py +++ b/dandi/tests/test_upload.py @@ -215,7 +215,9 @@ def test_upload_zarr(local_dandi_api, monkeypatch, tmp_path): assert asset.path == "sample.zarr" -@pytest.mark.xfail(reason="Zarr download not implemented yet", strict=True) +@pytest.mark.xfail( + reason="Zarr download not implemented yet", raises=NotImplementedError, strict=True +) def test_upload_different_zarr(tmp_path, zarr_dandiset): rmtree(zarr_dandiset.dspath / "sample.zarr") zarr.save(tmp_path / "sample.zarr", np.eye(5)) @@ -243,7 +245,9 @@ def test_upload_nonzarr_to_zarr_path(tmp_path, zarr_dandiset): ).read_text() == "This is not a Zarr.\n" -@pytest.mark.xfail(reason="Zarr download not implemented yet", strict=True) +@pytest.mark.xfail( + reason="Zarr download not implemented yet", raises=NotImplementedError, strict=True +) def test_upload_zarr_to_nonzarr_path(local_dandi_api, monkeypatch, tmp_path): monkeypatch.setenv("DANDI_API_KEY", local_dandi_api.api_key) d = local_dandi_api.client.create_dandiset("Test Dandiset", {}) From 448fb4724d94b48bba7f520c4d14da93d1aeccda Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Fri, 14 Jan 2022 08:15:10 -0500 Subject: [PATCH 31/56] Don't recognize empty directories as Zarrs --- dandi/exceptions.py | 2 +- dandi/files.py | 24 +++++++++++++----------- dandi/tests/test_files.py | 4 ++++ 3 files changed, 18 insertions(+), 12 deletions(-) diff --git a/dandi/exceptions.py b/dandi/exceptions.py index 0afbb3b73..c23c5c94b 100644 --- a/dandi/exceptions.py +++ b/dandi/exceptions.py @@ -66,5 +66,5 @@ class SchemaVersionError(Exception): pass -class UnknownSuffixError(ValueError): +class UnknownAssetError(ValueError): pass diff --git a/dandi/files.py b/dandi/files.py index 82d57fef3..b8ec0dbdd 100644 --- a/dandi/files.py +++ b/dandi/files.py @@ -36,7 +36,7 @@ dandiset_metadata_file, ) from .dandiapi import RemoteAsset, RemoteDandiset, RESTFullAPIClient -from .exceptions import UnknownSuffixError +from .exceptions import UnknownAssetError from .metadata import get_default_metadata, get_metadata, nwb2asset from .misctypes import DUMMY_DIGEST, Digest from .pynwb_utils import validate as pynwb_validate @@ -756,13 +756,13 @@ def find_dandi_files( if p.is_dir(): if p.is_symlink(): lgr.warning("%s: Ignoring unsupported symbolic link to directory", p) - continue - try: - df = dandi_file(p, dandiset_path) - except UnknownSuffixError: - path_queue.extend(p.iterdir()) - else: - yield df + elif any(p.iterdir()): + try: + df = dandi_file(p, dandiset_path) + except UnknownAssetError: + path_queue.extend(p.iterdir()) + else: + yield df else: df = dandi_file(p, dandiset_path) if isinstance(df, GenericAsset) and not allow_all: @@ -785,8 +785,8 @@ def dandi_file( directory. If ``filepath`` is a directory, it must be of a type represented by a - `LocalDirectoryAsset` subclass; otherwise, an `UnknownSuffixError` - exception will be raised. + `LocalDirectoryAsset` subclass; otherwise, an `UnknownAssetError` exception + will be raised. A regular file named :file:`dandiset.yaml` will only be represented by a `DandisetMetadataFile` instance if it is at the root of the Dandiset. @@ -800,10 +800,12 @@ def dandi_file( else: path = filepath.name if filepath.is_dir(): + if not any(filepath.iterdir()): + raise UnknownAssetError("Empty directories cannot be assets") for dirclass in LocalDirectoryAsset.__subclasses__(): if filepath.suffix in dirclass.EXTENSIONS: return dirclass(filepath=filepath, path=path) - raise UnknownSuffixError( + raise UnknownAssetError( f"Directory has unrecognized suffix {filepath.suffix!r}" ) elif path == dandiset_metadata_file: diff --git a/dandi/tests/test_files.py b/dandi/tests/test_files.py index 8c42f5560..d96d23fb2 100644 --- a/dandi/tests/test_files.py +++ b/dandi/tests/test_files.py @@ -32,6 +32,10 @@ def test_find_dandi_files(tmp_path: Path) -> None: (tmp_path / "subdir" / "sample04.zarr" / "baz").touch() (tmp_path / "subdir" / "gnusto").touch() (tmp_path / "subdir" / "cleesh.txt").touch() + (tmp_path / "empty.zarr").mkdir() + (tmp_path / ".ignored").touch() + (tmp_path / ".ignored.dir").mkdir() + (tmp_path / ".ignored.dir" / "ignored.nwb").touch() files = sorted(find_dandi_files(tmp_path), key=attrgetter("filepath")) assert files == [ From b2711b2242953ec34448cc67be50441ab1f8b377 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Mon, 17 Jan 2022 00:42:36 -0500 Subject: [PATCH 32/56] Validation no longer needs to be skipped when uploading non-NWB files --- dandi/tests/fixtures.py | 2 +- dandi/tests/test_dandiapi.py | 3 --- dandi/tests/test_upload.py | 1 - 3 files changed, 1 insertion(+), 5 deletions(-) diff --git a/dandi/tests/fixtures.py b/dandi/tests/fixtures.py index a6bdfe9a9..1c2209e78 100644 --- a/dandi/tests/fixtures.py +++ b/dandi/tests/fixtures.py @@ -328,7 +328,7 @@ def text_dandiset(local_dandi_api, monkeypatch, tmp_path_factory): dspath=dspath, dandiset=d, dandiset_id=dandiset_id, - upload_kwargs={"allow_any_path": True, "validation": "skip"}, + upload_kwargs={"allow_any_path": True}, ) td.upload() return td diff --git a/dandi/tests/test_dandiapi.py b/dandi/tests/test_dandiapi.py index ad7a1fa45..f7ffce2fb 100644 --- a/dandi/tests/test_dandiapi.py +++ b/dandi/tests/test_dandiapi.py @@ -73,7 +73,6 @@ def test_publish_and_manipulate(local_dandi_api, monkeypatch, tmp_path): dandi_instance=local_dandi_api.instance_id, devel_debug=True, allow_any_path=True, - validation="skip", ) d.wait_until_valid() @@ -99,7 +98,6 @@ def test_publish_and_manipulate(local_dandi_api, monkeypatch, tmp_path): dandi_instance=local_dandi_api.instance_id, devel_debug=True, allow_any_path=True, - validation="skip", ) rmtree(download_dir / dandiset_id) download(dv.version_api_url, download_dir) @@ -112,7 +110,6 @@ def test_publish_and_manipulate(local_dandi_api, monkeypatch, tmp_path): dandi_instance=local_dandi_api.instance_id, devel_debug=True, allow_any_path=True, - validation="skip", ) rmtree(download_dir / dandiset_id) diff --git a/dandi/tests/test_upload.py b/dandi/tests/test_upload.py index 243488f20..f2f603949 100644 --- a/dandi/tests/test_upload.py +++ b/dandi/tests/test_upload.py @@ -137,7 +137,6 @@ def test_upload_download_small_file(contents, local_dandi_api, monkeypatch, tmp_ dandi_instance=local_dandi_api.instance_id, devel_debug=True, allow_any_path=True, - validation="skip", ) download_dir = tmp_path / "download" download_dir.mkdir() From 335f6d7f16f6e283bddcf076659134ec73e8d5a4 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Fri, 14 Jan 2022 09:25:45 -0500 Subject: [PATCH 33/56] Rewrite get_zarr_checksum() using dandischema --- dandi/support/digests.py | 36 +++++++++--------------------------- setup.cfg | 2 +- 2 files changed, 10 insertions(+), 28 deletions(-) diff --git a/dandi/support/digests.py b/dandi/support/digests.py index 8d9db0476..e643a82af 100644 --- a/dandi/support/digests.py +++ b/dandi/support/digests.py @@ -10,13 +10,12 @@ """ import hashlib -import json import logging -from operator import itemgetter from pathlib import Path from typing import Optional from dandischema.digests.dandietag import DandiETag +from dandischema.digests.zarr import get_checksum from fscacher import PersistentCache from ..utils import auto_repr @@ -96,29 +95,12 @@ def get_dandietag(filepath) -> DandiETag: def get_zarr_checksum(dirpath: Path, basepath: Optional[Path] = None) -> str: if basepath is None: basepath = dirpath - dirs = [] - files = [] + dirs = {} + files = {} for p in dirpath.iterdir(): - if p.is_dir(): - dirs.append( - { - "md5": get_zarr_checksum(p, basepath), - "path": p.relative_to(basepath).as_posix(), - } - ) - else: - files.append( - { - "md5": get_digest(p, "md5"), - "path": p.relative_to(basepath).as_posix(), - } - ) - data = { - "directories": sorted(dirs, key=itemgetter("path")), - "files": sorted(files, key=itemgetter("path")), - } - return hashlib.md5( - json.dumps( - data, sort_keys=True, ensure_ascii=True, separators=(",", ":") - ).encode("utf-8") - ).hexdigest() + path = p.relative_to(basepath).as_posix() + if not p.is_dir(): + files[path] = get_digest(p, "md5") + elif any(p.iterdir()): + dirs[path] = get_zarr_checksum(p, basepath) + return get_checksum(files, dirs) diff --git a/setup.cfg b/setup.cfg index dbc4b9165..1182e7038 100644 --- a/setup.cfg +++ b/setup.cfg @@ -32,7 +32,7 @@ install_requires = appdirs click click-didyoumean - dandischema ~= 0.5.0 + dandischema ~= 0.5.1 etelemetry >= 0.2.2 fasteners fscacher From 5e561e93b62210cf43f4b3826990e5aa050350d0 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Thu, 13 Jan 2022 12:03:52 -0500 Subject: [PATCH 34/56] Classes for files & directories inside Zarrs --- dandi/dandiapi.py | 164 +++++++++++++++++++++++++++++++++++++- dandi/files.py | 138 +++++++++++++++++++++++++++----- dandi/misctypes.py | 137 ++++++++++++++++++++++++++++++- dandi/tests/test_files.py | 30 +++++++ 4 files changed, 446 insertions(+), 23 deletions(-) diff --git a/dandi/dandiapi.py b/dandi/dandiapi.py index ae6c8919f..ea8b77978 100644 --- a/dandi/dandiapi.py +++ b/dandi/dandiapi.py @@ -45,10 +45,12 @@ """ from abc import ABC, abstractmethod +from collections import deque +from dataclasses import dataclass, field, replace from datetime import datetime import json import os.path -from pathlib import Path +from pathlib import Path, PurePosixPath import re from time import sleep, time from types import TracebackType @@ -59,6 +61,7 @@ Dict, FrozenSet, Iterator, + List, Optional, Sequence, Type, @@ -66,11 +69,12 @@ Union, cast, ) -from urllib.parse import urlparse, urlunparse +from urllib.parse import unquote, urlparse, urlunparse import click from dandischema import models -from pydantic import BaseModel, Field, PrivateAttr +import dateutil.parser +from pydantic import AnyHttpUrl, BaseModel, Field, PrivateAttr import requests import tenacity @@ -87,7 +91,7 @@ ) from .exceptions import NotFoundError, SchemaVersionError from .keyring import keyring_lookup -from .misctypes import Digest +from .misctypes import BasePath, Digest from .utils import USER_AGENT, check_dandi_version, ensure_datetime, is_interactive lgr = get_logger() @@ -1461,3 +1465,155 @@ def set_raw_metadata(self, metadata: Dict[str, Any]) -> None: self.size = int(data["size"]) self.modified = ensure_datetime(data["modified"]) self._metadata = data["metadata"] + + @property + def filetree(self) -> "RemoteZarrEntry": + return RemoteZarrEntry( + client=self.client, zarr_id=self.zarr, parts=(), _known_dir=True + ) + + def iterfiles(self, include_dirs: bool = False) -> Iterator["RemoteZarrEntry"]: + dirs = deque([self.filetree]) + while dirs: + for p in dirs.popleft().iterdir(): + if p.is_dir(): + dirs.append(p) + if include_dirs: + yield p + else: + yield p + + +class ZarrListing(BaseModel): + # Not for public use + + directories: List[AnyHttpUrl] + files: List[AnyHttpUrl] + + @property + def dirnames(self) -> List[str]: + return [PurePosixPath(unquote(url.path)).name for url in self.directories] + + @property + def filenames(self) -> List[str]: + return [PurePosixPath(unquote(url.path)).name for url in self.files] + + +@dataclass +class RemoteZarrEntry(BasePath): + client: DandiAPIClient + zarr_id: str + _known_dir: Optional[bool] = field(default=None, compare=False, repr=False) + + def _get_subpath( + self, name: str, isdir: Optional[bool] = None + ) -> "RemoteZarrEntry": + if not name or "/" in name: + raise ValueError(f"Invalid path component: {name!r}") + elif name == ".": + return self + elif name == "..": + return self.parent + else: + return replace(self, parts=self.parts + (name,), _known_dir=isdir) + + @property + def parent(self) -> "RemoteZarrEntry": + if self.is_root(): + return self + else: + return replace( + self, + parts=self.parts[:-1], + _known_dir=True if self._known_dir is not None else None, + ) + + def _isdir(self) -> bool: + if self._known_dir is not None: + return self._known_dir + elif self.is_root(): + try: + self.client.get(f"/zarr/{self.zarr_id}/") + except requests.HTTPError as e: + if e.response.status_code == 404: + raise NotFoundError(f"No such Zarr: {self.zarr_id!r}") + else: + raise + return True + else: + ppath = "".join(p + "/" for p in self.parts[:-1]) + try: + r = self.client.get(f"/zarr/{self.zarr_id}.zarr/{ppath}") + except requests.HTTPError as e: + if e.response.status_code == 404: + raise NotFoundError( + f"No such entry {str(self)!r} in Zarr {self.zarr_id!r}" + ) + else: + raise + listing = ZarrListing.parse_obj(r) + if self.name in listing.dirnames: + return True + elif self.name in listing.filenames: + return False + else: + raise NotFoundError( + f"No such entry {str(self)!r} in Zarr {self.zarr_id!r}" + ) + + def exists(self) -> bool: + try: + self._isdir() + except NotFoundError: + return False + else: + return True + + def is_file(self) -> bool: + try: + return not self._isdir() + except NotFoundError: + return False + + def is_dir(self) -> bool: + try: + return self._isdir() + except NotFoundError: + return False + + def iterdir(self) -> Iterator["RemoteZarrEntry"]: + path = "".join(p + "/" for p in self.parts) + try: + r = self.client.get(f"/zarr/{self.zarr_id}.zarr/{path}") + except requests.HTTPError as e: + if e.response.status_code == 404: + raise NotFoundError( + f"{str(self)!r} in Zarr {self.zarr_id!r} does not exist or" + " is not a directory" + ) + else: + raise + listing = ZarrListing.parse_obj(r) + for name in listing.dirnames: + if name == "." or name == "..": + continue + yield self._get_subpath(name, isdir=True) + for name in listing.filenames: + yield self._get_subpath(name, isdir=False) + + def get_etag(self) -> Digest: + # TODO + raise NotImplementedError + + @property + def modified(self) -> datetime: + if not self.is_file(): + # TODO: Should we forego this check and let queries on directories + # fail with KeyError? + raise RuntimeError("Directories in Zarrs do not have 'modified' timestamps") + r = self.client.get( + f"/zarr/{self.zarr_id}.zarr/{'/'.join(self.parts)}", + headers={"Range": "bytes=0-0"}, + json_resp=False, + ) + return dateutil.parser.parse(r.headers["Last-Modified"]) diff --git a/dandi/files.py b/dandi/files.py index b8ec0dbdd..f1f8fa9ee 100644 --- a/dandi/files.py +++ b/dandi/files.py @@ -7,19 +7,22 @@ files that can be uploaded as assets to DANDI Archive. """ +from __future__ import annotations + from abc import ABC, abstractmethod from collections import deque from concurrent.futures import ThreadPoolExecutor, as_completed -from dataclasses import dataclass +from dataclasses import dataclass, replace from datetime import datetime import os from pathlib import Path import re from threading import Lock -from typing import Any, BinaryIO, Dict, Iterator, List, Optional, Union +from typing import Any, BinaryIO, Dict, Generic, Iterator, List, Optional, Union from xml.etree.ElementTree import fromstring from dandischema.digests.dandietag import DandiETag +from dandischema.digests.zarr import get_checksum from dandischema.models import BareAsset, CommonModel from dandischema.models import Dandiset as DandisetMeta from dandischema.models import get_schema_version @@ -38,7 +41,7 @@ from .dandiapi import RemoteAsset, RemoteDandiset, RESTFullAPIClient from .exceptions import UnknownAssetError from .metadata import get_default_metadata, get_metadata, nwb2asset -from .misctypes import DUMMY_DIGEST, Digest +from .misctypes import DUMMY_DIGEST, BasePath, Digest, P from .pynwb_utils import validate as pynwb_validate from .support.digests import get_dandietag, get_digest, get_zarr_checksum from .utils import chunked, ensure_datetime, pluralize, yaml_load @@ -505,19 +508,26 @@ def get_metadata( return metadata -class LocalDirectoryAsset(LocalAsset): +class LocalDirectoryAsset(LocalAsset, Generic[P]): """ Representation of a directory that can be uploaded to a DANDI Archive as a single asset of a Dandiset """ - def iterfiles(self) -> Iterator[Path]: + @property + @abstractmethod + def filetree(self) -> P: + ... + + def iterfiles(self, include_dirs: bool = False) -> Iterator[P]: """Yield all files within the directory""" - dirs = deque([self.filepath]) + dirs = deque([self.filetree]) while dirs: for p in dirs.popleft().iterdir(): if p.is_dir(): dirs.append(p) + if include_dirs: + yield p else: yield p @@ -527,11 +537,109 @@ def size(self) -> int: return sum(p.stat().st_size for p in self.iterfiles()) -class ZarrAsset(LocalDirectoryAsset): +@dataclass +class LocalZarrEntry(BasePath): + #: The path to the actual file or directory on disk + filepath: Path + #: The path to the root of the Zarr file tree + zarr_basepath: Path + + def _get_subpath(self, name: str) -> LocalZarrEntry: + if not name or "/" in name: + raise ValueError(f"Invalid path component: {name!r}") + elif name == ".": + return self + elif name == "..": + return self.parent + else: + return replace( + self, filepath=self.filepath / name, parts=self.parts + (name,) + ) + + @property + def parent(self) -> LocalZarrEntry: + if self.is_root(): + return self + else: + return replace(self, filepath=self.filepath.parent, parts=self.parts[:-1]) + + def exists(self) -> bool: + return self.filepath.exists() + + def is_file(self) -> bool: + return self.filepath.is_file() + + def is_dir(self) -> bool: + return self.filepath.is_dir() + + def iterdir(self) -> Iterator[LocalZarrEntry]: + for p in self.filepath.iterdir(): + if p.is_dir() and not any(p.iterdir()): + # Ignore empty directories + continue + yield self._get_subpath(p.name) + + def get_etag(self) -> Digest: + if self.is_dir(): + return get_zarr_checksum(self.filepath, basepath=self.zarr_basepath) + else: + return get_digest(self.filepath, "md5") + + @property + def size(self) -> int: + if self.is_dir(): + return sum(p.size for p in self.iterdir()) + else: + return os.path.getsize(self.filepath) + + @property + def modified(self) -> datetime: + # TODO: Should this be overridden for directories? + return ensure_datetime(self.filepath.stat().st_mtime) + + +@dataclass +class ZarrStat: + size: int + digest: Digest + files: List[LocalZarrEntry] # Unspecified order; does not include directories + + +class ZarrAsset(LocalDirectoryAsset[LocalZarrEntry]): """Representation of a local Zarr directory""" EXTENSIONS = [".ngff", ".zarr"] + @property + def filetree(self) -> LocalZarrEntry: + return LocalZarrEntry( + filepath=self.filepath, zarr_basepath=self.zarr_basepath, parts=() + ) + + def stat(self) -> ZarrStat: + def dirstat(self, dirpath: LocalZarrEntry) -> ZarrStat: + size = 0 + dir_md5s = {} + file_md5s = {} + files = [] + for p in dirpath.iterdir(): + if p.is_dir(): + st = dirstat(p) + size += st.size + dir_md5s[str(p)] = st.digest.value + files.extend(st.files) + else: + size += p.size + file_md5s[str(p)] = p.get_etag().value + files.append(p) + return ZarrStat( + size=size, + digest=Digest.dandi_zarr(get_checksum(file_md5s, dir_md5s)), + files=files, + ) + + return dirstat(self.filetree) + def get_etag(self) -> Digest: """Calculate a dandi-zarr-checksum digest for the asset""" return Digest.dandi_zarr(get_zarr_checksum(self.filepath)) @@ -621,9 +729,8 @@ def iter_upload( asset_path = metadata.setdefault("path", self.path) client = dandiset.client yield {"status": "calculating etag"} - # TODO: Only iterate over the filetree once and save the results in - # memory - filetag = self.get_etag().value + stat = self.stat() + filetag = stat.digest.value lgr.debug("Calculated dandi-zarr-checksum of %s for %s", filetag, self.filepath) digest = metadata.get("digest", {}) if "dandi:dandi-zarr-checksum" in digest: @@ -634,7 +741,6 @@ def iter_upload( ) yield {"status": "initiating upload"} lgr.debug("%s: Beginning upload", asset_path) - total_size = self.size bytes_uploaded = 0 r = client.post("/zarr/", json={"name": self.filepath.name}) zarr_id = r["zarr_id"] @@ -643,14 +749,10 @@ def iter_upload( headers={"X-Amz-ACL": "bucket-owner-full-control"}, ) as storage: for i, filebatch in enumerate( - chunked(self.iterfiles(), ZARR_UPLOAD_BATCH_SIZE), start=1 + chunked(stat.files, ZARR_UPLOAD_BATCH_SIZE), start=1 ): upload_body = [ - { - "path": p.relative_to(self.filepath).as_posix(), - "etag": get_digest(p, "md5"), - } - for p in filebatch + {"path": str(p), "etag": p.get_etag().value} for p in filebatch ] lgr.debug( "%s: Uploading Zarr file batch #%d (%s)", @@ -674,7 +776,7 @@ def iter_upload( bytes_uploaded += size yield { "status": "uploading", - "upload": 100 * bytes_uploaded / total_size, + "upload": 100 * bytes_uploaded / stat.size, "current": bytes_uploaded, } lgr.debug("%s: Completing upload of batch #%d", asset_path, i) diff --git a/dandi/misctypes.py b/dandi/misctypes.py index 1e5e0d064..3243c972e 100644 --- a/dandi/misctypes.py +++ b/dandi/misctypes.py @@ -1,7 +1,9 @@ from __future__ import annotations +from abc import ABC, abstractmethod from dataclasses import dataclass -from typing import Dict +from fnmatch import fnmatchcase +from typing import Dict, Iterator, List, Tuple, TypeVar from dandischema.models import DigestType @@ -41,3 +43,136 @@ def asdict(self) -> Dict[DigestType, str]: DUMMY_DIGEST = Digest(algorithm=DigestType.dandi_etag, value=32 * "d" + "-1") + +P = TypeVar("P", bound="BasePath") + + +@dataclass +class BasePath(ABC): + parts: Tuple[str, ...] + + def __str__(self) -> str: + return "/".join(self.parts) + + @property + def name(self) -> str: + if self.is_root(): + return "" + else: + assert self.parts + return self.parts[-1] + + @abstractmethod + def _get_subpath(self: P, name: str) -> P: + ... + + def __truediv__(self: P, path: str) -> P: + p = self + for q in self._split_path(path): + p = p._get_subpath(q) + return p + + def joinpath(self: P, *paths: str) -> P: + p = self + for q in paths: + p /= q + return p + + @staticmethod + def _split_path(path: str) -> Tuple[str, ...]: + if path.startswith("/"): + raise ValueError(f"Absolute paths not allowed: {path!r}") + return tuple(q for q in path.split("/") if q) + + def is_root(self) -> bool: + return self.parts == () + + @property + def root_path(self: P) -> P: + p = self + while not p.is_root(): + p = p.parent + return p + + @property + @abstractmethod + def parent(self: P) -> P: + # The parent of the root of a filetree is itself + ... + + @property + def parents(self: P) -> Tuple[P, ...]: + ps: List[P] = [] + p = self + while not p.is_root(): + q = p.parent + ps.append(q) + p = q + return tuple(ps) + + def with_name(self: P, name: str) -> P: + return self.parent / name + + @property + def suffix(self) -> str: + i = self.name.rfind(".") + if 0 < i < len(self.name) - 1: + return self.name[i:] + else: + return "" + + @property + def suffixes(self) -> List[str]: + if self.name.endswith("."): + return [] + name = self.name.lstrip(".") + return ["." + suffix for suffix in name.split(".")[1:]] + + @property + def stem(self) -> str: + i = self.name.rfind(".") + if 0 < i < len(self.name) - 1: + return self.name[:i] + else: + return self.name + + def with_stem(self: P, stem: str) -> P: + return self.with_name(stem + self.suffix) + + def with_suffix(self: P, suffix: str) -> P: + if "/" in suffix or (suffix and not suffix.startswith(".")) or suffix == ".": + raise ValueError(f"Invalid suffix: {suffix!r}") + if not self.name: + raise ValueError("Path has an empty name") + if not self.suffix: + name = self.name + suffix + else: + name = self.name[: -len(self.suffix)] + suffix + return self.with_name(name) + + def match(self, pattern: str) -> bool: + patparts = self._split_path(pattern) + if not patparts: + raise ValueError("Empty pattern") + if len(patparts) > len(self.parts): + return False + for part, pat in zip(reversed(self.parts), reversed(patparts)): + if not fnmatchcase(part, pat): + return False + return True + + @abstractmethod + def exists(self) -> bool: + ... + + @abstractmethod + def is_file(self) -> bool: + ... + + @abstractmethod + def is_dir(self) -> bool: + ... + + @abstractmethod + def iterdir(self: P) -> Iterator[P]: + ... diff --git a/dandi/tests/test_files.py b/dandi/tests/test_files.py index d96d23fb2..3a535124a 100644 --- a/dandi/tests/test_files.py +++ b/dandi/tests/test_files.py @@ -138,3 +138,33 @@ def test_upload_zarr(local_dandi_api, tmp_path): asset.set_raw_metadata(md) md = asset.get_raw_metadata() assert md["description"] == "A modified Zarr" + + for file_src in [zf, asset]: + entries = sorted(file_src.iterfiles(include_dirs=True), key=attrgetter("parts")) + assert [str(e) for e in entries] == [ + ".zgroup", + "arr_0", + "arr_0/.zarray", + "arr_0/0", + "arr_1", + "arr_1/.zarray", + "arr_1/0", + ] + assert (file_src.filetree / ".zgroup").exists() + assert (file_src.filetree / ".zgroup").is_file() + assert not (file_src.filetree / ".zgroup").is_dir() + assert (file_src.filetree / "arr_0").exists() + assert not (file_src.filetree / "arr_0").is_file() + assert (file_src.filetree / "arr_0").is_dir() + assert not (file_src.filetree / "0").exists() + assert not (file_src.filetree / "0").is_file() + assert not (file_src.filetree / "0").is_dir() + assert not (file_src.filetree / "arr_0" / ".zgroup").exists() + assert not (file_src.filetree / "arr_0" / ".zgroup").is_file() + assert not (file_src.filetree / "arr_0" / ".zgroup").is_dir() + assert not (file_src.filetree / ".zgroup" / "0").exists() + assert not (file_src.filetree / ".zgroup" / "0").is_file() + assert not (file_src.filetree / ".zgroup" / "0").is_dir() + assert not (file_src.filetree / "arr_2" / "0").exists() + assert not (file_src.filetree / "arr_2" / "0").is_file() + assert not (file_src.filetree / "arr_2" / "0").is_dir() From e3642afa57b309f451b2e3bd2dec46d664b171d2 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Mon, 17 Jan 2022 11:35:22 -0500 Subject: [PATCH 35/56] Fixes --- dandi/dandiapi.py | 5 +++++ dandi/files.py | 26 +++++++++++++++----------- dandi/misctypes.py | 5 +++++ dandi/tests/test_files.py | 4 ++++ 4 files changed, 29 insertions(+), 11 deletions(-) diff --git a/dandi/dandiapi.py b/dandi/dandiapi.py index ea8b77978..175b35780 100644 --- a/dandi/dandiapi.py +++ b/dandi/dandiapi.py @@ -1605,6 +1605,11 @@ def get_etag(self) -> Digest: # TODO raise NotImplementedError + @property + def size(self) -> int: + # TODO + raise NotImplementedError + @property def modified(self) -> datetime: if not self.is_file(): diff --git a/dandi/files.py b/dandi/files.py index f1f8fa9ee..9687dbff5 100644 --- a/dandi/files.py +++ b/dandi/files.py @@ -25,7 +25,7 @@ from dandischema.digests.zarr import get_checksum from dandischema.models import BareAsset, CommonModel from dandischema.models import Dandiset as DandisetMeta -from dandischema.models import get_schema_version +from dandischema.models import DigestType, get_schema_version from pydantic import ValidationError import requests import zarr @@ -534,7 +534,7 @@ def iterfiles(self, include_dirs: bool = False) -> Iterator[P]: @property def size(self) -> int: """The total size of the files in the directory""" - return sum(p.stat().st_size for p in self.iterfiles()) + return sum(p.size for p in self.iterfiles()) @dataclass @@ -581,9 +581,13 @@ def iterdir(self) -> Iterator[LocalZarrEntry]: def get_etag(self) -> Digest: if self.is_dir(): - return get_zarr_checksum(self.filepath, basepath=self.zarr_basepath) + return Digest.dandi_zarr( + get_zarr_checksum(self.filepath, basepath=self.zarr_basepath) + ) else: - return get_digest(self.filepath, "md5") + return Digest( + algorithm=DigestType.md5, value=get_digest(self.filepath, "md5") + ) @property def size(self) -> int: @@ -613,11 +617,11 @@ class ZarrAsset(LocalDirectoryAsset[LocalZarrEntry]): @property def filetree(self) -> LocalZarrEntry: return LocalZarrEntry( - filepath=self.filepath, zarr_basepath=self.zarr_basepath, parts=() + filepath=self.filepath, zarr_basepath=self.filepath, parts=() ) def stat(self) -> ZarrStat: - def dirstat(self, dirpath: LocalZarrEntry) -> ZarrStat: + def dirstat(dirpath: LocalZarrEntry) -> ZarrStat: size = 0 dir_md5s = {} file_md5s = {} @@ -632,11 +636,11 @@ def dirstat(self, dirpath: LocalZarrEntry) -> ZarrStat: size += p.size file_md5s[str(p)] = p.get_etag().value files.append(p) - return ZarrStat( - size=size, - digest=Digest.dandi_zarr(get_checksum(file_md5s, dir_md5s)), - files=files, - ) + return ZarrStat( + size=size, + digest=Digest.dandi_zarr(get_checksum(file_md5s, dir_md5s)), + files=files, + ) return dirstat(self.filetree) diff --git a/dandi/misctypes.py b/dandi/misctypes.py index 3243c972e..dc65b271c 100644 --- a/dandi/misctypes.py +++ b/dandi/misctypes.py @@ -176,3 +176,8 @@ def is_dir(self) -> bool: @abstractmethod def iterdir(self: P) -> Iterator[P]: ... + + @property + @abstractmethod + def size(self) -> int: + ... diff --git a/dandi/tests/test_files.py b/dandi/tests/test_files.py index 3a535124a..5efd1b092 100644 --- a/dandi/tests/test_files.py +++ b/dandi/tests/test_files.py @@ -5,6 +5,7 @@ import numpy as np import zarr +from .. import get_logger from ..consts import ZARR_MIME_TYPE, dandiset_metadata_file from ..dandiapi import RemoteZarrAsset from ..files import ( @@ -16,6 +17,8 @@ find_dandi_files, ) +lgr = get_logger() + def test_find_dandi_files(tmp_path: Path) -> None: (tmp_path / dandiset_metadata_file).touch() @@ -140,6 +143,7 @@ def test_upload_zarr(local_dandi_api, tmp_path): assert md["description"] == "A modified Zarr" for file_src in [zf, asset]: + lgr.debug("Traversing %s", type(file_src).__name__) entries = sorted(file_src.iterfiles(include_dirs=True), key=attrgetter("parts")) assert [str(e) for e in entries] == [ ".zgroup", From 607455fce83293ea4bd4de44aa470291cadd7dbb Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Mon, 17 Jan 2022 12:07:39 -0500 Subject: [PATCH 36/56] Fetching Zarr entry checksum & stats Depends on: - https://github.com/dandi/dandi-archive/pull/802 - https://github.com/dandi/dandi-archive/pull/805 --- dandi/dandiapi.py | 80 +++++++++++++++++++++++++++++------------------ 1 file changed, 50 insertions(+), 30 deletions(-) diff --git a/dandi/dandiapi.py b/dandi/dandiapi.py index 175b35780..0fe6432cc 100644 --- a/dandi/dandiapi.py +++ b/dandi/dandiapi.py @@ -1489,6 +1489,8 @@ class ZarrListing(BaseModel): directories: List[AnyHttpUrl] files: List[AnyHttpUrl] + checksums: Dict[str, str] + checksum: str @property def dirnames(self) -> List[str]: @@ -1499,6 +1501,12 @@ def filenames(self) -> List[str]: return [PurePosixPath(unquote(url.path)).name for url in self.files] +@dataclass +class ZarrEntryStat: + size: int + modified: datetime + + @dataclass class RemoteZarrEntry(BasePath): client: DandiAPIClient @@ -1541,17 +1549,7 @@ def _isdir(self) -> bool: raise return True else: - ppath = "".join(p + "/" for p in self.parts[:-1]) - try: - r = self.client.get(f"/zarr/{self.zarr_id}.zarr/{ppath}") - except requests.HTTPError as e: - if e.response.status_code == 404: - raise NotFoundError( - f"No such entry {str(self)!r} in Zarr {self.zarr_id!r}" - ) - else: - raise - listing = ZarrListing.parse_obj(r) + listing = self.parent.get_listing() if self.name in listing.dirnames: return True elif self.name in listing.filenames: @@ -1582,18 +1580,7 @@ def is_dir(self) -> bool: return False def iterdir(self) -> Iterator["RemoteZarrEntry"]: - path = "".join(p + "/" for p in self.parts) - try: - r = self.client.get(f"/zarr/{self.zarr_id}.zarr/{path}") - except requests.HTTPError as e: - if e.response.status_code == 404: - raise NotFoundError( - f"{str(self)!r} in Zarr {self.zarr_id!r} does not exist or" - " is not a directory" - ) - else: - raise - listing = ZarrListing.parse_obj(r) + listing = self.get_listing() for name in listing.dirnames: if name == "." or name == "..": continue @@ -1602,23 +1589,56 @@ def iterdir(self) -> Iterator["RemoteZarrEntry"]: yield self._get_subpath(name, isdir=False) def get_etag(self) -> Digest: - # TODO - raise NotImplementedError + if self.is_root(): + algorithm = models.DigestType.dandi_zarr_checksum + value = self.get_listing().checksum + else: + listing = self.parent.get_listing() + if self.name in listing.dirnames: + algorithm = models.DigestType.dandi_zarr_checksum + elif self.name in listing.filenames: + algorithm = models.DigestType.md5 + else: + raise NotFoundError( + f"No such entry {str(self)!r} in Zarr {self.zarr_id!r}" + ) + value = listing.checksums[self.name] + return Digest(algorithm=algorithm, value=value) @property def size(self) -> int: - # TODO - raise NotImplementedError + return self.stat().size @property def modified(self) -> datetime: + return self.stat().modified + + def stat(self) -> ZarrEntryStat: if not self.is_file(): # TODO: Should we forego this check and let queries on directories # fail with KeyError? raise RuntimeError("Directories in Zarrs do not have 'modified' timestamps") - r = self.client.get( + r = self.client.request( + "HEAD", f"/zarr/{self.zarr_id}.zarr/{'/'.join(self.parts)}", - headers={"Range": "bytes=0-0"}, json_resp=False, + allow_redirects=True, + ) + return ZarrEntryStat( + size=int(r.headers["Content-Length"]), + modified=dateutil.parser.parse(r.headers["Last-Modified"]), ) - return dateutil.parser.parse(r.headers["Last-Modified"]) + + def get_listing(self) -> ZarrListing: + path = "".join(p + "/" for p in self.parts) + try: + r = self.client.get(f"/zarr/{self.zarr_id}.zarr/{path}") + except requests.HTTPError as e: + if e.response.status_code == 404: + raise NotFoundError( + f"{str(self)!r} in Zarr {self.zarr_id!r} does not exist or" + " is not a directory" + ) + else: + raise + return ZarrListing.parse_obj(r) From e2ca2670646a397cc33f5d1c4ab42a6dbde390b7 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Wed, 19 Jan 2022 12:22:42 -0500 Subject: [PATCH 37/56] Test uploading a Zarr containing an empty directory --- dandi/tests/test_upload.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/dandi/tests/test_upload.py b/dandi/tests/test_upload.py index f2f603949..927352460 100644 --- a/dandi/tests/test_upload.py +++ b/dandi/tests/test_upload.py @@ -293,3 +293,24 @@ def test_upload_zarr_to_nonzarr_path(local_dandi_api, monkeypatch, tmp_path): dspath / "sample.zarr", tmp_path / "download" / dandiset_id / "sample.zarr", ) + + +def test_upload_zarr_with_empty_dir(local_dandi_api, monkeypatch, tmp_path): + d = local_dandi_api.client.create_dandiset("Test Dandiset", {}) + dandiset_id = d.identifier + (tmp_path / dandiset_metadata_file).write_text(f"identifier: '{dandiset_id}'\n") + zarr.save(tmp_path / "sample.zarr", np.arange(1000), np.arange(1000, 0, -1)) + (tmp_path / "sample.zarr" / "empty").mkdir() + monkeypatch.setenv("DANDI_API_KEY", local_dandi_api.api_key) + upload( + paths=[], + dandiset_path=tmp_path, + dandi_instance=local_dandi_api.instance_id, + devel_debug=True, + ) + (asset,) = d.get_assets() + assert isinstance(asset, RemoteZarrAsset) + assert asset.is_zarr() + assert not asset.is_blob() + assert asset.path == "sample.zarr" + assert not (asset.filetree / "empty").exists() From e179a49bc394421fee15e07b6abc9da97e1de980 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Wed, 19 Jan 2022 12:32:01 -0500 Subject: [PATCH 38/56] Test `upload --sync` with Zarrs --- dandi/tests/test_upload.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/dandi/tests/test_upload.py b/dandi/tests/test_upload.py index 927352460..f8345ff2c 100644 --- a/dandi/tests/test_upload.py +++ b/dandi/tests/test_upload.py @@ -172,6 +172,17 @@ def test_upload_sync_folder(mocker, text_dandiset): text_dandiset.dandiset.get_asset_by_path("subdir2/banana.txt") +def test_upload_sync_zarr(mocker, zarr_dandiset): + rmtree(zarr_dandiset.dspath / "sample.zarr") + zarr.save(zarr_dandiset.dspath / "identity.zarr", np.eye(5)) + confirm_mock = mocker.patch("click.confirm", return_value=True) + zarr_dandiset.upload(sync=True) + confirm_mock.assert_called_with("Delete 1 asset on server?") + zarr_dandiset.dandiset.get_asset_by_path("identity.zarr") + with pytest.raises(NotFoundError): + zarr_dandiset.dandiset.get_asset_by_path("sample.zarr") + + def test_upload_invalid_metadata( local_dandi_api, monkeypatch, simple1_nwb_metadata, tmp_path ): From 0a24a8c33ba403460b6a24eec5ec927f1c267496 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Wed, 19 Jan 2022 14:25:52 -0500 Subject: [PATCH 39/56] Docstrings --- dandi/dandiapi.py | 89 +++++++++++++++++++++++++++++++----- dandi/files.py | 40 ++++++++++++++-- dandi/misctypes.py | 56 ++++++++++++++++++++++- docs/source/modref/files.rst | 1 + 4 files changed, 170 insertions(+), 16 deletions(-) diff --git a/dandi/dandiapi.py b/dandi/dandiapi.py index 0fe6432cc..13ef408a4 100644 --- a/dandi/dandiapi.py +++ b/dandi/dandiapi.py @@ -1468,11 +1468,20 @@ def set_raw_metadata(self, metadata: Dict[str, Any]) -> None: @property def filetree(self) -> "RemoteZarrEntry": + """ + The `RemoteZarrEntry` for the root of the hierarchy of files within the + Zarr + """ return RemoteZarrEntry( client=self.client, zarr_id=self.zarr, parts=(), _known_dir=True ) def iterfiles(self, include_dirs: bool = False) -> Iterator["RemoteZarrEntry"]: + """ + Returns a generator of all `RemoteZarrEntry`\\s within the Zarr. By + default, only instances for files are produced, unless ``include_dirs`` + is true. + """ dirs = deque([self.filetree]) while dirs: for p in dirs.popleft().iterdir(): @@ -1485,31 +1494,51 @@ def iterfiles(self, include_dirs: bool = False) -> Iterator["RemoteZarrEntry"]: class ZarrListing(BaseModel): - # Not for public use + """Information about a directory within a `RemoteZarrAsset`""" + #: API URLs for the listings of the directory's subdirectories directories: List[AnyHttpUrl] + #: API URLs for downloading the files in the directory files: List[AnyHttpUrl] + #: The checksums (MD5 or Dandi Zarr checksum, as appropriate) for the + #: directory's entries, as a mapping from basenames to checksums checksums: Dict[str, str] + #: The Dandi Zarr checksum for the directory checksum: str @property def dirnames(self) -> List[str]: + """The basenames of the directory URLs in `directories`""" return [PurePosixPath(unquote(url.path)).name for url in self.directories] @property def filenames(self) -> List[str]: + """The basenames of the file URLs in `files`""" return [PurePosixPath(unquote(url.path)).name for url in self.files] @dataclass class ZarrEntryStat: + """ + Combined size & timestamp information for a file in a `RemoteZarrAsset` + """ + + #: The size of the file size: int + #: The time at which the file was last modified modified: datetime @dataclass class RemoteZarrEntry(BasePath): + """ + A file or directory within a `RemoteZarrAsset`. Implements + `~dandi.misctypes.BasePath`. + """ + + #: The `DandiAPIClient` instance used for API requests client: DandiAPIClient + #: The ID of the Zarr backing the asset zarr_id: str _known_dir: Optional[bool] = field(default=None, compare=False, repr=False) @@ -1589,6 +1618,13 @@ def iterdir(self) -> Iterator["RemoteZarrEntry"]: yield self._get_subpath(name, isdir=False) def get_etag(self) -> Digest: + """ + Retrieve the etag digest for the entry. If the entry is a directory, + the algorithm will be the Dandi Zarr checksum algorithm; if it is a + file, it will be MD5. + + :raises NotFoundError: if the path does not exist in the Zarr asset + """ if self.is_root(): algorithm = models.DigestType.dandi_zarr_checksum value = self.get_listing().checksum @@ -1607,37 +1643,66 @@ def get_etag(self) -> Digest: @property def size(self) -> int: + """ + The size of the entry, which must be a file + + :raises NotFoundError: if the path does not exist in the Zarr asset + :raises ValueError: if the path is a directory + """ return self.stat().size @property def modified(self) -> datetime: + """ + The time at which the entry (which must be a file) was last modified + + :raises NotFoundError: if the path does not exist in the Zarr asset + :raises ValueError: if the path is a directory + """ return self.stat().modified def stat(self) -> ZarrEntryStat: + """ + Return combined size & timestamp information for the entry, which must + be a file + + :raises NotFoundError: if the path does not exist in the Zarr asset + :raises ValueError: if the path is a directory + """ if not self.is_file(): - # TODO: Should we forego this check and let queries on directories - # fail with KeyError? - raise RuntimeError("Directories in Zarrs do not have 'modified' timestamps") - r = self.client.request( - "HEAD", - f"/zarr/{self.zarr_id}.zarr/{'/'.join(self.parts)}", - json_resp=False, - allow_redirects=True, - ) + raise ValueError("Cannot stat directories in Zarrs") + try: + r = self.client.request( + "HEAD", + f"/zarr/{self.zarr_id}.zarr/{'/'.join(self.parts)}", + json_resp=False, + allow_redirects=True, + ) + except requests.HTTPError as e: + if e.response.status_code == 404: + raise NotFoundError( + f"{str(self)!r} in Zarr {self.zarr_id!r} does not exist" + ) + else: + raise return ZarrEntryStat( size=int(r.headers["Content-Length"]), modified=dateutil.parser.parse(r.headers["Last-Modified"]), ) def get_listing(self) -> ZarrListing: + """ + Return the `ZarrListing` for the entry, which must be a directory + + :raises NotFoundError: if the path does not exist in the Zarr asset + """ path = "".join(p + "/" for p in self.parts) try: r = self.client.get(f"/zarr/{self.zarr_id}.zarr/{path}") except requests.HTTPError as e: if e.response.status_code == 404: raise NotFoundError( - f"{str(self)!r} in Zarr {self.zarr_id!r} does not exist or" - " is not a directory" + f"{str(self)!r} in Zarr {self.zarr_id!r} does not exist" ) else: raise diff --git a/dandi/files.py b/dandi/files.py index 9687dbff5..4bd13d90f 100644 --- a/dandi/files.py +++ b/dandi/files.py @@ -56,7 +56,7 @@ @dataclass class DandiFile(ABC): - """Base class for local files & directories of interest to DANDI""" + """Abstract base class for local files & directories of interest to DANDI""" #: The path to the actual file or directory on disk filepath: Path @@ -78,6 +78,7 @@ def get_metadata( digest: Optional[Digest] = None, ignore_errors: bool = True, ) -> CommonModel: + """Return the Dandi metadata for the file""" ... @abstractmethod @@ -86,6 +87,9 @@ def get_validation_errors( schema_version: Optional[str] = None, devel_debug: bool = False, ) -> List[str]: + """ + Attempt to validate the file and return a list of errors encountered + """ ... @@ -97,6 +101,7 @@ def get_metadata( digest: Optional[Digest] = None, ignore_errors: bool = True, ) -> DandisetMeta: + """Return the Dandiset metadata inside the file""" with open(self.filepath) as f: meta = yaml_load(f, typ="safe") return DandisetMeta.unvalidated(**meta) @@ -169,6 +174,7 @@ def get_metadata( digest: Optional[Digest] = None, ignore_errors: bool = True, ) -> BareAsset: + """Return the Dandi metadata for the asset""" ... # TODO: @validate_cache.memoize_path @@ -511,12 +517,17 @@ def get_metadata( class LocalDirectoryAsset(LocalAsset, Generic[P]): """ Representation of a directory that can be uploaded to a DANDI Archive as - a single asset of a Dandiset + a single asset of a Dandiset. It is generic in ``P``, bound to + `dandi.misctypes.BasePath`. """ @property @abstractmethod def filetree(self) -> P: + """ + The path object for the root of the hierarchy of files within the + directory + """ ... def iterfiles(self, include_dirs: bool = False) -> Iterator[P]: @@ -539,6 +550,8 @@ def size(self) -> int: @dataclass class LocalZarrEntry(BasePath): + """A file or directory within a `ZarrAsset`""" + #: The path to the actual file or directory on disk filepath: Path #: The path to the root of the Zarr file tree @@ -580,6 +593,11 @@ def iterdir(self) -> Iterator[LocalZarrEntry]: yield self._get_subpath(p.name) def get_etag(self) -> Digest: + """ + Calculate the etag digest for the entry. If the entry is a directory, + the algorithm will be the Dandi Zarr checksum algorithm; if it is a + file, it will be MD5. + """ if self.is_dir(): return Digest.dandi_zarr( get_zarr_checksum(self.filepath, basepath=self.zarr_basepath) @@ -591,6 +609,10 @@ def get_etag(self) -> Digest: @property def size(self) -> int: + """ + The size of the entry. For a directory, this is the total size of all + entries within it. + """ if self.is_dir(): return sum(p.size for p in self.iterdir()) else: @@ -598,15 +620,21 @@ def size(self) -> int: @property def modified(self) -> datetime: + """The time at which the entry was last modified""" # TODO: Should this be overridden for directories? return ensure_datetime(self.filepath.stat().st_mtime) @dataclass class ZarrStat: + """Details about a Zarr asset""" + + #: The total size of the asset size: int + #: The Dandi Zarr checksum of the asset digest: Digest - files: List[LocalZarrEntry] # Unspecified order; does not include directories + #: A list of all files in the asset in unspecified order + files: List[LocalZarrEntry] class ZarrAsset(LocalDirectoryAsset[LocalZarrEntry]): @@ -616,11 +644,17 @@ class ZarrAsset(LocalDirectoryAsset[LocalZarrEntry]): @property def filetree(self) -> LocalZarrEntry: + """ + The `LocalZarrEntry` for the root of the hierarchy of files within the + Zarr asset + """ return LocalZarrEntry( filepath=self.filepath, zarr_basepath=self.filepath, parts=() ) def stat(self) -> ZarrStat: + """Return various details about the Zarr asset""" + def dirstat(dirpath: LocalZarrEntry) -> ZarrStat: size = 0 dir_md5s = {} diff --git a/dandi/misctypes.py b/dandi/misctypes.py index dc65b271c..980b47083 100644 --- a/dandi/misctypes.py +++ b/dandi/misctypes.py @@ -1,3 +1,5 @@ +"""Miscellaneous public classes""" + from __future__ import annotations from abc import ABC, abstractmethod @@ -42,6 +44,8 @@ def asdict(self) -> Dict[DigestType, str]: return {self.algorithm: self.value} +#: Placeholder digest used in some situations where a digest is required but +#: not actually relevant and would be too expensive to calculate DUMMY_DIGEST = Digest(algorithm=DigestType.dandi_etag, value=32 * "d" + "-1") P = TypeVar("P", bound="BasePath") @@ -49,6 +53,15 @@ def asdict(self) -> Dict[DigestType, str]: @dataclass class BasePath(ABC): + """ + An abstract base class for path-like objects that can be traversed with the + ``/`` operator *à la* `pathlib.Path` (though, unlike `pathlib.Path` + instances, "dividing" by another non-string path is not allowed). All + paths are treated as forward-slash-separated relative paths under an + empty-name "root" path. + """ + + #: The path components of the object parts: Tuple[str, ...] def __str__(self) -> str: @@ -56,6 +69,10 @@ def __str__(self) -> str: @property def name(self) -> str: + """ + The basename of the path object. When the object represents the root + of a path hierarchy, this is the empty string. + """ if self.is_root(): return "" else: @@ -64,6 +81,12 @@ def name(self) -> str: @abstractmethod def _get_subpath(self: P, name: str) -> P: + """ + Return the path immediately under the instance with the given name. A + name of ``"."`` should cause ``self`` to be returned, and a name of + ``".."`` should cause ``self.parent`` to be returned. An empty name or + a name containing a forward slash should result in a `ValueError`. + """ ... def __truediv__(self: P, path: str) -> P: @@ -73,6 +96,10 @@ def __truediv__(self: P, path: str) -> P: return p def joinpath(self: P, *paths: str) -> P: + """ + Combine the path with each name or relative path in ``paths`` using the + ``/`` operator + """ p = self for q in paths: p /= q @@ -80,15 +107,20 @@ def joinpath(self: P, *paths: str) -> P: @staticmethod def _split_path(path: str) -> Tuple[str, ...]: + """Split a path into its path components""" if path.startswith("/"): raise ValueError(f"Absolute paths not allowed: {path!r}") return tuple(q for q in path.split("/") if q) def is_root(self) -> bool: + """ + Returns true if this path object represents the root of its hierarchy + """ return self.parts == () @property def root_path(self: P) -> P: + """The root of the path object's hierarchy""" p = self while not p.is_root(): p = p.parent @@ -97,11 +129,18 @@ def root_path(self: P) -> P: @property @abstractmethod def parent(self: P) -> P: - # The parent of the root of a filetree is itself + """ + The parent path of the object. The parent of the root of a path + hierarchy is itself. + """ ... @property def parents(self: P) -> Tuple[P, ...]: + """ + A tuple of the path's ancestors, starting at the parent and going up to + (but not including) the root of the hierarchy + """ ps: List[P] = [] p = self while not p.is_root(): @@ -111,10 +150,12 @@ def parents(self: P) -> Tuple[P, ...]: return tuple(ps) def with_name(self: P, name: str) -> P: + """Equivalent to ``p.parent / name``""" return self.parent / name @property def suffix(self) -> str: + """The final file extension of the basename, if any""" i = self.name.rfind(".") if 0 < i < len(self.name) - 1: return self.name[i:] @@ -123,6 +164,7 @@ def suffix(self) -> str: @property def suffixes(self) -> List[str]: + """A list of the basename's file extensions""" if self.name.endswith("."): return [] name = self.name.lstrip(".") @@ -130,6 +172,7 @@ def suffixes(self) -> List[str]: @property def stem(self) -> str: + """The basename without its final file extension, if any""" i = self.name.rfind(".") if 0 < i < len(self.name) - 1: return self.name[:i] @@ -137,9 +180,11 @@ def stem(self) -> str: return self.name def with_stem(self: P, stem: str) -> P: + """Returns a new path with the stem changed""" return self.with_name(stem + self.suffix) def with_suffix(self: P, suffix: str) -> P: + """Returns a new path with the final file extension changed""" if "/" in suffix or (suffix and not suffix.startswith(".")) or suffix == ".": raise ValueError(f"Invalid suffix: {suffix!r}") if not self.name: @@ -151,6 +196,7 @@ def with_suffix(self: P, suffix: str) -> P: return self.with_name(name) def match(self, pattern: str) -> bool: + """Tests whether the path matches the given glob pattern""" patparts = self._split_path(pattern) if not patparts: raise ValueError("Empty pattern") @@ -163,21 +209,29 @@ def match(self, pattern: str) -> bool: @abstractmethod def exists(self) -> bool: + """True iff the resource at the given path exists""" ... @abstractmethod def is_file(self) -> bool: + """True if the resource at the given path exists and is a file""" ... @abstractmethod def is_dir(self) -> bool: + """True if the resource at the given path exists and is a directory""" ... @abstractmethod def iterdir(self: P) -> Iterator[P]: + """ + Returns a generator of the paths under the instance, which must be a + directory + """ ... @property @abstractmethod def size(self) -> int: + """The size of the resource at the path""" ... diff --git a/docs/source/modref/files.rst b/docs/source/modref/files.rst index cd80ddc23..c5a9abe77 100644 --- a/docs/source/modref/files.rst +++ b/docs/source/modref/files.rst @@ -2,3 +2,4 @@ =============== .. automodule:: dandi.files + :show-inheritance: From 8aa0c4c289d35fa418422d719f1d06a077c10d5c Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Wed, 19 Jan 2022 16:13:53 -0500 Subject: [PATCH 40/56] First complete draft of Zarr download --- dandi/cli/base.py | 19 ++ dandi/cli/cmd_download.py | 10 +- dandi/cli/cmd_upload.py | 20 +-- dandi/dandiapi.py | 31 ++++ dandi/download.py | 294 ++++++++++++++++++++++++++----- dandi/tests/test_download.py | 80 ++++++++- docs/source/cmdline/download.rst | 5 +- setup.cfg | 1 + 8 files changed, 391 insertions(+), 69 deletions(-) diff --git a/dandi/cli/base.py b/dandi/cli/base.py index e2a2a4238..bf2b890d5 100644 --- a/dandi/cli/base.py +++ b/dandi/cli/base.py @@ -11,6 +11,25 @@ # Aux common functionality +class IntColonInt(click.ParamType): + name = "int:int" + + def convert(self, value, param, ctx): + if isinstance(value, str): + v1, colon, v2 = value.partition(":") + try: + v1 = int(v1) + v2 = int(v2) if colon else None + except ValueError: + self.fail("Value must be of the form `N[:M]`", param, ctx) + return (v1, v2) + else: + return value + + def get_metavar(self, param): + return "N[:M]" + + # ???: could make them always available but hidden # via hidden=True. def devel_option(*args, **kwargs): diff --git a/dandi/cli/cmd_download.py b/dandi/cli/cmd_download.py index 6355fcf35..39b1a0756 100644 --- a/dandi/cli/cmd_download.py +++ b/dandi/cli/cmd_download.py @@ -2,7 +2,7 @@ import click -from .base import instance_option, map_to_click_exceptions +from .base import IntColonInt, instance_option, map_to_click_exceptions from ..consts import known_instances, known_instances_rev from ..dandiarchive import _dandi_url_parser, parse_dandi_url @@ -70,8 +70,9 @@ def get_metavar(self, param): @click.option( "-J", "--jobs", - help="Number of parallel download jobs.", - default=6, # TODO: come up with smart auto-scaling etc + type=IntColonInt(), + help="Number of parallel download jobs and, optionally number of subjobs per Zarr asset", + default="6", # TODO: come up with smart auto-scaling etc show_default=True, ) @click.option( @@ -141,7 +142,8 @@ def download( output_dir, existing=existing, format=format, - jobs=jobs, + jobs=jobs[0], + jobs_per_zarr=jobs[1], get_metadata="dandiset.yaml" in download_types, get_assets="assets" in download_types, sync=sync, diff --git a/dandi/cli/cmd_upload.py b/dandi/cli/cmd_upload.py index 1139d01eb..ad200c8a7 100644 --- a/dandi/cli/cmd_upload.py +++ b/dandi/cli/cmd_upload.py @@ -1,6 +1,7 @@ import click from .base import ( + IntColonInt, devel_debug_option, devel_option, instance_option, @@ -8,25 +9,6 @@ ) -class IntColonInt(click.ParamType): - name = "int:int" - - def convert(self, value, param, ctx): - if isinstance(value, str): - v1, colon, v2 = value.partition(":") - try: - v1 = int(v1) - v2 = int(v2) if colon else None - except ValueError: - self.fail("Value must be of the form `N[:M]`", param, ctx) - return (v1, v2) - else: - return value - - def get_metavar(self, param): - return "N[:M]" - - @click.command() # @dandiset_path_option( # help="Top directory (local) of the dandiset. Files will be uploaded with " diff --git a/dandi/dandiapi.py b/dandi/dandiapi.py index 13ef408a4..8d0ed3cf6 100644 --- a/dandi/dandiapi.py +++ b/dandi/dandiapi.py @@ -1707,3 +1707,34 @@ def get_listing(self) -> ZarrListing: else: raise return ZarrListing.parse_obj(r) + + def get_download_file_iter( + self, chunk_size: int = MAX_CHUNK_SIZE + ) -> Callable[..., Iterator[bytes]]: + """ + Returns a function that when called (optionally with an offset into the + file to start downloading at) returns a generator of chunks of the file + """ + if not self.is_file(): + raise RuntimeError( + f"{str(self)!r} in Zarr {self.zarr_id!r} does not exist or" + " is not a file" + ) + + url = self.client.get_url(f"/zarr/{self.zarr_id}.zarr/{'/'.join(self.parts)}") + + def downloader(start_at: int = 0) -> Iterator[bytes]: + lgr.debug("Starting download from %s", url) + headers = None + if start_at > 0: + headers = {"Range": f"bytes={start_at}-"} + result = self.client.session.get(url, stream=True, headers=headers) + # TODO: apparently we might need retries here as well etc + # if result.status_code not in (200, 201): + result.raise_for_status() + for chunk in result.iter_content(chunk_size=chunk_size): + if chunk: # could be some "keep alive"? + yield chunk + lgr.info("File %s in Zarr %s successfully downloaded", self, self.zarr_id) + + return downloader diff --git a/dandi/download.py b/dandi/download.py index 9115d9da1..e3c510948 100644 --- a/dandi/download.py +++ b/dandi/download.py @@ -1,3 +1,6 @@ +from collections import Counter +from dataclasses import dataclass, field +from enum import Enum import hashlib import json import os @@ -7,12 +10,16 @@ from shutil import rmtree import sys import time +from typing import Dict, Iterator, Optional, Tuple +from dandischema.models import DigestType import humanize +from interleave import FINISH_CURRENT, interleave import requests from . import get_logger from .consts import RETRY_STATUSES, dandiset_metadata_file +from .dandiapi import RemoteZarrAsset from .dandiarchive import DandisetURL, MultiAssetURL, SingleAssetURL, parse_dandi_url from .dandiset import Dandiset from .exceptions import NotFoundError @@ -40,6 +47,7 @@ def download( format="pyout", existing="error", jobs=1, + jobs_per_zarr=None, get_metadata=True, get_assets=True, sync=False, @@ -92,6 +100,7 @@ def download( existing=existing, get_metadata=get_metadata, get_assets=get_assets, + jobs_per_zarr=jobs_per_zarr, **kw, ) @@ -163,6 +172,7 @@ def download_generator( existing="error", get_metadata=True, get_assets=True, + jobs_per_zarr=None, ): """A generator for downloads of files, folders, or entire dandiset from DANDI (as identified by URL) @@ -197,10 +207,6 @@ def download_generator( return for asset in assets: - if asset.is_zarr(): - raise NotImplementedError( - "Download of Zarr assets is not yet implemented" - ) path = asset.path.lstrip("/") # make into relative path path = op.normpath(path) if not isinstance(parsed_url, DandisetURL): @@ -223,41 +229,54 @@ def download_generator( yield {"path": path, "status": "error", "message": str(e)} continue d = metadata.get("digest", {}) - if "dandi:dandi-etag" in d: - digests = {"dandi-etag": d["dandi:dandi-etag"]} - else: - raise RuntimeError( - f"dandi-etag not available for asset. Known digests: {d}" - ) - try: - digests["sha256"] = d["dandi:sha2-256"] - except KeyError: - pass - downloader = asset.get_download_file_iter() + if asset.is_blob(): + if "dandi:dandi-etag" in d: + digests = {"dandi-etag": d["dandi:dandi-etag"]} + else: + raise RuntimeError( + f"dandi-etag not available for asset. Known digests: {d}" + ) + try: + digests["sha256"] = d["dandi:sha2-256"] + except KeyError: + pass + try: + mtime = ensure_datetime(metadata["blobDateModified"]) + except KeyError: + mtime = None + if mtime is None: + lgr.warning( + "Asset %s is missing blobDateModified metadata field", + asset.path, + ) + mtime = asset.modified + _download_generator = _download_file( + asset.get_download_file_iter(), + download_path, + toplevel_path=output_path, + # size and modified generally should be there but better to + # redownload than to crash + size=asset.size, + mtime=mtime, + existing=existing, + digests=digests, + ) - try: - mtime = metadata["blobDateModified"] - except KeyError: - mtime = None - if mtime is None: - lgr.warning( - "Asset %s is missing blobDateModified metadata field", - asset.path, + else: + assert asset.is_zarr(), f"Asset {asset.path} is neither blob nor Zarr" + if not isinstance(asset, RemoteZarrAsset): + raise NotImplementedError( + "Downloading a Zarr asset identified by a URL without" + " Dandiset details is not yet implemented" + ) + _download_generator = _download_zarr( + asset, + download_path, + toplevel_path=output_path, + existing=existing, + jobs=jobs_per_zarr, ) - mtime = asset.modified - - _download_generator = _download_file( - downloader, - download_path, - toplevel_path=output_path, - # size and modified generally should be there but better to redownload - # than to crash - size=asset.size, - mtime=mtime, - existing=existing, - digests=digests, - ) if yield_generator_for_fields: yield {"path": path, yield_generator_for_fields: _download_generator} @@ -431,14 +450,21 @@ def _download_file( existing="error", digests=None, ): - """Common logic for downloading a single file + """ + Common logic for downloading a single file. - Generator downloader: + Yields progress records that take the following forms:: - TODO: describe expected records it should yield - - progress - - error - - completion + {"status": "skipped", "message": ""} + {"size": } + {"status": "downloading"} + {"done": [, "done%": ]} + {"status": "error", "message": ""} + {"checksum": "differs", "status": "error", "message": ""} + {"checksum": "ok"} + {"checksum": "-"} # No digests were provided + {"status": "setting mtime"} + {"status": "done"} Parameters ---------- @@ -484,7 +510,17 @@ def _download_file( "%s is in git-annex, and hash does not match hash on server; redownloading", path, ) - elif get_digest(path, "dandi-etag") == digests["dandi-etag"]: + elif ( + "dandi-etag" in digests + and get_digest(path, "dandi-etag") == digests["dandi-etag"] + ): + yield _skip_file("already exists") + return + elif ( + "dandi-etag" not in digests + and "md5" in digests + and get_digest(path, "md5") == digests["md5"] + ): yield _skip_file("already exists") return else: @@ -701,3 +737,175 @@ def __exit__(self, exc_type, exc_value, traceback): def append(self, blob): self.fp.write(blob) + + +def _download_zarr( + asset: RemoteZarrAsset, + download_path: str, + toplevel_path: str, + existing: str, + jobs: Optional[int] = None, +) -> Iterator[dict]: + download_gens = {} + for entry in asset.iterfiles(): + etag = entry.get_etag() + assert etag.algorithm is DigestType.md5 + stat = entry.stat() + download_gens[entry.path] = _download_file( + entry.get_download_file_iter(), + op.join(download_path, op.normpath(str(entry))), + toplevel_path=toplevel_path, + size=stat.size, + mtime=stat.modified, + existing=existing, + digests={"md5": etag.value}, + ) + pc = ProgressCombiner(zarr_size=asset.size, file_qty=len(download_gens)) + with interleave( + [pairing(p, gen) for p, gen in download_gens.items()], + onerror=FINISH_CURRENT, + max_workers=jobs or 4, + ) as it: + for path, status in it: + for out in pc.feed(path, status): + if out == {"status": "done"}: + break + else: + yield out + else: + return + # TODO: Delete local files not in remote Zarr + yield {"status": "done"} + + +def pairing(p: str, gen: Iterator[dict]) -> Iterator[Tuple[str, dict]]: + for d in gen: + yield (p, d) + + +DLState = Enum("DLState", "STARTING DOWNLOADING SKIPPED ERROR CHECKSUM_ERROR DONE") + + +@dataclass +class DownloadProgress: + state: DLState = DLState.STARTING + downloaded: int = 0 + size: Optional[int] = None + + +@dataclass +class ProgressCombiner: + zarr_size: int + file_qty: int + files: Dict[str, DownloadProgress] = field(default_factory=dict) + #: Total size of all files that were not skipped and did not error out + #: during download + maxsize: int = 0 + prev_status: str = "" + yielded_size: bool = False + + @property + def message(self) -> str: + done = 0 + errored = 0 + skipped = 0 + for s in self.files.values(): + if s.state is DLState.DONE: + done += 1 + elif s.state in (DLState.ERROR, DLState.CHECKSUM_ERROR): + errored += 1 + elif s.state is DLState.SKIPPED: + skipped += 1 + parts = [] + if done: + parts.append(f"{done} done") + if errored: + parts.append(f"{errored} errored") + if skipped: + parts.append(f"{skipped} skipped") + return ", ".join(parts) + + def get_done(self) -> dict: + total_downloaded = sum( + s.downloaded + for s in self.files.values() + if s.state in (DLState.DOWNLOADING, DLState.CHECKSUM_ERROR, DLState.DONE) + ) + return { + "done": total_downloaded, + "done%": total_downloaded / self.maxsize * 100, + } + + def set_status(self, statusdict: dict) -> None: + state_qtys = Counter(s.state for s in self.files.values()) + total = len(self.files) + if ( + total == self.file_qty + and state_qtys[DLState.STARTING] == state_qtys[DLState.DOWNLOADING] == 0 + ): + # All files have finished + if state_qtys[DLState.ERROR] or state_qtys[DLState.CHECKSUM_ERROR]: + new_status = "error" + elif state_qtys[DLState.DONE]: + new_status = "done" + else: + new_status = "skipped" + elif total - state_qtys[DLState.STARTING] - state_qtys[DLState.SKIPPED] > 0: + new_status = "downloading" + else: + new_status = "" + if new_status != self.prev_status: + statusdict["status"] = new_status + self.prev_status = new_status + + def feed(self, path: str, status: dict) -> Iterator[dict]: + keys = list(status.keys()) + self.files.setdefault(path, DownloadProgress()) + if status.get("status") == "skipped": + self.files[path].state = DLState.SKIPPED + out = {"message": self.message} + self.set_status(out) + yield out + elif keys == ["size"]: + if not self.yielded_size: + yield {"size": self.zarr_size} + self.yielded_size = True + self.files[path].size = status["size"] + self.maxsize += status["size"] + if any(s.state is DLState.DOWNLOADING for s in self.files.values()): + yield self.get_done() + elif status == {"status": "downloading"}: + self.files[path].state = DLState.DOWNLOADING + out = {} + self.set_status(out) + if out: + yield out + elif "done" in status: + self.files[path].downloaded = status["done"] + yield self.get_done() + elif status.get("status") == "error": + if "checksum" in status: + self.files[path].state = DLState.CHECKSUM_ERROR + out = {} + else: + self.files[path].state = DLState.ERROR + sz = self.files[path].size + if sz is not None: + self.maxsize -= sz + out = self.get_done() + out["message"] = self.message + self.set_status(out) + yield out + elif keys == ["checksum"]: + pass + elif status == {"status": "setting mtime"}: + pass + elif status == {"status": "done"}: + self.files[path].state = DLState.DONE + out = {"message": self.message} + self.set_status(out) + yield out + else: + lgr.warning( + "Unexpected download status dict for %r received: %r", path, status + ) diff --git a/dandi/tests/test_download.py b/dandi/tests/test_download.py index cb2d4f273..7e1db5e69 100644 --- a/dandi/tests/test_download.py +++ b/dandi/tests/test_download.py @@ -3,6 +3,7 @@ import os.path as op import re from shutil import rmtree +from typing import List, Tuple import numpy as np import pytest @@ -12,7 +13,7 @@ from .skip import mark from ..consts import DRAFT, dandiset_metadata_file from ..dandiarchive import DandisetURL -from ..download import download, download_generator +from ..download import ProgressCombiner, download, download_generator from ..upload import upload from ..utils import assert_dirtrees_eq, list_paths @@ -335,3 +336,80 @@ def test_download_nonzarr_to_zarr_path(local_dandi_api, monkeypatch, tmp_path): download(d.version_api_url, tmp_path / "download", existing="overwrite-different") assert (dd / "sample.zarr").is_file() assert (dd / "sample.zarr").read_text() == "This is not a Zarr.\n" + + +@pytest.mark.parametrize( + "file_qty,inputs,expected", + [ + ( + 1, + [ + ("lonely.txt", {"size": 42}), + ("lonely.txt", {"status": "downloading"}), + ("lonely.txt", {"done": 0, "done%": 0.0}), + ("lonely.txt", {"done": 20, "done%": 20 / 42 * 100}), + ("lonely.txt", {"done": 40, "done%": 40 / 42 * 100}), + ("lonely.txt", {"done": 42, "done%": 100.0}), + ("lonely.txt", {"checksum": "ok"}), + ("lonely.txt", {"status": "setting mtime"}), + ("lonely.txt", {"status": "done"}), + ], + [ + {"size": 69105}, + {"status": "downloading"}, + {"done": 0, "done%": 0.0}, + {"done": 20, "done%": 20 / 42 * 100}, + {"done": 40, "done%": 40 / 42 * 100}, + {"done": 42, "done%": 100.0}, + {"status": "done", "message": "1 done"}, + ], + ), + ( + 2, + [ + ("apple.txt", {"size": 42}), + ("banana.txt", {"size": 127}), + ("apple.txt", {"status": "downloading"}), + ("banana.txt", {"status": "downloading"}), + ("apple.txt", {"done": 0, "done%": 0.0}), + ("banana.txt", {"done": 0, "done%": 0.0}), + ("apple.txt", {"done": 20, "done%": 20 / 42 * 100}), + ("banana.txt", {"done": 40, "done%": 40 / 127 * 100}), + ("apple.txt", {"done": 40, "done%": 40 / 42 * 100}), + ("banana.txt", {"done": 80, "done%": 80 / 127 * 100}), + ("apple.txt", {"done": 42, "done%": 100.0}), + ("banana.txt", {"done": 120, "done%": 120 / 127 * 100}), + ("apple.txt", {"checksum": "ok"}), + ("banana.txt", {"done": 127, "done%": 100.0}), + ("apple.txt", {"status": "setting mtime"}), + ("banana.txt", {"checksum": "ok"}), + ("apple.txt", {"status": "done"}), + ("banana.txt", {"status": "setting mtime"}), + ("banana.txt", {"status": "done"}), + ], + [ + {"size": 69105}, + {"status": "downloading"}, + {"done": 0, "done%": 0.0}, + {"done": 0, "done%": 0.0}, + {"done": 20, "done%": 20 / 169 * 100}, + {"done": 60, "done%": 60 / 169 * 100}, + {"done": 80, "done%": 80 / 169 * 100}, + {"done": 120, "done%": 120 / 169 * 100}, + {"done": 122, "done%": 122 / 169 * 100}, + {"done": 162, "done%": 162 / 169 * 100}, + {"done": 169, "done%": 100.0}, + {"message": "1 done"}, + {"status": "done", "message": "2 done"}, + ], + ), + ], +) +def test_progress_combiner( + file_qty: int, inputs: List[Tuple[str, dict]], expected: List[dict] +) -> None: + pc = ProgressCombiner(zarr_size=69105, file_qty=file_qty) + outputs = [] + for path, status in inputs: + outputs.extend(pc.feed(path, status)) + assert outputs == expected diff --git a/docs/source/cmdline/download.rst b/docs/source/cmdline/download.rst index e9001ced4..2536b71ff 100644 --- a/docs/source/cmdline/download.rst +++ b/docs/source/cmdline/download.rst @@ -31,9 +31,10 @@ Options DANDI instance to download from [default: ``dandi``] -.. option:: -J, --jobs +.. option:: -J, --jobs N[:M] - Number of parallel download jobs [default: 6] + Number of parallel download jobs and, optionally, number of upload subjobs + per Zarr asset job [default: 6:4] .. option:: -o, --output-dir diff --git a/setup.cfg b/setup.cfg index 1182e7038..73f0290c1 100644 --- a/setup.cfg +++ b/setup.cfg @@ -42,6 +42,7 @@ install_requires = #hdmf != 1.1.2 humanize importlib-metadata; python_version < "3.8" + interleave ~= 0.1 joblib keyring keyrings.alt From a22dd955f1d8f9dff55dc0622a89e8a49fa73b35 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Wed, 19 Jan 2022 16:39:10 -0500 Subject: [PATCH 41/56] More tests of ProgressCombiner --- dandi/download.py | 14 ++- dandi/tests/test_download.py | 231 +++++++++++++++++++++++++++++++++++ 2 files changed, 239 insertions(+), 6 deletions(-) diff --git a/dandi/download.py b/dandi/download.py index e3c510948..0e8b73949 100644 --- a/dandi/download.py +++ b/dandi/download.py @@ -768,7 +768,7 @@ def _download_zarr( ) as it: for path, status in it: for out in pc.feed(path, status): - if out == {"status": "done"}: + if out.get("status") == "done": break else: yield out @@ -886,16 +886,18 @@ def feed(self, path: str, status: dict) -> Iterator[dict]: elif status.get("status") == "error": if "checksum" in status: self.files[path].state = DLState.CHECKSUM_ERROR - out = {} + out = {"message": self.message} + self.set_status(out) + yield out else: self.files[path].state = DLState.ERROR + out = {"message": self.message} + self.set_status(out) + yield out sz = self.files[path].size if sz is not None: self.maxsize -= sz - out = self.get_done() - out["message"] = self.message - self.set_status(out) - yield out + yield self.get_done() elif keys == ["checksum"]: pass elif status == {"status": "setting mtime"}: diff --git a/dandi/tests/test_download.py b/dandi/tests/test_download.py index 7e1db5e69..894d63324 100644 --- a/dandi/tests/test_download.py +++ b/dandi/tests/test_download.py @@ -403,6 +403,237 @@ def test_download_nonzarr_to_zarr_path(local_dandi_api, monkeypatch, tmp_path): {"status": "done", "message": "2 done"}, ], ), + ( + 2, + [ + ("apple.txt", {"size": 42}), + ("apple.txt", {"status": "downloading"}), + ("apple.txt", {"done": 0, "done%": 0.0}), + ("apple.txt", {"done": 20, "done%": 20 / 42 * 100}), + ("banana.txt", {"size": 127}), + ("apple.txt", {"done": 40, "done%": 40 / 42 * 100}), + ("banana.txt", {"status": "downloading"}), + ("apple.txt", {"done": 42, "done%": 100.0}), + ("banana.txt", {"done": 0, "done%": 0.0}), + ("apple.txt", {"checksum": "ok"}), + ("banana.txt", {"done": 40, "done%": 40 / 127 * 100}), + ("apple.txt", {"status": "setting mtime"}), + ("banana.txt", {"done": 80, "done%": 80 / 127 * 100}), + ("apple.txt", {"status": "done"}), + ("banana.txt", {"done": 120, "done%": 120 / 127 * 100}), + ("banana.txt", {"done": 127, "done%": 100.0}), + ("banana.txt", {"checksum": "ok"}), + ("banana.txt", {"status": "setting mtime"}), + ("banana.txt", {"status": "done"}), + ], + [ + {"size": 69105}, + {"status": "downloading"}, + {"done": 0, "done%": 0.0}, + {"done": 20, "done%": 20 / 42 * 100}, + {"done": 20, "done%": 20 / 169 * 100}, + {"done": 40, "done%": 40 / 169 * 100}, + {"done": 42, "done%": 42 / 169 * 100}, + {"done": 42, "done%": 42 / 169 * 100}, + {"done": 82, "done%": 82 / 169 * 100}, + {"done": 122, "done%": 122 / 169 * 100}, + {"message": "1 done"}, + {"done": 162, "done%": 162 / 169 * 100}, + {"done": 169, "done%": 169 / 169 * 100}, + {"status": "done", "message": "2 done"}, + ], + ), + ( + 2, + [ + ("apple.txt", {"size": 42}), + ("apple.txt", {"status": "downloading"}), + ("apple.txt", {"done": 0, "done%": 0.0}), + ("apple.txt", {"done": 20, "done%": 20 / 42 * 100}), + ("apple.txt", {"done": 40, "done%": 40 / 42 * 100}), + ("apple.txt", {"done": 42, "done%": 100.0}), + ("apple.txt", {"checksum": "ok"}), + ("apple.txt", {"status": "setting mtime"}), + ("apple.txt", {"status": "done"}), + ("banana.txt", {"size": 127}), + ("banana.txt", {"status": "downloading"}), + ("banana.txt", {"done": 0, "done%": 0.0}), + ("banana.txt", {"done": 40, "done%": 40 / 127 * 100}), + ("banana.txt", {"done": 80, "done%": 80 / 127 * 100}), + ("banana.txt", {"done": 120, "done%": 120 / 127 * 100}), + ("banana.txt", {"done": 127, "done%": 100.0}), + ("banana.txt", {"checksum": "ok"}), + ("banana.txt", {"status": "setting mtime"}), + ("banana.txt", {"status": "done"}), + ], + [ + {"size": 69105}, + {"status": "downloading"}, + {"done": 0, "done%": 0.0}, + {"done": 20, "done%": 20 / 42 * 100}, + {"done": 40, "done%": 40 / 42 * 100}, + {"done": 42, "done%": 42 / 42 * 100}, + {"message": "1 done"}, + {"done": 42, "done%": 42 / 169 * 100}, + {"done": 82, "done%": 82 / 169 * 100}, + {"done": 122, "done%": 122 / 169 * 100}, + {"done": 162, "done%": 162 / 169 * 100}, + {"done": 169, "done%": 100.0}, + {"status": "done", "message": "2 done"}, + ], + ), + ( + 2, + [ + ("apple.txt", {"size": 42}), + ("banana.txt", {"size": 127}), + ("apple.txt", {"status": "downloading"}), + ("banana.txt", {"status": "downloading"}), + ("apple.txt", {"done": 0, "done%": 0.0}), + ("banana.txt", {"done": 0, "done%": 0.0}), + ("apple.txt", {"done": 20, "done%": 20 / 42 * 100}), + ("banana.txt", {"done": 40, "done%": 40 / 127 * 100}), + ("apple.txt", {"done": 40, "done%": 40 / 42 * 100}), + ("banana.txt", {"status": "error", "message": "Internet broke"}), + ("apple.txt", {"done": 42, "done%": 100.0}), + ("apple.txt", {"checksum": "ok"}), + ("apple.txt", {"status": "setting mtime"}), + ("apple.txt", {"status": "done"}), + ], + [ + {"size": 69105}, + {"status": "downloading"}, + {"done": 0, "done%": 0.0}, + {"done": 0, "done%": 0.0}, + {"done": 20, "done%": 20 / 169 * 100}, + {"done": 60, "done%": 60 / 169 * 100}, + {"done": 80, "done%": 80 / 169 * 100}, + {"message": "1 errored"}, + {"done": 40, "done%": 40 / 42 * 100}, + {"done": 42, "done%": 100.0}, + {"status": "error", "message": "1 done, 1 errored"}, + ], + ), + ( + 1, + [("lonely.txt", {"status": "skipped", "message": "already exists"})], + [{"status": "skipped", "message": "1 skipped"}], + ), + ( + 2, + [ + ("apple.txt", {"size": 42}), + ("banana.txt", {"status": "skipped", "message": "already exists"}), + ("apple.txt", {"status": "downloading"}), + ("apple.txt", {"done": 0, "done%": 0.0}), + ("apple.txt", {"done": 20, "done%": 20 / 42 * 100}), + ("apple.txt", {"done": 40, "done%": 40 / 42 * 100}), + ("apple.txt", {"done": 42, "done%": 100.0}), + ("apple.txt", {"checksum": "ok"}), + ("apple.txt", {"status": "setting mtime"}), + ("apple.txt", {"status": "done"}), + ], + [ + {"size": 69105}, + {"message": "1 skipped"}, + {"status": "downloading"}, + {"done": 0, "done%": 0.0}, + {"done": 20, "done%": 20 / 42 * 100}, + {"done": 40, "done%": 40 / 42 * 100}, + {"done": 42, "done%": 100.0}, + {"status": "done", "message": "1 done, 1 skipped"}, + ], + ), + ( + 2, + [ + ("apple.txt", {"size": 42}), + ("banana.txt", {"size": 127}), + ("apple.txt", {"status": "downloading"}), + ("banana.txt", {"status": "downloading"}), + ("apple.txt", {"done": 0, "done%": 0.0}), + ("banana.txt", {"done": 0, "done%": 0.0}), + ("apple.txt", {"done": 20, "done%": 20 / 42 * 100}), + ("banana.txt", {"done": 40, "done%": 40 / 127 * 100}), + ("apple.txt", {"done": 40, "done%": 40 / 42 * 100}), + ("banana.txt", {"done": 80, "done%": 80 / 127 * 100}), + ("apple.txt", {"done": 42, "done%": 100.0}), + ("banana.txt", {"done": 120, "done%": 120 / 127 * 100}), + ("apple.txt", {"checksum": "ok"}), + ("banana.txt", {"done": 127, "done%": 100.0}), + ("apple.txt", {"status": "setting mtime"}), + ( + "banana.txt", + { + "checksum": "differs", + "status": "error", + "message": "Checksum differs", + }, + ), + ("apple.txt", {"status": "done"}), + ], + [ + {"size": 69105}, + {"status": "downloading"}, + {"done": 0, "done%": 0.0}, + {"done": 0, "done%": 0.0}, + {"done": 20, "done%": 20 / 169 * 100}, + {"done": 60, "done%": 60 / 169 * 100}, + {"done": 80, "done%": 80 / 169 * 100}, + {"done": 120, "done%": 120 / 169 * 100}, + {"done": 122, "done%": 122 / 169 * 100}, + {"done": 162, "done%": 162 / 169 * 100}, + {"done": 169, "done%": 100.0}, + {"message": "1 errored"}, + {"status": "error", "message": "1 done, 1 errored"}, + ], + ), + ( + 3, + [ + ("apple.txt", {"size": 42}), + ("banana.txt", {"size": 127}), + ("apple.txt", {"status": "downloading"}), + ("banana.txt", {"status": "downloading"}), + ("coconut", {"status": "skipped", "message": "already exists"}), + ("apple.txt", {"done": 0, "done%": 0.0}), + ("banana.txt", {"done": 0, "done%": 0.0}), + ("apple.txt", {"done": 20, "done%": 20 / 42 * 100}), + ("banana.txt", {"done": 40, "done%": 40 / 127 * 100}), + ("apple.txt", {"done": 40, "done%": 40 / 42 * 100}), + ("banana.txt", {"done": 80, "done%": 80 / 127 * 100}), + ("apple.txt", {"done": 42, "done%": 100.0}), + ( + "apple.txt", + { + "checksum": "differs", + "status": "error", + "message": "Checksum differs", + }, + ), + ("banana.txt", {"done": 120, "done%": 120 / 127 * 100}), + ("banana.txt", {"done": 127, "done%": 100.0}), + ("banana.txt", {"checksum": "ok"}), + ("banana.txt", {"status": "setting mtime"}), + ("banana.txt", {"status": "done"}), + ], + [ + {"size": 69105}, + {"status": "downloading"}, + {"message": "1 skipped"}, + {"done": 0, "done%": 0.0}, + {"done": 0, "done%": 0.0}, + {"done": 20, "done%": 20 / 169 * 100}, + {"done": 60, "done%": 60 / 169 * 100}, + {"done": 80, "done%": 80 / 169 * 100}, + {"done": 120, "done%": 120 / 169 * 100}, + {"done": 122, "done%": 122 / 169 * 100}, + {"message": "1 errored, 1 skipped"}, + {"done": 162, "done%": 162 / 169 * 100}, + {"done": 169, "done%": 100.0}, + {"status": "error", "message": "1 done, 1 errored, 1 skipped"}, + ], + ), ], ) def test_progress_combiner( From 5aa819d66155de5d202ba9404e95b930605ed8af Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Wed, 19 Jan 2022 17:05:31 -0500 Subject: [PATCH 42/56] Fixes --- dandi/cli/tests/test_download.py | 7 +++++++ dandi/download.py | 12 +++++++++--- dandi/tests/test_download.py | 10 +--------- dandi/tests/test_upload.py | 8 +------- 4 files changed, 18 insertions(+), 19 deletions(-) diff --git a/dandi/cli/tests/test_download.py b/dandi/cli/tests/test_download.py index a128880c5..1fc5757b6 100644 --- a/dandi/cli/tests/test_download.py +++ b/dandi/cli/tests/test_download.py @@ -18,6 +18,7 @@ def test_download_defaults(mocker): existing="error", format="pyout", jobs=6, + jobs_per_zarr=None, get_metadata=True, get_assets=True, sync=False, @@ -34,6 +35,7 @@ def test_download_all_types(mocker): existing="error", format="pyout", jobs=6, + jobs_per_zarr=None, get_metadata=True, get_assets=True, sync=False, @@ -50,6 +52,7 @@ def test_download_metadata_only(mocker): existing="error", format="pyout", jobs=6, + jobs_per_zarr=None, get_metadata=True, get_assets=False, sync=False, @@ -66,6 +69,7 @@ def test_download_assets_only(mocker): existing="error", format="pyout", jobs=6, + jobs_per_zarr=None, get_metadata=False, get_assets=True, sync=False, @@ -94,6 +98,7 @@ def test_download_gui_instance_in_dandiset(mocker): existing="error", format="pyout", jobs=6, + jobs_per_zarr=None, get_metadata=True, get_assets=True, sync=False, @@ -113,6 +118,7 @@ def test_download_api_instance_in_dandiset(mocker): existing="error", format="pyout", jobs=6, + jobs_per_zarr=None, get_metadata=True, get_assets=True, sync=False, @@ -136,6 +142,7 @@ def test_download_url_instance_match(mocker): existing="error", format="pyout", jobs=6, + jobs_per_zarr=None, get_metadata=True, get_assets=True, sync=False, diff --git a/dandi/download.py b/dandi/download.py index 0e8b73949..888418cc5 100644 --- a/dandi/download.py +++ b/dandi/download.py @@ -552,8 +552,14 @@ def _download_file( if size is not None: yield {"size": size} - destdir = op.dirname(path) - os.makedirs(destdir, exist_ok=True) + destdir = Path(op.dirname(path)) + for p in (destdir, *destdir.parents): + if p.is_file(): + p.unlink() + break + elif p.is_dir(): + break + destdir.mkdir(parents=True, exist_ok=True) yield {"status": "downloading"} @@ -751,7 +757,7 @@ def _download_zarr( etag = entry.get_etag() assert etag.algorithm is DigestType.md5 stat = entry.stat() - download_gens[entry.path] = _download_file( + download_gens[str(entry)] = _download_file( entry.get_download_file_iter(), op.join(download_path, op.normpath(str(entry))), toplevel_path=toplevel_path, diff --git a/dandi/tests/test_download.py b/dandi/tests/test_download.py index 894d63324..f57693bbf 100644 --- a/dandi/tests/test_download.py +++ b/dandi/tests/test_download.py @@ -272,9 +272,6 @@ def test_download_metadata404(text_dandiset, tmp_path): ] -@pytest.mark.xfail( - reason="Zarr download not implemented yet", raises=NotImplementedError, strict=True -) def test_download_zarr(tmp_path, zarr_dandiset): download(zarr_dandiset.dandiset.version_api_url, tmp_path) assert_dirtrees_eq( @@ -283,9 +280,7 @@ def test_download_zarr(tmp_path, zarr_dandiset): ) -@pytest.mark.xfail( - reason="Zarr download not implemented yet", raises=NotImplementedError, strict=True -) +@pytest.mark.xfail(reason="Not implemented yet", strict=True) def test_download_different_zarr(tmp_path, zarr_dandiset): dd = tmp_path / zarr_dandiset.dandiset_id dd.mkdir() @@ -299,9 +294,6 @@ def test_download_different_zarr(tmp_path, zarr_dandiset): ) -@pytest.mark.xfail( - reason="Zarr download not implemented yet", raises=NotImplementedError, strict=True -) def test_download_zarr_to_nonzarr_path(tmp_path, zarr_dandiset): dd = tmp_path / zarr_dandiset.dandiset_id dd.mkdir() diff --git a/dandi/tests/test_upload.py b/dandi/tests/test_upload.py index f8345ff2c..55a98f0a2 100644 --- a/dandi/tests/test_upload.py +++ b/dandi/tests/test_upload.py @@ -225,12 +225,9 @@ def test_upload_zarr(local_dandi_api, monkeypatch, tmp_path): assert asset.path == "sample.zarr" -@pytest.mark.xfail( - reason="Zarr download not implemented yet", raises=NotImplementedError, strict=True -) def test_upload_different_zarr(tmp_path, zarr_dandiset): rmtree(zarr_dandiset.dspath / "sample.zarr") - zarr.save(tmp_path / "sample.zarr", np.eye(5)) + zarr.save(zarr_dandiset.dspath / "sample.zarr", np.eye(5)) zarr_dandiset.upload() download(zarr_dandiset.dandiset.version_api_url, tmp_path) assert_dirtrees_eq( @@ -255,9 +252,6 @@ def test_upload_nonzarr_to_zarr_path(tmp_path, zarr_dandiset): ).read_text() == "This is not a Zarr.\n" -@pytest.mark.xfail( - reason="Zarr download not implemented yet", raises=NotImplementedError, strict=True -) def test_upload_zarr_to_nonzarr_path(local_dandi_api, monkeypatch, tmp_path): monkeypatch.setenv("DANDI_API_KEY", local_dandi_api.api_key) d = local_dandi_api.client.create_dandiset("Test Dandiset", {}) From a3d34791e4cc84f6dcb81e0f090cf20efebdb47e Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Thu, 20 Jan 2022 08:40:03 -0500 Subject: [PATCH 43/56] Catch an error case --- dandi/files.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dandi/files.py b/dandi/files.py index 4bd13d90f..475f1813a 100644 --- a/dandi/files.py +++ b/dandi/files.py @@ -937,6 +937,8 @@ def dandi_file( filepath = Path(filepath) if dandiset_path is not None: path = filepath.relative_to(dandiset_path).as_posix() + if path == ".": + raise ValueError("Dandi file path cannot equal Dandiset path") else: path = filepath.name if filepath.is_dir(): From 4fa239533f94f3c6a61fa49a2fa5c63dc67625db Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Thu, 20 Jan 2022 09:55:35 -0500 Subject: [PATCH 44/56] Delete extra files in a local Zarr after downloading --- dandi/download.py | 68 ++++++++++++++++++++++++++++++------ dandi/files.py | 2 ++ dandi/tests/test_download.py | 27 ++++++++++++-- dandi/tests/test_helpers.py | 23 ++++++++++++ dandi/tests/test_upload.py | 3 +- dandi/utils.py | 20 ----------- 6 files changed, 109 insertions(+), 34 deletions(-) create mode 100644 dandi/tests/test_helpers.py diff --git a/dandi/download.py b/dandi/download.py index 888418cc5..ae56755fc 100644 --- a/dandi/download.py +++ b/dandi/download.py @@ -1,4 +1,4 @@ -from collections import Counter +from collections import Counter, deque from dataclasses import dataclass, field from enum import Enum import hashlib @@ -9,6 +9,7 @@ import random from shutil import rmtree import sys +from threading import Lock import time from typing import Dict, Iterator, Optional, Tuple @@ -206,6 +207,7 @@ def download_generator( if not get_assets: return + lock = Lock() for asset in assets: path = asset.path.lstrip("/") # make into relative path path = op.normpath(path) @@ -261,6 +263,7 @@ def download_generator( mtime=mtime, existing=existing, digests=digests, + lock=lock, ) else: @@ -276,6 +279,7 @@ def download_generator( toplevel_path=output_path, existing=existing, jobs=jobs_per_zarr, + lock=lock, ) if yield_generator_for_fields: @@ -445,6 +449,7 @@ def _download_file( downloader, path, toplevel_path, + lock, size=None, mtime=None, existing="error", @@ -553,13 +558,14 @@ def _download_file( yield {"size": size} destdir = Path(op.dirname(path)) - for p in (destdir, *destdir.parents): - if p.is_file(): - p.unlink() - break - elif p.is_dir(): - break - destdir.mkdir(parents=True, exist_ok=True) + with lock: + for p in (destdir, *destdir.parents): + if p.is_file(): + p.unlink() + break + elif p.is_dir(): + break + destdir.mkdir(parents=True, exist_ok=True) yield {"status": "downloading"} @@ -750,10 +756,12 @@ def _download_zarr( download_path: str, toplevel_path: str, existing: str, + lock: Lock, jobs: Optional[int] = None, ) -> Iterator[dict]: download_gens = {} - for entry in asset.iterfiles(): + entries = list(asset.iterfiles()) + for entry in entries: etag = entry.get_etag() assert etag.algorithm is DigestType.md5 stat = entry.stat() @@ -765,8 +773,11 @@ def _download_zarr( mtime=stat.modified, existing=existing, digests={"md5": etag.value}, + lock=lock, ) + pc = ProgressCombiner(zarr_size=asset.size, file_qty=len(download_gens)) + final_out: Optional[dict] = None with interleave( [pairing(p, gen) for p, gen in download_gens.items()], onerror=FINISH_CURRENT, @@ -775,12 +786,47 @@ def _download_zarr( for path, status in it: for out in pc.feed(path, status): if out.get("status") == "done": - break + final_out = out else: yield out + if final_out is not None: + break else: return - # TODO: Delete local files not in remote Zarr + + yield {"status": "deleting extra files"} + remote_paths = set(map(str, entries)) + zarr_basepath = Path(download_path) + dirs = deque([zarr_basepath]) + empty_dirs = deque() + while dirs: + d = dirs.popleft() + is_empty = True + for p in list(d.iterdir()): + if ( + p.is_file() + and p.relative_to(zarr_basepath).as_posix() not in remote_paths + ): + try: + p.unlink() + except OSError: + is_empty = False + elif p.is_dir(): + dirs.append(p) + else: + is_empty = False + if is_empty and d != zarr_basepath: + empty_dirs.append(d) + while empty_dirs: + d = empty_dirs.popleft() + try: + d.rmdir() + except OSError: + pass + else: + if d.parent != zarr_basepath and not any(d.parent.iterdir()): + empty_dirs.append(d.parent) + yield {"status": "done"} diff --git a/dandi/files.py b/dandi/files.py index 475f1813a..443f7df88 100644 --- a/dandi/files.py +++ b/dandi/files.py @@ -896,6 +896,8 @@ def find_dandi_files( if p.is_dir(): if p.is_symlink(): lgr.warning("%s: Ignoring unsupported symbolic link to directory", p) + elif p == Path(dandiset_path): + path_queue.extend(p.iterdir()) elif any(p.iterdir()): try: df = dandi_file(p, dandiset_path) diff --git a/dandi/tests/test_download.py b/dandi/tests/test_download.py index f57693bbf..d77f98905 100644 --- a/dandi/tests/test_download.py +++ b/dandi/tests/test_download.py @@ -11,11 +11,12 @@ import zarr from .skip import mark +from .test_helpers import assert_dirtrees_eq from ..consts import DRAFT, dandiset_metadata_file from ..dandiarchive import DandisetURL from ..download import ProgressCombiner, download, download_generator from ..upload import upload -from ..utils import assert_dirtrees_eq, list_paths +from ..utils import list_paths # both urls point to 000027 (lean test dataset), and both draft and "released" @@ -280,7 +281,6 @@ def test_download_zarr(tmp_path, zarr_dandiset): ) -@pytest.mark.xfail(reason="Not implemented yet", strict=True) def test_download_different_zarr(tmp_path, zarr_dandiset): dd = tmp_path / zarr_dandiset.dandiset_id dd.mkdir() @@ -294,6 +294,29 @@ def test_download_different_zarr(tmp_path, zarr_dandiset): ) +def test_download_different_zarr_delete_dir(local_dandi_api, monkeypatch, tmp_path): + monkeypatch.setenv("DANDI_API_KEY", local_dandi_api.api_key) + d = local_dandi_api.client.create_dandiset("Test Dandiset", {}) + dandiset_id = d.identifier + dspath = tmp_path / "dandiset" + dspath.mkdir() + (dspath / dandiset_metadata_file).write_text(f"identifier: '{dandiset_id}'\n") + zarr.save(dspath / "sample.zarr", np.eye(5)) + assert not any(p.is_dir() for p in (dspath / "sample.zarr").iterdir()) + upload( + paths=[], + dandiset_path=dspath, + dandi_instance=local_dandi_api.instance_id, + devel_debug=True, + ) + dd = tmp_path / "download" / dandiset_id + dd.mkdir(parents=True, exist_ok=True) + zarr.save(dd / "sample.zarr", np.arange(1000), np.arange(1000, 0, -1)) + assert any(p.is_dir() for p in (dd / "sample.zarr").iterdir()) + download(d.version_api_url, tmp_path / "download", existing="overwrite-different") + assert_dirtrees_eq(dspath / "sample.zarr", dd / "sample.zarr") + + def test_download_zarr_to_nonzarr_path(tmp_path, zarr_dandiset): dd = tmp_path / zarr_dandiset.dandiset_id dd.mkdir() diff --git a/dandi/tests/test_helpers.py b/dandi/tests/test_helpers.py new file mode 100644 index 000000000..0e5729c06 --- /dev/null +++ b/dandi/tests/test_helpers.py @@ -0,0 +1,23 @@ +from operator import attrgetter +from pathlib import Path + + +# This needs to be in a file named "test_*.py" so that pytest performs its +# assertion rewriting on it. +def assert_dirtrees_eq(tree1: Path, tree2: Path) -> None: + """Assert that the file trees at the given paths are equal""" + assert sorted(map(attrgetter("name"), tree1.iterdir())) == sorted( + map(attrgetter("name"), tree2.iterdir()) + ) + for p1 in tree1.iterdir(): + p2 = tree2 / p1.name + assert p1.is_dir() == p2.is_dir() + if p1.is_dir(): + assert_dirtrees_eq(p1, p2) + # TODO: Considering using the identify library to test for binary-ness. + # (We can't use mimetypes, as .json maps to application/json instead of + # text/json.) + elif p1.suffix in {".txt", ".py", ".json"}: + assert p1.read_text() == p2.read_text() + else: + assert p1.read_bytes() == p2.read_bytes() diff --git a/dandi/tests/test_upload.py b/dandi/tests/test_upload.py index 55a98f0a2..afc9eb9ba 100644 --- a/dandi/tests/test_upload.py +++ b/dandi/tests/test_upload.py @@ -7,6 +7,7 @@ import pytest import zarr +from .test_helpers import assert_dirtrees_eq from ..consts import DRAFT, ZARR_MIME_TYPE, dandiset_metadata_file from ..dandiapi import RemoteBlobAsset, RemoteZarrAsset from ..download import download @@ -14,7 +15,7 @@ from ..files import LocalFileAsset from ..pynwb_utils import make_nwb_file from ..upload import upload -from ..utils import assert_dirtrees_eq, list_paths +from ..utils import list_paths def test_new_upload_download(local_dandi_api, monkeypatch, organized_nwb_dir, tmp_path): diff --git a/dandi/utils.py b/dandi/utils.py index b318c519c..9f25469ef 100644 --- a/dandi/utils.py +++ b/dandi/utils.py @@ -10,7 +10,6 @@ import io import itertools from mimetypes import guess_type -from operator import attrgetter import os import os.path as op from pathlib import Path @@ -728,22 +727,3 @@ def chunked(iterable: Iterable[T], size: int) -> Iterator[List[T]]: else: return yield xs - - -def assert_dirtrees_eq(tree1: Path, tree2: Path) -> None: - """Assert that the file trees at the given paths are equal""" - assert sorted(map(attrgetter("name"), tree1.iterdir())) == sorted( - map(attrgetter("name"), tree2.iterdir()) - ) - for p1 in tree1.iterdir(): - p2 = tree2 / p1.name - assert p1.is_dir() == p2.is_dir() - if p1.is_dir(): - assert_dirtrees_eq(p1, p2) - # TODO: Considering using the identify library to test for binary-ness. - # (We can't use mimetypes, as .json maps to application/json instead of - # text/json.) - elif p1.suffix in {".txt", ".py", ".json"}: - assert p1.read_text() == p2.read_text() - else: - assert p1.read_bytes() == p2.read_bytes() From 8072d5d4e94285868a675a151a27232346bdf090 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Thu, 20 Jan 2022 10:23:31 -0500 Subject: [PATCH 45/56] Checksum Zarrs after downloading --- dandi/download.py | 31 +++++++++++++++++++++++++++++-- dandi/support/digests.py | 20 ++++++++++++++++---- 2 files changed, 45 insertions(+), 6 deletions(-) diff --git a/dandi/download.py b/dandi/download.py index ae56755fc..61cffed60 100644 --- a/dandi/download.py +++ b/dandi/download.py @@ -1,6 +1,7 @@ from collections import Counter, deque from dataclasses import dataclass, field from enum import Enum +from functools import partial import hashlib import json import os @@ -11,7 +12,7 @@ import sys from threading import Lock import time -from typing import Dict, Iterator, Optional, Tuple +from typing import Any, Callable, Dict, Iterator, Optional, Tuple from dandischema.models import DigestType import humanize @@ -25,7 +26,7 @@ from .dandiset import Dandiset from .exceptions import NotFoundError from .files import DandisetMetadataFile, find_dandi_files -from .support.digests import get_digest +from .support.digests import get_digest, get_zarr_checksum from .support.pyout import naturalsize from .utils import ( abbrev_prompt, @@ -454,6 +455,7 @@ def _download_file( mtime=None, existing="error", digests=None, + digest_callback: Optional[Callable[[str, str], Any]] = None, ): """ Common logic for downloading a single file. @@ -646,6 +648,8 @@ def _download_file( if downloaded_digest and not resuming: downloaded_digest = downloaded_digest.hexdigest() # we care only about hex + if digest_callback is not None: + digest_callback(algo, downloaded_digest) if digest != downloaded_digest: msg = f"{algo}: downloaded {downloaded_digest} != {digest}" yield {"checksum": "differs", "status": "error", "message": msg} @@ -761,6 +765,12 @@ def _download_zarr( ) -> Iterator[dict]: download_gens = {} entries = list(asset.iterfiles()) + digests = {} + + def digest_callback(path: str, algoname: str, d: str) -> None: + if algoname == "md5": + digests[path] = d + for entry in entries: etag = entry.get_etag() assert etag.algorithm is DigestType.md5 @@ -774,6 +784,7 @@ def _download_zarr( existing=existing, digests={"md5": etag.value}, lock=lock, + digest_callback=partial(digest_callback, str(entry)), ) pc = ProgressCombiner(zarr_size=asset.size, file_qty=len(download_gens)) @@ -827,6 +838,22 @@ def _download_zarr( if d.parent != zarr_basepath and not any(d.parent.iterdir()): empty_dirs.append(d.parent) + if "skipped" not in final_out["message"]: + zarr_checksum = asset.get_etag().value + local_checksum = get_zarr_checksum(zarr_basepath, known=digests) + if zarr_checksum != local_checksum: + msg = f"Zarr checksum: downloaded {local_checksum} != {zarr_checksum}" + yield {"checksum": "differs", "status": "error", "message": msg} + lgr.debug("%s is different: %s.", zarr_basepath, msg) + return + else: + yield {"checksum": "ok"} + lgr.debug( + "Verified that %s has correct Zarr checksum %s", + zarr_basepath, + zarr_checksum, + ) + yield {"status": "done"} diff --git a/dandi/support/digests.py b/dandi/support/digests.py index e643a82af..504ad755d 100644 --- a/dandi/support/digests.py +++ b/dandi/support/digests.py @@ -12,7 +12,7 @@ import hashlib import logging from pathlib import Path -from typing import Optional +from typing import Dict, Optional from dandischema.digests.dandietag import DandiETag from dandischema.digests.zarr import get_checksum @@ -92,15 +92,27 @@ def get_dandietag(filepath) -> DandiETag: return DandiETag.from_file(filepath) -def get_zarr_checksum(dirpath: Path, basepath: Optional[Path] = None) -> str: +def get_zarr_checksum( + dirpath: Path, + basepath: Optional[Path] = None, + known: Optional[Dict[str, str]] = None, +) -> str: if basepath is None: basepath = dirpath dirs = {} files = {} + if known is None: + known = {} for p in dirpath.iterdir(): path = p.relative_to(basepath).as_posix() if not p.is_dir(): - files[path] = get_digest(p, "md5") + try: + files[path] = known[path] + except KeyError: + files[path] = get_digest(p, "md5") elif any(p.iterdir()): - dirs[path] = get_zarr_checksum(p, basepath) + try: + dirs[path] = known[path] + except KeyError: + dirs[path] = get_zarr_checksum(p, basepath) return get_checksum(files, dirs) From f85b39240cf27f09c837ea17040cde15211a4355 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Thu, 20 Jan 2022 10:34:07 -0500 Subject: [PATCH 46/56] Make the BaseRemoteAsset download methods error if called on a Zarr --- dandi/dandiapi.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/dandi/dandiapi.py b/dandi/dandiapi.py index 8d0ed3cf6..3601eed9b 100644 --- a/dandi/dandiapi.py +++ b/dandi/dandiapi.py @@ -1269,8 +1269,13 @@ def get_download_file_iter( """ Returns a function that when called (optionally with an offset into the asset to start downloading at) returns a generator of chunks of the - asset + asset. + + :raises ValueError: if the asset is not backed by a blob """ + if not self.is_blob(): + raise ValueError("Only blob assets can be downloaded directly") + url = self.base_download_url def downloader(start_at: int = 0) -> Iterator[bytes]: @@ -1295,6 +1300,8 @@ def download( """ Download the asset to ``filepath``. Blocks until the download is complete. + + :raises ValueError: if the asset is not backed by a blob """ downloader = self.get_download_file_iter(chunk_size=chunk_size) with open(filepath, "wb") as fp: From 99ab41ade9787d898634bd381436f962cdadb0d9 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Thu, 20 Jan 2022 10:34:42 -0500 Subject: [PATCH 47/56] Add versionadded:: directives --- dandi/dandiapi.py | 36 +++++++++++++++++++++++++++++++----- dandi/files.py | 2 ++ dandi/misctypes.py | 6 +++++- 3 files changed, 38 insertions(+), 6 deletions(-) diff --git a/dandi/dandiapi.py b/dandi/dandiapi.py index 3601eed9b..acb67e07f 100644 --- a/dandi/dandiapi.py +++ b/dandi/dandiapi.py @@ -1209,6 +1209,8 @@ def get_digest( def get_etag(self) -> Digest: """ + .. versionadded:: 0.35.0 + Retrieves the DANDI etag digest of the appropriate type for the asset: a dandi-etag digest for blob resources or a dandi-zarr-checksum for Zarr resources @@ -1309,11 +1311,19 @@ def download( fp.write(chunk) def is_blob(self) -> bool: - """Returns true if the asset's actual data is a blob resource""" + """ + .. versionadded:: 0.35.0 + + Returns true if the asset's actual data is a blob resource + """ return self.get_raw_metadata().get("encodingFormat") != ZARR_MIME_TYPE def is_zarr(self) -> bool: - """Returns true if the asset's actual data is a Zarr resource""" + """ + .. versionadded:: 0.35.0 + + Returns true if the asset's actual data is a Zarr resource + """ return self.get_raw_metadata().get("encodingFormat") == ZARR_MIME_TYPE @@ -1417,7 +1427,11 @@ def delete(self) -> None: class RemoteBlobAsset(RemoteAsset): - """A `RemoteAsset` whose actual data is a blob resource""" + """ + .. versionadded:: 0.35.0 + + A `RemoteAsset` whose actual data is a blob resource + """ #: The ID of the underlying blob resource blob: str @@ -1446,7 +1460,11 @@ def set_raw_metadata(self, metadata: Dict[str, Any]) -> None: class RemoteZarrAsset(RemoteAsset): - """A `RemoteAsset` whose actual data is a Zarr resource""" + """ + .. versionadded:: 0.35.0 + + A `RemoteAsset` whose actual data is a Zarr resource + """ #: The ID of the underlying Zarr resource zarr: str @@ -1501,7 +1519,11 @@ def iterfiles(self, include_dirs: bool = False) -> Iterator["RemoteZarrEntry"]: class ZarrListing(BaseModel): - """Information about a directory within a `RemoteZarrAsset`""" + """ + .. versionadded:: 0.35.0 + + Information about a directory within a `RemoteZarrAsset` + """ #: API URLs for the listings of the directory's subdirectories directories: List[AnyHttpUrl] @@ -1527,6 +1549,8 @@ def filenames(self) -> List[str]: @dataclass class ZarrEntryStat: """ + .. versionadded:: 0.35.0 + Combined size & timestamp information for a file in a `RemoteZarrAsset` """ @@ -1539,6 +1563,8 @@ class ZarrEntryStat: @dataclass class RemoteZarrEntry(BasePath): """ + .. versionadded:: 0.35.0 + A file or directory within a `RemoteZarrAsset`. Implements `~dandi.misctypes.BasePath`. """ diff --git a/dandi/files.py b/dandi/files.py index 443f7df88..b40f3f437 100644 --- a/dandi/files.py +++ b/dandi/files.py @@ -1,4 +1,6 @@ """ +.. versionadded:: 0.35.0 + This module defines functionality for working with local files & directories (as opposed to remote resources on a DANDI Archive server) that are of interest to DANDI. The classes for such files & directories all inherit from diff --git a/dandi/misctypes.py b/dandi/misctypes.py index 980b47083..f9acaeef6 100644 --- a/dandi/misctypes.py +++ b/dandi/misctypes.py @@ -1,4 +1,8 @@ -"""Miscellaneous public classes""" +""" +.. versionadded:: 0.35.0 + +Miscellaneous public classes +""" from __future__ import annotations From 483238312bac77f7f48ade8db7c5b9b19c4ff2ec Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Thu, 20 Jan 2022 10:50:09 -0500 Subject: [PATCH 48/56] Make `dandi validate` support Zarrs --- dandi/cli/cmd_validate.py | 2 +- dandi/download.py | 4 +- dandi/files.py | 38 +++++---- dandi/tests/test_files.py | 10 ++- dandi/tests/test_validate.py | 33 -------- dandi/validate.py | 151 +++-------------------------------- 6 files changed, 41 insertions(+), 197 deletions(-) delete mode 100644 dandi/tests/test_validate.py diff --git a/dandi/cli/cmd_validate.py b/dandi/cli/cmd_validate.py index 21ba094bb..586079eb4 100644 --- a/dandi/cli/cmd_validate.py +++ b/dandi/cli/cmd_validate.py @@ -43,7 +43,7 @@ def validate(paths, schema=None, devel_debug=False, allow_any_path=False): all_files_errors = {} nfiles = 0 for path, errors in validate_( - paths, + *paths, schema_version=schema, devel_debug=devel_debug, allow_any_path=allow_any_path, diff --git a/dandi/download.py b/dandi/download.py index 61cffed60..883aaa196 100644 --- a/dandi/download.py +++ b/dandi/download.py @@ -138,7 +138,9 @@ def download( f"Unexpected URL type {type(parsed_url).__name__}" ) to_delete = [] - for df in find_dandi_files(download_dir, allow_all=True): + for df in find_dandi_files( + download_dir, dandiset_path=download_dir, allow_all=True + ): if isinstance(df, DandisetMetadataFile): continue a_path = op.normpath(op.join(prefix, df.path)) diff --git a/dandi/files.py b/dandi/files.py index b40f3f437..0afb326b8 100644 --- a/dandi/files.py +++ b/dandi/files.py @@ -47,7 +47,6 @@ from .pynwb_utils import validate as pynwb_validate from .support.digests import get_dandietag, get_digest, get_zarr_checksum from .utils import chunked, ensure_datetime, pluralize, yaml_load -from .validate import _check_required_fields lgr = get_logger() @@ -862,8 +861,8 @@ def find_dandi_files( :param dandiset_path: The path to the root of the Dandiset in which the paths are located. All paths in ``paths`` must be equal to or subpaths of - ``dandiset_path``. Can only be omitted when ``paths`` is a single - directory, in which case ``dandiset_path`` is set to that directory. + ``dandiset_path``. If `None`, then the Dandiset path for each asset + found is implicitly set to the parent directory. :param allow_all: If true, unrecognized assets and the Dandiset's :file:`dandiset.yaml` file are returned as `GenericAsset` and `DandisetMetadataFile` @@ -874,22 +873,16 @@ def find_dandi_files( (unless ``allow_all`` is true). """ - if dandiset_path is None: - if len(paths) == 1 and os.path.isdir(paths[0]): - dandiset_path = paths[0] - else: - raise ValueError( - "dandiset_path must be set when not traversing a single directory" - ) path_queue = deque() for p in paths: p = Path(p) - try: - p.relative_to(dandiset_path) - except ValueError: - raise ValueError( - "Path {str(p)!r} is not inside Dandiset path {str(dandiset_path)!r}" - ) + if dandiset_path is not None: + try: + p.relative_to(dandiset_path) + except ValueError: + raise ValueError( + "Path {str(p)!r} is not inside Dandiset path {str(dandiset_path)!r}" + ) path_queue.append(p) while path_queue: p = path_queue.popleft() @@ -898,7 +891,7 @@ def find_dandi_files( if p.is_dir(): if p.is_symlink(): lgr.warning("%s: Ignoring unsupported symbolic link to directory", p) - elif p == Path(dandiset_path): + elif dandiset_path is not None and p == Path(dandiset_path): path_queue.extend(p.iterdir()) elif any(p.iterdir()): try: @@ -1026,3 +1019,14 @@ def _upload_zarr_file( with path.open("rb") as fp: storage_session.put(upload_url, data=fp, json_resp=False) return path.stat().st_size + + +def _check_required_fields(d, required): + errors = [] + for f in required: + v = d.get(f, None) + if not v or (isinstance(v, str) and not (v.strip())): + errors += [f"Required field {f!r} has no value"] + if v in ("REQUIRED", "PLACEHOLDER"): + errors += [f"Required field {f!r} has value {v!r}"] + return errors diff --git a/dandi/tests/test_files.py b/dandi/tests/test_files.py index 5efd1b092..6f22f933b 100644 --- a/dandi/tests/test_files.py +++ b/dandi/tests/test_files.py @@ -40,7 +40,9 @@ def test_find_dandi_files(tmp_path: Path) -> None: (tmp_path / ".ignored.dir").mkdir() (tmp_path / ".ignored.dir" / "ignored.nwb").touch() - files = sorted(find_dandi_files(tmp_path), key=attrgetter("filepath")) + files = sorted( + find_dandi_files(tmp_path, dandiset_path=tmp_path), key=attrgetter("filepath") + ) assert files == [ ZarrAsset(filepath=tmp_path / "sample01.zarr", path="sample01.zarr"), NWBAsset(filepath=tmp_path / "sample02.nwb", path="sample02.nwb"), @@ -53,7 +55,8 @@ def test_find_dandi_files(tmp_path: Path) -> None: ] files = sorted( - find_dandi_files(tmp_path, allow_all=True), key=attrgetter("filepath") + find_dandi_files(tmp_path, dandiset_path=tmp_path, allow_all=True), + key=attrgetter("filepath"), ) assert files == [ GenericAsset(filepath=tmp_path / "bar.txt", path="bar.txt"), @@ -74,7 +77,8 @@ def test_find_dandi_files(tmp_path: Path) -> None: ] files = sorted( - find_dandi_files(tmp_path, include_metadata=True), key=attrgetter("filepath") + find_dandi_files(tmp_path, dandiset_path=tmp_path, include_metadata=True), + key=attrgetter("filepath"), ) assert files == [ DandisetMetadataFile(filepath=tmp_path / dandiset_metadata_file), diff --git a/dandi/tests/test_validate.py b/dandi/tests/test_validate.py deleted file mode 100644 index 66afdeb07..000000000 --- a/dandi/tests/test_validate.py +++ /dev/null @@ -1,33 +0,0 @@ -from dandischema.models import get_schema_version - -from ..validate import validate_file - - -def test_validate_simple1(simple1_nwb): - # this file should be ok - errors = validate_file(simple1_nwb, schema_version=get_schema_version()) - assert not errors - - -def test_validate_simple2(simple2_nwb): - # this file should be ok - errors = validate_file(simple2_nwb) - assert not errors - - -def test_validate_simple2_new(simple2_nwb): - # this file should be ok - errors = validate_file(simple2_nwb, schema_version=get_schema_version()) - assert not errors - - -def test_validate_bogus(tmp_path): - path = tmp_path / "wannabe.nwb" - path.write_text("not really nwb") - # intended to produce use-case for https://github.com/dandi/dandi-cli/issues/93 - # but it would be tricky, so it is more of a smoke test that - # we do not crash - errors = validate_file(str(path)) - # ATM we would get 2 errors -- since could not be open in two places, - # but that would be too rigid to test. Let's just see that we have expected errors - assert any(e.startswith("Failed to read metadata") for e in errors) diff --git a/dandi/validate.py b/dandi/validate.py index 4bd76dcd5..8f49c6ddb 100644 --- a/dandi/validate.py +++ b/dandi/validate.py @@ -1,27 +1,13 @@ -import os.path as op - -from . import get_logger -from .consts import dandiset_metadata_file -from .metadata import get_metadata -from .misctypes import DUMMY_DIGEST -from .pynwb_utils import validate as pynwb_validate -from .pynwb_utils import validate_cache -from .utils import find_dandi_files, find_files, yaml_load - -lgr = get_logger() - -# TODO -- should come from schema. This is just a simplistic example for now -_required_dandiset_metadata_fields = ["identifier", "name", "description"] -_required_nwb_metadata_fields = ["subject_id"] +from .files import find_dandi_files # TODO: provide our own "errors" records, which would also include warnings etc -def validate(paths, schema_version=None, devel_debug=False, allow_any_path=False): +def validate(*paths, schema_version=None, devel_debug=False, allow_any_path=False): """Validate content Parameters ---------- - paths: str or list of paths + paths: *str Could be individual (.nwb) files or a single dandiset path. Yields @@ -29,129 +15,10 @@ def validate(paths, schema_version=None, devel_debug=False, allow_any_path=False path, errors errors for a path """ - filepaths = find_files(".*", paths) if allow_any_path else find_dandi_files(paths) - for path in filepaths: - errors = validate_file( - path, schema_version=schema_version, devel_debug=devel_debug - ) - yield path, errors - - -def validate_file(filepath, schema_version=None, devel_debug=False): - if op.basename(filepath) == dandiset_metadata_file: - return validate_dandiset_yaml( - filepath, schema_version=None, devel_debug=devel_debug + for df in find_dandi_files(*paths, dandiset_path=None, allow_all=allow_any_path): + yield ( + df.filepath, + df.get_validation_errors( + schema_version=schema_version, devel_debug=devel_debug + ), ) - else: - return pynwb_validate(filepath, devel_debug=devel_debug) + validate_asset_file( - filepath, schema_version=schema_version, devel_debug=devel_debug - ) - - -@validate_cache.memoize_path -def validate_dandiset_yaml(filepath, schema_version=None, devel_debug=False): - """Validate dandiset.yaml""" - with open(filepath) as f: - meta = yaml_load(f, typ="safe") - if schema_version is None: - schema_version = meta.get("schemaVersion") - if schema_version is None: - return _check_required_fields(meta, _required_dandiset_metadata_fields) - else: - from dandischema.models import Dandiset as DandisetMeta - from dandischema.models import get_schema_version - from pydantic import ValidationError - - current_version = get_schema_version() - if schema_version != current_version: - raise ValueError( - f"Unsupported schema version: {schema_version}; expected {current_version}" - ) - try: - DandisetMeta(**meta) - except ValidationError as e: - if devel_debug: - raise - lgr.warning( - "Validation error for %s: %s", filepath, e, extra={"validating": True} - ) - return [str(e)] - except Exception as e: - if devel_debug: - raise - lgr.warning( - "Unexpected validation error for %s: %s", - filepath, - e, - extra={"validating": True}, - ) - return [f"Failed to initialize Dandiset meta: {e}"] - return [] - - -@validate_cache.memoize_path -def validate_asset_file(filepath, schema_version=None, devel_debug=False): - """Provide validation of asset file regarding requirements we impose""" - if schema_version is not None: - from dandischema.models import BareAsset, get_schema_version - from pydantic import ValidationError - - from .metadata import get_asset_metadata - - current_version = get_schema_version() - if schema_version != current_version: - raise ValueError( - f"Unsupported schema version: {schema_version}; expected {current_version}" - ) - try: - asset = get_asset_metadata( - filepath, - relpath="dummy", - digest=DUMMY_DIGEST, - allow_any_path=True, - ) - BareAsset(**asset.dict()) - except ValidationError as e: - if devel_debug: - raise - lgr.warning( - "Validation error for %s: %s", filepath, e, extra={"validating": True} - ) - return [str(e)] - except Exception as e: - if devel_debug: - raise - lgr.warning( - "Unexpected validation error for %s: %s", - filepath, - e, - extra={"validating": True}, - ) - return [f"Failed to read metadata: {e}"] - return [] - else: - # make sure that we have some basic metadata fields we require - try: - meta = get_metadata(filepath) - except Exception as e: - if devel_debug: - raise - lgr.warning( - "Failed to read metadata in %s: %s", - filepath, - e, - extra={"validating": True}, - ) - return [f"Failed to read metadata: {e}"] - return _check_required_fields(meta, _required_nwb_metadata_fields) - - -def _check_required_fields(d, required): - errors = [] - for f in required: - v = d.get(f, None) - if not v or (isinstance(v, str) and not (v.strip())): - errors += [f"Required field {f!r} has no value"] - if v in ("REQUIRED", "PLACEHOLDER"): - errors += [f"Required field {f!r} has value {v!r}"] - return errors From 3c6008629a6b3a5a784b8630a1a0d7fdaec58951 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Thu, 20 Jan 2022 10:54:30 -0500 Subject: [PATCH 49/56] Update `dandi download` and `dandi validate` docs --- dandi/cli/cmd_upload.py | 6 +++--- dandi/cli/cmd_validate.py | 2 +- docs/source/cmdline/upload.rst | 9 +++++---- docs/source/cmdline/validate.rst | 4 ++-- 4 files changed, 11 insertions(+), 10 deletions(-) diff --git a/dandi/cli/cmd_upload.py b/dandi/cli/cmd_upload.py index ad200c8a7..ada1f850c 100644 --- a/dandi/cli/cmd_upload.py +++ b/dandi/cli/cmd_upload.py @@ -84,9 +84,9 @@ def upload( Local Dandiset should pass validation. For that, the assets should first be organized using the `dandi organize` command. - By default all .nwb files in the Dandiset (excluding directories starting - with a period) will be considered for the upload. You can point to - specific files you would like to validate and have uploaded. + By default all .nwb, .zarr, and .ngff assets in the Dandiset (ignoring + directories starting with a period) will be considered for the upload. You + can point to specific files you would like to validate and have uploaded. """ from ..upload import upload diff --git a/dandi/cli/cmd_validate.py b/dandi/cli/cmd_validate.py index 586079eb4..73027f0b6 100644 --- a/dandi/cli/cmd_validate.py +++ b/dandi/cli/cmd_validate.py @@ -18,7 +18,7 @@ @devel_debug_option() @map_to_click_exceptions def validate(paths, schema=None, devel_debug=False, allow_any_path=False): - """Validate files for NWB (and DANDI) compliance. + """Validate files for NWB and DANDI compliance. Exits with non-0 exit code if any file is not compliant. """ diff --git a/docs/source/cmdline/upload.rst b/docs/source/cmdline/upload.rst index d3bf14bdc..5367aa2c2 100644 --- a/docs/source/cmdline/upload.rst +++ b/docs/source/cmdline/upload.rst @@ -13,9 +13,10 @@ a :file:`dandiset.yaml` file must exist in the local :option:`--dandiset-path`. Local Dandisets should pass validation. For that, the assets should first be organized using the :ref:`dandi_organize` command. -By default, all :file:`*.nwb` files in the Dandiset (excluding directories -starting with a period) will be considered for the upload. You can point to -specific files you would like to validate and have uploaded. +By default, all :file:`*.nwb`, :file:`*.zarr`, and :file:`*.ngff` assets in the +Dandiset (ignoring directories starting with a period) will be considered for +the upload. You can point to specific files you would like to validate and +have uploaded. Options ------- @@ -66,7 +67,7 @@ set to a nonempty value. .. option:: --allow-any-path - Upload all file types, not just :file:`*.nwb`'s + Upload all file types, not just NWBs and Zarrs .. option:: --devel-debug diff --git a/docs/source/cmdline/validate.rst b/docs/source/cmdline/validate.rst index d35cd7d32..bc03476b3 100644 --- a/docs/source/cmdline/validate.rst +++ b/docs/source/cmdline/validate.rst @@ -5,7 +5,7 @@ dandi [] validate [ ...] -Validate files for NWB (and DANDI) compliance. +Validate files for NWB and DANDI compliance. Exits with non-zero exit code if any file is not compliant. @@ -19,7 +19,7 @@ set to a nonempty value. .. option:: --allow-any-path - Validate all file types, not just :file:`*.nwb`'s + Validate all file types, not just NWBs and Zarrs .. option:: --devel-debug From c12f72b306098519d11ffdeab25e3a1e635addd1 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Thu, 20 Jan 2022 11:05:35 -0500 Subject: [PATCH 50/56] `dandi download --sync` with Zarrs --- dandi/download.py | 5 ++++- dandi/tests/test_download.py | 15 +++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/dandi/download.py b/dandi/download.py index 883aaa196..7382907b8 100644 --- a/dandi/download.py +++ b/dandi/download.py @@ -161,7 +161,10 @@ def download( print(p) elif opt == "yes": for p in to_delete: - os.unlink(p) + if p.is_dir(): + rmtree(p) + else: + p.unlink() break else: break diff --git a/dandi/tests/test_download.py b/dandi/tests/test_download.py index d77f98905..c3f2bf05a 100644 --- a/dandi/tests/test_download.py +++ b/dandi/tests/test_download.py @@ -229,6 +229,21 @@ def test_download_sync_list(capsys, mocker, text_dandiset, tmp_path): assert capsys.readouterr().out.splitlines()[-1] == str(dspath / "file.txt") +def test_download_sync_zarr(mocker, zarr_dandiset, tmp_path): + zarr_dandiset.dandiset.get_asset_by_path("sample.zarr").delete() + dspath = tmp_path / zarr_dandiset.dandiset_id + os.rename(zarr_dandiset.dspath, dspath) + confirm_mock = mocker.patch("dandi.download.abbrev_prompt", return_value="yes") + download( + zarr_dandiset.dandiset.version_api_url, + tmp_path, + existing="overwrite", + sync=True, + ) + confirm_mock.assert_called_with("Delete 1 local asset?", "yes", "no", "list") + assert not (dspath / "sample.zarr").exists() + + @responses.activate def test_download_no_blobDateModified(text_dandiset, tmp_path): # Regression test for #806 From 0da8d932cdb297f51cea0740415d5880b7cdec0a Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Fri, 21 Jan 2022 14:23:44 -0500 Subject: [PATCH 51/56] =?UTF-8?q?`get=5Fdigest()`=20=E2=86=92=20`get=5Fraw?= =?UTF-8?q?=5Fdigest()`;=20`get=5Fetag()`=20=E2=86=92=20`get=5Fdigest()`?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- dandi/dandiapi.py | 32 +++++++++++++++++++++++--------- dandi/download.py | 4 ++-- dandi/files.py | 22 +++++++++++----------- dandi/tests/test_dandiapi.py | 9 +++++---- dandi/upload.py | 2 +- 5 files changed, 42 insertions(+), 27 deletions(-) diff --git a/dandi/dandiapi.py b/dandi/dandiapi.py index acb67e07f..73b58f418 100644 --- a/dandi/dandiapi.py +++ b/dandi/dandiapi.py @@ -1191,15 +1191,27 @@ def get_raw_metadata(self) -> Dict[str, Any]: else: raise - def get_digest( - self, digest_type: Union[str, models.DigestType] = models.DigestType.dandi_etag + def get_raw_digest( + self, + digest_type: Union[str, models.DigestType, None] = models.DigestType.dandi_etag, ) -> str: """ Retrieves the value of the given type of digest from the asset's metadata. Raises `NotFoundError` if there is no entry for the given digest type. + + If no digest type is specified, the same type as used by `get_digest()` + is returned. + + .. versionchanged:: 0.35.0 + Renamed from ``get_digest()`` to ``get_raw_digest()`` """ - if isinstance(digest_type, models.DigestType): + if digest_type is None: + if self.is_zarr(): + digest_type = models.DigestType.dandi_zarr_checksum.value + else: + digest_type = models.DigestType.dandi_etag.value + elif isinstance(digest_type, models.DigestType): digest_type = digest_type.value metadata = self.get_raw_metadata() try: @@ -1207,9 +1219,11 @@ def get_digest( except KeyError: raise NotFoundError(f"No {digest_type} digest found in metadata") - def get_etag(self) -> Digest: + def get_digest(self) -> Digest: """ .. versionadded:: 0.35.0 + Replaces the previous version of ``get_digest()``, now renamed to + `get_raw_digest()` Retrieves the DANDI etag digest of the appropriate type for the asset: a dandi-etag digest for blob resources or a dandi-zarr-checksum for @@ -1219,7 +1233,7 @@ def get_etag(self) -> Digest: algorithm = models.DigestType.dandi_zarr_checksum else: algorithm = models.DigestType.dandi_etag - return Digest(algorithm=algorithm, value=self.get_digest(algorithm)) + return Digest(algorithm=algorithm, value=self.get_raw_digest(algorithm)) def get_content_url( self, @@ -1650,11 +1664,11 @@ def iterdir(self) -> Iterator["RemoteZarrEntry"]: for name in listing.filenames: yield self._get_subpath(name, isdir=False) - def get_etag(self) -> Digest: + def get_digest(self) -> Digest: """ - Retrieve the etag digest for the entry. If the entry is a directory, - the algorithm will be the Dandi Zarr checksum algorithm; if it is a - file, it will be MD5. + Retrieve the DANDI etag digest for the entry. If the entry is a + directory, the algorithm will be the Dandi Zarr checksum algorithm; if + it is a file, it will be MD5. :raises NotFoundError: if the path does not exist in the Zarr asset """ diff --git a/dandi/download.py b/dandi/download.py index 7382907b8..9a5f3908e 100644 --- a/dandi/download.py +++ b/dandi/download.py @@ -777,7 +777,7 @@ def digest_callback(path: str, algoname: str, d: str) -> None: digests[path] = d for entry in entries: - etag = entry.get_etag() + etag = entry.get_digest() assert etag.algorithm is DigestType.md5 stat = entry.stat() download_gens[str(entry)] = _download_file( @@ -844,7 +844,7 @@ def digest_callback(path: str, algoname: str, d: str) -> None: empty_dirs.append(d.parent) if "skipped" not in final_out["message"]: - zarr_checksum = asset.get_etag().value + zarr_checksum = asset.get_digest().value local_checksum = get_zarr_checksum(zarr_basepath, known=digests) if zarr_checksum != local_checksum: msg = f"Zarr checksum: downloaded {local_checksum} != {zarr_checksum}" diff --git a/dandi/files.py b/dandi/files.py index 0afb326b8..7fffbfb92 100644 --- a/dandi/files.py +++ b/dandi/files.py @@ -162,10 +162,10 @@ class LocalAsset(DandiFile): path: str @abstractmethod - def get_etag(self) -> Digest: + def get_digest(self) -> Digest: """ - Calculate an etag digest for the asset using the appropriate algorithm - for its type + Calculate a DANDI etag digest for the asset using the appropriate + algorithm for its type """ ... @@ -286,7 +286,7 @@ class LocalFileAsset(LocalAsset): an asset of a Dandiset """ - def get_etag(self) -> Digest: + def get_digest(self) -> Digest: """Calculate a dandi-etag digest for the asset""" value = get_digest(self.filepath, digest="dandi-etag") return Digest.dandi_etag(value) @@ -593,11 +593,11 @@ def iterdir(self) -> Iterator[LocalZarrEntry]: continue yield self._get_subpath(p.name) - def get_etag(self) -> Digest: + def get_digest(self) -> Digest: """ - Calculate the etag digest for the entry. If the entry is a directory, - the algorithm will be the Dandi Zarr checksum algorithm; if it is a - file, it will be MD5. + Calculate the DANDI etag digest for the entry. If the entry is a + directory, the algorithm will be the Dandi Zarr checksum algorithm; if + it is a file, it will be MD5. """ if self.is_dir(): return Digest.dandi_zarr( @@ -669,7 +669,7 @@ def dirstat(dirpath: LocalZarrEntry) -> ZarrStat: files.extend(st.files) else: size += p.size - file_md5s[str(p)] = p.get_etag().value + file_md5s[str(p)] = p.get_digest().value files.append(p) return ZarrStat( size=size, @@ -679,7 +679,7 @@ def dirstat(dirpath: LocalZarrEntry) -> ZarrStat: return dirstat(self.filetree) - def get_etag(self) -> Digest: + def get_digest(self) -> Digest: """Calculate a dandi-zarr-checksum digest for the asset""" return Digest.dandi_zarr(get_zarr_checksum(self.filepath)) @@ -791,7 +791,7 @@ def iter_upload( chunked(stat.files, ZARR_UPLOAD_BATCH_SIZE), start=1 ): upload_body = [ - {"path": str(p), "etag": p.get_etag().value} for p in filebatch + {"path": str(p), "etag": p.get_digest().value} for p in filebatch ] lgr.debug( "%s: Uploading Zarr file batch #%d (%s)", diff --git a/dandi/tests/test_dandiapi.py b/dandi/tests/test_dandiapi.py index f7ffce2fb..1f66a4d5d 100644 --- a/dandi/tests/test_dandiapi.py +++ b/dandi/tests/test_dandiapi.py @@ -539,18 +539,19 @@ def test_set_dandiset_metadata(text_dandiset): [ (DigestType.dandi_etag, r"[0-9a-f]{32}-\d{1,5}"), ("dandi:dandi-etag", r"[0-9a-f]{32}-\d{1,5}"), + (None, r"[0-9a-f]{32}-\d{1,5}"), ], ) -def test_get_digest(digest_type, digest_regex, text_dandiset): +def test_get_raw_digest(digest_type, digest_regex, text_dandiset): asset = text_dandiset.dandiset.get_asset_by_path("file.txt") - d = asset.get_digest(digest_type) + d = asset.get_raw_digest(digest_type) assert re.fullmatch(digest_regex, d) -def test_get_digest_nonexistent(text_dandiset): +def test_get_raw_digest_nonexistent(text_dandiset): asset = text_dandiset.dandiset.get_asset_by_path("file.txt") with pytest.raises(NotFoundError): - asset.get_digest("md5") + asset.get_raw_digest("md5") def test_refresh(text_dandiset): diff --git a/dandi/upload.py b/dandi/upload.py index da12929a1..580146949 100644 --- a/dandi/upload.py +++ b/dandi/upload.py @@ -160,7 +160,7 @@ def process_path(dfile): # yield {"status": "digesting"} try: - file_etag = dfile.get_etag() + file_etag = dfile.get_digest() except Exception as exc: yield skip_file("failed to compute digest: %s" % str(exc)) return From c0a32efff54c34e2b79c2d551def3a396a2e1fb6 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Fri, 21 Jan 2022 14:45:04 -0500 Subject: [PATCH 52/56] Replace `is_blob()` and `is_zarr()` with an `asset_type` property & enum --- dandi/dandiapi.py | 63 ++++++++++++++++++++++---------------- dandi/download.py | 8 +++-- dandi/tests/test_files.py | 5 ++- dandi/tests/test_upload.py | 17 ++++------ 4 files changed, 50 insertions(+), 43 deletions(-) diff --git a/dandi/dandiapi.py b/dandi/dandiapi.py index 73b58f418..40fdc327b 100644 --- a/dandi/dandiapi.py +++ b/dandi/dandiapi.py @@ -48,6 +48,7 @@ from collections import deque from dataclasses import dataclass, field, replace from datetime import datetime +from enum import Enum import json import os.path from pathlib import Path, PurePosixPath @@ -99,6 +100,18 @@ T = TypeVar("T") +class AssetType(Enum): + """ + .. versionadded:: 0.35.0 + + An enum for the different kinds of resources that an asset's actual data + can be + """ + + BLOB = 1 + ZARR = 2 + + # Following class is loosely based on GirderClient, with authentication etc # being stripped. # TODO: add copyright/license info @@ -1207,7 +1220,7 @@ def get_raw_digest( Renamed from ``get_digest()`` to ``get_raw_digest()`` """ if digest_type is None: - if self.is_zarr(): + if self.asset_type is AssetType.ZARR: digest_type = models.DigestType.dandi_zarr_checksum.value else: digest_type = models.DigestType.dandi_etag.value @@ -1229,7 +1242,7 @@ def get_digest(self) -> Digest: a dandi-etag digest for blob resources or a dandi-zarr-checksum for Zarr resources """ - if self.is_zarr(): + if self.asset_type is AssetType.ZARR: algorithm = models.DigestType.dandi_zarr_checksum else: algorithm = models.DigestType.dandi_etag @@ -1289,7 +1302,7 @@ def get_download_file_iter( :raises ValueError: if the asset is not backed by a blob """ - if not self.is_blob(): + if self.asset_type is not AssetType.BLOB: raise ValueError("Only blob assets can be downloaded directly") url = self.base_download_url @@ -1324,21 +1337,17 @@ def download( for chunk in downloader(): fp.write(chunk) - def is_blob(self) -> bool: - """ - .. versionadded:: 0.35.0 - - Returns true if the asset's actual data is a blob resource - """ - return self.get_raw_metadata().get("encodingFormat") != ZARR_MIME_TYPE - - def is_zarr(self) -> bool: + @property + def asset_type(self) -> AssetType: """ .. versionadded:: 0.35.0 - Returns true if the asset's actual data is a Zarr resource + The type of the asset's underlying data """ - return self.get_raw_metadata().get("encodingFormat") == ZARR_MIME_TYPE + if self.get_raw_metadata().get("encodingFormat") == ZARR_MIME_TYPE: + return AssetType.ZARR + else: + return AssetType.BLOB class RemoteAsset(ABC, BaseRemoteAsset): @@ -1450,13 +1459,14 @@ class RemoteBlobAsset(RemoteAsset): #: The ID of the underlying blob resource blob: str - def is_blob(self) -> bool: - """Returns true if the asset's actual data is a blob resource""" - return True + @property + def asset_type(self) -> AssetType: + """ + .. versionadded:: 0.35.0 - def is_zarr(self) -> bool: - """Returns true if the asset's actual data is a Zarr resource""" - return False + The type of the asset's underlying data + """ + return AssetType.BLOB def set_raw_metadata(self, metadata: Dict[str, Any]) -> None: """ @@ -1483,13 +1493,14 @@ class RemoteZarrAsset(RemoteAsset): #: The ID of the underlying Zarr resource zarr: str - def is_blob(self) -> bool: - """Returns true if the asset's actual data is a blob resource""" - return False + @property + def asset_type(self) -> AssetType: + """ + .. versionadded:: 0.35.0 - def is_zarr(self) -> bool: - """Returns true if the asset's actual data is a Zarr resource""" - return True + The type of the asset's underlying data + """ + return AssetType.ZARR def set_raw_metadata(self, metadata: Dict[str, Any]) -> None: """ diff --git a/dandi/download.py b/dandi/download.py index 9a5f3908e..fd8c68996 100644 --- a/dandi/download.py +++ b/dandi/download.py @@ -21,7 +21,7 @@ from . import get_logger from .consts import RETRY_STATUSES, dandiset_metadata_file -from .dandiapi import RemoteZarrAsset +from .dandiapi import AssetType, RemoteZarrAsset from .dandiarchive import DandisetURL, MultiAssetURL, SingleAssetURL, parse_dandi_url from .dandiset import Dandiset from .exceptions import NotFoundError @@ -238,7 +238,7 @@ def download_generator( continue d = metadata.get("digest", {}) - if asset.is_blob(): + if asset.asset_type is AssetType.BLOB: if "dandi:dandi-etag" in d: digests = {"dandi-etag": d["dandi:dandi-etag"]} else: @@ -273,7 +273,9 @@ def download_generator( ) else: - assert asset.is_zarr(), f"Asset {asset.path} is neither blob nor Zarr" + assert ( + asset.asset_type is AssetType.ZARR + ), f"Asset {asset.path} is neither blob nor Zarr" if not isinstance(asset, RemoteZarrAsset): raise NotImplementedError( "Downloading a Zarr asset identified by a URL without" diff --git a/dandi/tests/test_files.py b/dandi/tests/test_files.py index 6f22f933b..524f91351 100644 --- a/dandi/tests/test_files.py +++ b/dandi/tests/test_files.py @@ -7,7 +7,7 @@ from .. import get_logger from ..consts import ZARR_MIME_TYPE, dandiset_metadata_file -from ..dandiapi import RemoteZarrAsset +from ..dandiapi import AssetType, RemoteZarrAsset from ..files import ( DandisetMetadataFile, GenericAsset, @@ -135,8 +135,7 @@ def test_upload_zarr(local_dandi_api, tmp_path): d = local_dandi_api.client.create_dandiset("Zarr Dandiset", {}) asset = zf.upload(d, {"description": "A test Zarr"}) assert isinstance(asset, RemoteZarrAsset) - assert asset.is_zarr() - assert not asset.is_blob() + assert asset.asset_type is AssetType.ZARR assert asset.path == "example.zarr" md = asset.get_raw_metadata() assert md["encodingFormat"] == ZARR_MIME_TYPE diff --git a/dandi/tests/test_upload.py b/dandi/tests/test_upload.py index afc9eb9ba..325368951 100644 --- a/dandi/tests/test_upload.py +++ b/dandi/tests/test_upload.py @@ -9,7 +9,7 @@ from .test_helpers import assert_dirtrees_eq from ..consts import DRAFT, ZARR_MIME_TYPE, dandiset_metadata_file -from ..dandiapi import RemoteBlobAsset, RemoteZarrAsset +from ..dandiapi import AssetType, RemoteBlobAsset, RemoteZarrAsset from ..download import download from ..exceptions import NotFoundError from ..files import LocalFileAsset @@ -221,8 +221,7 @@ def test_upload_zarr(local_dandi_api, monkeypatch, tmp_path): ) (asset,) = d.get_assets() assert isinstance(asset, RemoteZarrAsset) - assert asset.is_zarr() - assert not asset.is_blob() + assert asset.asset_type is AssetType.ZARR assert asset.path == "sample.zarr" @@ -243,8 +242,7 @@ def test_upload_nonzarr_to_zarr_path(tmp_path, zarr_dandiset): zarr_dandiset.upload(allow_any_path=True) (asset,) = zarr_dandiset.dandiset.get_assets() assert isinstance(asset, RemoteBlobAsset) - assert asset.is_blob() - assert not asset.is_zarr() + assert asset.asset_type is AssetType.BLOB assert asset.path == "sample.zarr" assert asset.get_raw_metadata()["encodingFormat"] == "application/octet-stream" download(zarr_dandiset.dandiset.version_api_url, tmp_path) @@ -271,8 +269,7 @@ def test_upload_zarr_to_nonzarr_path(local_dandi_api, monkeypatch, tmp_path): (asset,) = d.get_assets() assert isinstance(asset, RemoteBlobAsset) - assert asset.is_blob() - assert not asset.is_zarr() + assert asset.asset_type is AssetType.BLOB assert asset.path == "sample.zarr" assert asset.get_raw_metadata()["encodingFormat"] == "application/octet-stream" @@ -288,8 +285,7 @@ def test_upload_zarr_to_nonzarr_path(local_dandi_api, monkeypatch, tmp_path): (asset,) = d.get_assets() assert isinstance(asset, RemoteZarrAsset) - assert asset.is_zarr() - assert not asset.is_blob() + assert asset.asset_type is AssetType.ZARR assert asset.path == "sample.zarr" assert asset.get_raw_metadata()["encodingFormat"] == ZARR_MIME_TYPE @@ -316,7 +312,6 @@ def test_upload_zarr_with_empty_dir(local_dandi_api, monkeypatch, tmp_path): ) (asset,) = d.get_assets() assert isinstance(asset, RemoteZarrAsset) - assert asset.is_zarr() - assert not asset.is_blob() + assert asset.asset_type is AssetType.ZARR assert asset.path == "sample.zarr" assert not (asset.filetree / "empty").exists() From a60d5d492050e3e35282743a2732633af70217f4 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Mon, 24 Jan 2022 08:25:51 -0500 Subject: [PATCH 53/56] Type-annotate a utility function --- dandi/files.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dandi/files.py b/dandi/files.py index 7fffbfb92..91f0125ab 100644 --- a/dandi/files.py +++ b/dandi/files.py @@ -1021,11 +1021,11 @@ def _upload_zarr_file( return path.stat().st_size -def _check_required_fields(d, required): - errors = [] +def _check_required_fields(d: dict, required: List[str]) -> List[str]: + errors: List[str] = [] for f in required: v = d.get(f, None) - if not v or (isinstance(v, str) and not (v.strip())): + if not v or (isinstance(v, str) and not v.strip()): errors += [f"Required field {f!r} has no value"] if v in ("REQUIRED", "PLACEHOLDER"): errors += [f"Required field {f!r} has value {v!r}"] From 0caa08f7ed03031f0a198b08bc805a5b9c5940c9 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Mon, 24 Jan 2022 14:31:59 -0500 Subject: [PATCH 54/56] Update version in which Zarr support will be released --- dandi/dandiapi.py | 26 +++++++++++++------------- dandi/files.py | 2 +- dandi/misctypes.py | 2 +- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/dandi/dandiapi.py b/dandi/dandiapi.py index 40fdc327b..b12298d1b 100644 --- a/dandi/dandiapi.py +++ b/dandi/dandiapi.py @@ -102,7 +102,7 @@ class AssetType(Enum): """ - .. versionadded:: 0.35.0 + .. versionadded:: 0.36.0 An enum for the different kinds of resources that an asset's actual data can be @@ -1047,7 +1047,7 @@ def upload_raw_asset( this version of the Dandiset and return the resulting asset. Blocks until the upload is complete. - .. deprecated:: 0.35.0 + .. deprecated:: 0.36.0 Use the ``upload()`` method of `~dandi.files.LocalAsset` instances instead @@ -1079,7 +1079,7 @@ def iter_upload_raw_asset( this version of the Dandiset, returning a generator of status `dict`\\s. - .. deprecated:: 0.35.0 + .. deprecated:: 0.36.0 Use the ``iter_upload()`` method of `~dandi.files.LocalAsset` instances instead @@ -1216,7 +1216,7 @@ def get_raw_digest( If no digest type is specified, the same type as used by `get_digest()` is returned. - .. versionchanged:: 0.35.0 + .. versionchanged:: 0.36.0 Renamed from ``get_digest()`` to ``get_raw_digest()`` """ if digest_type is None: @@ -1234,7 +1234,7 @@ def get_raw_digest( def get_digest(self) -> Digest: """ - .. versionadded:: 0.35.0 + .. versionadded:: 0.36.0 Replaces the previous version of ``get_digest()``, now renamed to `get_raw_digest()` @@ -1340,7 +1340,7 @@ def download( @property def asset_type(self) -> AssetType: """ - .. versionadded:: 0.35.0 + .. versionadded:: 0.36.0 The type of the asset's underlying data """ @@ -1451,7 +1451,7 @@ def delete(self) -> None: class RemoteBlobAsset(RemoteAsset): """ - .. versionadded:: 0.35.0 + .. versionadded:: 0.36.0 A `RemoteAsset` whose actual data is a blob resource """ @@ -1462,7 +1462,7 @@ class RemoteBlobAsset(RemoteAsset): @property def asset_type(self) -> AssetType: """ - .. versionadded:: 0.35.0 + .. versionadded:: 0.36.0 The type of the asset's underlying data """ @@ -1485,7 +1485,7 @@ def set_raw_metadata(self, metadata: Dict[str, Any]) -> None: class RemoteZarrAsset(RemoteAsset): """ - .. versionadded:: 0.35.0 + .. versionadded:: 0.36.0 A `RemoteAsset` whose actual data is a Zarr resource """ @@ -1496,7 +1496,7 @@ class RemoteZarrAsset(RemoteAsset): @property def asset_type(self) -> AssetType: """ - .. versionadded:: 0.35.0 + .. versionadded:: 0.36.0 The type of the asset's underlying data """ @@ -1545,7 +1545,7 @@ def iterfiles(self, include_dirs: bool = False) -> Iterator["RemoteZarrEntry"]: class ZarrListing(BaseModel): """ - .. versionadded:: 0.35.0 + .. versionadded:: 0.36.0 Information about a directory within a `RemoteZarrAsset` """ @@ -1574,7 +1574,7 @@ def filenames(self) -> List[str]: @dataclass class ZarrEntryStat: """ - .. versionadded:: 0.35.0 + .. versionadded:: 0.36.0 Combined size & timestamp information for a file in a `RemoteZarrAsset` """ @@ -1588,7 +1588,7 @@ class ZarrEntryStat: @dataclass class RemoteZarrEntry(BasePath): """ - .. versionadded:: 0.35.0 + .. versionadded:: 0.36.0 A file or directory within a `RemoteZarrAsset`. Implements `~dandi.misctypes.BasePath`. diff --git a/dandi/files.py b/dandi/files.py index 91f0125ab..56bedf224 100644 --- a/dandi/files.py +++ b/dandi/files.py @@ -1,5 +1,5 @@ """ -.. versionadded:: 0.35.0 +.. versionadded:: 0.36.0 This module defines functionality for working with local files & directories (as opposed to remote resources on a DANDI Archive server) that are of interest diff --git a/dandi/misctypes.py b/dandi/misctypes.py index f9acaeef6..03f9d040d 100644 --- a/dandi/misctypes.py +++ b/dandi/misctypes.py @@ -1,5 +1,5 @@ """ -.. versionadded:: 0.35.0 +.. versionadded:: 0.36.0 Miscellaneous public classes """ From b3a01de296070114cca7d6c6aae0436b94a336a3 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Tue, 25 Jan 2022 12:33:36 -0500 Subject: [PATCH 55/56] `BaseRemoteAsset.digest_type` --- dandi/dandiapi.py | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/dandi/dandiapi.py b/dandi/dandiapi.py index b12298d1b..47d49dd0e 100644 --- a/dandi/dandiapi.py +++ b/dandi/dandiapi.py @@ -1220,10 +1220,7 @@ def get_raw_digest( Renamed from ``get_digest()`` to ``get_raw_digest()`` """ if digest_type is None: - if self.asset_type is AssetType.ZARR: - digest_type = models.DigestType.dandi_zarr_checksum.value - else: - digest_type = models.DigestType.dandi_etag.value + digest_type = self.digest_type.value elif isinstance(digest_type, models.DigestType): digest_type = digest_type.value metadata = self.get_raw_metadata() @@ -1242,10 +1239,7 @@ def get_digest(self) -> Digest: a dandi-etag digest for blob resources or a dandi-zarr-checksum for Zarr resources """ - if self.asset_type is AssetType.ZARR: - algorithm = models.DigestType.dandi_zarr_checksum - else: - algorithm = models.DigestType.dandi_etag + algorithm = self.digest_type return Digest(algorithm=algorithm, value=self.get_raw_digest(algorithm)) def get_content_url( @@ -1349,6 +1343,20 @@ def asset_type(self) -> AssetType: else: return AssetType.BLOB + @property + def digest_type(self) -> models.DigestType: + """ + .. versionadded:: 0.36.0 + + The primary digest algorithm used by Dandi Archive for the asset, + determined based on its underlying data: dandi-etag for blob resources, + dandi-zarr-checksum for Zarr resources + """ + if self.asset_type is AssetType.ZARR: + return models.DigestType.dandi_zarr_checksum + else: + return models.DigestType.dandi_etag + class RemoteAsset(ABC, BaseRemoteAsset): """ From 2c6279e3bb1c5f1a2ed83bec08c70e8d83fb4ae3 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Tue, 25 Jan 2022 12:37:42 -0500 Subject: [PATCH 56/56] Add more information to an error message --- dandi/dandiapi.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/dandi/dandiapi.py b/dandi/dandiapi.py index 47d49dd0e..2b2cf44ea 100644 --- a/dandi/dandiapi.py +++ b/dandi/dandiapi.py @@ -1297,7 +1297,10 @@ def get_download_file_iter( :raises ValueError: if the asset is not backed by a blob """ if self.asset_type is not AssetType.BLOB: - raise ValueError("Only blob assets can be downloaded directly") + raise ValueError( + f"Cannot download asset {self} directly: asset is of type" + f" {self.asset_type.name}, not BLOB" + ) url = self.base_download_url