diff --git a/dandi/cli/base.py b/dandi/cli/base.py index e2a2a4238..bf2b890d5 100644 --- a/dandi/cli/base.py +++ b/dandi/cli/base.py @@ -11,6 +11,25 @@ # Aux common functionality +class IntColonInt(click.ParamType): + name = "int:int" + + def convert(self, value, param, ctx): + if isinstance(value, str): + v1, colon, v2 = value.partition(":") + try: + v1 = int(v1) + v2 = int(v2) if colon else None + except ValueError: + self.fail("Value must be of the form `N[:M]`", param, ctx) + return (v1, v2) + else: + return value + + def get_metavar(self, param): + return "N[:M]" + + # ???: could make them always available but hidden # via hidden=True. def devel_option(*args, **kwargs): diff --git a/dandi/cli/cmd_download.py b/dandi/cli/cmd_download.py index 6355fcf35..39b1a0756 100644 --- a/dandi/cli/cmd_download.py +++ b/dandi/cli/cmd_download.py @@ -2,7 +2,7 @@ import click -from .base import instance_option, map_to_click_exceptions +from .base import IntColonInt, instance_option, map_to_click_exceptions from ..consts import known_instances, known_instances_rev from ..dandiarchive import _dandi_url_parser, parse_dandi_url @@ -70,8 +70,9 @@ def get_metavar(self, param): @click.option( "-J", "--jobs", - help="Number of parallel download jobs.", - default=6, # TODO: come up with smart auto-scaling etc + type=IntColonInt(), + help="Number of parallel download jobs and, optionally number of subjobs per Zarr asset", + default="6", # TODO: come up with smart auto-scaling etc show_default=True, ) @click.option( @@ -141,7 +142,8 @@ def download( output_dir, existing=existing, format=format, - jobs=jobs, + jobs=jobs[0], + jobs_per_zarr=jobs[1], get_metadata="dandiset.yaml" in download_types, get_assets="assets" in download_types, sync=sync, diff --git a/dandi/cli/cmd_ls.py b/dandi/cli/cmd_ls.py index a485077c7..bd14f30ec 100644 --- a/dandi/cli/cmd_ls.py +++ b/dandi/cli/cmd_ls.py @@ -6,6 +6,7 @@ from .base import devel_option, lgr, map_to_click_exceptions from ..dandiarchive import DandisetURL, _dandi_url_parser, parse_dandi_url +from ..misctypes import Digest from ..utils import is_url # TODO: all the recursion options etc @@ -354,8 +355,7 @@ def fn(): rec = nwb2asset( path, schema_version=schema, - digest=digest, - digest_type="dandi_etag", + digest=Digest.dandi_etag(digest), ).json_dict() else: rec = get_metadata(path) diff --git a/dandi/cli/cmd_upload.py b/dandi/cli/cmd_upload.py index 1139d01eb..ada1f850c 100644 --- a/dandi/cli/cmd_upload.py +++ b/dandi/cli/cmd_upload.py @@ -1,6 +1,7 @@ import click from .base import ( + IntColonInt, devel_debug_option, devel_option, instance_option, @@ -8,25 +9,6 @@ ) -class IntColonInt(click.ParamType): - name = "int:int" - - def convert(self, value, param, ctx): - if isinstance(value, str): - v1, colon, v2 = value.partition(":") - try: - v1 = int(v1) - v2 = int(v2) if colon else None - except ValueError: - self.fail("Value must be of the form `N[:M]`", param, ctx) - return (v1, v2) - else: - return value - - def get_metavar(self, param): - return "N[:M]" - - @click.command() # @dandiset_path_option( # help="Top directory (local) of the dandiset. Files will be uploaded with " @@ -102,9 +84,9 @@ def upload( Local Dandiset should pass validation. For that, the assets should first be organized using the `dandi organize` command. - By default all .nwb files in the Dandiset (excluding directories starting - with a period) will be considered for the upload. You can point to - specific files you would like to validate and have uploaded. + By default all .nwb, .zarr, and .ngff assets in the Dandiset (ignoring + directories starting with a period) will be considered for the upload. You + can point to specific files you would like to validate and have uploaded. """ from ..upload import upload diff --git a/dandi/cli/cmd_validate.py b/dandi/cli/cmd_validate.py index 21ba094bb..73027f0b6 100644 --- a/dandi/cli/cmd_validate.py +++ b/dandi/cli/cmd_validate.py @@ -18,7 +18,7 @@ @devel_debug_option() @map_to_click_exceptions def validate(paths, schema=None, devel_debug=False, allow_any_path=False): - """Validate files for NWB (and DANDI) compliance. + """Validate files for NWB and DANDI compliance. Exits with non-0 exit code if any file is not compliant. """ @@ -43,7 +43,7 @@ def validate(paths, schema=None, devel_debug=False, allow_any_path=False): all_files_errors = {} nfiles = 0 for path, errors in validate_( - paths, + *paths, schema_version=schema, devel_debug=devel_debug, allow_any_path=allow_any_path, diff --git a/dandi/cli/tests/test_download.py b/dandi/cli/tests/test_download.py index a128880c5..1fc5757b6 100644 --- a/dandi/cli/tests/test_download.py +++ b/dandi/cli/tests/test_download.py @@ -18,6 +18,7 @@ def test_download_defaults(mocker): existing="error", format="pyout", jobs=6, + jobs_per_zarr=None, get_metadata=True, get_assets=True, sync=False, @@ -34,6 +35,7 @@ def test_download_all_types(mocker): existing="error", format="pyout", jobs=6, + jobs_per_zarr=None, get_metadata=True, get_assets=True, sync=False, @@ -50,6 +52,7 @@ def test_download_metadata_only(mocker): existing="error", format="pyout", jobs=6, + jobs_per_zarr=None, get_metadata=True, get_assets=False, sync=False, @@ -66,6 +69,7 @@ def test_download_assets_only(mocker): existing="error", format="pyout", jobs=6, + jobs_per_zarr=None, get_metadata=False, get_assets=True, sync=False, @@ -94,6 +98,7 @@ def test_download_gui_instance_in_dandiset(mocker): existing="error", format="pyout", jobs=6, + jobs_per_zarr=None, get_metadata=True, get_assets=True, sync=False, @@ -113,6 +118,7 @@ def test_download_api_instance_in_dandiset(mocker): existing="error", format="pyout", jobs=6, + jobs_per_zarr=None, get_metadata=True, get_assets=True, sync=False, @@ -136,6 +142,7 @@ def test_download_url_instance_match(mocker): existing="error", format="pyout", jobs=6, + jobs_per_zarr=None, get_metadata=True, get_assets=True, sync=False, diff --git a/dandi/consts.py b/dandi/consts.py index c0237733e..13642b3ef 100644 --- a/dandi/consts.py +++ b/dandi/consts.py @@ -142,3 +142,12 @@ class EmbargoStatus(Enum): #: HTTP response status codes that should always be retried (until we run out #: of retries) RETRY_STATUSES = (500, 502, 503, 504) + +#: Maximum allowed depth of a Zarr directory tree +MAX_ZARR_DEPTH = 5 + +#: MIME type assigned to & used to identify Zarr assets +ZARR_MIME_TYPE = "application/x-zarr" + +#: Maximum number of Zarr directory entries to upload at once +ZARR_UPLOAD_BATCH_SIZE = 255 diff --git a/dandi/dandiapi.py b/dandi/dandiapi.py index 8869f94aa..2b2cf44ea 100644 --- a/dandi/dandiapi.py +++ b/dandi/dandiapi.py @@ -44,23 +44,25 @@ } """ -from concurrent.futures import ThreadPoolExecutor, as_completed +from abc import ABC, abstractmethod +from collections import deque +from dataclasses import dataclass, field, replace from datetime import datetime +from enum import Enum import json import os.path -from pathlib import Path +from pathlib import Path, PurePosixPath import re -from threading import Lock from time import sleep, time from types import TracebackType from typing import ( Any, - BinaryIO, Callable, ClassVar, Dict, FrozenSet, Iterator, + List, Optional, Sequence, Type, @@ -68,13 +70,12 @@ Union, cast, ) -from urllib.parse import urlparse, urlunparse -from xml.etree.ElementTree import fromstring +from urllib.parse import unquote, urlparse, urlunparse import click from dandischema import models -from dandischema.digests.dandietag import DandiETag -from pydantic import BaseModel, Field, PrivateAttr +import dateutil.parser +from pydantic import AnyHttpUrl, BaseModel, Field, PrivateAttr import requests import tenacity @@ -83,6 +84,7 @@ DRAFT, MAX_CHUNK_SIZE, RETRY_STATUSES, + ZARR_MIME_TYPE, DandiInstance, EmbargoStatus, known_instances, @@ -90,6 +92,7 @@ ) from .exceptions import NotFoundError, SchemaVersionError from .keyring import keyring_lookup +from .misctypes import BasePath, Digest from .utils import USER_AGENT, check_dandi_version, ensure_datetime, is_interactive lgr = get_logger() @@ -97,6 +100,18 @@ T = TypeVar("T") +class AssetType(Enum): + """ + .. versionadded:: 0.36.0 + + An enum for the different kinds of resources that an asset's actual data + can be + """ + + BLOB = 1 + ZARR = 2 + + # Following class is loosely based on GirderClient, with authentication etc # being stripped. # TODO: add copyright/license info @@ -1032,6 +1047,10 @@ def upload_raw_asset( this version of the Dandiset and return the resulting asset. Blocks until the upload is complete. + .. deprecated:: 0.36.0 + Use the ``upload()`` method of `~dandi.files.LocalAsset` instances + instead + :param filepath: the path to the local file to upload :type filepath: str or PathLike :param dict asset_metadata: @@ -1042,12 +1061,11 @@ def upload_raw_asset( :param RemoteAsset replace_asset: If set, replace the given asset, which must have the same path as the new asset """ - for status in self.iter_upload_raw_asset( - filepath, asset_metadata, jobs=jobs, replace_asset=replace_asset - ): - if status["status"] == "done": - return status["asset"] - raise RuntimeError("iter_upload_raw_asset() finished without returning 'done'") + from .files import dandi_file + + return dandi_file(filepath).upload( + self, metadata=asset_metadata, jobs=jobs, replacing=replace_asset + ) def iter_upload_raw_asset( self, @@ -1061,6 +1079,10 @@ def iter_upload_raw_asset( this version of the Dandiset, returning a generator of status `dict`\\s. + .. deprecated:: 0.36.0 + Use the ``iter_upload()`` method of `~dandi.files.LocalAsset` + instances instead + :param filepath: the path to the local file to upload :type filepath: str or PathLike :param dict asset_metadata: @@ -1077,130 +1099,11 @@ def iter_upload_raw_asset( ``"done"`` and an ``"asset"`` key containing the resulting `RemoteAsset`. """ - from .support.digests import get_dandietag - - asset_path = asset_metadata["path"] - yield {"status": "calculating etag"} - etagger = get_dandietag(filepath) - filetag = etagger.as_str() - lgr.debug("Calculated dandi-etag of %s for %s", filetag, filepath) - digest = asset_metadata.get("digest", {}) - if "dandi:dandi-etag" in digest: - if digest["dandi:dandi-etag"] != filetag: - raise RuntimeError( - f"{filepath}: File etag changed; was originally" - f" {digest['dandi:dandi-etag']} but is now {filetag}" - ) - yield {"status": "initiating upload"} - lgr.debug("%s: Beginning upload", asset_path) - total_size = os.path.getsize(filepath) - try: - resp = self.client.post( - "/uploads/initialize/", - json={ - "contentSize": total_size, - "digest": { - "algorithm": "dandi:dandi-etag", - "value": filetag, - }, - "dandiset": self.identifier, - }, - ) - except requests.HTTPError as e: - if e.response.status_code == 409: - lgr.debug("%s: Blob already exists on server", asset_path) - blob_id = e.response.headers["Location"] - else: - raise - else: - upload_id = resp["upload_id"] - parts = resp["parts"] - if len(parts) != etagger.part_qty: - raise RuntimeError( - f"Server and client disagree on number of parts for upload;" - f" server says {len(parts)}, client says {etagger.part_qty}" - ) - parts_out = [] - bytes_uploaded = 0 - lgr.debug("Uploading %s in %d parts", filepath, len(parts)) - with RESTFullAPIClient("http://nil.nil") as storage: - with open(filepath, "rb") as fp: - with ThreadPoolExecutor(max_workers=jobs or 5) as executor: - lock = Lock() - futures = [ - executor.submit( - _upload_part, - storage_session=storage, - fp=fp, - lock=lock, - etagger=etagger, - asset_path=asset_path, - part=part, - ) - for part in parts - ] - for fut in as_completed(futures): - out_part = fut.result() - bytes_uploaded += out_part["size"] - yield { - "status": "uploading", - "upload": 100 * bytes_uploaded / total_size, - "current": bytes_uploaded, - } - parts_out.append(out_part) - lgr.debug("%s: Completing upload", asset_path) - resp = self.client.post( - f"/uploads/{upload_id}/complete/", - json={"parts": parts_out}, - ) - lgr.debug( - "%s: Announcing completion to %s", - asset_path, - resp["complete_url"], - ) - r = storage.post( - resp["complete_url"], data=resp["body"], json_resp=False - ) - lgr.debug( - "%s: Upload completed. Response content: %s", - asset_path, - r.content, - ) - rxml = fromstring(r.text) - m = re.match(r"\{.+?\}", rxml.tag) - ns = m.group(0) if m else "" - final_etag = rxml.findtext(f"{ns}ETag") - if final_etag is not None: - final_etag = final_etag.strip('"') - if final_etag != filetag: - raise RuntimeError( - "Server and client disagree on final ETag of uploaded file;" - f" server says {final_etag}, client says {filetag}" - ) - # else: Error? Warning? - resp = self.client.post(f"/uploads/{upload_id}/validate/") - blob_id = resp["blob_id"] - lgr.debug("%s: Assigning asset blob to dandiset & version", asset_path) - yield {"status": "producing asset"} - if replace_asset is not None: - lgr.debug("%s: Replacing pre-existing asset") - a = RemoteAsset.from_data( - self, - self.client.put( - replace_asset.api_path, - json={"metadata": asset_metadata, "blob_id": blob_id}, - ), - ) - else: - a = RemoteAsset.from_data( - self, - self.client.post( - f"{self.version_api_path}assets/", - json={"metadata": asset_metadata, "blob_id": blob_id}, - ), - ) - lgr.info("%s: Asset successfully uploaded", asset_path) - yield {"status": "done", "asset": a} + from .files import dandi_file + + return dandi_file(filepath).iter_upload( + self, metadata=asset_metadata, jobs=jobs, replacing=replace_asset + ) class BaseRemoteAsset(APIBase): @@ -1301,15 +1204,24 @@ def get_raw_metadata(self) -> Dict[str, Any]: else: raise - def get_digest( - self, digest_type: Union[str, models.DigestType] = models.DigestType.dandi_etag + def get_raw_digest( + self, + digest_type: Union[str, models.DigestType, None] = models.DigestType.dandi_etag, ) -> str: """ Retrieves the value of the given type of digest from the asset's metadata. Raises `NotFoundError` if there is no entry for the given digest type. + + If no digest type is specified, the same type as used by `get_digest()` + is returned. + + .. versionchanged:: 0.36.0 + Renamed from ``get_digest()`` to ``get_raw_digest()`` """ - if isinstance(digest_type, models.DigestType): + if digest_type is None: + digest_type = self.digest_type.value + elif isinstance(digest_type, models.DigestType): digest_type = digest_type.value metadata = self.get_raw_metadata() try: @@ -1317,6 +1229,19 @@ def get_digest( except KeyError: raise NotFoundError(f"No {digest_type} digest found in metadata") + def get_digest(self) -> Digest: + """ + .. versionadded:: 0.36.0 + Replaces the previous version of ``get_digest()``, now renamed to + `get_raw_digest()` + + Retrieves the DANDI etag digest of the appropriate type for the asset: + a dandi-etag digest for blob resources or a dandi-zarr-checksum for + Zarr resources + """ + algorithm = self.digest_type + return Digest(algorithm=algorithm, value=self.get_raw_digest(algorithm)) + def get_content_url( self, regex: str = r".*", @@ -1367,8 +1292,16 @@ def get_download_file_iter( """ Returns a function that when called (optionally with an offset into the asset to start downloading at) returns a generator of chunks of the - asset + asset. + + :raises ValueError: if the asset is not backed by a blob """ + if self.asset_type is not AssetType.BLOB: + raise ValueError( + f"Cannot download asset {self} directly: asset is of type" + f" {self.asset_type.name}, not BLOB" + ) + url = self.base_download_url def downloader(start_at: int = 0) -> Iterator[bytes]: @@ -1393,14 +1326,42 @@ def download( """ Download the asset to ``filepath``. Blocks until the download is complete. + + :raises ValueError: if the asset is not backed by a blob """ downloader = self.get_download_file_iter(chunk_size=chunk_size) with open(filepath, "wb") as fp: for chunk in downloader(): fp.write(chunk) + @property + def asset_type(self) -> AssetType: + """ + .. versionadded:: 0.36.0 -class RemoteAsset(BaseRemoteAsset): + The type of the asset's underlying data + """ + if self.get_raw_metadata().get("encodingFormat") == ZARR_MIME_TYPE: + return AssetType.ZARR + else: + return AssetType.BLOB + + @property + def digest_type(self) -> models.DigestType: + """ + .. versionadded:: 0.36.0 + + The primary digest algorithm used by Dandi Archive for the asset, + determined based on its underlying data: dandi-etag for blob resources, + dandi-zarr-checksum for Zarr resources + """ + if self.asset_type is AssetType.ZARR: + return models.DigestType.dandi_zarr_checksum + else: + return models.DigestType.dandi_etag + + +class RemoteAsset(ABC, BaseRemoteAsset): """ Subclass of `BaseRemoteAsset` that includes information about the Dandiset to which the asset belongs. @@ -1424,7 +1385,7 @@ class RemoteAsset(BaseRemoteAsset): @classmethod def from_data( - self, + cls, dandiset: RemoteDandiset, data: Dict[str, Any], metadata: Optional[Dict[str, Any]] = None, @@ -1437,7 +1398,17 @@ def from_data( This is a low-level method that non-developers would normally only use when acquiring data using means outside of this library. """ - return RemoteAsset( + if data.get("blob") is not None: + klass = RemoteBlobAsset + if data.pop("zarr", None) is not None: + raise ValueError("Asset data contains both `blob` and `zarr`'") + elif data.get("zarr") is not None: + klass = RemoteZarrAsset + if data.pop("blob", None) is not None: + raise ValueError("Asset data contains both `blob` and `zarr`'") + else: + raise ValueError("Asset data contains neither `blob` nor `zarr`") + return klass( client=dandiset.client, dandiset_id=dandiset.identifier, version_id=dandiset.version_id, @@ -1476,21 +1447,45 @@ def set_metadata(self, metadata: models.Asset) -> None: """ return self.set_raw_metadata(metadata.json_dict()) + @abstractmethod def set_raw_metadata(self, metadata: Dict[str, Any]) -> None: """ Set the metadata for the asset on the server to the given value and update the `RemoteAsset` in place. """ - try: - etag = metadata["digest"]["dandi:dandi-etag"] - except KeyError: - raise ValueError("dandi-etag digest not set in new asset metadata") - r = self.client.post( - "/blobs/digest/", - json={"algorithm": "dandi:dandi-etag", "value": etag}, - ) + ... + + def delete(self) -> None: + """Delete the asset""" + self.client.delete(self.api_path) + + +class RemoteBlobAsset(RemoteAsset): + """ + .. versionadded:: 0.36.0 + + A `RemoteAsset` whose actual data is a blob resource + """ + + #: The ID of the underlying blob resource + blob: str + + @property + def asset_type(self) -> AssetType: + """ + .. versionadded:: 0.36.0 + + The type of the asset's underlying data + """ + return AssetType.BLOB + + def set_raw_metadata(self, metadata: Dict[str, Any]) -> None: + """ + Set the metadata for the asset on the server to the given value and + update the `RemoteBlobAsset` in place. + """ data = self.client.put( - self.api_path, json={"metadata": metadata, "blob_id": r["blob_id"]} + self.api_path, json={"metadata": metadata, "blob_id": self.blob} ) self.identifier = data["asset_id"] self.path = data["path"] @@ -1498,63 +1493,317 @@ def set_raw_metadata(self, metadata: Dict[str, Any]) -> None: self.modified = ensure_datetime(data["modified"]) self._metadata = data["metadata"] - def delete(self) -> None: - """Delete the asset""" - self.client.delete(self.api_path) +class RemoteZarrAsset(RemoteAsset): + """ + .. versionadded:: 0.36.0 + + A `RemoteAsset` whose actual data is a Zarr resource + """ + + #: The ID of the underlying Zarr resource + zarr: str + + @property + def asset_type(self) -> AssetType: + """ + .. versionadded:: 0.36.0 + + The type of the asset's underlying data + """ + return AssetType.ZARR -def _upload_part( - storage_session: RESTFullAPIClient, - fp: BinaryIO, - lock: Lock, - etagger: DandiETag, - asset_path: str, - part: dict, -) -> dict: - etag_part = etagger.get_part(part["part_number"]) - if part["size"] != etag_part.size: - raise RuntimeError( - f"Server and client disagree on size of upload part" - f" {part['part_number']}; server says {part['size']}," - f" client says {etag_part.size}" + def set_raw_metadata(self, metadata: Dict[str, Any]) -> None: + """ + Set the metadata for the asset on the server to the given value and + update the `RemoteZarrAsset` in place. + """ + data = self.client.put( + self.api_path, json={"metadata": metadata, "zarr_id": self.zarr} ) - with lock: - fp.seek(etag_part.offset) - chunk = fp.read(part["size"]) - if len(chunk) != part["size"]: - raise RuntimeError( - f"End of file {fp.name} reached unexpectedly early:" - f" read {len(chunk)} bytes of out of an expected {part['size']}" + self.identifier = data["asset_id"] + self.path = data["path"] + self.size = int(data["size"]) + self.modified = ensure_datetime(data["modified"]) + self._metadata = data["metadata"] + + @property + def filetree(self) -> "RemoteZarrEntry": + """ + The `RemoteZarrEntry` for the root of the hierarchy of files within the + Zarr + """ + return RemoteZarrEntry( + client=self.client, zarr_id=self.zarr, parts=(), _known_dir=True ) - lgr.debug( - "%s: Uploading part %d/%d (%d bytes)", - asset_path, - part["part_number"], - etagger.part_qty, - part["size"], - ) - r = storage_session.put( - part["upload_url"], - data=chunk, - json_resp=False, - retry_statuses=[500], - ) - server_etag = r.headers["ETag"].strip('"') - lgr.debug( - "%s: Part upload finished ETag=%s Content-Length=%s", - asset_path, - server_etag, - r.headers.get("Content-Length"), - ) - client_etag = etagger.get_part_etag(etag_part) - if server_etag != client_etag: - raise RuntimeError( - f"Server and client disagree on ETag of upload part" - f" {part['part_number']}; server says" - f" {server_etag}, client says {client_etag}" + + def iterfiles(self, include_dirs: bool = False) -> Iterator["RemoteZarrEntry"]: + """ + Returns a generator of all `RemoteZarrEntry`\\s within the Zarr. By + default, only instances for files are produced, unless ``include_dirs`` + is true. + """ + dirs = deque([self.filetree]) + while dirs: + for p in dirs.popleft().iterdir(): + if p.is_dir(): + dirs.append(p) + if include_dirs: + yield p + else: + yield p + + +class ZarrListing(BaseModel): + """ + .. versionadded:: 0.36.0 + + Information about a directory within a `RemoteZarrAsset` + """ + + #: API URLs for the listings of the directory's subdirectories + directories: List[AnyHttpUrl] + #: API URLs for downloading the files in the directory + files: List[AnyHttpUrl] + #: The checksums (MD5 or Dandi Zarr checksum, as appropriate) for the + #: directory's entries, as a mapping from basenames to checksums + checksums: Dict[str, str] + #: The Dandi Zarr checksum for the directory + checksum: str + + @property + def dirnames(self) -> List[str]: + """The basenames of the directory URLs in `directories`""" + return [PurePosixPath(unquote(url.path)).name for url in self.directories] + + @property + def filenames(self) -> List[str]: + """The basenames of the file URLs in `files`""" + return [PurePosixPath(unquote(url.path)).name for url in self.files] + + +@dataclass +class ZarrEntryStat: + """ + .. versionadded:: 0.36.0 + + Combined size & timestamp information for a file in a `RemoteZarrAsset` + """ + + #: The size of the file + size: int + #: The time at which the file was last modified + modified: datetime + + +@dataclass +class RemoteZarrEntry(BasePath): + """ + .. versionadded:: 0.36.0 + + A file or directory within a `RemoteZarrAsset`. Implements + `~dandi.misctypes.BasePath`. + """ + + #: The `DandiAPIClient` instance used for API requests + client: DandiAPIClient + #: The ID of the Zarr backing the asset + zarr_id: str + _known_dir: Optional[bool] = field(default=None, compare=False, repr=False) + + def _get_subpath( + self, name: str, isdir: Optional[bool] = None + ) -> "RemoteZarrEntry": + if not name or "/" in name: + raise ValueError(f"Invalid path component: {name!r}") + elif name == ".": + return self + elif name == "..": + return self.parent + else: + return replace(self, parts=self.parts + (name,), _known_dir=isdir) + + @property + def parent(self) -> "RemoteZarrEntry": + if self.is_root(): + return self + else: + return replace( + self, + parts=self.parts[:-1], + _known_dir=True if self._known_dir is not None else None, + ) + + def _isdir(self) -> bool: + if self._known_dir is not None: + return self._known_dir + elif self.is_root(): + try: + self.client.get(f"/zarr/{self.zarr_id}/") + except requests.HTTPError as e: + if e.response.status_code == 404: + raise NotFoundError(f"No such Zarr: {self.zarr_id!r}") + else: + raise + return True + else: + listing = self.parent.get_listing() + if self.name in listing.dirnames: + return True + elif self.name in listing.filenames: + return False + else: + raise NotFoundError( + f"No such entry {str(self)!r} in Zarr {self.zarr_id!r}" + ) + + def exists(self) -> bool: + try: + self._isdir() + except NotFoundError: + return False + else: + return True + + def is_file(self) -> bool: + try: + return not self._isdir() + except NotFoundError: + return False + + def is_dir(self) -> bool: + try: + return self._isdir() + except NotFoundError: + return False + + def iterdir(self) -> Iterator["RemoteZarrEntry"]: + listing = self.get_listing() + for name in listing.dirnames: + if name == "." or name == "..": + continue + yield self._get_subpath(name, isdir=True) + for name in listing.filenames: + yield self._get_subpath(name, isdir=False) + + def get_digest(self) -> Digest: + """ + Retrieve the DANDI etag digest for the entry. If the entry is a + directory, the algorithm will be the Dandi Zarr checksum algorithm; if + it is a file, it will be MD5. + + :raises NotFoundError: if the path does not exist in the Zarr asset + """ + if self.is_root(): + algorithm = models.DigestType.dandi_zarr_checksum + value = self.get_listing().checksum + else: + listing = self.parent.get_listing() + if self.name in listing.dirnames: + algorithm = models.DigestType.dandi_zarr_checksum + elif self.name in listing.filenames: + algorithm = models.DigestType.md5 + else: + raise NotFoundError( + f"No such entry {str(self)!r} in Zarr {self.zarr_id!r}" + ) + value = listing.checksums[self.name] + return Digest(algorithm=algorithm, value=value) + + @property + def size(self) -> int: + """ + The size of the entry, which must be a file + + :raises NotFoundError: if the path does not exist in the Zarr asset + :raises ValueError: if the path is a directory + """ + return self.stat().size + + @property + def modified(self) -> datetime: + """ + The time at which the entry (which must be a file) was last modified + + :raises NotFoundError: if the path does not exist in the Zarr asset + :raises ValueError: if the path is a directory + """ + return self.stat().modified + + def stat(self) -> ZarrEntryStat: + """ + Return combined size & timestamp information for the entry, which must + be a file + + :raises NotFoundError: if the path does not exist in the Zarr asset + :raises ValueError: if the path is a directory + """ + if not self.is_file(): + raise ValueError("Cannot stat directories in Zarrs") + try: + r = self.client.request( + "HEAD", + f"/zarr/{self.zarr_id}.zarr/{'/'.join(self.parts)}", + json_resp=False, + allow_redirects=True, + ) + except requests.HTTPError as e: + if e.response.status_code == 404: + raise NotFoundError( + f"{str(self)!r} in Zarr {self.zarr_id!r} does not exist" + ) + else: + raise + return ZarrEntryStat( + size=int(r.headers["Content-Length"]), + modified=dateutil.parser.parse(r.headers["Last-Modified"]), ) - return { - "part_number": part["part_number"], - "size": part["size"], - "etag": server_etag, - } + + def get_listing(self) -> ZarrListing: + """ + Return the `ZarrListing` for the entry, which must be a directory + + :raises NotFoundError: if the path does not exist in the Zarr asset + """ + path = "".join(p + "/" for p in self.parts) + try: + r = self.client.get(f"/zarr/{self.zarr_id}.zarr/{path}") + except requests.HTTPError as e: + if e.response.status_code == 404: + raise NotFoundError( + f"{str(self)!r} in Zarr {self.zarr_id!r} does not exist" + ) + else: + raise + return ZarrListing.parse_obj(r) + + def get_download_file_iter( + self, chunk_size: int = MAX_CHUNK_SIZE + ) -> Callable[..., Iterator[bytes]]: + """ + Returns a function that when called (optionally with an offset into the + file to start downloading at) returns a generator of chunks of the file + """ + if not self.is_file(): + raise RuntimeError( + f"{str(self)!r} in Zarr {self.zarr_id!r} does not exist or" + " is not a file" + ) + + url = self.client.get_url(f"/zarr/{self.zarr_id}.zarr/{'/'.join(self.parts)}") + + def downloader(start_at: int = 0) -> Iterator[bytes]: + lgr.debug("Starting download from %s", url) + headers = None + if start_at > 0: + headers = {"Range": f"bytes={start_at}-"} + result = self.client.session.get(url, stream=True, headers=headers) + # TODO: apparently we might need retries here as well etc + # if result.status_code not in (200, 201): + result.raise_for_status() + for chunk in result.iter_content(chunk_size=chunk_size): + if chunk: # could be some "keep alive"? + yield chunk + lgr.info("File %s in Zarr %s successfully downloaded", self, self.zarr_id) + + return downloader diff --git a/dandi/download.py b/dandi/download.py index f3ac67053..fd8c68996 100644 --- a/dandi/download.py +++ b/dandi/download.py @@ -1,3 +1,7 @@ +from collections import Counter, deque +from dataclasses import dataclass, field +from enum import Enum +from functools import partial import hashlib import json import os @@ -6,22 +10,27 @@ import random from shutil import rmtree import sys +from threading import Lock import time +from typing import Any, Callable, Dict, Iterator, Optional, Tuple +from dandischema.models import DigestType import humanize +from interleave import FINISH_CURRENT, interleave import requests from . import get_logger from .consts import RETRY_STATUSES, dandiset_metadata_file +from .dandiapi import AssetType, RemoteZarrAsset from .dandiarchive import DandisetURL, MultiAssetURL, SingleAssetURL, parse_dandi_url from .dandiset import Dandiset from .exceptions import NotFoundError -from .support.digests import get_digest +from .files import DandisetMetadataFile, find_dandi_files +from .support.digests import get_digest, get_zarr_checksum from .support.pyout import naturalsize from .utils import ( abbrev_prompt, ensure_datetime, - find_files, flattened, is_same_time, on_windows, @@ -40,6 +49,7 @@ def download( format="pyout", existing="error", jobs=1, + jobs_per_zarr=None, get_metadata=True, get_assets=True, sync=False, @@ -92,6 +102,7 @@ def download( existing=existing, get_metadata=get_metadata, get_assets=get_assets, + jobs_per_zarr=jobs_per_zarr, **kw, ) @@ -127,14 +138,16 @@ def download( f"Unexpected URL type {type(parsed_url).__name__}" ) to_delete = [] - for p in find_files(".*", download_dir, exclude_datalad=True): - if p == op.join(output_path, dandiset_metadata_file): + for df in find_dandi_files( + download_dir, dandiset_path=download_dir, allow_all=True + ): + if isinstance(df, DandisetMetadataFile): continue - a_path = op.normpath(op.join(prefix, op.relpath(p, download_dir))) + a_path = op.normpath(op.join(prefix, df.path)) if on_windows: a_path = a_path.replace("\\", "/") if a_path not in asset_paths: - to_delete.append(p) + to_delete.append(df.filepath) if to_delete: while True: opt = abbrev_prompt( @@ -148,7 +161,10 @@ def download( print(p) elif opt == "yes": for p in to_delete: - os.unlink(p) + if p.is_dir(): + rmtree(p) + else: + p.unlink() break else: break @@ -163,6 +179,7 @@ def download_generator( existing="error", get_metadata=True, get_assets=True, + jobs_per_zarr=None, ): """A generator for downloads of files, folders, or entire dandiset from DANDI (as identified by URL) @@ -196,6 +213,7 @@ def download_generator( if not get_assets: return + lock = Lock() for asset in assets: path = asset.path.lstrip("/") # make into relative path path = op.normpath(path) @@ -219,41 +237,58 @@ def download_generator( yield {"path": path, "status": "error", "message": str(e)} continue d = metadata.get("digest", {}) - if "dandi:dandi-etag" in d: - digests = {"dandi-etag": d["dandi:dandi-etag"]} - else: - raise RuntimeError( - f"dandi-etag not available for asset. Known digests: {d}" - ) - try: - digests["sha256"] = d["dandi:sha2-256"] - except KeyError: - pass - downloader = asset.get_download_file_iter() + if asset.asset_type is AssetType.BLOB: + if "dandi:dandi-etag" in d: + digests = {"dandi-etag": d["dandi:dandi-etag"]} + else: + raise RuntimeError( + f"dandi-etag not available for asset. Known digests: {d}" + ) + try: + digests["sha256"] = d["dandi:sha2-256"] + except KeyError: + pass + try: + mtime = ensure_datetime(metadata["blobDateModified"]) + except KeyError: + mtime = None + if mtime is None: + lgr.warning( + "Asset %s is missing blobDateModified metadata field", + asset.path, + ) + mtime = asset.modified + _download_generator = _download_file( + asset.get_download_file_iter(), + download_path, + toplevel_path=output_path, + # size and modified generally should be there but better to + # redownload than to crash + size=asset.size, + mtime=mtime, + existing=existing, + digests=digests, + lock=lock, + ) - try: - mtime = metadata["blobDateModified"] - except KeyError: - mtime = None - if mtime is None: - lgr.warning( - "Asset %s is missing blobDateModified metadata field", - asset.path, + else: + assert ( + asset.asset_type is AssetType.ZARR + ), f"Asset {asset.path} is neither blob nor Zarr" + if not isinstance(asset, RemoteZarrAsset): + raise NotImplementedError( + "Downloading a Zarr asset identified by a URL without" + " Dandiset details is not yet implemented" + ) + _download_generator = _download_zarr( + asset, + download_path, + toplevel_path=output_path, + existing=existing, + jobs=jobs_per_zarr, + lock=lock, ) - mtime = asset.modified - - _download_generator = _download_file( - downloader, - download_path, - toplevel_path=output_path, - # size and modified generally should be there but better to redownload - # than to crash - size=asset.size, - mtime=mtime, - existing=existing, - digests=digests, - ) if yield_generator_for_fields: yield {"path": path, yield_generator_for_fields: _download_generator} @@ -422,19 +457,28 @@ def _download_file( downloader, path, toplevel_path, + lock, size=None, mtime=None, existing="error", digests=None, + digest_callback: Optional[Callable[[str, str], Any]] = None, ): - """Common logic for downloading a single file + """ + Common logic for downloading a single file. - Generator downloader: + Yields progress records that take the following forms:: - TODO: describe expected records it should yield - - progress - - error - - completion + {"status": "skipped", "message": ""} + {"size": } + {"status": "downloading"} + {"done": [, "done%": ]} + {"status": "error", "message": ""} + {"checksum": "differs", "status": "error", "message": ""} + {"checksum": "ok"} + {"checksum": "-"} # No digests were provided + {"status": "setting mtime"} + {"status": "done"} Parameters ---------- @@ -480,7 +524,17 @@ def _download_file( "%s is in git-annex, and hash does not match hash on server; redownloading", path, ) - elif get_digest(path, "dandi-etag") == digests["dandi-etag"]: + elif ( + "dandi-etag" in digests + and get_digest(path, "dandi-etag") == digests["dandi-etag"] + ): + yield _skip_file("already exists") + return + elif ( + "dandi-etag" not in digests + and "md5" in digests + and get_digest(path, "md5") == digests["md5"] + ): yield _skip_file("already exists") return else: @@ -512,8 +566,15 @@ def _download_file( if size is not None: yield {"size": size} - destdir = op.dirname(path) - os.makedirs(destdir, exist_ok=True) + destdir = Path(op.dirname(path)) + with lock: + for p in (destdir, *destdir.parents): + if p.is_file(): + p.unlink() + break + elif p.is_dir(): + break + destdir.mkdir(parents=True, exist_ok=True) yield {"status": "downloading"} @@ -594,6 +655,8 @@ def _download_file( if downloaded_digest and not resuming: downloaded_digest = downloaded_digest.hexdigest() # we care only about hex + if digest_callback is not None: + digest_callback(algo, downloaded_digest) if digest != downloaded_digest: msg = f"{algo}: downloaded {downloaded_digest} != {digest}" yield {"checksum": "differs", "status": "error", "message": msg} @@ -681,7 +744,11 @@ def __exit__(self, exc_type, exc_value, traceback): self.fp.close() try: if exc_type is None: - self.writefile.replace(self.filepath) + try: + self.writefile.replace(self.filepath) + except IsADirectoryError: + rmtree(self.filepath) + self.writefile.replace(self.filepath) finally: self.lock.release() if exc_type is None: @@ -693,3 +760,240 @@ def __exit__(self, exc_type, exc_value, traceback): def append(self, blob): self.fp.write(blob) + + +def _download_zarr( + asset: RemoteZarrAsset, + download_path: str, + toplevel_path: str, + existing: str, + lock: Lock, + jobs: Optional[int] = None, +) -> Iterator[dict]: + download_gens = {} + entries = list(asset.iterfiles()) + digests = {} + + def digest_callback(path: str, algoname: str, d: str) -> None: + if algoname == "md5": + digests[path] = d + + for entry in entries: + etag = entry.get_digest() + assert etag.algorithm is DigestType.md5 + stat = entry.stat() + download_gens[str(entry)] = _download_file( + entry.get_download_file_iter(), + op.join(download_path, op.normpath(str(entry))), + toplevel_path=toplevel_path, + size=stat.size, + mtime=stat.modified, + existing=existing, + digests={"md5": etag.value}, + lock=lock, + digest_callback=partial(digest_callback, str(entry)), + ) + + pc = ProgressCombiner(zarr_size=asset.size, file_qty=len(download_gens)) + final_out: Optional[dict] = None + with interleave( + [pairing(p, gen) for p, gen in download_gens.items()], + onerror=FINISH_CURRENT, + max_workers=jobs or 4, + ) as it: + for path, status in it: + for out in pc.feed(path, status): + if out.get("status") == "done": + final_out = out + else: + yield out + if final_out is not None: + break + else: + return + + yield {"status": "deleting extra files"} + remote_paths = set(map(str, entries)) + zarr_basepath = Path(download_path) + dirs = deque([zarr_basepath]) + empty_dirs = deque() + while dirs: + d = dirs.popleft() + is_empty = True + for p in list(d.iterdir()): + if ( + p.is_file() + and p.relative_to(zarr_basepath).as_posix() not in remote_paths + ): + try: + p.unlink() + except OSError: + is_empty = False + elif p.is_dir(): + dirs.append(p) + else: + is_empty = False + if is_empty and d != zarr_basepath: + empty_dirs.append(d) + while empty_dirs: + d = empty_dirs.popleft() + try: + d.rmdir() + except OSError: + pass + else: + if d.parent != zarr_basepath and not any(d.parent.iterdir()): + empty_dirs.append(d.parent) + + if "skipped" not in final_out["message"]: + zarr_checksum = asset.get_digest().value + local_checksum = get_zarr_checksum(zarr_basepath, known=digests) + if zarr_checksum != local_checksum: + msg = f"Zarr checksum: downloaded {local_checksum} != {zarr_checksum}" + yield {"checksum": "differs", "status": "error", "message": msg} + lgr.debug("%s is different: %s.", zarr_basepath, msg) + return + else: + yield {"checksum": "ok"} + lgr.debug( + "Verified that %s has correct Zarr checksum %s", + zarr_basepath, + zarr_checksum, + ) + + yield {"status": "done"} + + +def pairing(p: str, gen: Iterator[dict]) -> Iterator[Tuple[str, dict]]: + for d in gen: + yield (p, d) + + +DLState = Enum("DLState", "STARTING DOWNLOADING SKIPPED ERROR CHECKSUM_ERROR DONE") + + +@dataclass +class DownloadProgress: + state: DLState = DLState.STARTING + downloaded: int = 0 + size: Optional[int] = None + + +@dataclass +class ProgressCombiner: + zarr_size: int + file_qty: int + files: Dict[str, DownloadProgress] = field(default_factory=dict) + #: Total size of all files that were not skipped and did not error out + #: during download + maxsize: int = 0 + prev_status: str = "" + yielded_size: bool = False + + @property + def message(self) -> str: + done = 0 + errored = 0 + skipped = 0 + for s in self.files.values(): + if s.state is DLState.DONE: + done += 1 + elif s.state in (DLState.ERROR, DLState.CHECKSUM_ERROR): + errored += 1 + elif s.state is DLState.SKIPPED: + skipped += 1 + parts = [] + if done: + parts.append(f"{done} done") + if errored: + parts.append(f"{errored} errored") + if skipped: + parts.append(f"{skipped} skipped") + return ", ".join(parts) + + def get_done(self) -> dict: + total_downloaded = sum( + s.downloaded + for s in self.files.values() + if s.state in (DLState.DOWNLOADING, DLState.CHECKSUM_ERROR, DLState.DONE) + ) + return { + "done": total_downloaded, + "done%": total_downloaded / self.maxsize * 100, + } + + def set_status(self, statusdict: dict) -> None: + state_qtys = Counter(s.state for s in self.files.values()) + total = len(self.files) + if ( + total == self.file_qty + and state_qtys[DLState.STARTING] == state_qtys[DLState.DOWNLOADING] == 0 + ): + # All files have finished + if state_qtys[DLState.ERROR] or state_qtys[DLState.CHECKSUM_ERROR]: + new_status = "error" + elif state_qtys[DLState.DONE]: + new_status = "done" + else: + new_status = "skipped" + elif total - state_qtys[DLState.STARTING] - state_qtys[DLState.SKIPPED] > 0: + new_status = "downloading" + else: + new_status = "" + if new_status != self.prev_status: + statusdict["status"] = new_status + self.prev_status = new_status + + def feed(self, path: str, status: dict) -> Iterator[dict]: + keys = list(status.keys()) + self.files.setdefault(path, DownloadProgress()) + if status.get("status") == "skipped": + self.files[path].state = DLState.SKIPPED + out = {"message": self.message} + self.set_status(out) + yield out + elif keys == ["size"]: + if not self.yielded_size: + yield {"size": self.zarr_size} + self.yielded_size = True + self.files[path].size = status["size"] + self.maxsize += status["size"] + if any(s.state is DLState.DOWNLOADING for s in self.files.values()): + yield self.get_done() + elif status == {"status": "downloading"}: + self.files[path].state = DLState.DOWNLOADING + out = {} + self.set_status(out) + if out: + yield out + elif "done" in status: + self.files[path].downloaded = status["done"] + yield self.get_done() + elif status.get("status") == "error": + if "checksum" in status: + self.files[path].state = DLState.CHECKSUM_ERROR + out = {"message": self.message} + self.set_status(out) + yield out + else: + self.files[path].state = DLState.ERROR + out = {"message": self.message} + self.set_status(out) + yield out + sz = self.files[path].size + if sz is not None: + self.maxsize -= sz + yield self.get_done() + elif keys == ["checksum"]: + pass + elif status == {"status": "setting mtime"}: + pass + elif status == {"status": "done"}: + self.files[path].state = DLState.DONE + out = {"message": self.message} + self.set_status(out) + yield out + else: + lgr.warning( + "Unexpected download status dict for %r received: %r", path, status + ) diff --git a/dandi/exceptions.py b/dandi/exceptions.py index 42b23ceeb..c23c5c94b 100644 --- a/dandi/exceptions.py +++ b/dandi/exceptions.py @@ -64,3 +64,7 @@ def __str__(self): class SchemaVersionError(Exception): pass + + +class UnknownAssetError(ValueError): + pass diff --git a/dandi/files.py b/dandi/files.py new file mode 100644 index 000000000..56bedf224 --- /dev/null +++ b/dandi/files.py @@ -0,0 +1,1032 @@ +""" +.. versionadded:: 0.36.0 + +This module defines functionality for working with local files & directories +(as opposed to remote resources on a DANDI Archive server) that are of interest +to DANDI. The classes for such files & directories all inherit from +`DandiFile`, which has two immediate subclasses: `DandisetMetadataFile`, for +representing :file:`dandiset.yaml` files, and `LocalAsset`, for representing +files that can be uploaded as assets to DANDI Archive. +""" + +from __future__ import annotations + +from abc import ABC, abstractmethod +from collections import deque +from concurrent.futures import ThreadPoolExecutor, as_completed +from dataclasses import dataclass, replace +from datetime import datetime +import os +from pathlib import Path +import re +from threading import Lock +from typing import Any, BinaryIO, Dict, Generic, Iterator, List, Optional, Union +from xml.etree.ElementTree import fromstring + +from dandischema.digests.dandietag import DandiETag +from dandischema.digests.zarr import get_checksum +from dandischema.models import BareAsset, CommonModel +from dandischema.models import Dandiset as DandisetMeta +from dandischema.models import DigestType, get_schema_version +from pydantic import ValidationError +import requests +import zarr + +from . import get_logger +from .consts import ( + MAX_ZARR_DEPTH, + ZARR_MIME_TYPE, + ZARR_UPLOAD_BATCH_SIZE, + EmbargoStatus, + dandiset_metadata_file, +) +from .dandiapi import RemoteAsset, RemoteDandiset, RESTFullAPIClient +from .exceptions import UnknownAssetError +from .metadata import get_default_metadata, get_metadata, nwb2asset +from .misctypes import DUMMY_DIGEST, BasePath, Digest, P +from .pynwb_utils import validate as pynwb_validate +from .support.digests import get_dandietag, get_digest, get_zarr_checksum +from .utils import chunked, ensure_datetime, pluralize, yaml_load + +lgr = get_logger() + +# TODO -- should come from schema. This is just a simplistic example for now +_required_dandiset_metadata_fields = ["identifier", "name", "description"] +_required_nwb_metadata_fields = ["subject_id"] + + +@dataclass +class DandiFile(ABC): + """Abstract base class for local files & directories of interest to DANDI""" + + #: The path to the actual file or directory on disk + filepath: Path + + @property + def size(self) -> int: + """The size of the file""" + return os.path.getsize(self.filepath) + + @property + def modified(self) -> datetime: + """The time at which the file was last modified""" + # TODO: Should this be overridden for LocalDirectoryAsset? + return ensure_datetime(self.filepath.stat().st_mtime) + + @abstractmethod + def get_metadata( + self, + digest: Optional[Digest] = None, + ignore_errors: bool = True, + ) -> CommonModel: + """Return the Dandi metadata for the file""" + ... + + @abstractmethod + def get_validation_errors( + self, + schema_version: Optional[str] = None, + devel_debug: bool = False, + ) -> List[str]: + """ + Attempt to validate the file and return a list of errors encountered + """ + ... + + +class DandisetMetadataFile(DandiFile): + """Representation of a :file:`dandiset.yaml` file""" + + def get_metadata( + self, + digest: Optional[Digest] = None, + ignore_errors: bool = True, + ) -> DandisetMeta: + """Return the Dandiset metadata inside the file""" + with open(self.filepath) as f: + meta = yaml_load(f, typ="safe") + return DandisetMeta.unvalidated(**meta) + + # TODO: @validate_cache.memoize_path + def get_validation_errors( + self, + schema_version: Optional[str] = None, + devel_debug: bool = False, + ) -> List[str]: + with open(self.filepath) as f: + meta = yaml_load(f, typ="safe") + if schema_version is None: + schema_version = meta.get("schemaVersion") + if schema_version is None: + return _check_required_fields(meta, _required_dandiset_metadata_fields) + else: + current_version = get_schema_version() + if schema_version != current_version: + raise ValueError( + f"Unsupported schema version: {schema_version}; expected {current_version}" + ) + try: + DandisetMeta(**meta) + except ValidationError as e: + if devel_debug: + raise + lgr.warning( + "Validation error for %s: %s", + self.filepath, + e, + extra={"validating": True}, + ) + return [str(e)] + except Exception as e: + if devel_debug: + raise + lgr.warning( + "Unexpected validation error for %s: %s", + self.filepath, + e, + extra={"validating": True}, + ) + return [f"Failed to initialize Dandiset meta: {e}"] + return [] + + +@dataclass +class LocalAsset(DandiFile): + """ + Representation of a file or directory that can be uploaded to a DANDI + Archive as an asset of a Dandiset + """ + + #: The foward-slash-separated path to the asset within its local Dandiset + #: (i.e., relative to the Dandiset's root) + path: str + + @abstractmethod + def get_digest(self) -> Digest: + """ + Calculate a DANDI etag digest for the asset using the appropriate + algorithm for its type + """ + ... + + @abstractmethod + def get_metadata( + self, + digest: Optional[Digest] = None, + ignore_errors: bool = True, + ) -> BareAsset: + """Return the Dandi metadata for the asset""" + ... + + # TODO: @validate_cache.memoize_path + def get_validation_errors( + self, + schema_version: Optional[str] = None, + devel_debug: bool = False, + ) -> List[str]: + if schema_version is not None: + current_version = get_schema_version() + if schema_version != current_version: + raise ValueError( + f"Unsupported schema version: {schema_version}; expected {current_version}" + ) + try: + asset = self.get_metadata(digest=DUMMY_DIGEST) + BareAsset(**asset.dict()) + except ValidationError as e: + if devel_debug: + raise + lgr.warning( + "Validation error for %s: %s", + self.filepath, + e, + extra={"validating": True}, + ) + return [str(e)] + except Exception as e: + if devel_debug: + raise + lgr.warning( + "Unexpected validation error for %s: %s", + self.filepath, + e, + extra={"validating": True}, + ) + return [f"Failed to read metadata: {e}"] + return [] + else: + # TODO: Do something else? + return [] + + def upload( + self, + dandiset: RemoteDandiset, + metadata: Dict[str, Any], + jobs: Optional[int] = None, + replacing: Optional[RemoteAsset] = None, + ) -> RemoteAsset: + """ + Upload the file as an asset with the given metadata to the given + Dandiset and return the resulting asset. Blocks until the upload is + complete. + + :param RemoteDandiset dandiset: + the Dandiset to which the file will be uploaded + :param dict metadata: + Metadata for the uploaded asset. The "path" field will be set to + the value of the instance's ``path`` attribute if no such field is + already present. + :param int jobs: Number of threads to use for uploading; defaults to 5 + :param RemoteAsset replacing: + If set, replace the given asset, which must have the same path as + the new asset + :rtype: RemoteAsset + """ + for status in self.iter_upload( + dandiset, metadata, jobs=jobs, replacing=replacing + ): + if status["status"] == "done": + return status["asset"] + raise AssertionError("iter_upload() finished without returning 'done'") + + @abstractmethod + def iter_upload( + self, + dandiset: RemoteDandiset, + metadata: Dict[str, Any], + jobs: Optional[int] = None, + replacing: Optional[RemoteAsset] = None, + ) -> Iterator[dict]: + """ + Upload the asset with the given metadata to the given Dandiset, + returning a generator of status `dict`\\s. + + :param RemoteDandiset dandiset: + the Dandiset to which the asset will be uploaded + :param dict metadata: + Metadata for the uploaded asset. The "path" field will be set to + the value of the instance's ``path`` attribute if no such field is + already present. + :param int jobs: Number of threads to use for uploading; defaults to 5 + :param RemoteAsset replacing: + If set, replace the given asset, which must have the same path as + the new asset + :returns: + A generator of `dict`\\s containing at least a ``"status"`` key. + Upon successful upload, the last `dict` will have a status of + ``"done"`` and an ``"asset"`` key containing the resulting + `RemoteAsset`. + """ + ... + + +class LocalFileAsset(LocalAsset): + """ + Representation of a regular file that can be uploaded to a DANDI Archive as + an asset of a Dandiset + """ + + def get_digest(self) -> Digest: + """Calculate a dandi-etag digest for the asset""" + value = get_digest(self.filepath, digest="dandi-etag") + return Digest.dandi_etag(value) + + def iter_upload( + self, + dandiset: RemoteDandiset, + metadata: Dict[str, Any], + jobs: Optional[int] = None, + replacing: Optional[RemoteAsset] = None, + ) -> Iterator[dict]: + """ + Upload the file as an asset with the given metadata to the given + Dandiset, returning a generator of status `dict`\\s. + + :param RemoteDandiset dandiset: + the Dandiset to which the file will be uploaded + :param dict metadata: + Metadata for the uploaded asset. The "path" field will be set to + the value of the instance's ``path`` attribute if no such field is + already present. + :param int jobs: Number of threads to use for uploading; defaults to 5 + :param RemoteAsset replacing: + If set, replace the given asset, which must have the same path as + the new asset + :returns: + A generator of `dict`\\s containing at least a ``"status"`` key. + Upon successful upload, the last `dict` will have a status of + ``"done"`` and an ``"asset"`` key containing the resulting + `RemoteAsset`. + """ + asset_path = metadata.setdefault("path", self.path) + client = dandiset.client + yield {"status": "calculating etag"} + etagger = get_dandietag(self.filepath) + filetag = etagger.as_str() + lgr.debug("Calculated dandi-etag of %s for %s", filetag, self.filepath) + digest = metadata.get("digest", {}) + if "dandi:dandi-etag" in digest: + if digest["dandi:dandi-etag"] != filetag: + raise RuntimeError( + f"{self.filepath}: File etag changed; was originally" + f" {digest['dandi:dandi-etag']} but is now {filetag}" + ) + yield {"status": "initiating upload"} + lgr.debug("%s: Beginning upload", asset_path) + total_size = self.size + try: + resp = client.post( + "/uploads/initialize/", + json={ + "contentSize": total_size, + "digest": { + "algorithm": "dandi:dandi-etag", + "value": filetag, + }, + "dandiset": dandiset.identifier, + }, + ) + except requests.HTTPError as e: + if e.response.status_code == 409: + lgr.debug("%s: Blob already exists on server", asset_path) + blob_id = e.response.headers["Location"] + else: + raise + else: + upload_id = resp["upload_id"] + parts = resp["parts"] + if len(parts) != etagger.part_qty: + raise RuntimeError( + f"Server and client disagree on number of parts for upload;" + f" server says {len(parts)}, client says {etagger.part_qty}" + ) + parts_out = [] + bytes_uploaded = 0 + lgr.debug("Uploading %s in %d parts", self.filepath, len(parts)) + with RESTFullAPIClient("http://nil.nil") as storage: + with self.filepath.open("rb") as fp: + with ThreadPoolExecutor(max_workers=jobs or 5) as executor: + lock = Lock() + futures = [ + executor.submit( + _upload_blob_part, + storage_session=storage, + fp=fp, + lock=lock, + etagger=etagger, + asset_path=asset_path, + part=part, + ) + for part in parts + ] + for fut in as_completed(futures): + out_part = fut.result() + bytes_uploaded += out_part["size"] + yield { + "status": "uploading", + "upload": 100 * bytes_uploaded / total_size, + "current": bytes_uploaded, + } + parts_out.append(out_part) + lgr.debug("%s: Completing upload", asset_path) + resp = client.post( + f"/uploads/{upload_id}/complete/", + json={"parts": parts_out}, + ) + lgr.debug( + "%s: Announcing completion to %s", + asset_path, + resp["complete_url"], + ) + r = storage.post( + resp["complete_url"], data=resp["body"], json_resp=False + ) + lgr.debug( + "%s: Upload completed. Response content: %s", + asset_path, + r.content, + ) + rxml = fromstring(r.text) + m = re.match(r"\{.+?\}", rxml.tag) + ns = m.group(0) if m else "" + final_etag = rxml.findtext(f"{ns}ETag") + if final_etag is not None: + final_etag = final_etag.strip('"') + if final_etag != filetag: + raise RuntimeError( + "Server and client disagree on final ETag of uploaded file;" + f" server says {final_etag}, client says {filetag}" + ) + # else: Error? Warning? + resp = client.post(f"/uploads/{upload_id}/validate/") + blob_id = resp["blob_id"] + lgr.debug("%s: Assigning asset blob to dandiset & version", asset_path) + yield {"status": "producing asset"} + if replacing is not None: + lgr.debug("%s: Replacing pre-existing asset") + r = client.put( + replacing.api_path, + json={"metadata": metadata, "blob_id": blob_id}, + ) + else: + r = client.post( + f"{dandiset.version_api_path}assets/", + json={"metadata": metadata, "blob_id": blob_id}, + ) + a = RemoteAsset.from_data(dandiset, r) + lgr.info("%s: Asset successfully uploaded", asset_path) + yield {"status": "done", "asset": a} + + +class NWBAsset(LocalFileAsset): + """Representation of a local NWB file""" + + EXTENSIONS = [".nwb"] + + def get_metadata( + self, + digest: Optional[Digest] = None, + ignore_errors: bool = True, + ) -> BareAsset: + try: + metadata = nwb2asset(self.filepath, digest=digest) + except Exception as e: + lgr.warning( + "Failed to extract NWB metadata from %s: %s: %s", + self.filepath, + type(e).__name__, + str(e), + ) + if ignore_errors: + metadata = get_default_metadata(self.filepath, digest=digest) + else: + raise + metadata.path = self.path + return metadata + + # TODO: @validate_cache.memoize_path + def get_validation_errors( + self, + schema_version: Optional[str] = None, + devel_debug: bool = False, + ) -> List[str]: + errors = pynwb_validate(self.filepath, devel_debug=devel_debug) + if schema_version is not None: + errors.extend( + super().get_validation_errors( + schema_version=schema_version, devel_debug=devel_debug + ) + ) + else: + # make sure that we have some basic metadata fields we require + try: + meta = get_metadata(self.filepath) + except Exception as e: + if devel_debug: + raise + lgr.warning( + "Failed to read metadata in %s: %s", + self.filepath, + e, + extra={"validating": True}, + ) + errors.append(f"Failed to read metadata: {e}") + else: + errors.extend( + _check_required_fields(meta, _required_nwb_metadata_fields) + ) + return errors + + +class GenericAsset(LocalFileAsset): + """ + Representation of a generic regular file, one that is not of any known type + """ + + EXTENSIONS = [] + + def get_metadata( + self, + digest: Optional[Digest] = None, + ignore_errors: bool = True, + ) -> BareAsset: + metadata = get_default_metadata(self.filepath, digest=digest) + metadata.path = self.path + return metadata + + +class LocalDirectoryAsset(LocalAsset, Generic[P]): + """ + Representation of a directory that can be uploaded to a DANDI Archive as + a single asset of a Dandiset. It is generic in ``P``, bound to + `dandi.misctypes.BasePath`. + """ + + @property + @abstractmethod + def filetree(self) -> P: + """ + The path object for the root of the hierarchy of files within the + directory + """ + ... + + def iterfiles(self, include_dirs: bool = False) -> Iterator[P]: + """Yield all files within the directory""" + dirs = deque([self.filetree]) + while dirs: + for p in dirs.popleft().iterdir(): + if p.is_dir(): + dirs.append(p) + if include_dirs: + yield p + else: + yield p + + @property + def size(self) -> int: + """The total size of the files in the directory""" + return sum(p.size for p in self.iterfiles()) + + +@dataclass +class LocalZarrEntry(BasePath): + """A file or directory within a `ZarrAsset`""" + + #: The path to the actual file or directory on disk + filepath: Path + #: The path to the root of the Zarr file tree + zarr_basepath: Path + + def _get_subpath(self, name: str) -> LocalZarrEntry: + if not name or "/" in name: + raise ValueError(f"Invalid path component: {name!r}") + elif name == ".": + return self + elif name == "..": + return self.parent + else: + return replace( + self, filepath=self.filepath / name, parts=self.parts + (name,) + ) + + @property + def parent(self) -> LocalZarrEntry: + if self.is_root(): + return self + else: + return replace(self, filepath=self.filepath.parent, parts=self.parts[:-1]) + + def exists(self) -> bool: + return self.filepath.exists() + + def is_file(self) -> bool: + return self.filepath.is_file() + + def is_dir(self) -> bool: + return self.filepath.is_dir() + + def iterdir(self) -> Iterator[LocalZarrEntry]: + for p in self.filepath.iterdir(): + if p.is_dir() and not any(p.iterdir()): + # Ignore empty directories + continue + yield self._get_subpath(p.name) + + def get_digest(self) -> Digest: + """ + Calculate the DANDI etag digest for the entry. If the entry is a + directory, the algorithm will be the Dandi Zarr checksum algorithm; if + it is a file, it will be MD5. + """ + if self.is_dir(): + return Digest.dandi_zarr( + get_zarr_checksum(self.filepath, basepath=self.zarr_basepath) + ) + else: + return Digest( + algorithm=DigestType.md5, value=get_digest(self.filepath, "md5") + ) + + @property + def size(self) -> int: + """ + The size of the entry. For a directory, this is the total size of all + entries within it. + """ + if self.is_dir(): + return sum(p.size for p in self.iterdir()) + else: + return os.path.getsize(self.filepath) + + @property + def modified(self) -> datetime: + """The time at which the entry was last modified""" + # TODO: Should this be overridden for directories? + return ensure_datetime(self.filepath.stat().st_mtime) + + +@dataclass +class ZarrStat: + """Details about a Zarr asset""" + + #: The total size of the asset + size: int + #: The Dandi Zarr checksum of the asset + digest: Digest + #: A list of all files in the asset in unspecified order + files: List[LocalZarrEntry] + + +class ZarrAsset(LocalDirectoryAsset[LocalZarrEntry]): + """Representation of a local Zarr directory""" + + EXTENSIONS = [".ngff", ".zarr"] + + @property + def filetree(self) -> LocalZarrEntry: + """ + The `LocalZarrEntry` for the root of the hierarchy of files within the + Zarr asset + """ + return LocalZarrEntry( + filepath=self.filepath, zarr_basepath=self.filepath, parts=() + ) + + def stat(self) -> ZarrStat: + """Return various details about the Zarr asset""" + + def dirstat(dirpath: LocalZarrEntry) -> ZarrStat: + size = 0 + dir_md5s = {} + file_md5s = {} + files = [] + for p in dirpath.iterdir(): + if p.is_dir(): + st = dirstat(p) + size += st.size + dir_md5s[str(p)] = st.digest.value + files.extend(st.files) + else: + size += p.size + file_md5s[str(p)] = p.get_digest().value + files.append(p) + return ZarrStat( + size=size, + digest=Digest.dandi_zarr(get_checksum(file_md5s, dir_md5s)), + files=files, + ) + + return dirstat(self.filetree) + + def get_digest(self) -> Digest: + """Calculate a dandi-zarr-checksum digest for the asset""" + return Digest.dandi_zarr(get_zarr_checksum(self.filepath)) + + def get_metadata( + self, + digest: Optional[Digest] = None, + ignore_errors: bool = True, + ) -> BareAsset: + metadata = get_default_metadata(self.filepath, digest=digest) + metadata.encodingFormat = ZARR_MIME_TYPE + metadata.path = self.path + return metadata + + def get_validation_errors( + self, + schema_version: Optional[str] = None, + devel_debug: bool = False, + ) -> List[str]: + try: + data = zarr.open(self.filepath) + except Exception as e: + if devel_debug: + raise + lgr.warning( + "Error opening %s: %s: %s", + self.filepath, + type(e).__name__, + e, + extra={"validating": True}, + ) + return [str(e)] + if isinstance(data, zarr.Group) and not data: + msg = "Zarr group is empty" + if devel_debug: + raise ValueError(msg) + lgr.warning("%s: %s", self.filepath, msg, extra={"validating": True}) + return [msg] + try: + next(self.filepath.glob(f"*{os.sep}" + os.sep.join(["*"] * MAX_ZARR_DEPTH))) + except StopIteration: + pass + else: + msg = f"Zarr directory tree more than {MAX_ZARR_DEPTH} directories deep" + if devel_debug: + raise ValueError(msg) + lgr.warning("%s: %s", self.filepath, msg, extra={"validating": True}) + return [msg] + # TODO: Should this be appended to the above errors? + return super().get_validation_errors( + schema_version=schema_version, devel_debug=devel_debug + ) + + def iter_upload( + self, + dandiset: RemoteDandiset, + metadata: Dict[str, Any], + jobs: Optional[int] = None, + replacing: Optional[RemoteAsset] = None, + ) -> Iterator[dict]: + """ + Upload the Zarr directory as an asset with the given metadata to the + given Dandiset, returning a generator of status `dict`\\s. + + :param RemoteDandiset dandiset: + the Dandiset to which the Zarr will be uploaded + :param dict metadata: + Metadata for the uploaded asset. The "path" field will be set to + the value of the instance's ``path`` attribute if no such field is + already present. + :param int jobs: Number of threads to use for uploading; defaults to 5 + :param RemoteAsset replacing: + If set, replace the given asset, which must have the same path as + the new asset + :returns: + A generator of `dict`\\s containing at least a ``"status"`` key. + Upon successful upload, the last `dict` will have a status of + ``"done"`` and an ``"asset"`` key containing the resulting + `RemoteAsset`. + """ + # So that older clients don't get away with doing the wrong thing once + # Zarr upload to embargoed Dandisets is implemented in the API: + if dandiset.embargo_status is EmbargoStatus.EMBARGOED: + raise NotImplementedError( + "Uploading Zarr assets to embargoed Dandisets is currently not implemented" + ) + asset_path = metadata.setdefault("path", self.path) + client = dandiset.client + yield {"status": "calculating etag"} + stat = self.stat() + filetag = stat.digest.value + lgr.debug("Calculated dandi-zarr-checksum of %s for %s", filetag, self.filepath) + digest = metadata.get("digest", {}) + if "dandi:dandi-zarr-checksum" in digest: + if digest["dandi:dandi-zarr-checksum"] != filetag: + raise RuntimeError( + f"{self.filepath}: Zarr etag changed; was originally" + f" {digest['dandi:dandi-zarr-checksum']} but is now {filetag}" + ) + yield {"status": "initiating upload"} + lgr.debug("%s: Beginning upload", asset_path) + bytes_uploaded = 0 + r = client.post("/zarr/", json={"name": self.filepath.name}) + zarr_id = r["zarr_id"] + with RESTFullAPIClient( + "http://nil.nil", + headers={"X-Amz-ACL": "bucket-owner-full-control"}, + ) as storage: + for i, filebatch in enumerate( + chunked(stat.files, ZARR_UPLOAD_BATCH_SIZE), start=1 + ): + upload_body = [ + {"path": str(p), "etag": p.get_digest().value} for p in filebatch + ] + lgr.debug( + "%s: Uploading Zarr file batch #%d (%s)", + asset_path, + i, + pluralize(len(filebatch), "file"), + ) + r = client.post(f"/zarr/{zarr_id}/upload/", json=upload_body) + with ThreadPoolExecutor(max_workers=jobs or 5) as executor: + futures = [ + executor.submit( + _upload_zarr_file, + storage_session=storage, + path=self.filepath / upspec["path"], + upload_url=upspec["upload_url"], + ) + for upspec in r + ] + for fut in as_completed(futures): + size = fut.result() + bytes_uploaded += size + yield { + "status": "uploading", + "upload": 100 * bytes_uploaded / stat.size, + "current": bytes_uploaded, + } + lgr.debug("%s: Completing upload of batch #%d", asset_path, i) + client.post(f"/zarr/{zarr_id}/upload/complete/") + lgr.debug("%s: Upload completed", asset_path) + r = client.get(f"/zarr/{zarr_id}/") + if r["checksum"] != filetag: + raise RuntimeError( + "Server and client disagree on final ETag of uploaded Zarr;" + f" server says {r['checksum']}, client says {filetag}" + ) + lgr.debug("%s: Assigning asset blob to dandiset & version", asset_path) + yield {"status": "producing asset"} + if replacing is not None: + lgr.debug("%s: Replacing pre-existing asset") + r = client.put( + replacing.api_path, + json={"metadata": metadata, "zarr_id": zarr_id}, + ) + else: + r = client.post( + f"{dandiset.version_api_path}assets/", + json={"metadata": metadata, "zarr_id": zarr_id}, + ) + a = RemoteAsset.from_data(dandiset, r) + lgr.info("%s: Asset successfully uploaded", asset_path) + yield {"status": "done", "asset": a} + + +def find_dandi_files( + *paths: Union[str, Path], + dandiset_path: Optional[Union[str, Path]] = None, + allow_all: bool = False, + include_metadata: bool = False, +) -> Iterator[DandiFile]: + """ + Yield all DANDI files at or under the paths in ``paths`` (which may be + either files or directories). Files & directories whose names start with a + period are ignored. Directories are only included in the return value if + they are of a type represented by a `LocalDirectoryAsset` subclass, in + which case they are not recursed into. + + :param dandiset_path: + The path to the root of the Dandiset in which the paths are located. + All paths in ``paths`` must be equal to or subpaths of + ``dandiset_path``. If `None`, then the Dandiset path for each asset + found is implicitly set to the parent directory. + :param allow_all: + If true, unrecognized assets and the Dandiset's :file:`dandiset.yaml` + file are returned as `GenericAsset` and `DandisetMetadataFile` + instances, respectively. If false, they are not returned at all. + :param include_metadata: + If true, the Dandiset's :file:`dandiset.yaml` file is returned as a + `DandisetMetadataFile` instance. If false, it is not returned at all + (unless ``allow_all`` is true). + """ + + path_queue = deque() + for p in paths: + p = Path(p) + if dandiset_path is not None: + try: + p.relative_to(dandiset_path) + except ValueError: + raise ValueError( + "Path {str(p)!r} is not inside Dandiset path {str(dandiset_path)!r}" + ) + path_queue.append(p) + while path_queue: + p = path_queue.popleft() + if p.name.startswith("."): + continue + if p.is_dir(): + if p.is_symlink(): + lgr.warning("%s: Ignoring unsupported symbolic link to directory", p) + elif dandiset_path is not None and p == Path(dandiset_path): + path_queue.extend(p.iterdir()) + elif any(p.iterdir()): + try: + df = dandi_file(p, dandiset_path) + except UnknownAssetError: + path_queue.extend(p.iterdir()) + else: + yield df + else: + df = dandi_file(p, dandiset_path) + if isinstance(df, GenericAsset) and not allow_all: + pass + elif isinstance(df, DandisetMetadataFile) and not ( + allow_all or include_metadata + ): + pass + else: + yield df + + +def dandi_file( + filepath: Union[str, Path], dandiset_path: Optional[Union[str, Path]] = None +) -> DandiFile: + """ + Return a `DandiFile` instance of the appropriate type for the file at + ``filepath`` inside the Dandiset rooted at ``dandiset_path``. If + ``dandiset_path`` is not set, it will default to ``filepath``'s parent + directory. + + If ``filepath`` is a directory, it must be of a type represented by a + `LocalDirectoryAsset` subclass; otherwise, an `UnknownAssetError` exception + will be raised. + + A regular file named :file:`dandiset.yaml` will only be represented by a + `DandisetMetadataFile` instance if it is at the root of the Dandiset. + + A regular file that is not of a known type will be represented by a + `GenericAsset` instance. + """ + filepath = Path(filepath) + if dandiset_path is not None: + path = filepath.relative_to(dandiset_path).as_posix() + if path == ".": + raise ValueError("Dandi file path cannot equal Dandiset path") + else: + path = filepath.name + if filepath.is_dir(): + if not any(filepath.iterdir()): + raise UnknownAssetError("Empty directories cannot be assets") + for dirclass in LocalDirectoryAsset.__subclasses__(): + if filepath.suffix in dirclass.EXTENSIONS: + return dirclass(filepath=filepath, path=path) + raise UnknownAssetError( + f"Directory has unrecognized suffix {filepath.suffix!r}" + ) + elif path == dandiset_metadata_file: + return DandisetMetadataFile(filepath=filepath) + else: + for fileclass in LocalFileAsset.__subclasses__(): + if filepath.suffix in fileclass.EXTENSIONS: + return fileclass(filepath=filepath, path=path) + return GenericAsset(filepath=filepath, path=path) + + +def _upload_blob_part( + storage_session: RESTFullAPIClient, + fp: BinaryIO, + lock: Lock, + etagger: DandiETag, + asset_path: str, + part: dict, +) -> dict: + etag_part = etagger.get_part(part["part_number"]) + if part["size"] != etag_part.size: + raise RuntimeError( + f"Server and client disagree on size of upload part" + f" {part['part_number']}; server says {part['size']}," + f" client says {etag_part.size}" + ) + with lock: + fp.seek(etag_part.offset) + chunk = fp.read(part["size"]) + if len(chunk) != part["size"]: + raise RuntimeError( + f"End of file {fp.name} reached unexpectedly early:" + f" read {len(chunk)} bytes of out of an expected {part['size']}" + ) + lgr.debug( + "%s: Uploading part %d/%d (%d bytes)", + asset_path, + part["part_number"], + etagger.part_qty, + part["size"], + ) + r = storage_session.put( + part["upload_url"], + data=chunk, + json_resp=False, + retry_statuses=[500], + ) + server_etag = r.headers["ETag"].strip('"') + lgr.debug( + "%s: Part upload finished ETag=%s Content-Length=%s", + asset_path, + server_etag, + r.headers.get("Content-Length"), + ) + client_etag = etagger.get_part_etag(etag_part) + if server_etag != client_etag: + raise RuntimeError( + f"Server and client disagree on ETag of upload part" + f" {part['part_number']}; server says" + f" {server_etag}, client says {client_etag}" + ) + return { + "part_number": part["part_number"], + "size": part["size"], + "etag": server_etag, + } + + +def _upload_zarr_file( + storage_session: RESTFullAPIClient, path: Path, upload_url: str +) -> int: + with path.open("rb") as fp: + storage_session.put(upload_url, data=fp, json_resp=False) + return path.stat().st_size + + +def _check_required_fields(d: dict, required: List[str]) -> List[str]: + errors: List[str] = [] + for f in required: + v = d.get(f, None) + if not v or (isinstance(v, str) and not v.strip()): + errors += [f"Required field {f!r} has no value"] + if v in ("REQUIRED", "PLACEHOLDER"): + errors += [f"Required field {f!r} has value {v!r}"] + return errors diff --git a/dandi/metadata.py b/dandi/metadata.py index dfae376b8..b7bc72bc6 100644 --- a/dandi/metadata.py +++ b/dandi/metadata.py @@ -3,7 +3,7 @@ import os import os.path as op import re -import typing as ty +from typing import Optional, Tuple from uuid import uuid4 from xml.dom.minidom import parseString @@ -13,6 +13,7 @@ from . import __version__, get_logger from .dandiset import Dandiset +from .misctypes import Digest from .pynwb_utils import ( _get_pynwb_metadata, get_neurodata_types, @@ -388,7 +389,7 @@ def extract_sex(metadata): stop=tenacity.stop_after_attempt(3), wait=tenacity.wait_exponential(exp_base=1.25, multiplier=1.25), ) -def parse_purlobourl(url: str, lookup: ty.Optional[ty.Tuple[str, ...]] = None): +def parse_purlobourl(url: str, lookup: Optional[Tuple[str, ...]] = None): """Parse an Ontobee URL to return properties of a Class node :param url: Ontobee URL @@ -786,12 +787,12 @@ def process_ndtypes(asset, nd_types): def get_asset_metadata( - filepath, relpath, digest=None, digest_type=None, allow_any_path=True + filepath, relpath, digest: Optional[Digest] = None, allow_any_path=True ) -> models.BareAsset: metadata = None if op.splitext(filepath)[1] == ".nwb": try: - metadata = nwb2asset(filepath, digest=digest, digest_type=digest_type) + metadata = nwb2asset(filepath, digest=digest) except Exception as e: lgr.warning( "Failed to extract NWB metadata from %s: %s: %s", @@ -802,15 +803,13 @@ def get_asset_metadata( if not allow_any_path: raise if metadata is None: - metadata = get_default_metadata( - filepath, digest=digest, digest_type=digest_type - ) + metadata = get_default_metadata(filepath, digest=digest) metadata.path = str(relpath) return metadata def nwb2asset( - nwb_path, digest=None, digest_type=None, schema_version=None + nwb_path, digest: Optional[Digest] = None, schema_version=None ) -> models.BareAsset: if schema_version is not None: current_version = models.get_schema_version() @@ -821,8 +820,8 @@ def nwb2asset( start_time = datetime.now().astimezone() metadata = get_metadata(nwb_path) if digest is not None: - metadata["digest"] = digest - metadata["digest_type"] = digest_type + metadata["digest"] = digest.value + metadata["digest_type"] = digest.algorithm.name metadata["contentSize"] = op.getsize(nwb_path) metadata["encodingFormat"] = "application/x-nwb" metadata["dateModified"] = get_utcnow_datetime() @@ -841,12 +840,12 @@ def nwb2asset( return asset -def get_default_metadata(path, digest=None, digest_type=None) -> models.BareAsset: +def get_default_metadata(path, digest: Optional[Digest] = None) -> models.BareAsset: start_time = datetime.now().astimezone() if digest is not None: - digest_model = {models.DigestType[digest_type]: digest} + digest_model = digest.asdict() else: - digest_model = [] + digest_model = {} dateModified = get_utcnow_datetime() blobDateModified = ensure_datetime(os.stat(path).st_mtime) if blobDateModified > dateModified: diff --git a/dandi/misctypes.py b/dandi/misctypes.py new file mode 100644 index 000000000..03f9d040d --- /dev/null +++ b/dandi/misctypes.py @@ -0,0 +1,241 @@ +""" +.. versionadded:: 0.36.0 + +Miscellaneous public classes +""" + +from __future__ import annotations + +from abc import ABC, abstractmethod +from dataclasses import dataclass +from fnmatch import fnmatchcase +from typing import Dict, Iterator, List, Tuple, TypeVar + +from dandischema.models import DigestType + + +@dataclass +class Digest: + """A computed digest for a file or directory""" + + #: The digest algorithm used + algorithm: DigestType + + #: The digest itself + value: str + + @classmethod + def dandi_etag(cls, value: str) -> Digest: + """ + Construct a `Digest` with the given value and a ``algorithm`` of + ``DigestType.dandi_etag`` + """ + return cls(algorithm=DigestType.dandi_etag, value=value) + + @classmethod + def dandi_zarr(cls, value: str) -> Digest: + """ + Construct a `Digest` with the given value and a ``algorithm`` of + ``DigestType.dandi_zarr_checksum`` + """ + return cls(algorithm=DigestType.dandi_zarr_checksum, value=value) + + def asdict(self) -> Dict[DigestType, str]: + """ + Convert the instance to a single-item `dict` mapping the digest + algorithm to the digest value + """ + return {self.algorithm: self.value} + + +#: Placeholder digest used in some situations where a digest is required but +#: not actually relevant and would be too expensive to calculate +DUMMY_DIGEST = Digest(algorithm=DigestType.dandi_etag, value=32 * "d" + "-1") + +P = TypeVar("P", bound="BasePath") + + +@dataclass +class BasePath(ABC): + """ + An abstract base class for path-like objects that can be traversed with the + ``/`` operator *à la* `pathlib.Path` (though, unlike `pathlib.Path` + instances, "dividing" by another non-string path is not allowed). All + paths are treated as forward-slash-separated relative paths under an + empty-name "root" path. + """ + + #: The path components of the object + parts: Tuple[str, ...] + + def __str__(self) -> str: + return "/".join(self.parts) + + @property + def name(self) -> str: + """ + The basename of the path object. When the object represents the root + of a path hierarchy, this is the empty string. + """ + if self.is_root(): + return "" + else: + assert self.parts + return self.parts[-1] + + @abstractmethod + def _get_subpath(self: P, name: str) -> P: + """ + Return the path immediately under the instance with the given name. A + name of ``"."`` should cause ``self`` to be returned, and a name of + ``".."`` should cause ``self.parent`` to be returned. An empty name or + a name containing a forward slash should result in a `ValueError`. + """ + ... + + def __truediv__(self: P, path: str) -> P: + p = self + for q in self._split_path(path): + p = p._get_subpath(q) + return p + + def joinpath(self: P, *paths: str) -> P: + """ + Combine the path with each name or relative path in ``paths`` using the + ``/`` operator + """ + p = self + for q in paths: + p /= q + return p + + @staticmethod + def _split_path(path: str) -> Tuple[str, ...]: + """Split a path into its path components""" + if path.startswith("/"): + raise ValueError(f"Absolute paths not allowed: {path!r}") + return tuple(q for q in path.split("/") if q) + + def is_root(self) -> bool: + """ + Returns true if this path object represents the root of its hierarchy + """ + return self.parts == () + + @property + def root_path(self: P) -> P: + """The root of the path object's hierarchy""" + p = self + while not p.is_root(): + p = p.parent + return p + + @property + @abstractmethod + def parent(self: P) -> P: + """ + The parent path of the object. The parent of the root of a path + hierarchy is itself. + """ + ... + + @property + def parents(self: P) -> Tuple[P, ...]: + """ + A tuple of the path's ancestors, starting at the parent and going up to + (but not including) the root of the hierarchy + """ + ps: List[P] = [] + p = self + while not p.is_root(): + q = p.parent + ps.append(q) + p = q + return tuple(ps) + + def with_name(self: P, name: str) -> P: + """Equivalent to ``p.parent / name``""" + return self.parent / name + + @property + def suffix(self) -> str: + """The final file extension of the basename, if any""" + i = self.name.rfind(".") + if 0 < i < len(self.name) - 1: + return self.name[i:] + else: + return "" + + @property + def suffixes(self) -> List[str]: + """A list of the basename's file extensions""" + if self.name.endswith("."): + return [] + name = self.name.lstrip(".") + return ["." + suffix for suffix in name.split(".")[1:]] + + @property + def stem(self) -> str: + """The basename without its final file extension, if any""" + i = self.name.rfind(".") + if 0 < i < len(self.name) - 1: + return self.name[:i] + else: + return self.name + + def with_stem(self: P, stem: str) -> P: + """Returns a new path with the stem changed""" + return self.with_name(stem + self.suffix) + + def with_suffix(self: P, suffix: str) -> P: + """Returns a new path with the final file extension changed""" + if "/" in suffix or (suffix and not suffix.startswith(".")) or suffix == ".": + raise ValueError(f"Invalid suffix: {suffix!r}") + if not self.name: + raise ValueError("Path has an empty name") + if not self.suffix: + name = self.name + suffix + else: + name = self.name[: -len(self.suffix)] + suffix + return self.with_name(name) + + def match(self, pattern: str) -> bool: + """Tests whether the path matches the given glob pattern""" + patparts = self._split_path(pattern) + if not patparts: + raise ValueError("Empty pattern") + if len(patparts) > len(self.parts): + return False + for part, pat in zip(reversed(self.parts), reversed(patparts)): + if not fnmatchcase(part, pat): + return False + return True + + @abstractmethod + def exists(self) -> bool: + """True iff the resource at the given path exists""" + ... + + @abstractmethod + def is_file(self) -> bool: + """True if the resource at the given path exists and is a file""" + ... + + @abstractmethod + def is_dir(self) -> bool: + """True if the resource at the given path exists and is a directory""" + ... + + @abstractmethod + def iterdir(self: P) -> Iterator[P]: + """ + Returns a generator of the paths under the instance, which must be a + directory + """ + ... + + @property + @abstractmethod + def size(self) -> int: + """The size of the resource at the path""" + ... diff --git a/dandi/support/digests.py b/dandi/support/digests.py index 6656697c9..504ad755d 100644 --- a/dandi/support/digests.py +++ b/dandi/support/digests.py @@ -11,8 +11,11 @@ import hashlib import logging +from pathlib import Path +from typing import Dict, Optional from dandischema.digests.dandietag import DandiETag +from dandischema.digests.zarr import get_checksum from fscacher import PersistentCache from ..utils import auto_repr @@ -87,3 +90,29 @@ def get_digest(filepath, digest="sha256") -> str: @checksums.memoize_path def get_dandietag(filepath) -> DandiETag: return DandiETag.from_file(filepath) + + +def get_zarr_checksum( + dirpath: Path, + basepath: Optional[Path] = None, + known: Optional[Dict[str, str]] = None, +) -> str: + if basepath is None: + basepath = dirpath + dirs = {} + files = {} + if known is None: + known = {} + for p in dirpath.iterdir(): + path = p.relative_to(basepath).as_posix() + if not p.is_dir(): + try: + files[path] = known[path] + except KeyError: + files[path] = get_digest(p, "md5") + elif any(p.iterdir()): + try: + dirs[path] = known[path] + except KeyError: + dirs[path] = get_zarr_checksum(p, basepath) + return get_checksum(files, dirs) diff --git a/dandi/tests/fixtures.py b/dandi/tests/fixtures.py index 5c4c5844c..1c2209e78 100644 --- a/dandi/tests/fixtures.py +++ b/dandi/tests/fixtures.py @@ -1,3 +1,4 @@ +from dataclasses import dataclass, field from datetime import datetime import logging import os @@ -7,20 +8,23 @@ from subprocess import DEVNULL, check_output, run import tempfile from time import sleep +from typing import Any, Dict from uuid import uuid4 from click.testing import CliRunner from dandischema.consts import DANDI_SCHEMA_VERSION from dateutil.tz import tzutc +import numpy as np import pynwb import pytest import requests +import zarr from .skip import skipif from .. import get_logger from ..cli.command import organize -from ..consts import dandiset_metadata_file, known_instances -from ..dandiapi import DandiAPIClient +from ..consts import DandiInstance, dandiset_metadata_file, known_instances +from ..dandiapi import DandiAPIClient, RemoteDandiset from ..pynwb_utils import make_nwb_file, metadata_nwb_file_fields from ..upload import upload @@ -241,23 +245,59 @@ def docker_compose_setup(): run(["docker-compose", "down", "-v"], cwd=str(LOCAL_DOCKER_DIR), check=True) +@dataclass +class DandiAPI: + api_key: str + client: DandiAPIClient + instance: DandiInstance + instance_id: str + + @property + def api_url(self) -> str: + return self.instance.api + + @pytest.fixture(scope="session") def local_dandi_api(docker_compose_setup): instance_id = "dandi-api-local-docker-tests" instance = known_instances[instance_id] api_key = docker_compose_setup["django_api_key"] with DandiAPIClient(api_url=instance.api, token=api_key) as client: - yield { - "api_key": api_key, - "client": client, - "instance": instance, - "instance_id": instance_id, - } + yield DandiAPI( + api_key=api_key, + client=client, + instance=instance, + instance_id=instance_id, + ) + + +@dataclass +class SampleDandiset: + api: DandiAPI + dspath: Path + dandiset: RemoteDandiset + dandiset_id: str + upload_kwargs: Dict[str, Any] = field(default_factory=dict) + + @property + def client(self) -> DandiAPIClient: + return self.api.client + + def upload(self, paths=None, **kwargs) -> None: + with pytest.MonkeyPatch().context() as m: + m.setenv("DANDI_API_KEY", self.api.api_key) + upload( + paths=paths or [], + dandiset_path=self.dspath, + dandi_instance=self.api.instance_id, + devel_debug=True, + **{**self.upload_kwargs, **kwargs}, + ) @pytest.fixture() def text_dandiset(local_dandi_api, monkeypatch, tmp_path_factory): - d = local_dandi_api["client"].create_dandiset( + d = local_dandi_api.client.create_dandiset( "Text Dandiset", { "schemaKey": "Dandiset", @@ -283,28 +323,48 @@ def text_dandiset(local_dandi_api, monkeypatch, tmp_path_factory): (dspath / "subdir2").mkdir() (dspath / "subdir2" / "banana.txt").write_text("Banana\n") (dspath / "subdir2" / "coconut.txt").write_text("Coconut\n") + td = SampleDandiset( + api=local_dandi_api, + dspath=dspath, + dandiset=d, + dandiset_id=dandiset_id, + upload_kwargs={"allow_any_path": True}, + ) + td.upload() + return td - def upload_dandiset(paths=None, **kwargs): - with monkeypatch.context() as m: - m.setenv("DANDI_API_KEY", local_dandi_api["api_key"]) - upload( - paths=paths or [], - dandiset_path=dspath, - dandi_instance=local_dandi_api["instance_id"], - devel_debug=True, - allow_any_path=True, - validation="skip", - **kwargs, - ) - upload_dandiset() - return { - "client": local_dandi_api["client"], - "dspath": dspath, - "dandiset": d, - "dandiset_id": dandiset_id, - "reupload": upload_dandiset, - } +@pytest.fixture() +def zarr_dandiset(local_dandi_api, monkeypatch, tmp_path_factory): + d = local_dandi_api.client.create_dandiset( + "Zarr Dandiset", + { + "schemaKey": "Dandiset", + "name": "Zarr Dandiset", + "description": "A test Zarr Dandiset", + "contributor": [ + { + "schemaKey": "Person", + "name": "Wodder, John", + "roleName": ["dcite:Author", "dcite:ContactPerson"], + } + ], + "license": ["spdx:CC0-1.0"], + "manifestLocation": ["https://github.com/dandi/dandi-cli"], + }, + ) + dandiset_id = d.identifier + dspath = tmp_path_factory.mktemp("zarr_dandiset") + (dspath / dandiset_metadata_file).write_text(f"identifier: '{dandiset_id}'\n") + zarr.save(dspath / "sample.zarr", np.arange(1000), np.arange(1000, 0, -1)) + td = SampleDandiset( + api=local_dandi_api, + dspath=dspath, + dandiset=d, + dandiset_id=dandiset_id, + ) + td.upload() + return td @pytest.fixture() diff --git a/dandi/tests/test_dandiapi.py b/dandi/tests/test_dandiapi.py index dc3cddd94..1f66a4d5d 100644 --- a/dandi/tests/test_dandiapi.py +++ b/dandi/tests/test_dandiapi.py @@ -28,7 +28,7 @@ def test_upload(local_dandi_api, simple1_nwb, tmp_path): - client = local_dandi_api["client"] + client = local_dandi_api.client d = client.create_dandiset(name="Upload Test", metadata={}) assert d.version_id == DRAFT d.upload_raw_asset(simple1_nwb, {"path": "testing/simple1.nwb"}) @@ -41,7 +41,7 @@ def test_upload(local_dandi_api, simple1_nwb, tmp_path): def test_publish_and_manipulate(local_dandi_api, monkeypatch, tmp_path): - client = local_dandi_api["client"] + client = local_dandi_api.client d = client.create_dandiset( "Test Dandiset", { @@ -67,13 +67,12 @@ def test_publish_and_manipulate(local_dandi_api, monkeypatch, tmp_path): (upload_dir / "subdir").mkdir() (upload_dir / "subdir" / "file.txt").write_text("This is test text.\n") monkeypatch.chdir(upload_dir) - monkeypatch.setenv("DANDI_API_KEY", local_dandi_api["api_key"]) + monkeypatch.setenv("DANDI_API_KEY", local_dandi_api.api_key) upload( paths=[], - dandi_instance=local_dandi_api["instance_id"], + dandi_instance=local_dandi_api.instance_id, devel_debug=True, allow_any_path=True, - validation="skip", ) d.wait_until_valid() @@ -96,10 +95,9 @@ def test_publish_and_manipulate(local_dandi_api, monkeypatch, tmp_path): (upload_dir / "subdir" / "file.txt").write_text("This is different text.\n") upload( paths=[], - dandi_instance=local_dandi_api["instance_id"], + dandi_instance=local_dandi_api.instance_id, devel_debug=True, allow_any_path=True, - validation="skip", ) rmtree(download_dir / dandiset_id) download(dv.version_api_url, download_dir) @@ -109,10 +107,9 @@ def test_publish_and_manipulate(local_dandi_api, monkeypatch, tmp_path): (upload_dir / "subdir" / "file2.txt").write_text("This is more text.\n") upload( paths=[], - dandi_instance=local_dandi_api["instance_id"], + dandi_instance=local_dandi_api.instance_id, devel_debug=True, allow_any_path=True, - validation="skip", ) rmtree(download_dir / dandiset_id) @@ -147,7 +144,7 @@ def test_publish_and_manipulate(local_dandi_api, monkeypatch, tmp_path): def test_get_asset_metadata(local_dandi_api, simple1_nwb): - client = local_dandi_api["client"] + client = local_dandi_api.client d = client.create_dandiset(name="Include Metadata Test", metadata={}) d.upload_raw_asset(simple1_nwb, {"path": "testing/simple1.nwb", "foo": "bar"}) (asset,) = d.get_assets() @@ -158,7 +155,7 @@ def test_get_asset_metadata(local_dandi_api, simple1_nwb): def test_large_upload(local_dandi_api, tmp_path): - client = local_dandi_api["client"] + client = local_dandi_api.client asset_file = tmp_path / "asset.dat" meg = bytes(random.choices(range(256), k=1 << 20)) with asset_file.open("wb") as fp: @@ -169,9 +166,9 @@ def test_large_upload(local_dandi_api, tmp_path): def test_authenticate_bad_key_good_key_input(local_dandi_api, mocker, monkeypatch): - good_key = local_dandi_api["api_key"] + good_key = local_dandi_api.api_key bad_key = "1234567890" - client_name = local_dandi_api["instance_id"] + client_name = local_dandi_api.instance_id app_id = f"dandi-api-{client_name}" backend_mock = mocker.Mock(spec=["set_password"]) @@ -186,7 +183,7 @@ def test_authenticate_bad_key_good_key_input(local_dandi_api, mocker, monkeypatc monkeypatch.delenv("DANDI_API_KEY", raising=False) - client = DandiAPIClient(local_dandi_api["instance"].api) + client = DandiAPIClient(local_dandi_api.api_url) assert "Authorization" not in client.session.headers client.dandi_authenticate() assert client.session.headers["Authorization"] == f"token {good_key}" @@ -201,8 +198,8 @@ def test_authenticate_bad_key_good_key_input(local_dandi_api, mocker, monkeypatc def test_authenticate_good_key_keyring(local_dandi_api, mocker, monkeypatch): - good_key = local_dandi_api["api_key"] - client_name = local_dandi_api["instance_id"] + good_key = local_dandi_api.api_key + client_name = local_dandi_api.instance_id app_id = f"dandi-api-{client_name}" backend_mock = mocker.Mock(spec=["set_password"]) @@ -215,7 +212,7 @@ def test_authenticate_good_key_keyring(local_dandi_api, mocker, monkeypatch): monkeypatch.delenv("DANDI_API_KEY", raising=False) - client = DandiAPIClient(local_dandi_api["instance"].api) + client = DandiAPIClient(local_dandi_api.api_url) assert "Authorization" not in client.session.headers client.dandi_authenticate() assert client.session.headers["Authorization"] == f"token {good_key}" @@ -230,9 +227,9 @@ def test_authenticate_good_key_keyring(local_dandi_api, mocker, monkeypatch): def test_authenticate_bad_key_keyring_good_key_input( local_dandi_api, mocker, monkeypatch ): - good_key = local_dandi_api["api_key"] + good_key = local_dandi_api.api_key bad_key = "1234567890" - client_name = local_dandi_api["instance_id"] + client_name = local_dandi_api.instance_id app_id = f"dandi-api-{client_name}" backend_mock = mocker.Mock(spec=["set_password"]) @@ -247,7 +244,7 @@ def test_authenticate_bad_key_keyring_good_key_input( monkeypatch.delenv("DANDI_API_KEY", raising=False) - client = DandiAPIClient(local_dandi_api["instance"].api) + client = DandiAPIClient(local_dandi_api.api_url) assert "Authorization" not in client.session.headers client.dandi_authenticate() assert client.session.headers["Authorization"] == f"token {good_key}" @@ -305,13 +302,14 @@ def test_get_content_url_follow_one_redirects_strip_query(): def test_remote_asset_json_dict(text_dandiset): - asset = text_dandiset["dandiset"].get_asset_by_path("file.txt") + asset = text_dandiset.dandiset.get_asset_by_path("file.txt") assert asset.json_dict() == { "asset_id": anys.ANY_STR, "modified": anys.ANY_AWARE_DATETIME_STR, "created": anys.ANY_AWARE_DATETIME_STR, "path": anys.ANY_STR, "size": anys.ANY_INT, + "blob": anys.ANY_STR, } @@ -342,16 +340,14 @@ def test_check_schema_version_mismatch(): def test_get_dandisets(text_dandiset): - dandisets = list(text_dandiset["client"].get_dandisets()) - assert ( - sum(1 for d in dandisets if d.identifier == text_dandiset["dandiset_id"]) == 1 - ) + dandisets = list(text_dandiset.client.get_dandisets()) + assert sum(1 for d in dandisets if d.identifier == text_dandiset.dandiset_id) == 1 def test_get_dandiset_lazy(mocker, text_dandiset): - client = text_dandiset["client"] + client = text_dandiset.client get_spy = mocker.spy(client, "get") - dandiset = client.get_dandiset(text_dandiset["dandiset_id"], DRAFT, lazy=True) + dandiset = client.get_dandiset(text_dandiset.dandiset_id, DRAFT, lazy=True) get_spy.assert_not_called() assert dandiset.version_id == DRAFT get_spy.assert_not_called() @@ -369,9 +365,9 @@ def test_get_dandiset_lazy(mocker, text_dandiset): def test_get_dandiset_non_lazy(mocker, text_dandiset): - client = text_dandiset["client"] + client = text_dandiset.client get_spy = mocker.spy(client, "get") - dandiset = client.get_dandiset(text_dandiset["dandiset_id"], DRAFT, lazy=False) + dandiset = client.get_dandiset(text_dandiset.dandiset_id, DRAFT, lazy=False) get_spy.assert_called_once() get_spy.reset_mock() assert dandiset.version_id == DRAFT @@ -390,9 +386,7 @@ def test_get_dandiset_non_lazy(mocker, text_dandiset): @pytest.mark.parametrize("lazy", [True, False]) def test_get_dandiset_no_version_id(lazy, text_dandiset): - dandiset = text_dandiset["client"].get_dandiset( - text_dandiset["dandiset_id"], lazy=lazy - ) + dandiset = text_dandiset.client.get_dandiset(text_dandiset.dandiset_id, lazy=lazy) assert dandiset.version_id == DRAFT assert isinstance(dandiset.created, datetime) assert isinstance(dandiset.created, datetime) @@ -409,10 +403,10 @@ def test_get_dandiset_no_version_id(lazy, text_dandiset): @pytest.mark.parametrize("lazy", [True, False]) def test_get_dandiset_published(lazy, text_dandiset): - d = text_dandiset["dandiset"] + d = text_dandiset.dandiset d.wait_until_valid() v = d.publish().version.identifier - dandiset = text_dandiset["client"].get_dandiset(d.identifier, v, lazy=lazy) + dandiset = text_dandiset.client.get_dandiset(d.identifier, v, lazy=lazy) assert dandiset.version_id == v assert isinstance(dandiset.created, datetime) assert isinstance(dandiset.created, datetime) @@ -430,10 +424,10 @@ def test_get_dandiset_published(lazy, text_dandiset): @pytest.mark.parametrize("lazy", [True, False]) def test_get_dandiset_published_no_version_id(lazy, text_dandiset): - d = text_dandiset["dandiset"] + d = text_dandiset.dandiset d.wait_until_valid() v = d.publish().version.identifier - dandiset = text_dandiset["client"].get_dandiset(d.identifier, lazy=lazy) + dandiset = text_dandiset.client.get_dandiset(d.identifier, lazy=lazy) assert dandiset.version_id == v assert isinstance(dandiset.created, datetime) assert isinstance(dandiset.created, datetime) @@ -451,10 +445,10 @@ def test_get_dandiset_published_no_version_id(lazy, text_dandiset): @pytest.mark.parametrize("lazy", [True, False]) def test_get_dandiset_published_draft(lazy, text_dandiset): - d = text_dandiset["dandiset"] + d = text_dandiset.dandiset d.wait_until_valid() v = d.publish().version.identifier - dandiset = text_dandiset["client"].get_dandiset(d.identifier, DRAFT, lazy=lazy) + dandiset = text_dandiset.client.get_dandiset(d.identifier, DRAFT, lazy=lazy) assert dandiset.version_id == DRAFT assert isinstance(dandiset.created, datetime) assert isinstance(dandiset.created, datetime) @@ -472,17 +466,17 @@ def test_get_dandiset_published_draft(lazy, text_dandiset): @pytest.mark.parametrize("lazy", [True, False]) def test_get_dandiset_published_other_version(lazy, text_dandiset): - d = text_dandiset["dandiset"] + d = text_dandiset.dandiset d.wait_until_valid() v1 = d.publish().version.identifier - (text_dandiset["dspath"] / "file2.txt").write_text("This is more text.\n") - text_dandiset["reupload"]() + (text_dandiset.dspath / "file2.txt").write_text("This is more text.\n") + text_dandiset.upload() d.wait_until_valid() v2 = d.publish().version.identifier assert v1 != v2 - dandiset = text_dandiset["client"].get_dandiset(d.identifier, v1, lazy=lazy) + dandiset = text_dandiset.client.get_dandiset(d.identifier, v1, lazy=lazy) assert dandiset.version_id == v1 assert isinstance(dandiset.created, datetime) assert isinstance(dandiset.created, datetime) @@ -500,7 +494,7 @@ def test_get_dandiset_published_other_version(lazy, text_dandiset): def test_set_asset_metadata(text_dandiset): - asset = text_dandiset["dandiset"].get_asset_by_path("file.txt") + asset = text_dandiset.dandiset.get_asset_by_path("file.txt") md = asset.get_metadata() md.blobDateModified = datetime(2038, 1, 19, 3, 14, 7, tzinfo=timezone.utc) asset.set_metadata(md) @@ -508,7 +502,7 @@ def test_set_asset_metadata(text_dandiset): def test_remote_dandiset_json_dict(text_dandiset): - data = text_dandiset["dandiset"].json_dict() + data = text_dandiset.dandiset.json_dict() assert data == { "identifier": anys.AnyFullmatch(dandiset_identifier_regex), "created": anys.ANY_AWARE_DATETIME_STR, @@ -530,7 +524,7 @@ def test_remote_dandiset_json_dict(text_dandiset): def test_set_dandiset_metadata(text_dandiset): - dandiset = text_dandiset["dandiset"] + dandiset = text_dandiset.dandiset md = dandiset.get_metadata() md.description = "A test Dandiset with altered metadata" dandiset.set_metadata(md) @@ -545,22 +539,23 @@ def test_set_dandiset_metadata(text_dandiset): [ (DigestType.dandi_etag, r"[0-9a-f]{32}-\d{1,5}"), ("dandi:dandi-etag", r"[0-9a-f]{32}-\d{1,5}"), + (None, r"[0-9a-f]{32}-\d{1,5}"), ], ) -def test_get_digest(digest_type, digest_regex, text_dandiset): - asset = text_dandiset["dandiset"].get_asset_by_path("file.txt") - d = asset.get_digest(digest_type) +def test_get_raw_digest(digest_type, digest_regex, text_dandiset): + asset = text_dandiset.dandiset.get_asset_by_path("file.txt") + d = asset.get_raw_digest(digest_type) assert re.fullmatch(digest_regex, d) -def test_get_digest_nonexistent(text_dandiset): - asset = text_dandiset["dandiset"].get_asset_by_path("file.txt") +def test_get_raw_digest_nonexistent(text_dandiset): + asset = text_dandiset.dandiset.get_asset_by_path("file.txt") with pytest.raises(NotFoundError): - asset.get_digest("md5") + asset.get_raw_digest("md5") def test_refresh(text_dandiset): - dandiset = text_dandiset["dandiset"] + dandiset = text_dandiset.dandiset mtime = dandiset.version.modified md = dandiset.get_metadata() md.description = "A test Dandiset with altered metadata" @@ -576,12 +571,12 @@ def test_refresh(text_dandiset): def test_get_asset_with_and_without_metadata(mocker, text_dandiset): - path_asset = text_dandiset["dandiset"].get_asset_by_path("file.txt") - id_asset = text_dandiset["dandiset"].get_asset(path_asset.identifier) + path_asset = text_dandiset.dandiset.get_asset_by_path("file.txt") + id_asset = text_dandiset.dandiset.get_asset(path_asset.identifier) assert path_asset == id_asset assert path_asset._metadata is None assert id_asset._metadata is not None - get_spy = mocker.spy(text_dandiset["client"], "get") + get_spy = mocker.spy(text_dandiset.client, "get") id_metadata = id_asset.get_raw_metadata() get_spy.assert_not_called() path_metadata = path_asset.get_raw_metadata() @@ -617,31 +612,31 @@ def test_retry_logging(caplog): def test_get_assets_order(text_dandiset): assert [ - asset.path for asset in text_dandiset["dandiset"].get_assets(order="path") + asset.path for asset in text_dandiset.dandiset.get_assets(order="path") ] == ["file.txt", "subdir1/apple.txt", "subdir2/banana.txt", "subdir2/coconut.txt"] assert [ - asset.path for asset in text_dandiset["dandiset"].get_assets(order="-path") + asset.path for asset in text_dandiset.dandiset.get_assets(order="-path") ] == ["subdir2/coconut.txt", "subdir2/banana.txt", "subdir1/apple.txt", "file.txt"] def test_get_assets_with_path_prefix(text_dandiset): assert sorted( asset.path - for asset in text_dandiset["dandiset"].get_assets_with_path_prefix("subdir") + for asset in text_dandiset.dandiset.get_assets_with_path_prefix("subdir") ) == ["subdir1/apple.txt", "subdir2/banana.txt", "subdir2/coconut.txt"] assert sorted( asset.path - for asset in text_dandiset["dandiset"].get_assets_with_path_prefix("subdir2") + for asset in text_dandiset.dandiset.get_assets_with_path_prefix("subdir2") ) == ["subdir2/banana.txt", "subdir2/coconut.txt"] assert [ asset.path - for asset in text_dandiset["dandiset"].get_assets_with_path_prefix( + for asset in text_dandiset.dandiset.get_assets_with_path_prefix( "subdir", order="path" ) ] == ["subdir1/apple.txt", "subdir2/banana.txt", "subdir2/coconut.txt"] assert [ asset.path - for asset in text_dandiset["dandiset"].get_assets_with_path_prefix( + for asset in text_dandiset.dandiset.get_assets_with_path_prefix( "subdir", order="-path" ) ] == ["subdir2/coconut.txt", "subdir2/banana.txt", "subdir1/apple.txt"] diff --git a/dandi/tests/test_dandiarchive.py b/dandi/tests/test_dandiarchive.py index 46828d30b..d5194c86b 100644 --- a/dandi/tests/test_dandiarchive.py +++ b/dandi/tests/test_dandiarchive.py @@ -360,9 +360,9 @@ def test_parse_gui_new_redirect(): @pytest.mark.parametrize("version_suffix", ["", "@draft", "@0.999999.9999"]) def test_get_nonexistent_dandiset(local_dandi_api, version_suffix): - url = f"dandi://{local_dandi_api['instance_id']}/999999{version_suffix}" + url = f"dandi://{local_dandi_api.instance_id}/999999{version_suffix}" parsed_url = parse_dandi_url(url) - client = local_dandi_api["client"] + client = local_dandi_api.client parsed_url.get_dandiset(client) # No error with pytest.raises(NotFoundError) as excinfo: parsed_url.get_dandiset(client, lazy=False) @@ -376,41 +376,38 @@ def test_get_nonexistent_dandiset(local_dandi_api, version_suffix): @pytest.mark.parametrize("version", ["draft", "0.999999.9999"]) def test_get_nonexistent_dandiset_asset_id(local_dandi_api, version): url = ( - f"{local_dandi_api['instance'].api}/dandisets/999999/versions/{version}" + f"{local_dandi_api.api_url}/dandisets/999999/versions/{version}" "/assets/00000000-0000-0000-0000-000000000000/" ) parsed_url = parse_dandi_url(url) - client = local_dandi_api["client"] + client = local_dandi_api.client assert list(parsed_url.get_assets(client)) == [] with pytest.raises(NotFoundError) as excinfo: next(parsed_url.get_assets(client, strict=True)) assert str(excinfo.value) == "No such Dandiset: '999999'" -def test_get_dandiset_nonexistent_asset_id(local_dandi_api, text_dandiset): +def test_get_dandiset_nonexistent_asset_id(text_dandiset): url = ( - f"{local_dandi_api['instance'].api}/dandisets/" - f"{text_dandiset['dandiset_id']}/versions/draft/assets/" + f"{text_dandiset.api.api_url}/dandisets/" + f"{text_dandiset.dandiset_id}/versions/draft/assets/" "00000000-0000-0000-0000-000000000000/" ) parsed_url = parse_dandi_url(url) - client = local_dandi_api["client"] + client = text_dandiset.client assert list(parsed_url.get_assets(client)) == [] with pytest.raises(NotFoundError) as excinfo: next(parsed_url.get_assets(client, strict=True)) assert str(excinfo.value) == ( "No such asset: '00000000-0000-0000-0000-000000000000' for" - f" DANDI-API-LOCAL-DOCKER-TESTS:{text_dandiset['dandiset_id']}/draft" + f" DANDI-API-LOCAL-DOCKER-TESTS:{text_dandiset.dandiset_id}/draft" ) def test_get_nonexistent_asset_id(local_dandi_api): - url = ( - f"{local_dandi_api['instance'].api}/assets/" - "00000000-0000-0000-0000-000000000000/" - ) + url = f"{local_dandi_api.api_url}/assets/00000000-0000-0000-0000-000000000000/" parsed_url = parse_dandi_url(url) - client = local_dandi_api["client"] + client = local_dandi_api.client assert list(parsed_url.get_assets(client)) == [] with pytest.raises(NotFoundError) as excinfo: next(parsed_url.get_assets(client, strict=True)) @@ -419,22 +416,22 @@ def test_get_nonexistent_asset_id(local_dandi_api): @pytest.mark.parametrize("version_suffix", ["", "@draft", "@0.999999.9999"]) def test_get_nonexistent_dandiset_asset_path(local_dandi_api, version_suffix): - url = f"dandi://{local_dandi_api['instance_id']}/999999{version_suffix}/does/not/exist" + url = f"dandi://{local_dandi_api.instance_id}/999999{version_suffix}/does/not/exist" parsed_url = parse_dandi_url(url) - client = local_dandi_api["client"] + client = local_dandi_api.client assert list(parsed_url.get_assets(client)) == [] with pytest.raises(NotFoundError) as excinfo: next(parsed_url.get_assets(client, strict=True)) assert str(excinfo.value) == "No such Dandiset: '999999'" -def test_get_nonexistent_asset_path(local_dandi_api, text_dandiset): +def test_get_nonexistent_asset_path(text_dandiset): url = ( - f"dandi://{local_dandi_api['instance_id']}/" - f"{text_dandiset['dandiset_id']}/does/not/exist" + f"dandi://{text_dandiset.api.instance_id}/" + f"{text_dandiset.dandiset_id}/does/not/exist" ) parsed_url = parse_dandi_url(url) - client = local_dandi_api["client"] + client = text_dandiset.client assert list(parsed_url.get_assets(client)) == [] with pytest.raises(NotFoundError) as excinfo: next(parsed_url.get_assets(client, strict=True)) @@ -444,24 +441,24 @@ def test_get_nonexistent_asset_path(local_dandi_api, text_dandiset): @pytest.mark.parametrize("version_suffix", ["", "@draft", "@0.999999.9999"]) def test_get_nonexistent_dandiset_asset_folder(local_dandi_api, version_suffix): url = ( - f"dandi://{local_dandi_api['instance_id']}/999999{version_suffix}" + f"dandi://{local_dandi_api.instance_id}/999999{version_suffix}" "/does/not/exist/" ) parsed_url = parse_dandi_url(url) - client = local_dandi_api["client"] + client = local_dandi_api.client assert list(parsed_url.get_assets(client)) == [] with pytest.raises(NotFoundError) as excinfo: next(parsed_url.get_assets(client, strict=True)) assert str(excinfo.value) == "No such Dandiset: '999999'" -def test_get_nonexistent_asset_folder(local_dandi_api, text_dandiset): +def test_get_nonexistent_asset_folder(text_dandiset): url = ( - f"dandi://{local_dandi_api['instance_id']}/" - f"{text_dandiset['dandiset_id']}/does/not/exist/" + f"dandi://{text_dandiset.api.instance_id}/" + f"{text_dandiset.dandiset_id}/does/not/exist/" ) parsed_url = parse_dandi_url(url) - client = local_dandi_api["client"] + client = text_dandiset.client assert list(parsed_url.get_assets(client)) == [] assert list(parsed_url.get_assets(client, strict=True)) == [] @@ -469,23 +466,23 @@ def test_get_nonexistent_asset_folder(local_dandi_api, text_dandiset): @pytest.mark.parametrize("version", ["draft", "0.999999.9999"]) def test_get_nonexistent_dandiset_asset_prefix(local_dandi_api, version): url = ( - f"{local_dandi_api['instance'].api}/dandisets/999999/versions/{version}" + f"{local_dandi_api.api_url}/dandisets/999999/versions/{version}" "/assets/?path=does/not/exist" ) parsed_url = parse_dandi_url(url) - client = local_dandi_api["client"] + client = local_dandi_api.client assert list(parsed_url.get_assets(client)) == [] with pytest.raises(NotFoundError) as excinfo: next(parsed_url.get_assets(client, strict=True)) assert str(excinfo.value) == "No such Dandiset: '999999'" -def test_get_nonexistent_asset_prefix(local_dandi_api, text_dandiset): +def test_get_nonexistent_asset_prefix(text_dandiset): url = ( - f"{local_dandi_api['instance'].api}/dandisets/" - f"{text_dandiset['dandiset_id']}/versions/draft/assets/?path=does/not/exist" + f"{text_dandiset.api.api_url}/dandisets/" + f"{text_dandiset.dandiset_id}/versions/draft/assets/?path=does/not/exist" ) parsed_url = parse_dandi_url(url) - client = local_dandi_api["client"] + client = text_dandiset.client assert list(parsed_url.get_assets(client)) == [] assert list(parsed_url.get_assets(client, strict=True)) == [] diff --git a/dandi/tests/test_delete.py b/dandi/tests/test_delete.py index e5fa7128b..087e56a12 100644 --- a/dandi/tests/test_delete.py +++ b/dandi/tests/test_delete.py @@ -54,13 +54,11 @@ ), ], ) -def test_delete_paths( - local_dandi_api, mocker, monkeypatch, text_dandiset, tmp_path, paths, remainder -): - monkeypatch.chdir(text_dandiset["dspath"]) - monkeypatch.setenv("DANDI_API_KEY", local_dandi_api["api_key"]) - instance = local_dandi_api["instance_id"] - dandiset_id = text_dandiset["dandiset_id"] +def test_delete_paths(mocker, monkeypatch, text_dandiset, tmp_path, paths, remainder): + monkeypatch.chdir(text_dandiset.dspath) + monkeypatch.setenv("DANDI_API_KEY", text_dandiset.api.api_key) + instance = text_dandiset.api.instance_id + dandiset_id = text_dandiset.dandiset_id delete_spy = mocker.spy(RESTFullAPIClient, "delete") delete( [p.format(instance=instance, dandiset_id=dandiset_id) for p in paths], @@ -69,20 +67,18 @@ def test_delete_paths( force=True, ) delete_spy.assert_called() - download(text_dandiset["dandiset"].version_api_url, tmp_path) + download(text_dandiset.dandiset.version_api_url, tmp_path) assert list_paths(tmp_path) == [ tmp_path / dandiset_id / f for f in ["dandiset.yaml"] + remainder ] @pytest.mark.parametrize("confirm", [True, False]) -def test_delete_path_confirm( - confirm, local_dandi_api, mocker, monkeypatch, text_dandiset -): - monkeypatch.chdir(text_dandiset["dspath"]) - monkeypatch.setenv("DANDI_API_KEY", local_dandi_api["api_key"]) - instance = local_dandi_api["instance_id"] - dandiset_id = text_dandiset["dandiset_id"] +def test_delete_path_confirm(confirm, mocker, monkeypatch, text_dandiset): + monkeypatch.chdir(text_dandiset.dspath) + monkeypatch.setenv("DANDI_API_KEY", text_dandiset.api.api_key) + instance = text_dandiset.api.instance_id + dandiset_id = text_dandiset.dandiset_id delete_spy = mocker.spy(RESTFullAPIClient, "delete") confirm_mock = mocker.patch("click.confirm", return_value=confirm) delete(["subdir2/coconut.txt"], dandi_instance=instance, devel_debug=True) @@ -95,10 +91,10 @@ def test_delete_path_confirm( delete_spy.assert_not_called() -def test_delete_path_pyout(local_dandi_api, mocker, monkeypatch, text_dandiset): - monkeypatch.chdir(text_dandiset["dspath"]) - monkeypatch.setenv("DANDI_API_KEY", local_dandi_api["api_key"]) - instance = local_dandi_api["instance_id"] +def test_delete_path_pyout(mocker, monkeypatch, text_dandiset): + monkeypatch.chdir(text_dandiset.dspath) + monkeypatch.setenv("DANDI_API_KEY", text_dandiset.api.api_key) + instance = text_dandiset.api.instance_id delete_spy = mocker.spy(RESTFullAPIClient, "delete") delete(["subdir2/coconut.txt"], dandi_instance=instance, force=True) delete_spy.assert_called() @@ -120,11 +116,11 @@ def test_delete_path_pyout(local_dandi_api, mocker, monkeypatch, text_dandiset): ], ], ) -def test_delete_dandiset(local_dandi_api, mocker, monkeypatch, text_dandiset, paths): - monkeypatch.chdir(text_dandiset["dspath"]) - monkeypatch.setenv("DANDI_API_KEY", local_dandi_api["api_key"]) - instance = local_dandi_api["instance_id"] - dandiset_id = text_dandiset["dandiset_id"] +def test_delete_dandiset(mocker, monkeypatch, text_dandiset, paths): + monkeypatch.chdir(text_dandiset.dspath) + monkeypatch.setenv("DANDI_API_KEY", text_dandiset.api.api_key) + instance = text_dandiset.api.instance_id + dandiset_id = text_dandiset.dandiset_id delete_spy = mocker.spy(RESTFullAPIClient, "delete") delete( [p.format(instance=instance, dandiset_id=dandiset_id) for p in paths], @@ -134,17 +130,15 @@ def test_delete_dandiset(local_dandi_api, mocker, monkeypatch, text_dandiset, pa ) delete_spy.assert_called() with pytest.raises(NotFoundError): - local_dandi_api["client"].get_dandiset(dandiset_id, DRAFT, lazy=False) + text_dandiset.client.get_dandiset(dandiset_id, DRAFT, lazy=False) @pytest.mark.parametrize("confirm", [True, False]) -def test_delete_dandiset_confirm( - confirm, local_dandi_api, mocker, monkeypatch, text_dandiset -): - monkeypatch.chdir(text_dandiset["dspath"]) - monkeypatch.setenv("DANDI_API_KEY", local_dandi_api["api_key"]) - instance = local_dandi_api["instance_id"] - dandiset_id = text_dandiset["dandiset_id"] +def test_delete_dandiset_confirm(confirm, mocker, monkeypatch, text_dandiset): + monkeypatch.chdir(text_dandiset.dspath) + monkeypatch.setenv("DANDI_API_KEY", text_dandiset.api.api_key) + instance = text_dandiset.api.instance_id + dandiset_id = text_dandiset.dandiset_id delete_spy = mocker.spy(RESTFullAPIClient, "delete") confirm_mock = mocker.patch("click.confirm", return_value=confirm) delete( @@ -157,11 +151,11 @@ def test_delete_dandiset_confirm( delete_spy.assert_not_called() -def test_delete_dandiset_mismatch(local_dandi_api, mocker, monkeypatch, text_dandiset): - monkeypatch.chdir(text_dandiset["dspath"]) - monkeypatch.setenv("DANDI_API_KEY", local_dandi_api["api_key"]) - instance = local_dandi_api["instance_id"] - dandiset_id = text_dandiset["dandiset_id"] +def test_delete_dandiset_mismatch(mocker, monkeypatch, text_dandiset): + monkeypatch.chdir(text_dandiset.dspath) + monkeypatch.setenv("DANDI_API_KEY", text_dandiset.api.api_key) + instance = text_dandiset.api.instance_id + dandiset_id = text_dandiset.dandiset_id not_dandiset = str(int(dandiset_id) - 1).zfill(6) delete_spy = mocker.spy(RESTFullAPIClient, "delete") for paths in [ @@ -182,11 +176,11 @@ def test_delete_dandiset_mismatch(local_dandi_api, mocker, monkeypatch, text_dan delete_spy.assert_not_called() -def test_delete_instance_mismatch(local_dandi_api, mocker, monkeypatch, text_dandiset): - monkeypatch.chdir(text_dandiset["dspath"]) - monkeypatch.setenv("DANDI_API_KEY", local_dandi_api["api_key"]) - instance = local_dandi_api["instance_id"] - dandiset_id = text_dandiset["dandiset_id"] +def test_delete_instance_mismatch(mocker, monkeypatch, text_dandiset): + monkeypatch.chdir(text_dandiset.dspath) + monkeypatch.setenv("DANDI_API_KEY", text_dandiset.api.api_key) + instance = text_dandiset.api.instance_id + dandiset_id = text_dandiset.dandiset_id delete_spy = mocker.spy(RESTFullAPIClient, "delete") for paths in [ [ @@ -208,8 +202,8 @@ def test_delete_instance_mismatch(local_dandi_api, mocker, monkeypatch, text_dan def test_delete_nonexistent_dandiset(local_dandi_api, mocker, monkeypatch): - monkeypatch.setenv("DANDI_API_KEY", local_dandi_api["api_key"]) - instance = local_dandi_api["instance_id"] + monkeypatch.setenv("DANDI_API_KEY", local_dandi_api.api_key) + instance = local_dandi_api.instance_id delete_spy = mocker.spy(RESTFullAPIClient, "delete") with pytest.raises(NotFoundError) as excinfo: delete( @@ -223,8 +217,8 @@ def test_delete_nonexistent_dandiset(local_dandi_api, mocker, monkeypatch): def test_delete_nonexistent_dandiset_skip_missing(local_dandi_api, mocker, monkeypatch): - monkeypatch.setenv("DANDI_API_KEY", local_dandi_api["api_key"]) - instance = local_dandi_api["instance_id"] + monkeypatch.setenv("DANDI_API_KEY", local_dandi_api.api_key) + instance = local_dandi_api.instance_id delete_spy = mocker.spy(RESTFullAPIClient, "delete") delete( [f"dandi://{instance}/999999/subdir1/apple.txt"], @@ -236,10 +230,10 @@ def test_delete_nonexistent_dandiset_skip_missing(local_dandi_api, mocker, monke delete_spy.assert_not_called() -def test_delete_nonexistent_asset(local_dandi_api, mocker, monkeypatch, text_dandiset): - monkeypatch.setenv("DANDI_API_KEY", local_dandi_api["api_key"]) - instance = local_dandi_api["instance_id"] - dandiset_id = text_dandiset["dandiset_id"] +def test_delete_nonexistent_asset(mocker, monkeypatch, text_dandiset): + monkeypatch.setenv("DANDI_API_KEY", text_dandiset.api.api_key) + instance = text_dandiset.api.instance_id + dandiset_id = text_dandiset.dandiset_id delete_spy = mocker.spy(RESTFullAPIClient, "delete") with pytest.raises(NotFoundError) as excinfo: delete( @@ -259,11 +253,11 @@ def test_delete_nonexistent_asset(local_dandi_api, mocker, monkeypatch, text_dan def test_delete_nonexistent_asset_skip_missing( - local_dandi_api, mocker, monkeypatch, text_dandiset, tmp_path + mocker, monkeypatch, text_dandiset, tmp_path ): - monkeypatch.setenv("DANDI_API_KEY", local_dandi_api["api_key"]) - instance = local_dandi_api["instance_id"] - dandiset_id = text_dandiset["dandiset_id"] + monkeypatch.setenv("DANDI_API_KEY", text_dandiset.api.api_key) + instance = text_dandiset.api.instance_id + dandiset_id = text_dandiset.dandiset_id delete_spy = mocker.spy(RESTFullAPIClient, "delete") delete( [ @@ -276,7 +270,7 @@ def test_delete_nonexistent_asset_skip_missing( skip_missing=True, ) delete_spy.assert_called() - download(text_dandiset["dandiset"].version_api_url, tmp_path) + download(text_dandiset.dandiset.version_api_url, tmp_path) assert list_paths(tmp_path) == [ tmp_path / dandiset_id / "dandiset.yaml", tmp_path / dandiset_id / "subdir1" / "apple.txt", @@ -285,12 +279,10 @@ def test_delete_nonexistent_asset_skip_missing( ] -def test_delete_nonexistent_asset_folder( - local_dandi_api, mocker, monkeypatch, text_dandiset -): - monkeypatch.setenv("DANDI_API_KEY", local_dandi_api["api_key"]) - instance = local_dandi_api["instance_id"] - dandiset_id = text_dandiset["dandiset_id"] +def test_delete_nonexistent_asset_folder(mocker, monkeypatch, text_dandiset): + monkeypatch.setenv("DANDI_API_KEY", text_dandiset.api.api_key) + instance = text_dandiset.api.instance_id + dandiset_id = text_dandiset.dandiset_id delete_spy = mocker.spy(RESTFullAPIClient, "delete") with pytest.raises(NotFoundError) as excinfo: delete( @@ -310,11 +302,11 @@ def test_delete_nonexistent_asset_folder( def test_delete_nonexistent_asset_folder_skip_missing( - local_dandi_api, mocker, monkeypatch, text_dandiset, tmp_path + mocker, monkeypatch, text_dandiset, tmp_path ): - monkeypatch.setenv("DANDI_API_KEY", local_dandi_api["api_key"]) - instance = local_dandi_api["instance_id"] - dandiset_id = text_dandiset["dandiset_id"] + monkeypatch.setenv("DANDI_API_KEY", text_dandiset.api.api_key) + instance = text_dandiset.api.instance_id + dandiset_id = text_dandiset.dandiset_id delete_spy = mocker.spy(RESTFullAPIClient, "delete") delete( [ @@ -327,7 +319,7 @@ def test_delete_nonexistent_asset_folder_skip_missing( skip_missing=True, ) delete_spy.assert_called() - download(text_dandiset["dandiset"].version_api_url, tmp_path) + download(text_dandiset.dandiset.version_api_url, tmp_path) assert list_paths(tmp_path) == [ tmp_path / dandiset_id / "dandiset.yaml", tmp_path / dandiset_id / "file.txt", @@ -337,8 +329,8 @@ def test_delete_nonexistent_asset_folder_skip_missing( def test_delete_version(local_dandi_api, mocker, monkeypatch): - monkeypatch.setenv("DANDI_API_KEY", local_dandi_api["api_key"]) - instance = local_dandi_api["instance_id"] + monkeypatch.setenv("DANDI_API_KEY", local_dandi_api.api_key) + instance = local_dandi_api.instance_id delete_spy = mocker.spy(RESTFullAPIClient, "delete") with pytest.raises(NotImplementedError) as excinfo: delete( diff --git a/dandi/tests/test_download.py b/dandi/tests/test_download.py index 7e76ebf14..c3f2bf05a 100644 --- a/dandi/tests/test_download.py +++ b/dandi/tests/test_download.py @@ -3,14 +3,19 @@ import os.path as op import re from shutil import rmtree +from typing import List, Tuple +import numpy as np import pytest import responses +import zarr from .skip import mark +from .test_helpers import assert_dirtrees_eq from ..consts import DRAFT, dandiset_metadata_file from ..dandiarchive import DandisetURL -from ..download import download, download_generator +from ..download import ProgressCombiner, download, download_generator +from ..upload import upload from ..utils import list_paths @@ -118,23 +123,23 @@ def test_download_000027_resume(tmp_path, resizer, version): def test_download_newest_version(text_dandiset, tmp_path): - dandiset = text_dandiset["dandiset"] - dandiset_id = text_dandiset["dandiset_id"] + dandiset = text_dandiset.dandiset + dandiset_id = text_dandiset.dandiset_id download(dandiset.api_url, tmp_path) assert (tmp_path / dandiset_id / "file.txt").read_text() == "This is test text.\n" dandiset.wait_until_valid() dandiset.publish() - (text_dandiset["dspath"] / "file.txt").write_text("This is different text.\n") - text_dandiset["reupload"]() + (text_dandiset.dspath / "file.txt").write_text("This is different text.\n") + text_dandiset.upload() rmtree(tmp_path / dandiset_id) download(dandiset.api_url, tmp_path) assert (tmp_path / dandiset_id / "file.txt").read_text() == "This is test text.\n" -def test_download_folder(local_dandi_api, text_dandiset, tmp_path): - dandiset_id = text_dandiset["dandiset_id"] +def test_download_folder(text_dandiset, tmp_path): + dandiset_id = text_dandiset.dandiset_id download( - f"dandi://{local_dandi_api['instance_id']}/{dandiset_id}/subdir2/", tmp_path + f"dandi://{text_dandiset.api.instance_id}/{dandiset_id}/subdir2/", tmp_path ) assert list_paths(tmp_path, dirs=True) == [ tmp_path / "subdir2", @@ -145,10 +150,10 @@ def test_download_folder(local_dandi_api, text_dandiset, tmp_path): assert (tmp_path / "subdir2" / "coconut.txt").read_text() == "Coconut\n" -def test_download_item(local_dandi_api, text_dandiset, tmp_path): - dandiset_id = text_dandiset["dandiset_id"] +def test_download_item(text_dandiset, tmp_path): + dandiset_id = text_dandiset.dandiset_id download( - f"dandi://{local_dandi_api['instance_id']}/{dandiset_id}/subdir2/coconut.txt", + f"dandi://{text_dandiset.api.instance_id}/{dandiset_id}/subdir2/coconut.txt", tmp_path, ) assert list_paths(tmp_path, dirs=True) == [tmp_path / "coconut.txt"] @@ -156,29 +161,29 @@ def test_download_item(local_dandi_api, text_dandiset, tmp_path): def test_download_asset_id(text_dandiset, tmp_path): - asset = text_dandiset["dandiset"].get_asset_by_path("subdir2/coconut.txt") + asset = text_dandiset.dandiset.get_asset_by_path("subdir2/coconut.txt") download(asset.download_url, tmp_path) assert list_paths(tmp_path, dirs=True) == [tmp_path / "coconut.txt"] assert (tmp_path / "coconut.txt").read_text() == "Coconut\n" def test_download_asset_id_only(text_dandiset, tmp_path): - asset = text_dandiset["dandiset"].get_asset_by_path("subdir2/coconut.txt") + asset = text_dandiset.dandiset.get_asset_by_path("subdir2/coconut.txt") download(asset.base_download_url, tmp_path) assert list_paths(tmp_path, dirs=True) == [tmp_path / "coconut.txt"] assert (tmp_path / "coconut.txt").read_text() == "Coconut\n" @pytest.mark.parametrize("confirm", [True, False]) -def test_download_sync(confirm, local_dandi_api, mocker, text_dandiset, tmp_path): - text_dandiset["dandiset"].get_asset_by_path("file.txt").delete() - dspath = tmp_path / text_dandiset["dandiset_id"] - os.rename(text_dandiset["dspath"], dspath) +def test_download_sync(confirm, mocker, text_dandiset, tmp_path): + text_dandiset.dandiset.get_asset_by_path("file.txt").delete() + dspath = tmp_path / text_dandiset.dandiset_id + os.rename(text_dandiset.dspath, dspath) confirm_mock = mocker.patch( "dandi.download.abbrev_prompt", return_value="yes" if confirm else "no" ) download( - f"dandi://{local_dandi_api['instance_id']}/{text_dandiset['dandiset_id']}", + f"dandi://{text_dandiset.api.instance_id}/{text_dandiset.dandiset_id}", tmp_path, existing="overwrite", sync=True, @@ -190,28 +195,28 @@ def test_download_sync(confirm, local_dandi_api, mocker, text_dandiset, tmp_path assert (dspath / "file.txt").exists() -def test_download_sync_folder(local_dandi_api, mocker, text_dandiset): - text_dandiset["dandiset"].get_asset_by_path("file.txt").delete() - text_dandiset["dandiset"].get_asset_by_path("subdir2/banana.txt").delete() +def test_download_sync_folder(mocker, text_dandiset): + text_dandiset.dandiset.get_asset_by_path("file.txt").delete() + text_dandiset.dandiset.get_asset_by_path("subdir2/banana.txt").delete() confirm_mock = mocker.patch("dandi.download.abbrev_prompt", return_value="yes") download( - f"dandi://{local_dandi_api['instance_id']}/{text_dandiset['dandiset_id']}/subdir2/", - text_dandiset["dspath"], + f"dandi://{text_dandiset.api.instance_id}/{text_dandiset.dandiset_id}/subdir2/", + text_dandiset.dspath, existing="overwrite", sync=True, ) confirm_mock.assert_called_with("Delete 1 local asset?", "yes", "no", "list") - assert (text_dandiset["dspath"] / "file.txt").exists() - assert not (text_dandiset["dspath"] / "subdir2" / "banana.txt").exists() + assert (text_dandiset.dspath / "file.txt").exists() + assert not (text_dandiset.dspath / "subdir2" / "banana.txt").exists() -def test_download_sync_list(capsys, local_dandi_api, mocker, text_dandiset, tmp_path): - text_dandiset["dandiset"].get_asset_by_path("file.txt").delete() - dspath = tmp_path / text_dandiset["dandiset_id"] - os.rename(text_dandiset["dspath"], dspath) +def test_download_sync_list(capsys, mocker, text_dandiset, tmp_path): + text_dandiset.dandiset.get_asset_by_path("file.txt").delete() + dspath = tmp_path / text_dandiset.dandiset_id + os.rename(text_dandiset.dspath, dspath) input_mock = mocker.patch("dandi.utils.input", side_effect=["list", "yes"]) download( - f"dandi://{local_dandi_api['instance_id']}/{text_dandiset['dandiset_id']}", + f"dandi://{text_dandiset.api.instance_id}/{text_dandiset.dandiset_id}", tmp_path, existing="overwrite", sync=True, @@ -224,11 +229,26 @@ def test_download_sync_list(capsys, local_dandi_api, mocker, text_dandiset, tmp_ assert capsys.readouterr().out.splitlines()[-1] == str(dspath / "file.txt") +def test_download_sync_zarr(mocker, zarr_dandiset, tmp_path): + zarr_dandiset.dandiset.get_asset_by_path("sample.zarr").delete() + dspath = tmp_path / zarr_dandiset.dandiset_id + os.rename(zarr_dandiset.dspath, dspath) + confirm_mock = mocker.patch("dandi.download.abbrev_prompt", return_value="yes") + download( + zarr_dandiset.dandiset.version_api_url, + tmp_path, + existing="overwrite", + sync=True, + ) + confirm_mock.assert_called_with("Delete 1 local asset?", "yes", "no", "list") + assert not (dspath / "sample.zarr").exists() + + @responses.activate def test_download_no_blobDateModified(text_dandiset, tmp_path): # Regression test for #806 responses.add_passthru(re.compile("^http")) - dandiset = text_dandiset["dandiset"] + dandiset = text_dandiset.dandiset asset = dandiset.get_asset_by_path("file.txt") metadata = asset.get_raw_metadata() del metadata["blobDateModified"] @@ -239,14 +259,14 @@ def test_download_no_blobDateModified(text_dandiset, tmp_path): @responses.activate def test_download_metadata404(text_dandiset, tmp_path): responses.add_passthru(re.compile("^http")) - asset = text_dandiset["dandiset"].get_asset_by_path("subdir1/apple.txt") + asset = text_dandiset.dandiset.get_asset_by_path("subdir1/apple.txt") responses.add(responses.GET, asset.api_url, status=404) statuses = list( download_generator( DandisetURL( - api_url=text_dandiset["client"].api_url, - dandiset_id=text_dandiset["dandiset"].identifier, - version_id=text_dandiset["dandiset"].version_id, + api_url=text_dandiset.client.api_url, + dandiset_id=text_dandiset.dandiset.identifier, + version_id=text_dandiset.dandiset.version_id, ), tmp_path, ) @@ -266,3 +286,391 @@ def test_download_metadata404(text_dandiset, tmp_path): tmp_path / "subdir2" / "banana.txt", tmp_path / "subdir2" / "coconut.txt", ] + + +def test_download_zarr(tmp_path, zarr_dandiset): + download(zarr_dandiset.dandiset.version_api_url, tmp_path) + assert_dirtrees_eq( + zarr_dandiset.dspath / "sample.zarr", + tmp_path / zarr_dandiset.dandiset_id / "sample.zarr", + ) + + +def test_download_different_zarr(tmp_path, zarr_dandiset): + dd = tmp_path / zarr_dandiset.dandiset_id + dd.mkdir() + zarr.save(dd / "sample.zarr", np.eye(5)) + download( + zarr_dandiset.dandiset.version_api_url, tmp_path, existing="overwrite-different" + ) + assert_dirtrees_eq( + zarr_dandiset.dspath / "sample.zarr", + tmp_path / zarr_dandiset.dandiset_id / "sample.zarr", + ) + + +def test_download_different_zarr_delete_dir(local_dandi_api, monkeypatch, tmp_path): + monkeypatch.setenv("DANDI_API_KEY", local_dandi_api.api_key) + d = local_dandi_api.client.create_dandiset("Test Dandiset", {}) + dandiset_id = d.identifier + dspath = tmp_path / "dandiset" + dspath.mkdir() + (dspath / dandiset_metadata_file).write_text(f"identifier: '{dandiset_id}'\n") + zarr.save(dspath / "sample.zarr", np.eye(5)) + assert not any(p.is_dir() for p in (dspath / "sample.zarr").iterdir()) + upload( + paths=[], + dandiset_path=dspath, + dandi_instance=local_dandi_api.instance_id, + devel_debug=True, + ) + dd = tmp_path / "download" / dandiset_id + dd.mkdir(parents=True, exist_ok=True) + zarr.save(dd / "sample.zarr", np.arange(1000), np.arange(1000, 0, -1)) + assert any(p.is_dir() for p in (dd / "sample.zarr").iterdir()) + download(d.version_api_url, tmp_path / "download", existing="overwrite-different") + assert_dirtrees_eq(dspath / "sample.zarr", dd / "sample.zarr") + + +def test_download_zarr_to_nonzarr_path(tmp_path, zarr_dandiset): + dd = tmp_path / zarr_dandiset.dandiset_id + dd.mkdir() + (dd / "sample.zarr").write_text("This is not a Zarr.\n") + download( + zarr_dandiset.dandiset.version_api_url, tmp_path, existing="overwrite-different" + ) + assert_dirtrees_eq( + zarr_dandiset.dspath / "sample.zarr", + tmp_path / zarr_dandiset.dandiset_id / "sample.zarr", + ) + + +def test_download_nonzarr_to_zarr_path(local_dandi_api, monkeypatch, tmp_path): + monkeypatch.setenv("DANDI_API_KEY", local_dandi_api.api_key) + d = local_dandi_api.client.create_dandiset("Test Dandiset", {}) + dandiset_id = d.identifier + dspath = tmp_path / "dandiset" + dspath.mkdir() + (dspath / dandiset_metadata_file).write_text(f"identifier: '{dandiset_id}'\n") + (dspath / "sample.zarr").write_text("This is not a Zarr.\n") + upload( + paths=[], + dandiset_path=dspath, + dandi_instance=local_dandi_api.instance_id, + devel_debug=True, + allow_any_path=True, + ) + dd = tmp_path / "download" / dandiset_id + dd.mkdir(parents=True, exist_ok=True) + zarr.save(dd / "sample.zarr", np.arange(1000), np.arange(1000, 0, -1)) + download(d.version_api_url, tmp_path / "download", existing="overwrite-different") + assert (dd / "sample.zarr").is_file() + assert (dd / "sample.zarr").read_text() == "This is not a Zarr.\n" + + +@pytest.mark.parametrize( + "file_qty,inputs,expected", + [ + ( + 1, + [ + ("lonely.txt", {"size": 42}), + ("lonely.txt", {"status": "downloading"}), + ("lonely.txt", {"done": 0, "done%": 0.0}), + ("lonely.txt", {"done": 20, "done%": 20 / 42 * 100}), + ("lonely.txt", {"done": 40, "done%": 40 / 42 * 100}), + ("lonely.txt", {"done": 42, "done%": 100.0}), + ("lonely.txt", {"checksum": "ok"}), + ("lonely.txt", {"status": "setting mtime"}), + ("lonely.txt", {"status": "done"}), + ], + [ + {"size": 69105}, + {"status": "downloading"}, + {"done": 0, "done%": 0.0}, + {"done": 20, "done%": 20 / 42 * 100}, + {"done": 40, "done%": 40 / 42 * 100}, + {"done": 42, "done%": 100.0}, + {"status": "done", "message": "1 done"}, + ], + ), + ( + 2, + [ + ("apple.txt", {"size": 42}), + ("banana.txt", {"size": 127}), + ("apple.txt", {"status": "downloading"}), + ("banana.txt", {"status": "downloading"}), + ("apple.txt", {"done": 0, "done%": 0.0}), + ("banana.txt", {"done": 0, "done%": 0.0}), + ("apple.txt", {"done": 20, "done%": 20 / 42 * 100}), + ("banana.txt", {"done": 40, "done%": 40 / 127 * 100}), + ("apple.txt", {"done": 40, "done%": 40 / 42 * 100}), + ("banana.txt", {"done": 80, "done%": 80 / 127 * 100}), + ("apple.txt", {"done": 42, "done%": 100.0}), + ("banana.txt", {"done": 120, "done%": 120 / 127 * 100}), + ("apple.txt", {"checksum": "ok"}), + ("banana.txt", {"done": 127, "done%": 100.0}), + ("apple.txt", {"status": "setting mtime"}), + ("banana.txt", {"checksum": "ok"}), + ("apple.txt", {"status": "done"}), + ("banana.txt", {"status": "setting mtime"}), + ("banana.txt", {"status": "done"}), + ], + [ + {"size": 69105}, + {"status": "downloading"}, + {"done": 0, "done%": 0.0}, + {"done": 0, "done%": 0.0}, + {"done": 20, "done%": 20 / 169 * 100}, + {"done": 60, "done%": 60 / 169 * 100}, + {"done": 80, "done%": 80 / 169 * 100}, + {"done": 120, "done%": 120 / 169 * 100}, + {"done": 122, "done%": 122 / 169 * 100}, + {"done": 162, "done%": 162 / 169 * 100}, + {"done": 169, "done%": 100.0}, + {"message": "1 done"}, + {"status": "done", "message": "2 done"}, + ], + ), + ( + 2, + [ + ("apple.txt", {"size": 42}), + ("apple.txt", {"status": "downloading"}), + ("apple.txt", {"done": 0, "done%": 0.0}), + ("apple.txt", {"done": 20, "done%": 20 / 42 * 100}), + ("banana.txt", {"size": 127}), + ("apple.txt", {"done": 40, "done%": 40 / 42 * 100}), + ("banana.txt", {"status": "downloading"}), + ("apple.txt", {"done": 42, "done%": 100.0}), + ("banana.txt", {"done": 0, "done%": 0.0}), + ("apple.txt", {"checksum": "ok"}), + ("banana.txt", {"done": 40, "done%": 40 / 127 * 100}), + ("apple.txt", {"status": "setting mtime"}), + ("banana.txt", {"done": 80, "done%": 80 / 127 * 100}), + ("apple.txt", {"status": "done"}), + ("banana.txt", {"done": 120, "done%": 120 / 127 * 100}), + ("banana.txt", {"done": 127, "done%": 100.0}), + ("banana.txt", {"checksum": "ok"}), + ("banana.txt", {"status": "setting mtime"}), + ("banana.txt", {"status": "done"}), + ], + [ + {"size": 69105}, + {"status": "downloading"}, + {"done": 0, "done%": 0.0}, + {"done": 20, "done%": 20 / 42 * 100}, + {"done": 20, "done%": 20 / 169 * 100}, + {"done": 40, "done%": 40 / 169 * 100}, + {"done": 42, "done%": 42 / 169 * 100}, + {"done": 42, "done%": 42 / 169 * 100}, + {"done": 82, "done%": 82 / 169 * 100}, + {"done": 122, "done%": 122 / 169 * 100}, + {"message": "1 done"}, + {"done": 162, "done%": 162 / 169 * 100}, + {"done": 169, "done%": 169 / 169 * 100}, + {"status": "done", "message": "2 done"}, + ], + ), + ( + 2, + [ + ("apple.txt", {"size": 42}), + ("apple.txt", {"status": "downloading"}), + ("apple.txt", {"done": 0, "done%": 0.0}), + ("apple.txt", {"done": 20, "done%": 20 / 42 * 100}), + ("apple.txt", {"done": 40, "done%": 40 / 42 * 100}), + ("apple.txt", {"done": 42, "done%": 100.0}), + ("apple.txt", {"checksum": "ok"}), + ("apple.txt", {"status": "setting mtime"}), + ("apple.txt", {"status": "done"}), + ("banana.txt", {"size": 127}), + ("banana.txt", {"status": "downloading"}), + ("banana.txt", {"done": 0, "done%": 0.0}), + ("banana.txt", {"done": 40, "done%": 40 / 127 * 100}), + ("banana.txt", {"done": 80, "done%": 80 / 127 * 100}), + ("banana.txt", {"done": 120, "done%": 120 / 127 * 100}), + ("banana.txt", {"done": 127, "done%": 100.0}), + ("banana.txt", {"checksum": "ok"}), + ("banana.txt", {"status": "setting mtime"}), + ("banana.txt", {"status": "done"}), + ], + [ + {"size": 69105}, + {"status": "downloading"}, + {"done": 0, "done%": 0.0}, + {"done": 20, "done%": 20 / 42 * 100}, + {"done": 40, "done%": 40 / 42 * 100}, + {"done": 42, "done%": 42 / 42 * 100}, + {"message": "1 done"}, + {"done": 42, "done%": 42 / 169 * 100}, + {"done": 82, "done%": 82 / 169 * 100}, + {"done": 122, "done%": 122 / 169 * 100}, + {"done": 162, "done%": 162 / 169 * 100}, + {"done": 169, "done%": 100.0}, + {"status": "done", "message": "2 done"}, + ], + ), + ( + 2, + [ + ("apple.txt", {"size": 42}), + ("banana.txt", {"size": 127}), + ("apple.txt", {"status": "downloading"}), + ("banana.txt", {"status": "downloading"}), + ("apple.txt", {"done": 0, "done%": 0.0}), + ("banana.txt", {"done": 0, "done%": 0.0}), + ("apple.txt", {"done": 20, "done%": 20 / 42 * 100}), + ("banana.txt", {"done": 40, "done%": 40 / 127 * 100}), + ("apple.txt", {"done": 40, "done%": 40 / 42 * 100}), + ("banana.txt", {"status": "error", "message": "Internet broke"}), + ("apple.txt", {"done": 42, "done%": 100.0}), + ("apple.txt", {"checksum": "ok"}), + ("apple.txt", {"status": "setting mtime"}), + ("apple.txt", {"status": "done"}), + ], + [ + {"size": 69105}, + {"status": "downloading"}, + {"done": 0, "done%": 0.0}, + {"done": 0, "done%": 0.0}, + {"done": 20, "done%": 20 / 169 * 100}, + {"done": 60, "done%": 60 / 169 * 100}, + {"done": 80, "done%": 80 / 169 * 100}, + {"message": "1 errored"}, + {"done": 40, "done%": 40 / 42 * 100}, + {"done": 42, "done%": 100.0}, + {"status": "error", "message": "1 done, 1 errored"}, + ], + ), + ( + 1, + [("lonely.txt", {"status": "skipped", "message": "already exists"})], + [{"status": "skipped", "message": "1 skipped"}], + ), + ( + 2, + [ + ("apple.txt", {"size": 42}), + ("banana.txt", {"status": "skipped", "message": "already exists"}), + ("apple.txt", {"status": "downloading"}), + ("apple.txt", {"done": 0, "done%": 0.0}), + ("apple.txt", {"done": 20, "done%": 20 / 42 * 100}), + ("apple.txt", {"done": 40, "done%": 40 / 42 * 100}), + ("apple.txt", {"done": 42, "done%": 100.0}), + ("apple.txt", {"checksum": "ok"}), + ("apple.txt", {"status": "setting mtime"}), + ("apple.txt", {"status": "done"}), + ], + [ + {"size": 69105}, + {"message": "1 skipped"}, + {"status": "downloading"}, + {"done": 0, "done%": 0.0}, + {"done": 20, "done%": 20 / 42 * 100}, + {"done": 40, "done%": 40 / 42 * 100}, + {"done": 42, "done%": 100.0}, + {"status": "done", "message": "1 done, 1 skipped"}, + ], + ), + ( + 2, + [ + ("apple.txt", {"size": 42}), + ("banana.txt", {"size": 127}), + ("apple.txt", {"status": "downloading"}), + ("banana.txt", {"status": "downloading"}), + ("apple.txt", {"done": 0, "done%": 0.0}), + ("banana.txt", {"done": 0, "done%": 0.0}), + ("apple.txt", {"done": 20, "done%": 20 / 42 * 100}), + ("banana.txt", {"done": 40, "done%": 40 / 127 * 100}), + ("apple.txt", {"done": 40, "done%": 40 / 42 * 100}), + ("banana.txt", {"done": 80, "done%": 80 / 127 * 100}), + ("apple.txt", {"done": 42, "done%": 100.0}), + ("banana.txt", {"done": 120, "done%": 120 / 127 * 100}), + ("apple.txt", {"checksum": "ok"}), + ("banana.txt", {"done": 127, "done%": 100.0}), + ("apple.txt", {"status": "setting mtime"}), + ( + "banana.txt", + { + "checksum": "differs", + "status": "error", + "message": "Checksum differs", + }, + ), + ("apple.txt", {"status": "done"}), + ], + [ + {"size": 69105}, + {"status": "downloading"}, + {"done": 0, "done%": 0.0}, + {"done": 0, "done%": 0.0}, + {"done": 20, "done%": 20 / 169 * 100}, + {"done": 60, "done%": 60 / 169 * 100}, + {"done": 80, "done%": 80 / 169 * 100}, + {"done": 120, "done%": 120 / 169 * 100}, + {"done": 122, "done%": 122 / 169 * 100}, + {"done": 162, "done%": 162 / 169 * 100}, + {"done": 169, "done%": 100.0}, + {"message": "1 errored"}, + {"status": "error", "message": "1 done, 1 errored"}, + ], + ), + ( + 3, + [ + ("apple.txt", {"size": 42}), + ("banana.txt", {"size": 127}), + ("apple.txt", {"status": "downloading"}), + ("banana.txt", {"status": "downloading"}), + ("coconut", {"status": "skipped", "message": "already exists"}), + ("apple.txt", {"done": 0, "done%": 0.0}), + ("banana.txt", {"done": 0, "done%": 0.0}), + ("apple.txt", {"done": 20, "done%": 20 / 42 * 100}), + ("banana.txt", {"done": 40, "done%": 40 / 127 * 100}), + ("apple.txt", {"done": 40, "done%": 40 / 42 * 100}), + ("banana.txt", {"done": 80, "done%": 80 / 127 * 100}), + ("apple.txt", {"done": 42, "done%": 100.0}), + ( + "apple.txt", + { + "checksum": "differs", + "status": "error", + "message": "Checksum differs", + }, + ), + ("banana.txt", {"done": 120, "done%": 120 / 127 * 100}), + ("banana.txt", {"done": 127, "done%": 100.0}), + ("banana.txt", {"checksum": "ok"}), + ("banana.txt", {"status": "setting mtime"}), + ("banana.txt", {"status": "done"}), + ], + [ + {"size": 69105}, + {"status": "downloading"}, + {"message": "1 skipped"}, + {"done": 0, "done%": 0.0}, + {"done": 0, "done%": 0.0}, + {"done": 20, "done%": 20 / 169 * 100}, + {"done": 60, "done%": 60 / 169 * 100}, + {"done": 80, "done%": 80 / 169 * 100}, + {"done": 120, "done%": 120 / 169 * 100}, + {"done": 122, "done%": 122 / 169 * 100}, + {"message": "1 errored, 1 skipped"}, + {"done": 162, "done%": 162 / 169 * 100}, + {"done": 169, "done%": 100.0}, + {"status": "error", "message": "1 done, 1 errored, 1 skipped"}, + ], + ), + ], +) +def test_progress_combiner( + file_qty: int, inputs: List[Tuple[str, dict]], expected: List[dict] +) -> None: + pc = ProgressCombiner(zarr_size=69105, file_qty=file_qty) + outputs = [] + for path, status in inputs: + outputs.extend(pc.feed(path, status)) + assert outputs == expected diff --git a/dandi/tests/test_files.py b/dandi/tests/test_files.py new file mode 100644 index 000000000..524f91351 --- /dev/null +++ b/dandi/tests/test_files.py @@ -0,0 +1,177 @@ +from operator import attrgetter +from pathlib import Path + +from dandischema.models import get_schema_version +import numpy as np +import zarr + +from .. import get_logger +from ..consts import ZARR_MIME_TYPE, dandiset_metadata_file +from ..dandiapi import AssetType, RemoteZarrAsset +from ..files import ( + DandisetMetadataFile, + GenericAsset, + NWBAsset, + ZarrAsset, + dandi_file, + find_dandi_files, +) + +lgr = get_logger() + + +def test_find_dandi_files(tmp_path: Path) -> None: + (tmp_path / dandiset_metadata_file).touch() + (tmp_path / "sample01.zarr").mkdir() + (tmp_path / "sample01.zarr" / "inner.nwb").touch() + (tmp_path / "sample01.zarr" / "foo").touch() + (tmp_path / "sample02.nwb").touch() + (tmp_path / "foo").touch() + (tmp_path / "bar.txt").touch() + (tmp_path / "subdir").mkdir() + (tmp_path / "subdir" / "sample03.nwb").touch() + (tmp_path / "subdir" / "sample04.zarr").mkdir() + (tmp_path / "subdir" / "sample04.zarr" / "inner2.nwb").touch() + (tmp_path / "subdir" / "sample04.zarr" / "baz").touch() + (tmp_path / "subdir" / "gnusto").touch() + (tmp_path / "subdir" / "cleesh.txt").touch() + (tmp_path / "empty.zarr").mkdir() + (tmp_path / ".ignored").touch() + (tmp_path / ".ignored.dir").mkdir() + (tmp_path / ".ignored.dir" / "ignored.nwb").touch() + + files = sorted( + find_dandi_files(tmp_path, dandiset_path=tmp_path), key=attrgetter("filepath") + ) + assert files == [ + ZarrAsset(filepath=tmp_path / "sample01.zarr", path="sample01.zarr"), + NWBAsset(filepath=tmp_path / "sample02.nwb", path="sample02.nwb"), + NWBAsset( + filepath=tmp_path / "subdir" / "sample03.nwb", path="subdir/sample03.nwb" + ), + ZarrAsset( + filepath=tmp_path / "subdir" / "sample04.zarr", path="subdir/sample04.zarr" + ), + ] + + files = sorted( + find_dandi_files(tmp_path, dandiset_path=tmp_path, allow_all=True), + key=attrgetter("filepath"), + ) + assert files == [ + GenericAsset(filepath=tmp_path / "bar.txt", path="bar.txt"), + DandisetMetadataFile(filepath=tmp_path / dandiset_metadata_file), + GenericAsset(filepath=tmp_path / "foo", path="foo"), + ZarrAsset(filepath=tmp_path / "sample01.zarr", path="sample01.zarr"), + NWBAsset(filepath=tmp_path / "sample02.nwb", path="sample02.nwb"), + GenericAsset( + filepath=tmp_path / "subdir" / "cleesh.txt", path="subdir/cleesh.txt" + ), + GenericAsset(filepath=tmp_path / "subdir" / "gnusto", path="subdir/gnusto"), + NWBAsset( + filepath=tmp_path / "subdir" / "sample03.nwb", path="subdir/sample03.nwb" + ), + ZarrAsset( + filepath=tmp_path / "subdir" / "sample04.zarr", path="subdir/sample04.zarr" + ), + ] + + files = sorted( + find_dandi_files(tmp_path, dandiset_path=tmp_path, include_metadata=True), + key=attrgetter("filepath"), + ) + assert files == [ + DandisetMetadataFile(filepath=tmp_path / dandiset_metadata_file), + ZarrAsset(filepath=tmp_path / "sample01.zarr", path="sample01.zarr"), + NWBAsset(filepath=tmp_path / "sample02.nwb", path="sample02.nwb"), + NWBAsset( + filepath=tmp_path / "subdir" / "sample03.nwb", path="subdir/sample03.nwb" + ), + ZarrAsset( + filepath=tmp_path / "subdir" / "sample04.zarr", path="subdir/sample04.zarr" + ), + ] + + +def test_validate_simple1(simple1_nwb): + # this file should be ok + errors = dandi_file(simple1_nwb).get_validation_errors( + schema_version=get_schema_version() + ) + assert not errors + + +def test_validate_simple2(simple2_nwb): + # this file should be ok + errors = dandi_file(simple2_nwb).get_validation_errors() + assert not errors + + +def test_validate_simple2_new(simple2_nwb): + # this file should be ok + errors = dandi_file(simple2_nwb).get_validation_errors( + schema_version=get_schema_version() + ) + assert not errors + + +def test_validate_bogus(tmp_path): + path = tmp_path / "wannabe.nwb" + path.write_text("not really nwb") + # intended to produce use-case for https://github.com/dandi/dandi-cli/issues/93 + # but it would be tricky, so it is more of a smoke test that + # we do not crash + errors = dandi_file(path).get_validation_errors() + # ATM we would get 2 errors -- since could not be open in two places, + # but that would be too rigid to test. Let's just see that we have expected errors + assert any(e.startswith("Failed to read metadata") for e in errors) + + +def test_upload_zarr(local_dandi_api, tmp_path): + filepath = tmp_path / "example.zarr" + zarr.save(filepath, np.arange(1000), np.arange(1000, 0, -1)) + zf = dandi_file(filepath) + assert isinstance(zf, ZarrAsset) + d = local_dandi_api.client.create_dandiset("Zarr Dandiset", {}) + asset = zf.upload(d, {"description": "A test Zarr"}) + assert isinstance(asset, RemoteZarrAsset) + assert asset.asset_type is AssetType.ZARR + assert asset.path == "example.zarr" + md = asset.get_raw_metadata() + assert md["encodingFormat"] == ZARR_MIME_TYPE + assert md["description"] == "A test Zarr" + md["description"] = "A modified Zarr" + asset.set_raw_metadata(md) + md = asset.get_raw_metadata() + assert md["description"] == "A modified Zarr" + + for file_src in [zf, asset]: + lgr.debug("Traversing %s", type(file_src).__name__) + entries = sorted(file_src.iterfiles(include_dirs=True), key=attrgetter("parts")) + assert [str(e) for e in entries] == [ + ".zgroup", + "arr_0", + "arr_0/.zarray", + "arr_0/0", + "arr_1", + "arr_1/.zarray", + "arr_1/0", + ] + assert (file_src.filetree / ".zgroup").exists() + assert (file_src.filetree / ".zgroup").is_file() + assert not (file_src.filetree / ".zgroup").is_dir() + assert (file_src.filetree / "arr_0").exists() + assert not (file_src.filetree / "arr_0").is_file() + assert (file_src.filetree / "arr_0").is_dir() + assert not (file_src.filetree / "0").exists() + assert not (file_src.filetree / "0").is_file() + assert not (file_src.filetree / "0").is_dir() + assert not (file_src.filetree / "arr_0" / ".zgroup").exists() + assert not (file_src.filetree / "arr_0" / ".zgroup").is_file() + assert not (file_src.filetree / "arr_0" / ".zgroup").is_dir() + assert not (file_src.filetree / ".zgroup" / "0").exists() + assert not (file_src.filetree / ".zgroup" / "0").is_file() + assert not (file_src.filetree / ".zgroup" / "0").is_dir() + assert not (file_src.filetree / "arr_2" / "0").exists() + assert not (file_src.filetree / "arr_2" / "0").is_file() + assert not (file_src.filetree / "arr_2" / "0").is_dir() diff --git a/dandi/tests/test_helpers.py b/dandi/tests/test_helpers.py new file mode 100644 index 000000000..0e5729c06 --- /dev/null +++ b/dandi/tests/test_helpers.py @@ -0,0 +1,23 @@ +from operator import attrgetter +from pathlib import Path + + +# This needs to be in a file named "test_*.py" so that pytest performs its +# assertion rewriting on it. +def assert_dirtrees_eq(tree1: Path, tree2: Path) -> None: + """Assert that the file trees at the given paths are equal""" + assert sorted(map(attrgetter("name"), tree1.iterdir())) == sorted( + map(attrgetter("name"), tree2.iterdir()) + ) + for p1 in tree1.iterdir(): + p2 = tree2 / p1.name + assert p1.is_dir() == p2.is_dir() + if p1.is_dir(): + assert_dirtrees_eq(p1, p2) + # TODO: Considering using the identify library to test for binary-ness. + # (We can't use mimetypes, as .json maps to application/json instead of + # text/json.) + elif p1.suffix in {".txt", ".py", ".json"}: + assert p1.read_text() == p2.read_text() + else: + assert p1.read_bytes() == p2.read_bytes() diff --git a/dandi/tests/test_keyring.py b/dandi/tests/test_keyring.py index 5d3400a33..50fea4912 100644 --- a/dandi/tests/test_keyring.py +++ b/dandi/tests/test_keyring.py @@ -25,11 +25,11 @@ def test_dandi_authenticate_no_env_var(local_dandi_api, monkeypatch, mocker): monkeypatch.delenv("DANDI_API_KEY", raising=False) monkeypatch.setenv("PYTHON_KEYRING_BACKEND", "keyring.backends.null.Keyring") inputmock = mocker.patch( - "dandi.dandiapi.input", return_value=local_dandi_api["api_key"] + "dandi.dandiapi.input", return_value=local_dandi_api.api_key ) - DandiAPIClient(local_dandi_api["instance"].api).dandi_authenticate() + DandiAPIClient(local_dandi_api.api_url).dandi_authenticate() inputmock.assert_called_once_with( - "Please provide API Key for {}: ".format(local_dandi_api["instance_id"]) + "Please provide API Key for {}: ".format(local_dandi_api.instance_id) ) diff --git a/dandi/tests/test_upload.py b/dandi/tests/test_upload.py index 2881c9de7..325368951 100644 --- a/dandi/tests/test_upload.py +++ b/dandi/tests/test_upload.py @@ -1,28 +1,33 @@ import os from pathlib import Path +from shutil import rmtree +import numpy as np import pynwb import pytest +import zarr -from ..consts import DRAFT, dandiset_metadata_file -from ..dandiapi import RemoteDandiset +from .test_helpers import assert_dirtrees_eq +from ..consts import DRAFT, ZARR_MIME_TYPE, dandiset_metadata_file +from ..dandiapi import AssetType, RemoteBlobAsset, RemoteZarrAsset from ..download import download from ..exceptions import NotFoundError +from ..files import LocalFileAsset from ..pynwb_utils import make_nwb_file from ..upload import upload from ..utils import list_paths def test_new_upload_download(local_dandi_api, monkeypatch, organized_nwb_dir, tmp_path): - d = local_dandi_api["client"].create_dandiset("Test Dandiset", {}) + d = local_dandi_api.client.create_dandiset("Test Dandiset", {}) dandiset_id = d.identifier (nwb_file,) = organized_nwb_dir.glob(f"*{os.sep}*.nwb") (organized_nwb_dir / dandiset_metadata_file).write_text( f"identifier: '{dandiset_id}'\n" ) monkeypatch.chdir(organized_nwb_dir) - monkeypatch.setenv("DANDI_API_KEY", local_dandi_api["api_key"]) - upload(paths=[], dandi_instance=local_dandi_api["instance_id"], devel_debug=True) + monkeypatch.setenv("DANDI_API_KEY", local_dandi_api.api_key) + upload(paths=[], dandi_instance=local_dandi_api.instance_id, devel_debug=True) download(d.version_api_url, tmp_path) (nwb_file2,) = tmp_path.glob(f"{dandiset_id}{os.sep}*{os.sep}*.nwb") assert nwb_file.name == nwb_file2.name @@ -44,65 +49,65 @@ def test_new_upload_download(local_dandi_api, monkeypatch, organized_nwb_dir, tm Path(dandiset_metadata_file).write_text(yaml_dump(ds_metadata)) upload( paths=[dandiset_metadata_file], - dandi_instance=local_dandi_api["instance_id"], + dandi_instance=local_dandi_api.instance_id, devel_debug=True, upload_dandiset_metadata=True, ) - d = local_dandi_api["client"].get_dandiset(dandiset_id, DRAFT) + d = local_dandi_api.client.get_dandiset(dandiset_id, DRAFT) assert d.version.name == "shorty" def test_new_upload_extant_existing(mocker, text_dandiset): - iter_upload_spy = mocker.spy(RemoteDandiset, "iter_upload_raw_asset") + iter_upload_spy = mocker.spy(LocalFileAsset, "iter_upload") with pytest.raises(FileExistsError): - text_dandiset["reupload"](existing="error") + text_dandiset.upload(existing="error") iter_upload_spy.assert_not_called() def test_new_upload_extant_skip(mocker, text_dandiset): - iter_upload_spy = mocker.spy(RemoteDandiset, "iter_upload_raw_asset") - text_dandiset["reupload"](existing="skip") + iter_upload_spy = mocker.spy(LocalFileAsset, "iter_upload") + text_dandiset.upload(existing="skip") iter_upload_spy.assert_not_called() @pytest.mark.parametrize("existing", ["overwrite", "refresh"]) def test_new_upload_extant_eq_overwrite(existing, mocker, text_dandiset): - iter_upload_spy = mocker.spy(RemoteDandiset, "iter_upload_raw_asset") - text_dandiset["reupload"](existing=existing) + iter_upload_spy = mocker.spy(LocalFileAsset, "iter_upload") + text_dandiset.upload(existing=existing) iter_upload_spy.assert_not_called() @pytest.mark.parametrize("existing", ["overwrite", "refresh"]) def test_new_upload_extant_neq_overwrite(existing, mocker, text_dandiset, tmp_path): - dandiset_id = text_dandiset["dandiset_id"] - (text_dandiset["dspath"] / "file.txt").write_text("This is different text.\n") - iter_upload_spy = mocker.spy(RemoteDandiset, "iter_upload_raw_asset") - text_dandiset["reupload"](existing=existing) + dandiset_id = text_dandiset.dandiset_id + (text_dandiset.dspath / "file.txt").write_text("This is different text.\n") + iter_upload_spy = mocker.spy(LocalFileAsset, "iter_upload") + text_dandiset.upload(existing=existing) iter_upload_spy.assert_called() - download(text_dandiset["dandiset"].version_api_url, tmp_path) + download(text_dandiset.dandiset.version_api_url, tmp_path) assert ( tmp_path / dandiset_id / "file.txt" ).read_text() == "This is different text.\n" def test_new_upload_extant_old_refresh(mocker, text_dandiset): - (text_dandiset["dspath"] / "file.txt").write_text("This is different text.\n") - os.utime(text_dandiset["dspath"] / "file.txt", times=(0, 0)) - iter_upload_spy = mocker.spy(RemoteDandiset, "iter_upload_raw_asset") - text_dandiset["reupload"](existing="refresh") + (text_dandiset.dspath / "file.txt").write_text("This is different text.\n") + os.utime(text_dandiset.dspath / "file.txt", times=(0, 0)) + iter_upload_spy = mocker.spy(LocalFileAsset, "iter_upload") + text_dandiset.upload(existing="refresh") iter_upload_spy.assert_not_called() def test_new_upload_extant_force(mocker, text_dandiset): - iter_upload_spy = mocker.spy(RemoteDandiset, "iter_upload_raw_asset") - text_dandiset["reupload"](existing="force") + iter_upload_spy = mocker.spy(LocalFileAsset, "iter_upload") + text_dandiset.upload(existing="force") iter_upload_spy.assert_called() def test_new_upload_extant_bad_existing(mocker, text_dandiset): - iter_upload_spy = mocker.spy(RemoteDandiset, "iter_upload_raw_asset") - text_dandiset["reupload"](existing="foobar") + iter_upload_spy = mocker.spy(LocalFileAsset, "iter_upload") + text_dandiset.upload(existing="foobar") iter_upload_spy.assert_not_called() @@ -119,21 +124,20 @@ def test_new_upload_extant_bad_existing(mocker, text_dandiset): ], ) def test_upload_download_small_file(contents, local_dandi_api, monkeypatch, tmp_path): - client = local_dandi_api["client"] + client = local_dandi_api.client d = client.create_dandiset("Small Dandiset", {}) dandiset_id = d.identifier dspath = tmp_path / "upload" dspath.mkdir() (dspath / dandiset_metadata_file).write_text(f"identifier: '{dandiset_id}'\n") (dspath / "file.txt").write_bytes(contents) - monkeypatch.setenv("DANDI_API_KEY", local_dandi_api["api_key"]) + monkeypatch.setenv("DANDI_API_KEY", local_dandi_api.api_key) upload( paths=[], dandiset_path=dspath, - dandi_instance=local_dandi_api["instance_id"], + dandi_instance=local_dandi_api.instance_id, devel_debug=True, allow_any_path=True, - validation="skip", ) download_dir = tmp_path / "download" download_dir.mkdir() @@ -147,34 +151,45 @@ def test_upload_download_small_file(contents, local_dandi_api, monkeypatch, tmp_ @pytest.mark.parametrize("confirm", [True, False]) def test_upload_sync(confirm, mocker, text_dandiset): - (text_dandiset["dspath"] / "file.txt").unlink() + (text_dandiset.dspath / "file.txt").unlink() confirm_mock = mocker.patch("click.confirm", return_value=confirm) - text_dandiset["reupload"](sync=True) + text_dandiset.upload(sync=True) confirm_mock.assert_called_with("Delete 1 asset on server?") if confirm: with pytest.raises(NotFoundError): - text_dandiset["dandiset"].get_asset_by_path("file.txt") + text_dandiset.dandiset.get_asset_by_path("file.txt") else: - text_dandiset["dandiset"].get_asset_by_path("file.txt") + text_dandiset.dandiset.get_asset_by_path("file.txt") def test_upload_sync_folder(mocker, text_dandiset): - (text_dandiset["dspath"] / "file.txt").unlink() - (text_dandiset["dspath"] / "subdir2" / "banana.txt").unlink() + (text_dandiset.dspath / "file.txt").unlink() + (text_dandiset.dspath / "subdir2" / "banana.txt").unlink() confirm_mock = mocker.patch("click.confirm", return_value=True) - text_dandiset["reupload"](paths=[text_dandiset["dspath"] / "subdir2"], sync=True) + text_dandiset.upload(paths=[text_dandiset.dspath / "subdir2"], sync=True) confirm_mock.assert_called_with("Delete 1 asset on server?") - text_dandiset["dandiset"].get_asset_by_path("file.txt") + text_dandiset.dandiset.get_asset_by_path("file.txt") with pytest.raises(NotFoundError): - text_dandiset["dandiset"].get_asset_by_path("subdir2/banana.txt") + text_dandiset.dandiset.get_asset_by_path("subdir2/banana.txt") + + +def test_upload_sync_zarr(mocker, zarr_dandiset): + rmtree(zarr_dandiset.dspath / "sample.zarr") + zarr.save(zarr_dandiset.dspath / "identity.zarr", np.eye(5)) + confirm_mock = mocker.patch("click.confirm", return_value=True) + zarr_dandiset.upload(sync=True) + confirm_mock.assert_called_with("Delete 1 asset on server?") + zarr_dandiset.dandiset.get_asset_by_path("identity.zarr") + with pytest.raises(NotFoundError): + zarr_dandiset.dandiset.get_asset_by_path("sample.zarr") def test_upload_invalid_metadata( local_dandi_api, monkeypatch, simple1_nwb_metadata, tmp_path ): monkeypatch.chdir(tmp_path) - monkeypatch.setenv("DANDI_API_KEY", local_dandi_api["api_key"]) - d = local_dandi_api["client"].create_dandiset("Broken Dandiset", {}) + monkeypatch.setenv("DANDI_API_KEY", local_dandi_api.api_key) + d = local_dandi_api.client.create_dandiset("Broken Dandiset", {}) nwb_file = "broken.nwb" make_nwb_file( nwb_file, @@ -187,6 +202,116 @@ def test_upload_invalid_metadata( **simple1_nwb_metadata, ) Path(dandiset_metadata_file).write_text(f"identifier: '{d.identifier}'\n") - upload(paths=[], dandi_instance=local_dandi_api["instance_id"], devel_debug=True) + upload(paths=[], dandi_instance=local_dandi_api.instance_id, devel_debug=True) with pytest.raises(NotFoundError): d.get_asset_by_path(nwb_file) + + +def test_upload_zarr(local_dandi_api, monkeypatch, tmp_path): + d = local_dandi_api.client.create_dandiset("Test Dandiset", {}) + dandiset_id = d.identifier + (tmp_path / dandiset_metadata_file).write_text(f"identifier: '{dandiset_id}'\n") + zarr.save(tmp_path / "sample.zarr", np.arange(1000), np.arange(1000, 0, -1)) + monkeypatch.setenv("DANDI_API_KEY", local_dandi_api.api_key) + upload( + paths=[], + dandiset_path=tmp_path, + dandi_instance=local_dandi_api.instance_id, + devel_debug=True, + ) + (asset,) = d.get_assets() + assert isinstance(asset, RemoteZarrAsset) + assert asset.asset_type is AssetType.ZARR + assert asset.path == "sample.zarr" + + +def test_upload_different_zarr(tmp_path, zarr_dandiset): + rmtree(zarr_dandiset.dspath / "sample.zarr") + zarr.save(zarr_dandiset.dspath / "sample.zarr", np.eye(5)) + zarr_dandiset.upload() + download(zarr_dandiset.dandiset.version_api_url, tmp_path) + assert_dirtrees_eq( + zarr_dandiset.dspath / "sample.zarr", + tmp_path / zarr_dandiset.dandiset_id / "sample.zarr", + ) + + +def test_upload_nonzarr_to_zarr_path(tmp_path, zarr_dandiset): + rmtree(zarr_dandiset.dspath / "sample.zarr") + (zarr_dandiset.dspath / "sample.zarr").write_text("This is not a Zarr.\n") + zarr_dandiset.upload(allow_any_path=True) + (asset,) = zarr_dandiset.dandiset.get_assets() + assert isinstance(asset, RemoteBlobAsset) + assert asset.asset_type is AssetType.BLOB + assert asset.path == "sample.zarr" + assert asset.get_raw_metadata()["encodingFormat"] == "application/octet-stream" + download(zarr_dandiset.dandiset.version_api_url, tmp_path) + assert ( + tmp_path / zarr_dandiset.dandiset_id / "sample.zarr" + ).read_text() == "This is not a Zarr.\n" + + +def test_upload_zarr_to_nonzarr_path(local_dandi_api, monkeypatch, tmp_path): + monkeypatch.setenv("DANDI_API_KEY", local_dandi_api.api_key) + d = local_dandi_api.client.create_dandiset("Test Dandiset", {}) + dandiset_id = d.identifier + dspath = tmp_path / "dandiset" + dspath.mkdir() + (dspath / dandiset_metadata_file).write_text(f"identifier: '{dandiset_id}'\n") + (dspath / "sample.zarr").write_text("This is not a Zarr.\n") + upload( + paths=[], + dandiset_path=dspath, + dandi_instance=local_dandi_api.instance_id, + devel_debug=True, + allow_any_path=True, + ) + + (asset,) = d.get_assets() + assert isinstance(asset, RemoteBlobAsset) + assert asset.asset_type is AssetType.BLOB + assert asset.path == "sample.zarr" + assert asset.get_raw_metadata()["encodingFormat"] == "application/octet-stream" + + (dspath / "sample.zarr").unlink() + zarr.save(dspath / "sample.zarr", np.arange(1000), np.arange(1000, 0, -1)) + upload( + paths=[], + dandiset_path=dspath, + dandi_instance=local_dandi_api.instance_id, + devel_debug=True, + allow_any_path=True, + ) + + (asset,) = d.get_assets() + assert isinstance(asset, RemoteZarrAsset) + assert asset.asset_type is AssetType.ZARR + assert asset.path == "sample.zarr" + assert asset.get_raw_metadata()["encodingFormat"] == ZARR_MIME_TYPE + + (tmp_path / "download").mkdir() + download(d.version_api_url, tmp_path / "download") + assert_dirtrees_eq( + dspath / "sample.zarr", + tmp_path / "download" / dandiset_id / "sample.zarr", + ) + + +def test_upload_zarr_with_empty_dir(local_dandi_api, monkeypatch, tmp_path): + d = local_dandi_api.client.create_dandiset("Test Dandiset", {}) + dandiset_id = d.identifier + (tmp_path / dandiset_metadata_file).write_text(f"identifier: '{dandiset_id}'\n") + zarr.save(tmp_path / "sample.zarr", np.arange(1000), np.arange(1000, 0, -1)) + (tmp_path / "sample.zarr" / "empty").mkdir() + monkeypatch.setenv("DANDI_API_KEY", local_dandi_api.api_key) + upload( + paths=[], + dandiset_path=tmp_path, + dandi_instance=local_dandi_api.instance_id, + devel_debug=True, + ) + (asset,) = d.get_assets() + assert isinstance(asset, RemoteZarrAsset) + assert asset.asset_type is AssetType.ZARR + assert asset.path == "sample.zarr" + assert not (asset.filetree / "empty").exists() diff --git a/dandi/tests/test_validate.py b/dandi/tests/test_validate.py deleted file mode 100644 index 66afdeb07..000000000 --- a/dandi/tests/test_validate.py +++ /dev/null @@ -1,33 +0,0 @@ -from dandischema.models import get_schema_version - -from ..validate import validate_file - - -def test_validate_simple1(simple1_nwb): - # this file should be ok - errors = validate_file(simple1_nwb, schema_version=get_schema_version()) - assert not errors - - -def test_validate_simple2(simple2_nwb): - # this file should be ok - errors = validate_file(simple2_nwb) - assert not errors - - -def test_validate_simple2_new(simple2_nwb): - # this file should be ok - errors = validate_file(simple2_nwb, schema_version=get_schema_version()) - assert not errors - - -def test_validate_bogus(tmp_path): - path = tmp_path / "wannabe.nwb" - path.write_text("not really nwb") - # intended to produce use-case for https://github.com/dandi/dandi-cli/issues/93 - # but it would be tricky, so it is more of a smoke test that - # we do not crash - errors = validate_file(str(path)) - # ATM we would get 2 errors -- since could not be open in two places, - # but that would be too rigid to test. Let's just see that we have expected errors - assert any(e.startswith("Failed to read metadata") for e in errors) diff --git a/dandi/upload.py b/dandi/upload.py index 6b6364eb3..580146949 100644 --- a/dandi/upload.py +++ b/dandi/upload.py @@ -1,6 +1,6 @@ from functools import reduce import os.path -from pathlib import Path, PurePosixPath +from pathlib import Path import re import time @@ -9,6 +9,7 @@ from . import lgr from .consts import DRAFT, dandiset_identifier_regex, dandiset_metadata_file from .exceptions import NotFoundError +from .files import DandisetMetadataFile, LocalAsset, find_dandi_files from .utils import ensure_datetime, get_instance, pluralize @@ -27,7 +28,6 @@ def upload( ): from .dandiapi import DandiAPIClient from .dandiset import APIDandiset, Dandiset - from .support.digests import get_digest dandiset = Dandiset.find(dandiset_path) if not dandiset: @@ -55,11 +55,9 @@ def upload( f"convention {dandiset_identifier_regex!r}." ) - from .metadata import get_asset_metadata from .pynwb_utils import ignore_benign_pynwb_warnings from .support.pyout import naturalsize - from .utils import find_dandi_files, find_files, path_is_subpath - from .validate import validate_file + from .utils import path_is_subpath ignore_benign_pynwb_warnings() # so validate doesn't whine @@ -71,21 +69,16 @@ def upload( original_paths = paths # Expand and validate all paths -- they should reside within dandiset - paths = find_files(".*", paths) if allow_any_path else find_dandi_files(paths) - paths = list(map(Path, paths)) - npaths = len(paths) - lgr.info(f"Found {npaths} files to consider") - for path in paths: - if not ( - allow_any_path - or path.name == dandiset_metadata_file - or path.name.endswith(".nwb") - ): - raise NotImplementedError( - f"ATM only .nwb and dandiset.yaml should be in the paths to upload. Got {path}" - ) - if not path_is_subpath(str(path.absolute()), dandiset.path): - raise ValueError(f"{path} is not under {dandiset.path}") + paths = [Path(p).absolute() for p in paths] + dandi_files = list( + find_dandi_files( + *paths, + dandiset_path=dandiset.path, + allow_all=allow_any_path, + include_metadata=True, + ) + ) + lgr.info(f"Found {len(dandi_files)} files to consider") # We will keep a shared set of "being processed" paths so # we could limit the number of them until @@ -101,28 +94,22 @@ def skip_file(msg): # TODO: we might want to always yield a full record so no field is not # provided to pyout to cause it to halt - def process_path(path, relpath): + def process_path(dfile): """ Parameters ---------- - path: Path - Non Pure (OS specific) Path - relpath: - For location on server. Will be cast to PurePosixPath + dfile: DandiFile Yields ------ dict Records for pyout """ - # Ensure consistent types - path = Path(path) - relpath = PurePosixPath(relpath) + strpath = str(dfile.filepath) try: try: - path_stat = path.stat() - yield {"size": path_stat.st_size} + yield {"size": dfile.size} except FileNotFoundError: yield skip_file("ERROR: File not found") return @@ -135,9 +122,9 @@ def process_path(path, relpath): # Validate first, so we do not bother server at all if not kosher # # TODO: enable back validation of dandiset.yaml - if path.name != dandiset_metadata_file and validation != "skip": + if isinstance(dfile, LocalAsset) and validation != "skip": yield {"status": "pre-validating"} - validation_errors = validate_file(path) + validation_errors = dfile.get_validation_errors() yield {"errors": len(validation_errors)} # TODO: split for dandi, pynwb errors if validation_errors: @@ -156,7 +143,7 @@ def process_path(path, relpath): # Special handling for dandiset.yaml # Yarik hates it but that is life for now. TODO # - if path.name == dandiset_metadata_file: + if isinstance(dfile, DandisetMetadataFile): # TODO This is a temporary measure to avoid breaking web UI # dandiset metadata schema assumptions. All edits should happen # online. @@ -173,30 +160,27 @@ def process_path(path, relpath): # yield {"status": "digesting"} try: - file_etag = get_digest(path, digest="dandi-etag") + file_etag = dfile.get_digest() except Exception as exc: yield skip_file("failed to compute digest: %s" % str(exc)) return try: - extant = remote_dandiset.get_asset_by_path(str(relpath)) + extant = remote_dandiset.get_asset_by_path(dfile.path) except NotFoundError: extant = None else: metadata = extant.get_raw_metadata() - local_mtime = ensure_datetime(path_stat.st_mtime) + local_mtime = dfile.modified remote_mtime_str = metadata.get("blobDateModified") - d = metadata.get("digest", {}) - if "dandi:dandi-etag" in d: - extant_etag = d["dandi:dandi-etag"] - else: - # TODO: Should this error instead? - extant_etag = None + # TODO: Should this error if the digest is missing? + extant_etag = metadata.get("digest", {}).get(file_etag.algorithm.value) if remote_mtime_str is not None: remote_mtime = ensure_datetime(remote_mtime_str) remote_file_status = ( "same" - if extant_etag == file_etag and remote_mtime == local_mtime + if extant_etag == file_etag.value + and remote_mtime == local_mtime else ( "newer" if remote_mtime > local_mtime @@ -217,11 +201,11 @@ def process_path(path, relpath): return # Logic below only for overwrite and reupload if existing == "overwrite": - if extant_etag == file_etag: + if extant_etag == file_etag.value: yield skip_file(exists_msg) return elif existing == "refresh": - if extant_etag == file_etag: + if extant_etag == file_etag.value: yield skip_file("file exists") return elif remote_mtime is not None and remote_mtime >= local_mtime: @@ -243,12 +227,8 @@ def process_path(path, relpath): # ad-hoc for dandiset.yaml for now yield {"status": "extracting metadata"} try: - metadata = get_asset_metadata( - path, - relpath, - digest=file_etag, - digest_type="dandi_etag", - allow_any_path=allow_any_path, + metadata = dfile.get_metadata( + digest=file_etag, ignore_errors=allow_any_path ).json_dict() except Exception as e: yield skip_file("failed to extract metadata: %s" % str(e)) @@ -259,12 +239,12 @@ def process_path(path, relpath): # yield {"status": "uploading"} validating = False - for r in remote_dandiset.iter_upload_raw_asset( - path, metadata, jobs=jobs_per_file, replace_asset=extant + for r in dfile.iter_upload( + remote_dandiset, metadata, jobs=jobs_per_file, replacing=extant ): r.pop("asset", None) # to keep pyout from choking if r["status"] == "uploading": - uploaded_paths[str(path)]["size"] = r.pop("current") + uploaded_paths[strpath]["size"] = r.pop("current") yield r elif r["status"] == "post-validating": # Only yield the first "post-validating" status @@ -278,14 +258,14 @@ def process_path(path, relpath): except Exception as exc: if devel_debug: raise - lgr.exception("Error uploading %s:", relpath) + lgr.exception("Error uploading %s:", strpath) # Custom formatting for some exceptions we know to extract # user-meaningful message message = str(exc) - uploaded_paths[str(path)]["errors"].append(message) + uploaded_paths[strpath]["errors"].append(message) yield {"status": "ERROR", "message": message} finally: - process_paths.remove(str(path)) + process_paths.remove(strpath) # We will again use pyout to provide a neat table summarizing our progress # with upload etc @@ -313,31 +293,28 @@ def upload_agg(*ignored): out = pyouts.LogSafeTabular(style=pyout_style, columns=rec_fields, max_workers=jobs) with out: - for path in paths: + for dfile in dandi_files: while len(process_paths) >= 10: lgr.log(2, "Sleep waiting for some paths to finish processing") time.sleep(0.5) - rec = {"path": str(path)} - process_paths.add(str(path)) + process_paths.add(str(dfile.filepath)) - try: - relpath = path.absolute().relative_to(dandiset.path) + if isinstance(dfile, DandisetMetadataFile): + rec = {"path": dandiset_metadata_file} + else: + assert isinstance(dfile, LocalAsset) + rec = {"path": dfile.path} - rec["path"] = str(relpath) + try: if devel_debug: # DEBUG: do serially - for v in process_path(path, relpath): + for v in process_path(dfile): print(str(v), flush=True) else: - rec[tuple(rec_fields[1:])] = process_path(path, relpath) + rec[tuple(rec_fields[1:])] = process_path(dfile) except ValueError as exc: - if "does not start with" in str(exc): - # if top_path is not the top path for the path - # Provide more concise specific message without path details - rec.update(skip_file("must be a child of top path")) - else: - rec.update(skip_file(exc)) + rec.update(skip_file(exc)) out(rec) if sync: diff --git a/dandi/utils.py b/dandi/utils.py index c0d9c171b..9f25469ef 100644 --- a/dandi/utils.py +++ b/dandi/utils.py @@ -19,7 +19,7 @@ import subprocess import sys import types -from typing import List, Optional, Union +from typing import Iterable, Iterator, List, Optional, TypeVar, Union import dateutil.parser import requests @@ -708,3 +708,22 @@ def check_dandi_version(): exc, ) os.environ["DANDI_NO_ET"] = "1" + + +T = TypeVar("T") + + +def chunked(iterable: Iterable[T], size: int) -> Iterator[List[T]]: + # cf. chunked() from more-itertools + i = iter(iterable) + while True: + xs = [] + for _ in range(size): + try: + xs.append(next(i)) + except StopIteration: + if xs: + break + else: + return + yield xs diff --git a/dandi/validate.py b/dandi/validate.py index 16d057442..8f49c6ddb 100644 --- a/dandi/validate.py +++ b/dandi/validate.py @@ -1,26 +1,13 @@ -import os.path as op - -from . import get_logger -from .consts import dandiset_metadata_file -from .metadata import get_metadata -from .pynwb_utils import validate as pynwb_validate -from .pynwb_utils import validate_cache -from .utils import find_dandi_files, find_files, yaml_load - -lgr = get_logger() - -# TODO -- should come from schema. This is just a simplistic example for now -_required_dandiset_metadata_fields = ["identifier", "name", "description"] -_required_nwb_metadata_fields = ["subject_id"] +from .files import find_dandi_files # TODO: provide our own "errors" records, which would also include warnings etc -def validate(paths, schema_version=None, devel_debug=False, allow_any_path=False): +def validate(*paths, schema_version=None, devel_debug=False, allow_any_path=False): """Validate content Parameters ---------- - paths: str or list of paths + paths: *str Could be individual (.nwb) files or a single dandiset path. Yields @@ -28,130 +15,10 @@ def validate(paths, schema_version=None, devel_debug=False, allow_any_path=False path, errors errors for a path """ - filepaths = find_files(".*", paths) if allow_any_path else find_dandi_files(paths) - for path in filepaths: - errors = validate_file( - path, schema_version=schema_version, devel_debug=devel_debug - ) - yield path, errors - - -def validate_file(filepath, schema_version=None, devel_debug=False): - if op.basename(filepath) == dandiset_metadata_file: - return validate_dandiset_yaml( - filepath, schema_version=None, devel_debug=devel_debug + for df in find_dandi_files(*paths, dandiset_path=None, allow_all=allow_any_path): + yield ( + df.filepath, + df.get_validation_errors( + schema_version=schema_version, devel_debug=devel_debug + ), ) - else: - return pynwb_validate(filepath, devel_debug=devel_debug) + validate_asset_file( - filepath, schema_version=schema_version, devel_debug=devel_debug - ) - - -@validate_cache.memoize_path -def validate_dandiset_yaml(filepath, schema_version=None, devel_debug=False): - """Validate dandiset.yaml""" - with open(filepath) as f: - meta = yaml_load(f, typ="safe") - if schema_version is None: - schema_version = meta.get("schemaVersion") - if schema_version is None: - return _check_required_fields(meta, _required_dandiset_metadata_fields) - else: - from dandischema.models import Dandiset as DandisetMeta - from dandischema.models import get_schema_version - from pydantic import ValidationError - - current_version = get_schema_version() - if schema_version != current_version: - raise ValueError( - f"Unsupported schema version: {schema_version}; expected {current_version}" - ) - try: - DandisetMeta(**meta) - except ValidationError as e: - if devel_debug: - raise - lgr.warning( - "Validation error for %s: %s", filepath, e, extra={"validating": True} - ) - return [str(e)] - except Exception as e: - if devel_debug: - raise - lgr.warning( - "Unexpected validation error for %s: %s", - filepath, - e, - extra={"validating": True}, - ) - return [f"Failed to initialize Dandiset meta: {e}"] - return [] - - -@validate_cache.memoize_path -def validate_asset_file(filepath, schema_version=None, devel_debug=False): - """Provide validation of asset file regarding requirements we impose""" - if schema_version is not None: - from dandischema.models import BareAsset, get_schema_version - from pydantic import ValidationError - - from .metadata import get_asset_metadata - - current_version = get_schema_version() - if schema_version != current_version: - raise ValueError( - f"Unsupported schema version: {schema_version}; expected {current_version}" - ) - try: - asset = get_asset_metadata( - filepath, - relpath="dummy", - digest=32 * "d" + "-1", - digest_type="dandi_etag", - allow_any_path=True, - ) - BareAsset(**asset.dict()) - except ValidationError as e: - if devel_debug: - raise - lgr.warning( - "Validation error for %s: %s", filepath, e, extra={"validating": True} - ) - return [str(e)] - except Exception as e: - if devel_debug: - raise - lgr.warning( - "Unexpected validation error for %s: %s", - filepath, - e, - extra={"validating": True}, - ) - return [f"Failed to read metadata: {e}"] - return [] - else: - # make sure that we have some basic metadata fields we require - try: - meta = get_metadata(filepath) - except Exception as e: - if devel_debug: - raise - lgr.warning( - "Failed to read metadata in %s: %s", - filepath, - e, - extra={"validating": True}, - ) - return [f"Failed to read metadata: {e}"] - return _check_required_fields(meta, _required_nwb_metadata_fields) - - -def _check_required_fields(d, required): - errors = [] - for f in required: - v = d.get(f, None) - if not v or (isinstance(v, str) and not (v.strip())): - errors += [f"Required field {f!r} has no value"] - if v in ("REQUIRED", "PLACEHOLDER"): - errors += [f"Required field {f!r} has value {v!r}"] - return errors diff --git a/docs/source/cmdline/download.rst b/docs/source/cmdline/download.rst index e9001ced4..2536b71ff 100644 --- a/docs/source/cmdline/download.rst +++ b/docs/source/cmdline/download.rst @@ -31,9 +31,10 @@ Options DANDI instance to download from [default: ``dandi``] -.. option:: -J, --jobs +.. option:: -J, --jobs N[:M] - Number of parallel download jobs [default: 6] + Number of parallel download jobs and, optionally, number of upload subjobs + per Zarr asset job [default: 6:4] .. option:: -o, --output-dir diff --git a/docs/source/cmdline/upload.rst b/docs/source/cmdline/upload.rst index d3bf14bdc..5367aa2c2 100644 --- a/docs/source/cmdline/upload.rst +++ b/docs/source/cmdline/upload.rst @@ -13,9 +13,10 @@ a :file:`dandiset.yaml` file must exist in the local :option:`--dandiset-path`. Local Dandisets should pass validation. For that, the assets should first be organized using the :ref:`dandi_organize` command. -By default, all :file:`*.nwb` files in the Dandiset (excluding directories -starting with a period) will be considered for the upload. You can point to -specific files you would like to validate and have uploaded. +By default, all :file:`*.nwb`, :file:`*.zarr`, and :file:`*.ngff` assets in the +Dandiset (ignoring directories starting with a period) will be considered for +the upload. You can point to specific files you would like to validate and +have uploaded. Options ------- @@ -66,7 +67,7 @@ set to a nonempty value. .. option:: --allow-any-path - Upload all file types, not just :file:`*.nwb`'s + Upload all file types, not just NWBs and Zarrs .. option:: --devel-debug diff --git a/docs/source/cmdline/validate.rst b/docs/source/cmdline/validate.rst index d35cd7d32..bc03476b3 100644 --- a/docs/source/cmdline/validate.rst +++ b/docs/source/cmdline/validate.rst @@ -5,7 +5,7 @@ dandi [] validate [ ...] -Validate files for NWB (and DANDI) compliance. +Validate files for NWB and DANDI compliance. Exits with non-zero exit code if any file is not compliant. @@ -19,7 +19,7 @@ set to a nonempty value. .. option:: --allow-any-path - Validate all file types, not just :file:`*.nwb`'s + Validate all file types, not just NWBs and Zarrs .. option:: --devel-debug diff --git a/docs/source/modref/files.rst b/docs/source/modref/files.rst new file mode 100644 index 000000000..c5a9abe77 --- /dev/null +++ b/docs/source/modref/files.rst @@ -0,0 +1,5 @@ +``dandi.files`` +=============== + +.. automodule:: dandi.files + :show-inheritance: diff --git a/docs/source/modref/index.rst b/docs/source/modref/index.rst index 96e4cdd43..6d6e78e8d 100644 --- a/docs/source/modref/index.rst +++ b/docs/source/modref/index.rst @@ -22,6 +22,8 @@ Mid-level user interfaces .. toctree:: dandiapi + files + misctypes Support functionality ===================== diff --git a/docs/source/modref/misctypes.rst b/docs/source/modref/misctypes.rst new file mode 100644 index 000000000..d1ee6a0b1 --- /dev/null +++ b/docs/source/modref/misctypes.rst @@ -0,0 +1,4 @@ +``dandi.misctypes`` +=================== + +.. automodule:: dandi.misctypes diff --git a/setup.cfg b/setup.cfg index f503562cd..73f0290c1 100644 --- a/setup.cfg +++ b/setup.cfg @@ -32,7 +32,7 @@ install_requires = appdirs click click-didyoumean - dandischema ~= 0.4.1 + dandischema ~= 0.5.1 etelemetry >= 0.2.2 fasteners fscacher @@ -42,6 +42,7 @@ install_requires = #hdmf != 1.1.2 humanize importlib-metadata; python_version < "3.8" + interleave ~= 0.1 joblib keyring keyrings.alt @@ -56,6 +57,7 @@ install_requires = semantic-version tenacity tqdm + zarr ~= 2.10 zip_safe = False packages = find: include_package_data = True diff --git a/tools/update-assets-on-server b/tools/update-assets-on-server index b032e9350..b1e544eac 100755 --- a/tools/update-assets-on-server +++ b/tools/update-assets-on-server @@ -18,6 +18,7 @@ import requests from dandi.dandiapi import DandiAPIClient from dandi.metadata import get_default_metadata, nwb2asset +from dandi.misctypes import Digest from dandi.support.digests import get_digest logging.basicConfig( @@ -37,10 +38,10 @@ def get_meta(path, digest=None): try: if digest is None: digest = get_digest(path, digest="dandi-etag") - localmeta = nwb2asset(path, digest=digest, digest_type="dandi_etag") + localmeta = nwb2asset(path, digest=Digest.dandi_etag(digest)) except Exception as e: ul.error(f"Error {e} getting {path}") - localmeta = get_default_metadata(path, digest=digest, digest_type="dandi_etag") + localmeta = get_default_metadata(path, digest=Digest.dandi_etag(digest)) return localmeta.json_dict()