diff --git a/src/mdio/api/__init__.py b/src/mdio/api/__init__.py index f731a65c2..44bc79850 100644 --- a/src/mdio/api/__init__.py +++ b/src/mdio/api/__init__.py @@ -1 +1,6 @@ """Public API.""" + +from mdio.api.create import create_empty +from mdio.api.create import create_empty_like + +__all__ = ["create_empty", "create_empty_like"] diff --git a/src/mdio/api/create.py b/src/mdio/api/create.py new file mode 100644 index 000000000..c17a01091 --- /dev/null +++ b/src/mdio/api/create.py @@ -0,0 +1,168 @@ +"""Creating MDIO v1 datasets.""" + +from __future__ import annotations + +from datetime import UTC +from datetime import datetime +from typing import TYPE_CHECKING + +from mdio.api.io import _normalize_path +from mdio.api.io import open_mdio +from mdio.api.io import to_mdio +from mdio.builder.template_registry import TemplateRegistry +from mdio.builder.xarray_builder import to_xarray_dataset +from mdio.converters.segy import populate_dim_coordinates +from mdio.converters.type_converter import to_structured_type +from mdio.core.grid import Grid + +if TYPE_CHECKING: + from pathlib import Path + + from segy.schema import HeaderSpec + from upath import UPath + from xarray import Dataset as xr_Dataset + + from mdio.builder.schemas import Dataset + from mdio.builder.templates.base import AbstractDatasetTemplate + from mdio.core.dimension import Dimension + + +def create_empty( # noqa PLR0913 + mdio_template: AbstractDatasetTemplate | str, + dimensions: list[Dimension], + output_path: UPath | Path | str | None, + headers: HeaderSpec | None = None, + overwrite: bool = False, +) -> xr_Dataset: + """A function that creates an empty MDIO v1 file with known dimensions. + + Args: + mdio_template: The MDIO template or template name to use to define the dataset structure. + dimensions: The dimensions of the MDIO file. + output_path: The universal path for the output MDIO v1 file. + headers: SEG-Y v1.0 trace headers. Defaults to None. + overwrite: Whether to overwrite the output file if it already exists. Defaults to False. + + Returns: + The output MDIO dataset. + + Raises: + FileExistsError: If the output location already exists and overwrite is False. + """ + output_path = _normalize_path(output_path) + + if not overwrite and output_path.exists(): + err = f"Output location '{output_path.as_posix()}' exists. Set `overwrite=True` if intended." + raise FileExistsError(err) + + header_dtype = to_structured_type(headers.dtype) if headers else None + grid = Grid(dims=dimensions) + if isinstance(mdio_template, str): + # A template name is passed in. Get a unit-unaware template from registry + mdio_template = TemplateRegistry().get(mdio_template) + # Build the dataset using the template + mdio_ds: Dataset = mdio_template.build_dataset(name=mdio_template.name, sizes=grid.shape, header_dtype=header_dtype) + + # Convert to xarray dataset + xr_dataset: xr_Dataset = to_xarray_dataset(mdio_ds=mdio_ds) + + # Populate coordinates using the grid + # For empty datasets, we only populate dimension coordinates + drop_vars_delayed = [] + xr_dataset, drop_vars_delayed = populate_dim_coordinates(xr_dataset, grid, drop_vars_delayed=drop_vars_delayed) + + if headers: + # Since the headers were provided, the user wants to export to SEG-Y + # Add a dummy segy_file_header variable used to export to SEG-Y + xr_dataset["segy_file_header"] = ((), "") + + # Create the Zarr store with the correct structure but with empty arrays + if output_path is not None: + to_mdio(xr_dataset, output_path=output_path, mode="w", compute=False) + + # Write the dimension coordinates and trace mask + xr_dataset = xr_dataset[drop_vars_delayed + ["trace_mask"]] + + if output_path is not None: + to_mdio(xr_dataset, output_path=output_path, mode="r+", compute=True) + + return xr_dataset + + +def create_empty_like( # noqa PLR0913 + input_path: UPath | Path | str, + output_path: UPath | Path | str, + keep_coordinates: bool = False, + overwrite: bool = False, +) -> xr_Dataset: + """A function that creates an empty MDIO v1 file with the same structure as an existing one. + + Args: + input_path: The path of the input MDIO file. + output_path: The path of the output MDIO file. + If None, the output will not be written to disk. + keep_coordinates: Whether to keep the coordinates in the output file. + overwrite: Whether to overwrite the output file if it exists. + + Returns: + The output MDIO dataset. + + Raises: + FileExistsError: If the output location already exists and overwrite is False. + """ + input_path = _normalize_path(input_path) + output_path = _normalize_path(output_path) if output_path is not None else None + + if not overwrite and output_path is not None and output_path.exists(): + err = f"Output location '{output_path.as_posix()}' exists. Set `overwrite=True` if intended." + raise FileExistsError(err) + + ds = open_mdio(input_path) + + # Create a copy with the same structure but no data or, + # optionally, coordinates + ds_output = ds.copy(data=None).reset_coords(drop=not keep_coordinates) + + # Dataset + # Keep the name (which is the same as the used template name) and the original API version + # ds_output.attrs["name"] + # ds_output.attrs["apiVersion"] + ds_output.attrs["createdOn"] = str(datetime.now(UTC)) + + # Coordinates + if not keep_coordinates: + for coord_name in ds_output.coords: + ds_output[coord_name].attrs.pop("unitsV1", None) + + # MDIO attributes + attr = ds_output.attrs["attributes"] + if attr is not None: + attr.pop("gridOverrides", None) # Empty dataset should not have gridOverrides + # Keep the original values for the following attributes + # attr["defaultVariableName"] + # attr["surveyType"] + # attr["gatherType"] + + # "All traces should be marked as dead in empty dataset" + if "trace_mask" in ds_output.variables: + ds_output["trace_mask"][:] = False + + # Data variable + var_name = attr["defaultVariableName"] + var = ds_output[var_name] + var.attrs.pop("statsV1", None) + if not keep_coordinates: + var.attrs.pop("unitsV1", None) + + # SEG-Y file header + if "segy_file_header" in ds_output.variables: + segy_file_header = ds_output["segy_file_header"] + if segy_file_header is not None: + segy_file_header.attrs.pop("textHeader", None) + segy_file_header.attrs.pop("binaryHeader", None) + segy_file_header.attrs.pop("rawBinaryHeader", None) + + if output_path is not None: + to_mdio(ds_output, output_path=output_path, mode="w", compute=True) + + return ds_output diff --git a/src/mdio/builder/dataset_builder.py b/src/mdio/builder/dataset_builder.py index 1cc515988..5fc0b288e 100644 --- a/src/mdio/builder/dataset_builder.py +++ b/src/mdio/builder/dataset_builder.py @@ -6,7 +6,6 @@ from enum import auto from typing import Any -from mdio import __version__ from mdio.builder.formatting_html import dataset_builder_repr_html from mdio.builder.schemas.compressors import ZFP from mdio.builder.schemas.compressors import Blosc @@ -59,6 +58,8 @@ class MDIODatasetBuilder: """ def __init__(self, name: str, attributes: dict[str, Any] | None = None): + from mdio import __version__ # noqa: PLC0415 - fixed circular import in mdio package and dataset_builder.py + self._metadata = DatasetMetadata( name=name, api_version=__version__, diff --git a/src/mdio/converters/__init__.py b/src/mdio/converters/__init__.py index fd88595ff..753a0fc56 100644 --- a/src/mdio/converters/__init__.py +++ b/src/mdio/converters/__init__.py @@ -1,6 +1,30 @@ """MDIO Data conversion API.""" -from mdio.converters.mdio import mdio_to_segy -from mdio.converters.segy import segy_to_mdio +from typing import TYPE_CHECKING +from typing import Any + +if TYPE_CHECKING: + from mdio.converters.mdio import mdio_to_segy + from mdio.converters.segy import segy_to_mdio __all__ = ["mdio_to_segy", "segy_to_mdio"] + + +def __getattr__(name: str) -> Any: # noqa: ANN401 - required for dynamic attribute access + """Lazy import for converters to avoid circular imports.""" + if name == "mdio_to_segy": + from mdio.converters.mdio import ( # noqa: PLC0415 - intentionally inside the function to avoid circular imports + mdio_to_segy, + ) + + return mdio_to_segy + + if name == "segy_to_mdio": + from mdio.converters.segy import ( # noqa: PLC0415 - intentionally inside the function to avoid circular imports + segy_to_mdio, + ) + + return segy_to_mdio + + err = f"module {__name__!r} has no attribute {name!r}" + raise AttributeError(err) diff --git a/tests/conftest.py b/tests/conftest.py index 2a32b8cd4..76bf90f9d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -47,6 +47,18 @@ def zarr_tmp(tmp_path_factory: pytest.TempPathFactory) -> Path: return tmp_path_factory.mktemp(r"mdio") +@pytest.fixture(scope="session") +def teapot_mdio_tmp(tmp_path_factory: pytest.TempPathFactory) -> Path: + """Make a temp file for the output MDIO.""" + return tmp_path_factory.mktemp(r"teapot.mdio") + + +@pytest.fixture(scope="module") +def mdio_4d_tmp(tmp_path_factory: pytest.TempPathFactory) -> Path: + """Make a temp file for the output MDIO.""" + return tmp_path_factory.mktemp(r"tmp_4d.mdio") + + @pytest.fixture(scope="module") def zarr_tmp2(tmp_path_factory: pytest.TempPathFactory) -> Path: # pragma: no cover - used by disabled test """Make a temp file for the output MDIO.""" @@ -58,3 +70,9 @@ def segy_export_tmp(tmp_path_factory: pytest.TempPathFactory) -> Path: """Make a temp file for the round-trip IBM SEG-Y.""" tmp_dir = tmp_path_factory.mktemp("segy") return tmp_dir / "teapot_roundtrip.segy" + + +@pytest.fixture(scope="class") +def empty_mdio_dir(tmp_path_factory: pytest.TempPathFactory) -> Path: + """Make a temp file for empty MDIO testing.""" + return tmp_path_factory.mktemp(r"empty_mdio_dir") diff --git a/tests/integration/test_import_streamer_grid_overrides.py b/tests/integration/test_import_streamer_grid_overrides.py index d05070f5d..dbcc4dca3 100644 --- a/tests/integration/test_import_streamer_grid_overrides.py +++ b/tests/integration/test_import_streamer_grid_overrides.py @@ -39,7 +39,7 @@ class TestImport4DNonReg: # pragma: no cover - tests is skipped def test_import_4d_segy( # noqa: PLR0913 self, segy_mock_4d_shots: dict[StreamerShotGeometryType, Path], - zarr_tmp: Path, + mdio_4d_tmp: Path, grid_override: dict[str, Any], chan_header_type: StreamerShotGeometryType, ) -> None: @@ -51,7 +51,7 @@ def test_import_4d_segy( # noqa: PLR0913 segy_spec=segy_spec, mdio_template=TemplateRegistry().get("PreStackShotGathers3DTime"), input_path=segy_path, - output_path=zarr_tmp, + output_path=mdio_4d_tmp, overwrite=True, grid_overrides=grid_override, ) @@ -62,7 +62,7 @@ def test_import_4d_segy( # noqa: PLR0913 cables = [0, 101, 201, 301] receivers_per_cable = [1, 5, 7, 5] - ds = open_mdio(zarr_tmp) + ds = open_mdio(mdio_4d_tmp) assert ds["segy_file_header"].attrs["binaryHeader"]["samples_per_trace"] == num_samples assert ds.attrs["attributes"]["gridOverrides"] == grid_override @@ -86,7 +86,7 @@ class TestImport4D: def test_import_4d_segy( # noqa: PLR0913 self, segy_mock_4d_shots: dict[StreamerShotGeometryType, Path], - zarr_tmp: Path, + mdio_4d_tmp: Path, grid_override: dict[str, Any], chan_header_type: StreamerShotGeometryType, ) -> None: @@ -98,7 +98,7 @@ def test_import_4d_segy( # noqa: PLR0913 segy_spec=segy_spec, mdio_template=TemplateRegistry().get("PreStackShotGathers3DTime"), input_path=segy_path, - output_path=zarr_tmp, + output_path=mdio_4d_tmp, overwrite=True, grid_overrides=grid_override, ) @@ -109,7 +109,7 @@ def test_import_4d_segy( # noqa: PLR0913 cables = [0, 101, 201, 301] receivers_per_cable = [1, 5, 7, 5] - ds = open_mdio(zarr_tmp) + ds = open_mdio(mdio_4d_tmp) assert ds["segy_file_header"].attrs["binaryHeader"]["samples_per_trace"] == num_samples assert ds.attrs["attributes"].get("gridOverrides", None) == grid_override # may not exist, so default=None @@ -134,7 +134,7 @@ class TestImport4DSparse: def test_import_4d_segy( # noqa: PLR0913 self, segy_mock_4d_shots: dict[StreamerShotGeometryType, Path], - zarr_tmp: Path, + mdio_4d_tmp: Path, chan_header_type: StreamerShotGeometryType, ) -> None: """Test importing a SEG-Y file to MDIO.""" @@ -148,7 +148,7 @@ def test_import_4d_segy( # noqa: PLR0913 segy_spec=segy_spec, mdio_template=TemplateRegistry().get("PreStackShotGathers3DTime"), input_path=segy_path, - output_path=zarr_tmp, + output_path=mdio_4d_tmp, overwrite=True, ) @@ -167,7 +167,7 @@ class TestImport6D: # pragma: no cover - tests is skipped def test_import_6d_segy( # noqa: PLR0913 self, segy_mock_4d_shots: dict[StreamerShotGeometryType, Path], - zarr_tmp: Path, + mdio_4d_tmp: Path, grid_override: dict[str, Any], chan_header_type: StreamerShotGeometryType, ) -> None: @@ -179,7 +179,7 @@ def test_import_6d_segy( # noqa: PLR0913 segy_spec=segy_spec, mdio_template=TemplateRegistry().get("XYZ"), # Placeholder for the template input_path=segy_path, - output_path=zarr_tmp, + output_path=mdio_4d_tmp, overwrite=True, grid_overrides=grid_override, ) @@ -195,7 +195,7 @@ def test_import_6d_segy( # noqa: PLR0913 guns = [1, 2] receivers_per_cable = [1, 5, 7, 5] - ds = open_mdio(zarr_tmp) + ds = open_mdio(mdio_4d_tmp) xrt.assert_duckarray_equal(ds["gun"], guns) xrt.assert_duckarray_equal(ds["shot_point"], shots) diff --git a/tests/integration/test_segy_roundtrip_teapot.py b/tests/integration/test_segy_roundtrip_teapot.py index b950a8f03..86e401092 100644 --- a/tests/integration/test_segy_roundtrip_teapot.py +++ b/tests/integration/test_segy_roundtrip_teapot.py @@ -9,12 +9,13 @@ import numpy as np import numpy.testing as npt import pytest -from segy.schema import HeaderField -from segy.schema import ScalarType -from segy.standards import get_segy_standard +from tests.integration.testing_helpers import UNITS_METER +from tests.integration.testing_helpers import UNITS_MILLISECOND +from tests.integration.testing_helpers import UNITS_NONE from tests.integration.testing_helpers import get_inline_header_values +from tests.integration.testing_helpers import get_teapot_segy_spec from tests.integration.testing_helpers import get_values -from tests.integration.testing_helpers import validate_variable +from tests.integration.testing_helpers import validate_xr_variable from mdio import __version__ from mdio import mdio_to_segy @@ -43,13 +44,7 @@ def set_env_vars(monkeypatch: Generator[pytest.MonkeyPatch]) -> None: @pytest.fixture def teapot_segy_spec() -> SegySpec: """Return the customized SEG-Y specification for the teapot dome dataset.""" - teapot_fields = [ - HeaderField(name="inline", byte=17, format=ScalarType.INT32), - HeaderField(name="crossline", byte=13, format=ScalarType.INT32), - HeaderField(name="cdp_x", byte=81, format=ScalarType.INT32), - HeaderField(name="cdp_y", byte=85, format=ScalarType.INT32), - ] - return get_segy_standard(1.0).customize(trace_header_fields=teapot_fields) + return get_teapot_segy_spec() def text_header_teapot_dome() -> str: @@ -156,25 +151,29 @@ class TestTeapotRoundtrip: def test_teapot_import( self, segy_input: Path, - zarr_tmp: Path, + teapot_mdio_tmp: Path, teapot_segy_spec: SegySpec, ) -> None: """Test importing a SEG-Y file to MDIO. NOTE: This test must be executed before the 'TestReader' and 'TestExport' tests. """ + unit_aware_template = TemplateRegistry().get("PostStack3DTime") + unit_aware_template.add_units({"time": UNITS_MILLISECOND}) + unit_aware_template.add_units({"cdp_x": UNITS_METER}) + unit_aware_template.add_units({"cdp_y": UNITS_METER}) segy_to_mdio( segy_spec=teapot_segy_spec, - mdio_template=TemplateRegistry().get("PostStack3DTime"), + mdio_template=unit_aware_template, input_path=segy_input, - output_path=zarr_tmp, + output_path=teapot_mdio_tmp, overwrite=True, ) @pytest.mark.dependency("test_3d_import") - def test_dataset_metadata(self, zarr_tmp: Path) -> None: + def test_dataset_metadata(self, teapot_mdio_tmp: Path) -> None: """Metadata reading tests.""" - ds = open_mdio(zarr_tmp) + ds = open_mdio(teapot_mdio_tmp) expected_attrs = { "apiVersion": __version__, "createdOn": "2025-08-06 16:21:54.747880+00:00", @@ -202,9 +201,9 @@ def test_dataset_metadata(self, zarr_tmp: Path) -> None: assert segy_file_header.attrs["binaryHeader"] == binary_header_teapot_dome() assert segy_file_header.attrs["rawBinaryHeader"] == raw_binary_header_teapot_dome() - def test_variable_metadata(self, zarr_tmp: Path) -> None: + def test_variable_metadata(self, teapot_mdio_tmp: Path) -> None: """Metadata reading tests.""" - ds = open_mdio(zarr_tmp) + ds = open_mdio(teapot_mdio_tmp) expected_attrs = { "count": 46854270, "sum": -8594.551589292674, @@ -219,77 +218,76 @@ def test_variable_metadata(self, zarr_tmp: Path) -> None: expected_attrs.pop("histogram") np.testing.assert_allclose(list(actual_attrs.values()), list(expected_attrs.values())) - def test_grid(self, zarr_tmp: Path, teapot_segy_spec: SegySpec) -> None: + def test_grid(self, teapot_mdio_tmp: Path, teapot_segy_spec: SegySpec) -> None: """Test validating MDIO variables.""" - ds = open_mdio(zarr_tmp) + ds = open_mdio(teapot_mdio_tmp) # Validate the dimension coordinate variables - validate_variable(ds, "inline", (345,), ("inline",), np.int32, range(1, 346), get_values) - validate_variable(ds, "crossline", (188,), ("crossline",), np.int32, range(1, 189), get_values) - validate_variable(ds, "time", (1501,), ("time",), np.int32, range(0, 3002, 2), get_values) + validate_xr_variable(ds, "inline", {"inline": 345}, UNITS_NONE, np.int32, False, range(1, 346), get_values) + validate_xr_variable( + ds, "crossline", {"crossline": 188}, UNITS_NONE, np.int32, False, range(1, 189), get_values + ) + validate_xr_variable( + ds, "time", {"time": 1501}, UNITS_MILLISECOND, np.int32, False, range(0, 3002, 2), get_values + ) # Validate the non-dimensional coordinate variables - validate_variable(ds, "cdp_x", (345, 188), ("inline", "crossline"), np.float64, None, None) - validate_variable(ds, "cdp_y", (345, 188), ("inline", "crossline"), np.float64, None, None) + validate_xr_variable(ds, "cdp_x", {"inline": 345, "crossline": 188}, UNITS_METER, np.float64) + validate_xr_variable(ds, "cdp_y", {"inline": 345, "crossline": 188}, UNITS_METER, np.float64) # Validate the headers # We have a custom set of headers since we used customize_segy_specs() segy_spec = teapot_segy_spec data_type = segy_spec.trace.header.dtype - validate_variable( + validate_xr_variable( ds, "headers", - (345, 188), - ("inline", "crossline"), + {"inline": 345, "crossline": 188}, + UNITS_NONE, data_type.newbyteorder("native"), # mdio saves with machine endian, spec could be different endian + False, range(1, 346), get_inline_header_values, ) # Validate the trace mask - validate_variable(ds, "trace_mask", (345, 188), ("inline", "crossline"), np.bool, None, None) + validate_xr_variable(ds, "trace_mask", {"inline": 345, "crossline": 188}, UNITS_NONE, np.bool) # validate the amplitude data - validate_variable( - ds, - "amplitude", - (345, 188, 1501), - ("inline", "crossline", "time"), - np.float32, - None, - None, + validate_xr_variable( + ds, "amplitude", {"inline": 345, "crossline": 188, "time": 1501}, UNITS_NONE, np.float32, True ) - def test_inline_reads(self, zarr_tmp: Path) -> None: + def test_inline_reads(self, teapot_mdio_tmp: Path) -> None: """Read and compare every 75 inlines' mean and std. dev.""" - ds = open_mdio(zarr_tmp) + ds = open_mdio(teapot_mdio_tmp) inlines = ds["amplitude"][::75, :, :] mean, std = inlines.mean(dtype="float64"), inlines.std(dtype="float64") npt.assert_allclose([mean, std], [0.00010555267, 0.60027058412]) # 11 precision - def test_crossline_reads(self, zarr_tmp: Path) -> None: + def test_crossline_reads(self, teapot_mdio_tmp: Path) -> None: """Read and compare every 75 crosslines' mean and std. dev.""" - ds = open_mdio(zarr_tmp) + ds = open_mdio(teapot_mdio_tmp) xlines = ds["amplitude"][:, ::75, :] mean, std = xlines.mean(dtype="float64"), xlines.std(dtype="float64") npt.assert_allclose([mean, std], [-5.03298501828e-05, 0.59406807762]) # 11 precision - def test_zslice_reads(self, zarr_tmp: Path) -> None: + def test_zslice_reads(self, teapot_mdio_tmp: Path) -> None: """Read and compare every 225 z-slices' mean and std. dev.""" - ds = open_mdio(zarr_tmp) + ds = open_mdio(teapot_mdio_tmp) slices = ds["amplitude"][:, :, ::225] mean, std = slices.mean(dtype="float64"), slices.std(dtype="float64") npt.assert_allclose([mean, std], [0.00523692339, 0.61279943571]) # 11 precision @pytest.mark.dependency("test_3d_import") def test_3d_export( - self, segy_input: Path, zarr_tmp: Path, segy_export_tmp: Path, teapot_segy_spec: SegySpec + self, segy_input: Path, teapot_mdio_tmp: Path, segy_export_tmp: Path, teapot_segy_spec: SegySpec ) -> None: """Test 3D export.""" rng = np.random.default_rng(seed=1234) - mdio_to_segy(segy_spec=teapot_segy_spec, input_path=zarr_tmp, output_path=segy_export_tmp) + mdio_to_segy(segy_spec=teapot_segy_spec, input_path=teapot_mdio_tmp, output_path=segy_export_tmp) # Check if file sizes match on IBM file. assert segy_input.stat().st_size == segy_export_tmp.stat().st_size diff --git a/tests/integration/test_z_create_empty.py b/tests/integration/test_z_create_empty.py new file mode 100644 index 000000000..517b5005b --- /dev/null +++ b/tests/integration/test_z_create_empty.py @@ -0,0 +1,398 @@ +"""Test for create_empty_mdio function. + +This set of tests has to run after the segy_roundtrip_teapot tests have run because +the teapot dataset is used as the input for the create_empty_like test. + +NOTE: The only reliable way to ensure the test order (including the case when the +test are run in parallel) is to use the alphabetical order of the test names. +""" + +from __future__ import annotations + +import math +from typing import TYPE_CHECKING + +import numpy as np +import pytest +from segy.standards import get_segy_standard + +from mdio.builder.schemas.v1.units import LengthUnitEnum +from mdio.builder.schemas.v1.units import LengthUnitModel +from mdio.builder.schemas.v1.units import SpeedUnitEnum +from mdio.builder.schemas.v1.units import SpeedUnitModel +from mdio.builder.schemas.v1.units import TimeUnitEnum +from mdio.builder.schemas.v1.units import TimeUnitModel + +if TYPE_CHECKING: + from pathlib import Path + + from xarray import Dataset as xr_Dataset + + +from tests.integration.testing_helpers import UNITS_METER +from tests.integration.testing_helpers import UNITS_METERS_PER_SECOND +from tests.integration.testing_helpers import UNITS_MILLISECOND +from tests.integration.testing_helpers import UNITS_NONE +from tests.integration.testing_helpers import UNITS_SECOND +from tests.integration.testing_helpers import get_teapot_segy_spec +from tests.integration.testing_helpers import get_values +from tests.integration.testing_helpers import validate_xr_variable + +from mdio import __version__ +from mdio.api.create import create_empty +from mdio.api.create import create_empty_like +from mdio.api.io import open_mdio +from mdio.api.io import to_mdio +from mdio.builder.schemas.v1.stats import CenteredBinHistogram +from mdio.builder.schemas.v1.stats import SummaryStatistics +from mdio.builder.templates.seismic_3d_poststack import Seismic3DPostStackTemplate +from mdio.converters.mdio import mdio_to_segy +from mdio.core import Dimension + + +class PostStack3DVelocityMetricTemplate(Seismic3DPostStackTemplate): + """Custom template that uses 'velocity' as the default variable name instead of 'amplitude'.""" + + @property + def _default_variable_name(self) -> str: + """Override the default variable name.""" + return "velocity" + + def __init__(self, data_domain: str) -> None: + super().__init__(data_domain) + self._units.update( + { + "time": UNITS_MILLISECOND, + "cdp_x": UNITS_METER, + "cdp_y": UNITS_METER, + "velocity": UNITS_METERS_PER_SECOND, + } + ) + + @property + def _name(self) -> str: + """Override the name of the template.""" + domain_suffix = self._data_domain.capitalize() + return f"PostStack3DVelocity{domain_suffix}" + + +class TestCreateEmptyMdio: + """Tests for create_empty_mdio function.""" + + @classmethod + def _create_empty_mdio(cls, create_headers: bool, output_path: Path, overwrite: bool = True) -> xr_Dataset: + """Create a temporary empty MDIO file for testing.""" + # Create the grid with the specified dimensions + dims = [ + Dimension(name="inline", coords=range(1, 346, 1)), + Dimension(name="crossline", coords=range(1, 189, 1)), + Dimension(name="time", coords=range(0, 3002, 2)), + ] + + # If later on, we want to export to SEG-Y, we need to provide the trace header spec. + # The HeaderSpec can be either standard or customized. + headers = get_teapot_segy_spec().trace.header if create_headers else None + # Create an empty MDIO v1 metric post-stack 3D time velocity dataset + return create_empty( + mdio_template=PostStack3DVelocityMetricTemplate(data_domain="time"), + dimensions=dims, + output_path=output_path, + headers=headers, + overwrite=overwrite, + ) + + @classmethod + def validate_teapot_dataset_metadata(cls, ds: xr_Dataset, is_velocity: bool) -> None: + """Validate the dataset metadata.""" + if is_velocity: + assert ds.name == "PostStack3DVelocityTime" + else: + assert ds.name == "PostStack3DTime" + + # Check basic metadata attributes + expected_attrs = { + "apiVersion": __version__, + "name": ds.name, + } + actual_attrs_json = ds.attrs + + # Compare one by one due to ever changing createdOn + for key, value in expected_attrs.items(): + assert key in actual_attrs_json + if key == "createdOn": + assert actual_attrs_json[key] is not None + else: + assert actual_attrs_json[key] == value + + # Check that createdOn exists + assert "createdOn" in actual_attrs_json + + # Validate template attributes + attributes = ds.attrs["attributes"] + assert attributes is not None + assert len(attributes) == 3 + # Validate all attributes provided by the abstract template + if is_velocity: + assert attributes["defaultVariableName"] == "velocity" + else: + assert attributes["defaultVariableName"] == "amplitude" + assert attributes["surveyType"] == "3D" + assert attributes["gatherType"] == "stacked" + + @classmethod + def validate_teapot_dataset_variables( + cls, ds: xr_Dataset, header_dtype: np.dtype | None, is_velocity: bool + ) -> None: + """Validate an empty MDIO dataset structure and content.""" + # Check that the dataset has the expected shape + assert ds.sizes == {"inline": 345, "crossline": 188, "time": 1501} + + # Validate the dimension coordinate variables + validate_xr_variable(ds, "inline", {"inline": 345}, UNITS_NONE, np.int32, False, range(1, 346), get_values) + validate_xr_variable( + ds, "crossline", {"crossline": 188}, UNITS_NONE, np.int32, False, range(1, 189), get_values + ) + validate_xr_variable( + ds, "time", {"time": 1501}, UNITS_MILLISECOND, np.int32, False, range(0, 3002, 2), get_values + ) + + # Validate the non-dimensional coordinate variables (should be empty for empty dataset) + validate_xr_variable(ds, "cdp_x", {"inline": 345, "crossline": 188}, UNITS_METER, np.float64) + validate_xr_variable(ds, "cdp_y", {"inline": 345, "crossline": 188}, UNITS_METER, np.float64) + + if header_dtype is not None: + # Validate the headers (should be empty for empty dataset) + # Infer the dtype from segy_spec and ignore endianness + header_dtype = header_dtype.newbyteorder("native") + validate_xr_variable(ds, "headers", {"inline": 345, "crossline": 188}, UNITS_NONE, header_dtype) + validate_xr_variable(ds, "segy_file_header", dims={}, units=UNITS_NONE, data_type=np.dtype("U1")) + else: + assert "headers" not in ds.variables + assert "segy_file_header" not in ds.variables + + # Validate the trace mask (should be all True for empty dataset) + validate_xr_variable(ds, "trace_mask", {"inline": 345, "crossline": 188}, UNITS_NONE, np.bool_) + trace_mask = ds["trace_mask"].values + assert not np.any(trace_mask), "Expected all `False` values in `trace_mask` but found `True`." + + # Validate the velocity or amplitude data (should be empty) + if is_velocity: + validate_xr_variable( + ds, "velocity", {"inline": 345, "crossline": 188, "time": 1501}, UNITS_METERS_PER_SECOND, np.float32 + ) + else: + validate_xr_variable( + ds, "amplitude", {"inline": 345, "crossline": 188, "time": 1501}, UNITS_NONE, np.float32 + ) + + @pytest.fixture(scope="class") + def mdio_with_headers(self, empty_mdio_dir: Path) -> Path: + """Create a temporary empty MDIO file for testing. + + This fixture is scoped to the class level, so it will be executed only once + and shared across all test methods in the class. + """ + empty_mdio: Path = empty_mdio_dir / "mdio_with_headers.mdio" + xr_dataset = self._create_empty_mdio(create_headers=True, output_path=empty_mdio) + assert xr_dataset is not None + return empty_mdio + + @pytest.fixture(scope="class") + def mdio_no_headers(self, empty_mdio_dir: Path) -> Path: + """Create a temporary empty MDIO file for testing. + + This fixture is scoped to the class level, so it will be executed only once + and shared across all test methods in the class. + """ + empty_mdio: Path = empty_mdio_dir / "mdio_no_headers.mdio" + xr_dataset = self._create_empty_mdio(create_headers=False, output_path=empty_mdio) + assert xr_dataset is not None + return empty_mdio + + def test_dataset_metadata(self, mdio_with_headers: Path) -> None: + """Test dataset metadata for empty MDIO file.""" + ds = open_mdio(mdio_with_headers) + self.validate_teapot_dataset_metadata(ds, is_velocity=True) + + def test_variables(self, mdio_with_headers: Path, mdio_no_headers: Path) -> None: + """Test grid validation for empty MDIO file.""" + ds = open_mdio(mdio_with_headers) + header_dtype = get_teapot_segy_spec().trace.header.dtype + self.validate_teapot_dataset_variables(ds, header_dtype=header_dtype, is_velocity=True) + + ds = open_mdio(mdio_no_headers) + self.validate_teapot_dataset_variables(ds, header_dtype=None, is_velocity=True) + + def test_overwrite_behavior(self, empty_mdio_dir: Path) -> None: + """Test overwrite parameter behavior in create_empty_mdio.""" + empty_mdio = empty_mdio_dir / "empty.mdio" + empty_mdio.mkdir(parents=True, exist_ok=True) + garbage_file = empty_mdio / "garbage.txt" + garbage_file.write_text("This is garbage data that should be overwritten") + garbage_dir = empty_mdio / "garbage_dir" + garbage_dir.mkdir(exist_ok=True) + (garbage_dir / "nested_garbage.txt").write_text("More garbage") + + # Verify the directory exists with garbage data + assert empty_mdio.exists() + assert garbage_file.exists() + assert garbage_dir.exists() + + # Second call: Try to create MDIO with overwrite=False - should raise FileExistsError + with pytest.raises(FileExistsError, match="Output location.*exists"): + self._create_empty_mdio(create_headers=True, output_path=empty_mdio, overwrite=False) + + # Third call: Create MDIO with overwrite=True - should succeed and overwrite garbage + xr_dataset = self._create_empty_mdio(create_headers=True, output_path=empty_mdio, overwrite=True) + assert xr_dataset is not None + + # Validate that the MDIO file can be loaded correctly using the helper function + ds = open_mdio(empty_mdio) + self.validate_teapot_dataset_metadata(ds, is_velocity=True) + header_dtype = get_teapot_segy_spec().trace.header.dtype + self.validate_teapot_dataset_variables(ds, header_dtype=header_dtype, is_velocity=True) + + # Verify the garbage data was overwritten (should not exist) + assert not garbage_file.exists(), "Garbage file should have been overwritten" + assert not garbage_dir.exists(), "Garbage directory should have been overwritten" + + def test_populate_empty_dataset(self, mdio_with_headers: Path) -> None: + """Test showing how to populate empty dataset.""" + # Open an empty PostStack3DVelocityTime dataset with SEG-Y 1.0 headers + # + # When this empty dataset was created from the 'PostStack3DVelocityTime' template and dimensions, + # * 'inline', 'crossline', and 'time' dimension coordinate variables were created and pre-populated + # NOTE: the 'time' units are specified in the template, so they are not None in this case. + # * 'cdp_x', 'cdp_y' non-dimensional coordinate variables were created + # NOTE: the 'cdp_x' and 'cdp_y' units are specified in the template, so they are not None in this case. + # * 'velocity' variable was created (the name of this default variable is specified in the template) + # NOTE: the 'velocity' units are specified in the template, so they are not None in this case. + # * 'trace_mask' variable was created and pre-populated with 'False' fill values + # (all traces are marked as dead) + # * 'headers' and 'segy_file_header' variables were created (if the dataset was created with + # headers not None). The 'headers' variable structured datatype is defined by the HeaderSpec + # that was used to create the empty MDIO + # * dataset attribute called 'attributes' was created + ds = open_mdio(mdio_with_headers) + + # 1) Populate dataset's velocity + var_name = ds.attrs["attributes"]["defaultVariableName"] + velocity = ds[var_name] + velocity[:5, :, :] = 1 + velocity[5:10, :, :] = 2 + velocity[50:100, :, :] = 3 + velocity[150:175, :, :] = -1 + + # 2) Populate dataset's velocity statistics (optional) + nonzero_samples = np.ma.masked_invalid(velocity, copy=False) + stats = SummaryStatistics( + count=nonzero_samples.count(), + min=nonzero_samples.min(), + max=nonzero_samples.max(), + sum=nonzero_samples.sum(dtype="float64"), + sum_squares=(np.ma.power(nonzero_samples, 2).sum(dtype="float64")), + histogram=CenteredBinHistogram(bin_centers=[], counts=[]), + ) + velocity.attrs["statsV1"] = stats.model_dump(mode="json") + + # 3) Populate the non-dimensional coordinate variables 'cdp_x' and 'cdp_y' (optional) + origin = [270000, 3290000] # survey x, y origin + inline_azimuth_rad = 0.523599 # survey orientation, in radians, from the north to the east (30 degrees) + spacing = [50, 50] # survey inline, crossline spacing + inline_grid, xline_grid = np.meshgrid(ds.inline.values, ds.crossline.values, indexing="ij") + sin_azimuth = math.sin(inline_azimuth_rad) + cos_azimuth = math.cos(inline_azimuth_rad) + ds.cdp_x[:] = origin[0] + inline_grid * spacing[0] * sin_azimuth + xline_grid * spacing[1] * cos_azimuth + ds.cdp_y[:] = origin[1] + inline_grid * spacing[0] * cos_azimuth - xline_grid * spacing[1] * sin_azimuth + + # 4) Populate dataset's trace mask (optional) + ds.trace_mask[:] = ~np.isnan(velocity[:, :, 0]) + + # 5) If the units were not set in the template or you want to change the coordinate and data variable units + # you can set the unitsV1 attribute for the coordinate and data variables (optional). + # If you are happy with the units specified in the template, you should skip this step. + ds.time.attrs["unitsV1"] = TimeUnitModel(time=TimeUnitEnum.MILLISECOND).model_dump(mode="json") + + ds.cdp_x.attrs["unitsV1"] = LengthUnitModel(length=LengthUnitEnum.FOOT).model_dump(mode="json") + ds.cdp_x.attrs["unitsV1"] = LengthUnitModel(length=LengthUnitEnum.FOOT).model_dump(mode="json") + + velocity.attrs["unitsV1"] = SpeedUnitModel(speed=SpeedUnitEnum.FEET_PER_SECOND).model_dump(mode="json") + + # 6) Populate dataset's segy trace headers, if those were created (required only if we want to export to SEG-Y) + if "headers" in ds.variables: + # Both the structured "headers" and the dummy "segy_file_header" variables are + # required to enable SEG-Y to MDIO conversion + + # Populate the structured trace "headers" variable + ds["headers"].values["inline"] = inline_grid + ds["headers"].values["crossline"] = xline_grid + # coordinate_scalar: + # Scalar to be applied to all coordinates specified in Standard Trace Header bytes + # 73–88 and to bytes Trace Header 181–188 to give the real value. Scalar = 1, + # ±10, ±100, ±1000, or ±10,000. If positive, scalar is used as a multiplier; if + # negative, scalar is used as divisor. A value of zero is assumed to be a scalar + # value of 1. + ds["headers"].values["coordinate_scalar"][:] = np.int16(-100) + ds["headers"].values["cdp_x"][:] = np.int32(ds.cdp_x * 100) + ds["headers"].values["cdp_y"][:] = np.int32(ds.cdp_y * 100) + + # Fill its metadata (.attrs) with 'textHeader' and 'binaryHeader'. + ds["segy_file_header"].attrs.update( + { + "textHeader": "\n".join( + [ + "C01 BYTES 13-16: CROSSLINE " + " " * 47, + "C02 BYTES 17-20: INLINE " + " " * 47, + "C03 BYTES 71-74: COORDINATE SCALAR " + " " * 47, + "C04 BYTES 181-184: CDP X " + " " * 47, + "C05 BYTES 185-188: CDP Y " + " " * 47, + *(f"C{i:02d}" + " " * 77 for i in range(6, 41)), + ] + ), + "binaryHeader": { + "data_sample_format": 1, + "sample_interval": int(ds.time[1] - ds.time[0]), + "samples_per_trace": ds.time.size, + "segy_revision_major": 0, + "segy_revision_minor": 0, + }, + } + ) + + # 7) Create dataset's custom attributes (optional) + ds.attrs["attributes"]["createdBy"] = "John Doe" + + # 8) Export to MDIO + output_path_mdio = mdio_with_headers.parent / "populated_empty.mdio" + to_mdio(ds, output_path=output_path_mdio, mode="w", compute=True) + + # 9) Convert the populated empty MDIO to SEG-Y + if "headers" in ds.variables: + # Select the SEG-Y standard to use for the conversion + custom_segy_spec = get_segy_standard(1.0) + # Customize to use the same HeaderSpec that was used to create the empty MDIO + custom_segy_spec.trace.header = get_teapot_segy_spec().trace.header + # Convert the MDIO file to SEG-Y + mdio_to_segy( + segy_spec=custom_segy_spec, + input_path=output_path_mdio, + output_path=mdio_with_headers.parent / "populated_empty.sgy", + ) + + def test_create_empty_like(self, teapot_mdio_tmp: Path, empty_mdio_dir: Path) -> None: + """Create an empty MDIO file like the input MDIO file. + + This test has to run after the segy_roundtrip_teapot tests have run because + its uses 'teapot_mdio_tmp' created by the segy_roundtrip_teapot tests as the input. + """ + ds = create_empty_like( + input_path=teapot_mdio_tmp, + output_path=empty_mdio_dir / "create_empty_like.mdio", + keep_coordinates=True, + overwrite=True, + ) + assert ds is not None + + self.validate_teapot_dataset_metadata(ds, is_velocity=False) + header_dtype = get_teapot_segy_spec().trace.header.dtype + self.validate_teapot_dataset_variables(ds, header_dtype=header_dtype, is_velocity=False) diff --git a/tests/integration/testing_helpers.py b/tests/integration/testing_helpers.py index c871ba6db..27a35d985 100644 --- a/tests/integration/testing_helpers.py +++ b/tests/integration/testing_helpers.py @@ -4,7 +4,37 @@ import numpy as np import xarray as xr -from numpy.typing import DTypeLike +from segy.schema import HeaderField +from segy.schema import ScalarType +from segy.schema.segy import SegySpec +from segy.standards import get_segy_standard + +from mdio.builder.schemas.v1.units import AllUnitModel +from mdio.builder.schemas.v1.units import LengthUnitEnum +from mdio.builder.schemas.v1.units import LengthUnitModel +from mdio.builder.schemas.v1.units import SpeedUnitEnum +from mdio.builder.schemas.v1.units import SpeedUnitModel +from mdio.builder.schemas.v1.units import TimeUnitEnum +from mdio.builder.schemas.v1.units import TimeUnitModel + +UNITS_NONE = None +UNITS_METER = LengthUnitModel(length=LengthUnitEnum.METER) +UNITS_SECOND = TimeUnitModel(time=TimeUnitEnum.SECOND) +UNITS_MILLISECOND = TimeUnitModel(time=TimeUnitEnum.MILLISECOND) +UNITS_METERS_PER_SECOND = SpeedUnitModel(speed=SpeedUnitEnum.METERS_PER_SECOND) +UNITS_FOOT = LengthUnitModel(length=LengthUnitEnum.FOOT) +UNITS_FEET_PER_SECOND = SpeedUnitModel(speed=SpeedUnitEnum.FEET_PER_SECOND) + + +def get_teapot_segy_spec() -> SegySpec: + """Return the customized SEG-Y specification for the teapot dome dataset.""" + teapot_fields = [ + HeaderField(name="inline", byte=17, format=ScalarType.INT32), + HeaderField(name="crossline", byte=13, format=ScalarType.INT32), + HeaderField(name="cdp_x", byte=81, format=ScalarType.INT32), + HeaderField(name="cdp_y", byte=85, format=ScalarType.INT32), + ] + return get_segy_standard(1.0).customize(trace_header_fields=teapot_fields) def get_values(arr: xr.DataArray) -> np.ndarray: @@ -17,35 +47,49 @@ def get_inline_header_values(dataset: xr.Dataset) -> np.ndarray: return dataset["inline"].values -def validate_variable( # noqa PLR0913 +def validate_xr_variable( # noqa PLR0913 dataset: xr.Dataset, name: str, - shape: tuple[int, ...], - dims: tuple[str, ...], - data_type: DTypeLike, - expected_values: range | None, + dims: dict[int], + units: AllUnitModel, + data_type: np.dtype, + has_stats: bool = False, + expected_values: range | None = None, actual_value_generator: Callable[[xr.DataArray], np.ndarray] | None = None, ) -> None: """Validate the properties of a variable in an Xarray dataset.""" - arr = dataset[name] - assert shape == arr.shape - assert set(dims) == set(arr.dims) + v = dataset[name] + assert v is not None + assert v.sizes == dims if hasattr(data_type, "fields") and data_type.fields is not None: # The following assertion will fail because of differences in offsets # assert data_type == arr.dtype # Compare field names expected_names = list(data_type.names) - actual_names = list(arr.dtype.names) + actual_names = list(v.dtype.names) assert expected_names == actual_names # Compare field types expected_types = [data_type[name] for name in data_type.names] - actual_types = [arr.dtype[name] for name in arr.dtype.names] + actual_types = [v.dtype[name] for name in v.dtype.names] assert expected_types == actual_types else: - assert data_type == arr.dtype + assert data_type == v.dtype + + stats = v.attrs.get("statsV1", None) + if has_stats: + assert stats is not None, "StatsV1 should not be empty for dataset variables with stats" + else: + assert stats is None, "StatsV1 should be empty for dataset variables without stats" + + if units is not None: + units_v1 = v.attrs.get("unitsV1", None) + assert units_v1 is not None, "UnitsV1 should not be empty for dataset variables with units" + assert units_v1 == units.model_dump(mode="json") + else: + assert "unitsV1" not in v.attrs, "UnitsV1 should not exist for unit-unaware variables" if expected_values is not None and actual_value_generator is not None: - actual_values = actual_value_generator(arr) + actual_values = actual_value_generator(v) assert np.array_equal(expected_values, actual_values) diff --git a/tests/unit/v1/test_dataset_serializer.py b/tests/unit/v1/test_dataset_serializer.py index 45c19665f..464d9e6cc 100644 --- a/tests/unit/v1/test_dataset_serializer.py +++ b/tests/unit/v1/test_dataset_serializer.py @@ -1,5 +1,6 @@ """Tests the schema v1 dataset_serializer public API.""" +import shutil from pathlib import Path import numpy as np @@ -302,4 +303,7 @@ def test_seismic_poststack_3d_acceptance_to_xarray_dataset(tmp_path: Path) -> No xr_ds = to_xarray_dataset(dataset) file_path = f"{tmp_path}/{xr_ds.attrs['name']}.zarr" + # Delete the directory if it exists from running previous tests + if Path(file_path).exists(): + shutil.rmtree(file_path) to_mdio(xr_ds, output_path=file_path, mode="w-", compute=False)