From 46cfc1d9ba32f8b0706e96183f1734daa4d0f08e Mon Sep 17 00:00:00 2001 From: Dmitriy Repin Date: Mon, 29 Sep 2025 16:27:01 +0000 Subject: [PATCH 01/27] create empty --- src/mdio/converters/segy.py | 58 +++++++++ tests/conftest.py | 18 ++- tests/integration/test_create_empty_mdio.py | 122 ++++++++++++++++++ .../test_segy_import_export_masked.py | 2 +- 4 files changed, 194 insertions(+), 6 deletions(-) create mode 100644 tests/integration/test_create_empty_mdio.py diff --git a/src/mdio/converters/segy.py b/src/mdio/converters/segy.py index cdf9f88e3..b0e05ddc1 100644 --- a/src/mdio/converters/segy.py +++ b/src/mdio/converters/segy.py @@ -417,3 +417,61 @@ def segy_to_mdio( # noqa PLR0913 dataset=xr_dataset, data_variable_name=default_variable_name, ) + + +def create_empty_mdio( # noqa PLR0913 + segy_spec: SegySpec, + mdio_template: AbstractDatasetTemplate, + grid: Grid, + output_path: UPath | Path | str, + overwrite: bool = False, +) -> None: + """A function that creates an empty MDIO v1 file with known dimensions. + + Args: + segy_spec: The SEG-Y specification to use for trace headers. + mdio_template: The MDIO template to use to define the dataset structure. + grid: The grid specifying the dimensions of the MDIO file. + output_path: The universal path for the output MDIO v1 file. + overwrite: Whether to overwrite the output file if it already exists. Defaults to False. + + Raises: + FileExistsError: If the output location already exists and overwrite is False. + """ + output_path = _normalize_path(output_path) + + if not overwrite and output_path.exists(): + err = f"Output location '{output_path.as_posix()}' exists. Set `overwrite=True` if intended." + raise FileExistsError(err) + + # Build the dataset structure using the template and grid + header_dtype = to_structured_type(segy_spec.trace.header.dtype) + horizontal_unit = _get_horizontal_coordinate_unit(grid.dims) + mdio_ds: Dataset = mdio_template.build_dataset( + name=mdio_template.name, + sizes=grid.shape, + horizontal_coord_unit=horizontal_unit, + header_dtype=header_dtype, + ) + + # Convert to xarray dataset + xr_dataset: xr_Dataset = to_xarray_dataset(mdio_ds=mdio_ds) + + # Populate coordinates using the grid + # For empty datasets, we only populate dimension coordinates + drop_vars_delayed = [] + dataset, drop_vars_delayed = populate_dim_coordinates(xr_dataset, grid, drop_vars_delayed=drop_vars_delayed) + + # Set the trace mask to indicate all traces are live (since this is an empty dataset) + if grid.live_mask is not None: + dataset.trace_mask.data[:] = grid.live_mask + else: + # If live_mask is None, create a mask where all traces are live + dataset.trace_mask.data[:] = True + + # Create the Zarr store with the correct structure but with empty arrays + to_mdio(dataset, output_path=output_path, mode="w", compute=False) + + # Write the dimension coordinates and trace mask + meta_ds = dataset[drop_vars_delayed + ["trace_mask"]] + to_mdio(meta_ds, output_path=output_path, mode="r+", compute=True) diff --git a/tests/conftest.py b/tests/conftest.py index 24bc06ef9..3a27e1767 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -23,7 +23,7 @@ def fake_segy_tmp(tmp_path_factory: pytest.TempPathFactory) -> Path: """Make a temp file for the fake SEG-Y files we are going to create.""" if DEBUG_MODE: - return Path("TMP/fake_segy") + return Path("tmp/fake_segy") return tmp_path_factory.mktemp(r"fake_segy") @@ -37,7 +37,7 @@ def segy_input_uri() -> str: def segy_input(segy_input_uri: str, tmp_path_factory: pytest.TempPathFactory) -> Path: """Download teapot dome dataset for testing.""" if DEBUG_MODE: - tmp_dir = Path("TMP/segy") + tmp_dir = Path("tmp/segy") tmp_dir.mkdir(parents=True, exist_ok=True) else: tmp_dir = tmp_path_factory.mktemp("segy") @@ -50,7 +50,7 @@ def segy_input(segy_input_uri: str, tmp_path_factory: pytest.TempPathFactory) -> def zarr_tmp(tmp_path_factory: pytest.TempPathFactory) -> Path: """Make a temp file for the output MDIO.""" if DEBUG_MODE: - return Path("TMP/mdio") + return Path("tmp/mdio") return tmp_path_factory.mktemp(r"mdio") @@ -58,7 +58,7 @@ def zarr_tmp(tmp_path_factory: pytest.TempPathFactory) -> Path: def zarr_tmp2(tmp_path_factory: pytest.TempPathFactory) -> Path: """Make a temp file for the output MDIO.""" if DEBUG_MODE: - return Path("TMP/mdio2") + return Path("tmp/mdio2") return tmp_path_factory.mktemp(r"mdio2") @@ -66,8 +66,16 @@ def zarr_tmp2(tmp_path_factory: pytest.TempPathFactory) -> Path: def segy_export_tmp(tmp_path_factory: pytest.TempPathFactory) -> Path: """Make a temp file for the round-trip IBM SEG-Y.""" if DEBUG_MODE: - tmp_dir = Path("TMP/segy") + tmp_dir = Path("tmp/segy") tmp_dir.mkdir(parents=True, exist_ok=True) else: tmp_dir = tmp_path_factory.mktemp("segy") return tmp_dir / "teapot_roundtrip.segy" + + +@pytest.fixture(scope="class") +def empty_mdio(tmp_path_factory: pytest.TempPathFactory) -> Path: + """Make a temp file for empty MDIO testing.""" + if DEBUG_MODE: + return Path("tmp/empty_mdio") + return tmp_path_factory.mktemp(r"empty_mdio") diff --git a/tests/integration/test_create_empty_mdio.py b/tests/integration/test_create_empty_mdio.py new file mode 100644 index 000000000..73a8753a9 --- /dev/null +++ b/tests/integration/test_create_empty_mdio.py @@ -0,0 +1,122 @@ +"""Test for create_empty_mdio function.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import numpy as np +import pytest +from segy.standards import get_segy_standard + +if TYPE_CHECKING: + from pathlib import Path + + from segy.schema import SegySpec +from tests.integration.testing_helpers import get_values +from tests.integration.testing_helpers import validate_variable + +from mdio import __version__ +from mdio.api.io import open_mdio +from mdio.builder.template_registry import get_template +from mdio.converters.segy import create_empty_mdio +from mdio.core import Dimension +from mdio.core import Grid + + +class TestCreateEmptyPostStack3DTimeMdio: + """Tests for create_empty_mdio function.""" + + @pytest.fixture(scope="class") + def segy_spec(self) -> SegySpec: + """Return the SEG-Y specification for the test.""" + return get_segy_standard(1.0) + + @pytest.fixture(scope="class") + def empty_mdio_path(self, segy_spec: SegySpec, empty_mdio: Path) -> Path: + """Create a temporary empty MDIO file for testing. + + This fixture is scoped to the class level, so it will be executed only once + and shared across all test methods in the class. + """ + # Create the grid with the specified dimensions + grid = Grid( + dims=[ + Dimension(name="inline", coords=range(100, 300, 1)), # 100-300 with step 1 + Dimension(name="crossline", coords=range(1000, 1600, 2)), # 1000-1600 with step 2 + Dimension(name="time", coords=range(0, 3000, 4)), # 0-3 seconds 4ms sample rate + ] + ) + + mdio_template = get_template("PostStack3DTime") + + # Call create_empty_mdio + create_empty_mdio( + segy_spec=segy_spec, + mdio_template=mdio_template, + grid=grid, + output_path=empty_mdio, + overwrite=True + ) + + return empty_mdio + + def test_dataset_metadata(self, empty_mdio_path: Path) -> None: + """Test dataset metadata for empty MDIO file.""" + ds = open_mdio(empty_mdio_path) + + # Check basic metadata attributes + expected_attrs = { + "apiVersion": __version__, + "name": "PostStack3DTime", + } + actual_attrs_json = ds.attrs + + # Compare one by one due to ever changing createdOn + for key, value in expected_attrs.items(): + assert key in actual_attrs_json + if key == "createdOn": + assert actual_attrs_json[key] is not None + else: + assert actual_attrs_json[key] == value + + # Check that createdOn exists + assert "createdOn" in actual_attrs_json + assert actual_attrs_json["createdOn"] is not None + + # Validate template attributes + attributes = ds.attrs["attributes"] + assert attributes is not None + assert len(attributes) == 3 + # Validate all attributes provided by the abstract template + assert attributes["defaultVariableName"] == "amplitude" + assert attributes["surveyType"] == "3D" + assert attributes["gatherType"] == "stacked" + + def test_grid(self, empty_mdio_path: Path, segy_spec: SegySpec) -> None: + """Test grid validation for empty MDIO file.""" + ds = open_mdio(empty_mdio_path) + + # Check that the dataset has the expected shape + assert ds.sizes == {"inline": 200, "crossline": 300, "time": 750} + + # Validate the dimension coordinate variables + validate_variable(ds, "inline", (200,), ("inline",), np.int32, range(100, 300), get_values) + validate_variable(ds, "crossline", (300,), ("crossline",), np.int32, range(1000, 1600, 2), get_values) + validate_variable(ds, "time", (750,), ("time",), np.int32, range(0, 3000, 4), get_values) + + # Validate the non-dimensional coordinate variables (should be empty for empty dataset) + validate_variable(ds, "cdp_x", (200, 300), ("inline", "crossline"), np.float64, None, None) + validate_variable(ds, "cdp_y", (200, 300), ("inline", "crossline"), np.float64, None, None) + + # Validate the headers (should be empty for empty dataset) + # Infer the dtype from segy_spec and ignore endianness + header_dtype = segy_spec.trace.header.dtype.newbyteorder("native") + validate_variable(ds, "headers", (200, 300), ("inline", "crossline"), header_dtype, None, None) + + # Validate the trace mask (should be all True for empty dataset) + validate_variable(ds, "trace_mask", (200, 300), ("inline", "crossline"), np.bool_, None, None) + trace_mask = ds["trace_mask"].values + assert np.all(trace_mask), "All traces should be marked as live in empty dataset" + + # Validate the amplitude data (should be empty) + validate_variable(ds, "amplitude", (200, 300, 750), ("inline", "crossline", "time"), np.float32, None, None) diff --git a/tests/integration/test_segy_import_export_masked.py b/tests/integration/test_segy_import_export_masked.py index 064ce1d7c..1b1aa1740 100644 --- a/tests/integration/test_segy_import_export_masked.py +++ b/tests/integration/test_segy_import_export_masked.py @@ -285,7 +285,7 @@ def generate_selection_mask(selection_conf: SelectionMaskConfig, grid_conf: Grid def export_masked_path(tmp_path_factory: pytest.TempPathFactory) -> Path: """Fixture that generates temp directory for export tests.""" if DEBUG_MODE: - return Path("TMP/export_masked") + return Path("tmp/export_masked") return tmp_path_factory.getbasetemp() / "export_masked" From 8fbc127ebfedba8e3d2c074d4270518711dab6f8 Mon Sep 17 00:00:00 2001 From: Dmitriy Repin Date: Mon, 29 Sep 2025 16:42:14 +0000 Subject: [PATCH 02/27] Add test_overwrite_behavior --- tests/integration/test_create_empty_mdio.py | 102 +++++++++++++++----- 1 file changed, 80 insertions(+), 22 deletions(-) diff --git a/tests/integration/test_create_empty_mdio.py b/tests/integration/test_create_empty_mdio.py index 73a8753a9..4728cffe6 100644 --- a/tests/integration/test_create_empty_mdio.py +++ b/tests/integration/test_create_empty_mdio.py @@ -26,6 +26,34 @@ class TestCreateEmptyPostStack3DTimeMdio: """Tests for create_empty_mdio function.""" + @classmethod + def _validate_empty_mdio_dataset(cls, ds, segy_spec: SegySpec) -> None: + """Validate an empty MDIO dataset structure and content.""" + # Check that the dataset has the expected shape + assert ds.sizes == {"inline": 200, "crossline": 300, "time": 750} + + # Validate the dimension coordinate variables + validate_variable(ds, "inline", (200,), ("inline",), np.int32, range(100, 300), get_values) + validate_variable(ds, "crossline", (300,), ("crossline",), np.int32, range(1000, 1600, 2), get_values) + validate_variable(ds, "time", (750,), ("time",), np.int32, range(0, 3000, 4), get_values) + + # Validate the non-dimensional coordinate variables (should be empty for empty dataset) + validate_variable(ds, "cdp_x", (200, 300), ("inline", "crossline"), np.float64, None, None) + validate_variable(ds, "cdp_y", (200, 300), ("inline", "crossline"), np.float64, None, None) + + # Validate the headers (should be empty for empty dataset) + # Infer the dtype from segy_spec and ignore endianness + header_dtype = segy_spec.trace.header.dtype.newbyteorder("native") + validate_variable(ds, "headers", (200, 300), ("inline", "crossline"), header_dtype, None, None) + + # Validate the trace mask (should be all True for empty dataset) + validate_variable(ds, "trace_mask", (200, 300), ("inline", "crossline"), np.bool_, None, None) + trace_mask = ds["trace_mask"].values + assert np.all(trace_mask), "All traces should be marked as live in empty dataset" + + # Validate the amplitude data (should be empty) + validate_variable(ds, "amplitude", (200, 300, 750), ("inline", "crossline", "time"), np.float32, None, None) + @pytest.fixture(scope="class") def segy_spec(self) -> SegySpec: """Return the SEG-Y specification for the test.""" @@ -95,28 +123,58 @@ def test_dataset_metadata(self, empty_mdio_path: Path) -> None: def test_grid(self, empty_mdio_path: Path, segy_spec: SegySpec) -> None: """Test grid validation for empty MDIO file.""" ds = open_mdio(empty_mdio_path) + self._validate_empty_mdio_dataset(ds, segy_spec) - # Check that the dataset has the expected shape - assert ds.sizes == {"inline": 200, "crossline": 300, "time": 750} - # Validate the dimension coordinate variables - validate_variable(ds, "inline", (200,), ("inline",), np.int32, range(100, 300), get_values) - validate_variable(ds, "crossline", (300,), ("crossline",), np.int32, range(1000, 1600, 2), get_values) - validate_variable(ds, "time", (750,), ("time",), np.int32, range(0, 3000, 4), get_values) - - # Validate the non-dimensional coordinate variables (should be empty for empty dataset) - validate_variable(ds, "cdp_x", (200, 300), ("inline", "crossline"), np.float64, None, None) - validate_variable(ds, "cdp_y", (200, 300), ("inline", "crossline"), np.float64, None, None) - - # Validate the headers (should be empty for empty dataset) - # Infer the dtype from segy_spec and ignore endianness - header_dtype = segy_spec.trace.header.dtype.newbyteorder("native") - validate_variable(ds, "headers", (200, 300), ("inline", "crossline"), header_dtype, None, None) - - # Validate the trace mask (should be all True for empty dataset) - validate_variable(ds, "trace_mask", (200, 300), ("inline", "crossline"), np.bool_, None, None) - trace_mask = ds["trace_mask"].values - assert np.all(trace_mask), "All traces should be marked as live in empty dataset" + def test_overwrite_behavior(self, segy_spec: SegySpec, empty_mdio: Path) -> None: + """Test overwrite parameter behavior in create_empty_mdio.""" + # Create the grid with the specified dimensions + grid = Grid( + dims=[ + Dimension(name="inline", coords=range(100, 300, 1)), # 100-300 with step 1 + Dimension(name="crossline", coords=range(1000, 1600, 2)), # 1000-1600 with step 2 + Dimension(name="time", coords=range(0, 3000, 4)), # 0-3 seconds 4ms sample rate + ] + ) - # Validate the amplitude data (should be empty) - validate_variable(ds, "amplitude", (200, 300, 750), ("inline", "crossline", "time"), np.float32, None, None) + mdio_template = get_template("PostStack3DTime") + + # First: Create a directory and populate it with garbage data + empty_mdio.mkdir(parents=True, exist_ok=True) + garbage_file = empty_mdio / "garbage.txt" + garbage_file.write_text("This is garbage data that should be overwritten") + garbage_dir = empty_mdio / "garbage_dir" + garbage_dir.mkdir() + (garbage_dir / "nested_garbage.txt").write_text("More garbage") + + # Verify the directory exists with garbage data + assert empty_mdio.exists() + assert garbage_file.exists() + assert garbage_dir.exists() + + # Second call: Try to create MDIO with overwrite=False - should raise FileExistsError + with pytest.raises(FileExistsError, match="Output location.*exists"): + create_empty_mdio( + segy_spec=segy_spec, + mdio_template=mdio_template, + grid=grid, + output_path=empty_mdio, + overwrite=False + ) + + # Third call: Create MDIO with overwrite=True - should succeed and overwrite garbage + create_empty_mdio( + segy_spec=segy_spec, + mdio_template=mdio_template, + grid=grid, + output_path=empty_mdio, + overwrite=True + ) + + # Validate that the MDIO file can be loaded correctly using the helper function + ds = open_mdio(empty_mdio) + self._validate_empty_mdio_dataset(ds, segy_spec) + + # Verify the garbage data was overwritten (should not exist) + assert not garbage_file.exists(), "Garbage file should have been overwritten" + assert not garbage_dir.exists(), "Garbage directory should have been overwritten" From 6d1fceae24d19c9b96822378b5eadfc4f34b14b7 Mon Sep 17 00:00:00 2001 From: Dmitriy Repin Date: Mon, 29 Sep 2025 16:45:46 +0000 Subject: [PATCH 03/27] Fix pre-commit --- tests/integration/test_create_empty_mdio.py | 35 +++++++-------------- 1 file changed, 12 insertions(+), 23 deletions(-) diff --git a/tests/integration/test_create_empty_mdio.py b/tests/integration/test_create_empty_mdio.py index 4728cffe6..863e74896 100644 --- a/tests/integration/test_create_empty_mdio.py +++ b/tests/integration/test_create_empty_mdio.py @@ -12,6 +12,8 @@ from pathlib import Path from segy.schema import SegySpec + from xarray import Dataset as xr_Dataset + from tests.integration.testing_helpers import get_values from tests.integration.testing_helpers import validate_variable @@ -27,7 +29,7 @@ class TestCreateEmptyPostStack3DTimeMdio: """Tests for create_empty_mdio function.""" @classmethod - def _validate_empty_mdio_dataset(cls, ds, segy_spec: SegySpec) -> None: + def _validate_empty_mdio_dataset(cls, ds: xr_Dataset, segy_spec: SegySpec) -> None: """Validate an empty MDIO dataset structure and content.""" # Check that the dataset has the expected shape assert ds.sizes == {"inline": 200, "crossline": 300, "time": 750} @@ -79,11 +81,7 @@ def empty_mdio_path(self, segy_spec: SegySpec, empty_mdio: Path) -> Path: # Call create_empty_mdio create_empty_mdio( - segy_spec=segy_spec, - mdio_template=mdio_template, - grid=grid, - output_path=empty_mdio, - overwrite=True + segy_spec=segy_spec, mdio_template=mdio_template, grid=grid, output_path=empty_mdio, overwrite=True ) return empty_mdio @@ -125,7 +123,6 @@ def test_grid(self, empty_mdio_path: Path, segy_spec: SegySpec) -> None: ds = open_mdio(empty_mdio_path) self._validate_empty_mdio_dataset(ds, segy_spec) - def test_overwrite_behavior(self, segy_spec: SegySpec, empty_mdio: Path) -> None: """Test overwrite parameter behavior in create_empty_mdio.""" # Create the grid with the specified dimensions @@ -138,7 +135,7 @@ def test_overwrite_behavior(self, segy_spec: SegySpec, empty_mdio: Path) -> None ) mdio_template = get_template("PostStack3DTime") - + # First: Create a directory and populate it with garbage data empty_mdio.mkdir(parents=True, exist_ok=True) garbage_file = empty_mdio / "garbage.txt" @@ -146,35 +143,27 @@ def test_overwrite_behavior(self, segy_spec: SegySpec, empty_mdio: Path) -> None garbage_dir = empty_mdio / "garbage_dir" garbage_dir.mkdir() (garbage_dir / "nested_garbage.txt").write_text("More garbage") - + # Verify the directory exists with garbage data assert empty_mdio.exists() assert garbage_file.exists() assert garbage_dir.exists() - + # Second call: Try to create MDIO with overwrite=False - should raise FileExistsError with pytest.raises(FileExistsError, match="Output location.*exists"): create_empty_mdio( - segy_spec=segy_spec, - mdio_template=mdio_template, - grid=grid, - output_path=empty_mdio, - overwrite=False + segy_spec=segy_spec, mdio_template=mdio_template, grid=grid, output_path=empty_mdio, overwrite=False ) - + # Third call: Create MDIO with overwrite=True - should succeed and overwrite garbage create_empty_mdio( - segy_spec=segy_spec, - mdio_template=mdio_template, - grid=grid, - output_path=empty_mdio, - overwrite=True + segy_spec=segy_spec, mdio_template=mdio_template, grid=grid, output_path=empty_mdio, overwrite=True ) - + # Validate that the MDIO file can be loaded correctly using the helper function ds = open_mdio(empty_mdio) self._validate_empty_mdio_dataset(ds, segy_spec) - + # Verify the garbage data was overwritten (should not exist) assert not garbage_file.exists(), "Garbage file should have been overwritten" assert not garbage_dir.exists(), "Garbage directory should have been overwritten" From 40bd1da678255fa0872b43c540b46704ab7d2e73 Mon Sep 17 00:00:00 2001 From: Dmitriy Repin Date: Tue, 30 Sep 2025 16:43:59 +0000 Subject: [PATCH 04/27] Update API --- src/mdio/converters/segy.py | 6 +- src/mdio/creators/__init__.py | 5 + src/mdio/creators/mdio.py | 78 ++++++++++++++ tests/conftest.py | 5 +- tests/integration/test_create_empty_mdio.py | 106 ++++++++++---------- 5 files changed, 143 insertions(+), 57 deletions(-) create mode 100644 src/mdio/creators/__init__.py create mode 100644 src/mdio/creators/mdio.py diff --git a/src/mdio/converters/segy.py b/src/mdio/converters/segy.py index b0e05ddc1..527a87af0 100644 --- a/src/mdio/converters/segy.py +++ b/src/mdio/converters/segy.py @@ -249,7 +249,7 @@ def populate_non_dim_coordinates( return dataset, drop_vars_delayed -def _get_horizontal_coordinate_unit(segy_headers: list[Dimension]) -> LengthUnitModel | None: +def get_horizontal_coordinate_unit(segy_headers: list[Dimension]) -> LengthUnitModel | None: """Get the coordinate unit from the SEG-Y headers.""" name = TraceHeaderFieldsRev0.COORDINATE_UNIT.name.upper() unit_hdr = next((c for c in segy_headers if c.name.upper() == name), None) @@ -372,7 +372,7 @@ def segy_to_mdio( # noqa PLR0913 _, non_dim_coords = _get_coordinates(grid, segy_headers, mdio_template) header_dtype = to_structured_type(segy_spec.trace.header.dtype) - horizontal_unit = _get_horizontal_coordinate_unit(segy_dimensions) + horizontal_unit = get_horizontal_coordinate_unit(segy_dimensions) mdio_ds: Dataset = mdio_template.build_dataset( name=mdio_template.name, sizes=grid.shape, @@ -446,7 +446,7 @@ def create_empty_mdio( # noqa PLR0913 # Build the dataset structure using the template and grid header_dtype = to_structured_type(segy_spec.trace.header.dtype) - horizontal_unit = _get_horizontal_coordinate_unit(grid.dims) + horizontal_unit = get_horizontal_coordinate_unit(grid.dims) mdio_ds: Dataset = mdio_template.build_dataset( name=mdio_template.name, sizes=grid.shape, diff --git a/src/mdio/creators/__init__.py b/src/mdio/creators/__init__.py new file mode 100644 index 000000000..66258d208 --- /dev/null +++ b/src/mdio/creators/__init__.py @@ -0,0 +1,5 @@ +"""MDIO Data creation API.""" + +from mdio.creators.mdio import create_empty_mdio + +__all__ = ["create_empty_mdio"] \ No newline at end of file diff --git a/src/mdio/creators/mdio.py b/src/mdio/creators/mdio.py new file mode 100644 index 000000000..6057fb5cb --- /dev/null +++ b/src/mdio/creators/mdio.py @@ -0,0 +1,78 @@ +"""Creating MDIO v1 datasets.""" + +from __future__ import annotations +from typing import TYPE_CHECKING +from segy.standards import get_segy_standard +from mdio.api.io import _normalize_path +from mdio.api.io import to_mdio +from mdio.builder.xarray_builder import to_xarray_dataset +from mdio.converters.segy import get_horizontal_coordinate_unit, populate_dim_coordinates +from mdio.converters.type_converter import to_structured_type +from mdio.core.grid import Grid + +if TYPE_CHECKING: + from pathlib import Path + from typing import Any + + from segy.arrays import HeaderArray as SegyHeaderArray + from segy.schema import SegySpec + from upath import UPath + from xarray import Dataset as xr_Dataset + + from mdio.builder.schemas import Dataset + from mdio.builder.templates.abstract_dataset_template import AbstractDatasetTemplate + from mdio.core.dimension import Dimension + + +def create_empty_mdio( # noqa PLR0913 + mdio_template: AbstractDatasetTemplate, + dimensions: list[Dimension], + output_path: UPath | Path | str, + create_headers: bool = False, + overwrite: bool = False, +) -> None: + """A function that creates an empty MDIO v1 file with known dimensions. + + Args: + mdio_template: The MDIO template to use to define the dataset structure. + dimensions: The dimensions of the MDIO file. + output_path: The universal path for the output MDIO v1 file. + create_headers: Whether to create a full set of SEG-Y v1.0 trace headers. Defaults to False. + overwrite: Whether to overwrite the output file if it already exists. Defaults to False. + + Raises: + FileExistsError: If the output location already exists and overwrite is False. + """ + output_path = _normalize_path(output_path) + + if not overwrite and output_path.exists(): + err = f"Output location '{output_path.as_posix()}' exists. Set `overwrite=True` if intended." + raise FileExistsError(err) + + header_dtype = to_structured_type(get_segy_standard(1.0).trace.header.dtype) if create_headers else None + grid = Grid(dims=dimensions) + horizontal_unit = get_horizontal_coordinate_unit(grid.dims) + mdio_ds: Dataset = mdio_template.build_dataset( + name=mdio_template.name, + sizes=grid.shape, + horizontal_coord_unit=horizontal_unit, + header_dtype=header_dtype, + ) + + # Convert to xarray dataset + xr_dataset: xr_Dataset = to_xarray_dataset(mdio_ds=mdio_ds) + + # Populate coordinates using the grid + # For empty datasets, we only populate dimension coordinates + drop_vars_delayed = [] + dataset, drop_vars_delayed = populate_dim_coordinates(xr_dataset, grid, drop_vars_delayed=drop_vars_delayed) + + # Set the trace mask to indicate all traces are live (since this is an empty dataset) + dataset.trace_mask.data[:] = True + + # Create the Zarr store with the correct structure but with empty arrays + to_mdio(dataset, output_path=output_path, mode="w", compute=False) + + # Write the dimension coordinates and trace mask + meta_ds = dataset[drop_vars_delayed + ["trace_mask"]] + to_mdio(meta_ds, output_path=output_path, mode="r+", compute=True) \ No newline at end of file diff --git a/tests/conftest.py b/tests/conftest.py index 3a27e1767..8555d1eb5 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -74,8 +74,9 @@ def segy_export_tmp(tmp_path_factory: pytest.TempPathFactory) -> Path: @pytest.fixture(scope="class") -def empty_mdio(tmp_path_factory: pytest.TempPathFactory) -> Path: +def empty_mdio_dir(tmp_path_factory: pytest.TempPathFactory) -> Path: """Make a temp file for empty MDIO testing.""" if DEBUG_MODE: - return Path("tmp/empty_mdio") + tmp_dir = Path("tmp/empty_mdio") + tmp_dir.mkdir(parents=True, exist_ok=True) return tmp_path_factory.mktemp(r"empty_mdio") diff --git a/tests/integration/test_create_empty_mdio.py b/tests/integration/test_create_empty_mdio.py index 863e74896..b867f6669 100644 --- a/tests/integration/test_create_empty_mdio.py +++ b/tests/integration/test_create_empty_mdio.py @@ -11,7 +11,6 @@ if TYPE_CHECKING: from pathlib import Path - from segy.schema import SegySpec from xarray import Dataset as xr_Dataset from tests.integration.testing_helpers import get_values @@ -20,16 +19,15 @@ from mdio import __version__ from mdio.api.io import open_mdio from mdio.builder.template_registry import get_template -from mdio.converters.segy import create_empty_mdio from mdio.core import Dimension -from mdio.core import Grid +from mdio.creators.mdio import create_empty_mdio class TestCreateEmptyPostStack3DTimeMdio: """Tests for create_empty_mdio function.""" @classmethod - def _validate_empty_mdio_dataset(cls, ds: xr_Dataset, segy_spec: SegySpec) -> None: + def _validate_empty_mdio_dataset(cls, ds: xr_Dataset, has_headers: bool) -> None: """Validate an empty MDIO dataset structure and content.""" # Check that the dataset has the expected shape assert ds.sizes == {"inline": 200, "crossline": 300, "time": 750} @@ -43,10 +41,13 @@ def _validate_empty_mdio_dataset(cls, ds: xr_Dataset, segy_spec: SegySpec) -> No validate_variable(ds, "cdp_x", (200, 300), ("inline", "crossline"), np.float64, None, None) validate_variable(ds, "cdp_y", (200, 300), ("inline", "crossline"), np.float64, None, None) - # Validate the headers (should be empty for empty dataset) - # Infer the dtype from segy_spec and ignore endianness - header_dtype = segy_spec.trace.header.dtype.newbyteorder("native") - validate_variable(ds, "headers", (200, 300), ("inline", "crossline"), header_dtype, None, None) + if has_headers: + # Validate the headers (should be empty for empty dataset) + # Infer the dtype from segy_spec and ignore endianness + header_dtype = get_segy_standard(1.0).trace.header.dtype.newbyteorder("native") + validate_variable(ds, "headers", (200, 300), ("inline", "crossline"), header_dtype, None, None) + else: + assert "headers" not in ds.variables # Validate the trace mask (should be all True for empty dataset) validate_variable(ds, "trace_mask", (200, 300), ("inline", "crossline"), np.bool_, None, None) @@ -56,39 +57,52 @@ def _validate_empty_mdio_dataset(cls, ds: xr_Dataset, segy_spec: SegySpec) -> No # Validate the amplitude data (should be empty) validate_variable(ds, "amplitude", (200, 300, 750), ("inline", "crossline", "time"), np.float32, None, None) - @pytest.fixture(scope="class") - def segy_spec(self) -> SegySpec: - """Return the SEG-Y specification for the test.""" - return get_segy_standard(1.0) + @classmethod + def _create_empty_mdio(cls, create_headers: bool, output_path: Path, overwrite: bool = True) -> None: + """Create a temporary empty MDIO file for testing.""" + # Create the grid with the specified dimensions + dims = [ + Dimension(name="inline", coords=range(100, 300, 1)), # 100-300 with step 1 + Dimension(name="crossline", coords=range(1000, 1600, 2)), # 1000-1600 with step 2 + Dimension(name="time", coords=range(0, 3000, 4)), # 0-3 seconds 4ms sample rate + ] + + mdio_template = get_template("PostStack3DTime") + + # Call create_empty_mdio + create_empty_mdio( + mdio_template=mdio_template, + dimensions=dims, + output_path=output_path, + create_headers=create_headers, + overwrite=overwrite, + ) @pytest.fixture(scope="class") - def empty_mdio_path(self, segy_spec: SegySpec, empty_mdio: Path) -> Path: + def mdio_with_headers(self, empty_mdio_dir: Path) -> Path: """Create a temporary empty MDIO file for testing. This fixture is scoped to the class level, so it will be executed only once and shared across all test methods in the class. """ - # Create the grid with the specified dimensions - grid = Grid( - dims=[ - Dimension(name="inline", coords=range(100, 300, 1)), # 100-300 with step 1 - Dimension(name="crossline", coords=range(1000, 1600, 2)), # 1000-1600 with step 2 - Dimension(name="time", coords=range(0, 3000, 4)), # 0-3 seconds 4ms sample rate - ] - ) - - mdio_template = get_template("PostStack3DTime") + empty_mdio: Path = empty_mdio_dir / "with_headers.mdio" + self._create_empty_mdio(create_headers=True, output_path=empty_mdio) + return empty_mdio - # Call create_empty_mdio - create_empty_mdio( - segy_spec=segy_spec, mdio_template=mdio_template, grid=grid, output_path=empty_mdio, overwrite=True - ) + @pytest.fixture(scope="class") + def mdio_no_headers(self, empty_mdio_dir: Path) -> Path: + """Create a temporary empty MDIO file for testing. + This fixture is scoped to the class level, so it will be executed only once + and shared across all test methods in the class. + """ + empty_mdio: Path = empty_mdio_dir / "no_headers.mdio" + self._create_empty_mdio(create_headers=False, output_path=empty_mdio) return empty_mdio - def test_dataset_metadata(self, empty_mdio_path: Path) -> None: + def test_dataset_metadata(self, mdio_with_headers: Path) -> None: """Test dataset metadata for empty MDIO file.""" - ds = open_mdio(empty_mdio_path) + ds = open_mdio(mdio_with_headers) # Check basic metadata attributes expected_attrs = { @@ -118,25 +132,17 @@ def test_dataset_metadata(self, empty_mdio_path: Path) -> None: assert attributes["surveyType"] == "3D" assert attributes["gatherType"] == "stacked" - def test_grid(self, empty_mdio_path: Path, segy_spec: SegySpec) -> None: + def test_variables(self, mdio_with_headers: Path, mdio_no_headers: Path) -> None: """Test grid validation for empty MDIO file.""" - ds = open_mdio(empty_mdio_path) - self._validate_empty_mdio_dataset(ds, segy_spec) + ds = open_mdio(mdio_with_headers) + self._validate_empty_mdio_dataset(ds, has_headers=True) - def test_overwrite_behavior(self, segy_spec: SegySpec, empty_mdio: Path) -> None: - """Test overwrite parameter behavior in create_empty_mdio.""" - # Create the grid with the specified dimensions - grid = Grid( - dims=[ - Dimension(name="inline", coords=range(100, 300, 1)), # 100-300 with step 1 - Dimension(name="crossline", coords=range(1000, 1600, 2)), # 1000-1600 with step 2 - Dimension(name="time", coords=range(0, 3000, 4)), # 0-3 seconds 4ms sample rate - ] - ) - - mdio_template = get_template("PostStack3DTime") + ds = open_mdio(mdio_no_headers) + self._validate_empty_mdio_dataset(ds, has_headers=False) - # First: Create a directory and populate it with garbage data + def test_overwrite_behavior(self, empty_mdio_dir: Path) -> None: + """Test overwrite parameter behavior in create_empty_mdio.""" + empty_mdio = empty_mdio_dir / "empty.mdio" empty_mdio.mkdir(parents=True, exist_ok=True) garbage_file = empty_mdio / "garbage.txt" garbage_file.write_text("This is garbage data that should be overwritten") @@ -151,18 +157,14 @@ def test_overwrite_behavior(self, segy_spec: SegySpec, empty_mdio: Path) -> None # Second call: Try to create MDIO with overwrite=False - should raise FileExistsError with pytest.raises(FileExistsError, match="Output location.*exists"): - create_empty_mdio( - segy_spec=segy_spec, mdio_template=mdio_template, grid=grid, output_path=empty_mdio, overwrite=False - ) + self._create_empty_mdio(create_headers=True, output_path=empty_mdio, overwrite=False) # Third call: Create MDIO with overwrite=True - should succeed and overwrite garbage - create_empty_mdio( - segy_spec=segy_spec, mdio_template=mdio_template, grid=grid, output_path=empty_mdio, overwrite=True - ) + self._create_empty_mdio(create_headers=True, output_path=empty_mdio, overwrite=True) # Validate that the MDIO file can be loaded correctly using the helper function ds = open_mdio(empty_mdio) - self._validate_empty_mdio_dataset(ds, segy_spec) + self._validate_empty_mdio_dataset(ds, has_headers=True) # Verify the garbage data was overwritten (should not exist) assert not garbage_file.exists(), "Garbage file should have been overwritten" From c73a7ed9e848307138ad03c87c0332867d99ccbc Mon Sep 17 00:00:00 2001 From: Dmitriy Repin Date: Tue, 30 Sep 2025 16:50:14 +0000 Subject: [PATCH 05/27] move create_empty_mdio --- src/mdio/converters/segy.py | 60 +------------------------------------ 1 file changed, 1 insertion(+), 59 deletions(-) diff --git a/src/mdio/converters/segy.py b/src/mdio/converters/segy.py index 527a87af0..489848d2c 100644 --- a/src/mdio/converters/segy.py +++ b/src/mdio/converters/segy.py @@ -416,62 +416,4 @@ def segy_to_mdio( # noqa PLR0913 grid_map=grid.map, dataset=xr_dataset, data_variable_name=default_variable_name, - ) - - -def create_empty_mdio( # noqa PLR0913 - segy_spec: SegySpec, - mdio_template: AbstractDatasetTemplate, - grid: Grid, - output_path: UPath | Path | str, - overwrite: bool = False, -) -> None: - """A function that creates an empty MDIO v1 file with known dimensions. - - Args: - segy_spec: The SEG-Y specification to use for trace headers. - mdio_template: The MDIO template to use to define the dataset structure. - grid: The grid specifying the dimensions of the MDIO file. - output_path: The universal path for the output MDIO v1 file. - overwrite: Whether to overwrite the output file if it already exists. Defaults to False. - - Raises: - FileExistsError: If the output location already exists and overwrite is False. - """ - output_path = _normalize_path(output_path) - - if not overwrite and output_path.exists(): - err = f"Output location '{output_path.as_posix()}' exists. Set `overwrite=True` if intended." - raise FileExistsError(err) - - # Build the dataset structure using the template and grid - header_dtype = to_structured_type(segy_spec.trace.header.dtype) - horizontal_unit = get_horizontal_coordinate_unit(grid.dims) - mdio_ds: Dataset = mdio_template.build_dataset( - name=mdio_template.name, - sizes=grid.shape, - horizontal_coord_unit=horizontal_unit, - header_dtype=header_dtype, - ) - - # Convert to xarray dataset - xr_dataset: xr_Dataset = to_xarray_dataset(mdio_ds=mdio_ds) - - # Populate coordinates using the grid - # For empty datasets, we only populate dimension coordinates - drop_vars_delayed = [] - dataset, drop_vars_delayed = populate_dim_coordinates(xr_dataset, grid, drop_vars_delayed=drop_vars_delayed) - - # Set the trace mask to indicate all traces are live (since this is an empty dataset) - if grid.live_mask is not None: - dataset.trace_mask.data[:] = grid.live_mask - else: - # If live_mask is None, create a mask where all traces are live - dataset.trace_mask.data[:] = True - - # Create the Zarr store with the correct structure but with empty arrays - to_mdio(dataset, output_path=output_path, mode="w", compute=False) - - # Write the dimension coordinates and trace mask - meta_ds = dataset[drop_vars_delayed + ["trace_mask"]] - to_mdio(meta_ds, output_path=output_path, mode="r+", compute=True) + ) \ No newline at end of file From 5e36333c495bf2162eebebe1c45a2b800c8bdb7d Mon Sep 17 00:00:00 2001 From: Dmitriy Repin Date: Tue, 30 Sep 2025 16:52:43 +0000 Subject: [PATCH 06/27] Revert TMP -> tmp change --- tests/conftest.py | 12 ++++++------ tests/integration/test_segy_import_export_masked.py | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 8555d1eb5..32118aca8 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -23,7 +23,7 @@ def fake_segy_tmp(tmp_path_factory: pytest.TempPathFactory) -> Path: """Make a temp file for the fake SEG-Y files we are going to create.""" if DEBUG_MODE: - return Path("tmp/fake_segy") + return Path("TMP/fake_segy") return tmp_path_factory.mktemp(r"fake_segy") @@ -37,7 +37,7 @@ def segy_input_uri() -> str: def segy_input(segy_input_uri: str, tmp_path_factory: pytest.TempPathFactory) -> Path: """Download teapot dome dataset for testing.""" if DEBUG_MODE: - tmp_dir = Path("tmp/segy") + tmp_dir = Path("TMP/segy") tmp_dir.mkdir(parents=True, exist_ok=True) else: tmp_dir = tmp_path_factory.mktemp("segy") @@ -50,7 +50,7 @@ def segy_input(segy_input_uri: str, tmp_path_factory: pytest.TempPathFactory) -> def zarr_tmp(tmp_path_factory: pytest.TempPathFactory) -> Path: """Make a temp file for the output MDIO.""" if DEBUG_MODE: - return Path("tmp/mdio") + return Path("TMP/mdio") return tmp_path_factory.mktemp(r"mdio") @@ -58,7 +58,7 @@ def zarr_tmp(tmp_path_factory: pytest.TempPathFactory) -> Path: def zarr_tmp2(tmp_path_factory: pytest.TempPathFactory) -> Path: """Make a temp file for the output MDIO.""" if DEBUG_MODE: - return Path("tmp/mdio2") + return Path("TMP/mdio2") return tmp_path_factory.mktemp(r"mdio2") @@ -66,7 +66,7 @@ def zarr_tmp2(tmp_path_factory: pytest.TempPathFactory) -> Path: def segy_export_tmp(tmp_path_factory: pytest.TempPathFactory) -> Path: """Make a temp file for the round-trip IBM SEG-Y.""" if DEBUG_MODE: - tmp_dir = Path("tmp/segy") + tmp_dir = Path("TMP/segy") tmp_dir.mkdir(parents=True, exist_ok=True) else: tmp_dir = tmp_path_factory.mktemp("segy") @@ -77,6 +77,6 @@ def segy_export_tmp(tmp_path_factory: pytest.TempPathFactory) -> Path: def empty_mdio_dir(tmp_path_factory: pytest.TempPathFactory) -> Path: """Make a temp file for empty MDIO testing.""" if DEBUG_MODE: - tmp_dir = Path("tmp/empty_mdio") + tmp_dir = Path("TMP/empty_mdio") tmp_dir.mkdir(parents=True, exist_ok=True) return tmp_path_factory.mktemp(r"empty_mdio") diff --git a/tests/integration/test_segy_import_export_masked.py b/tests/integration/test_segy_import_export_masked.py index 1b1aa1740..064ce1d7c 100644 --- a/tests/integration/test_segy_import_export_masked.py +++ b/tests/integration/test_segy_import_export_masked.py @@ -285,7 +285,7 @@ def generate_selection_mask(selection_conf: SelectionMaskConfig, grid_conf: Grid def export_masked_path(tmp_path_factory: pytest.TempPathFactory) -> Path: """Fixture that generates temp directory for export tests.""" if DEBUG_MODE: - return Path("tmp/export_masked") + return Path("TMP/export_masked") return tmp_path_factory.getbasetemp() / "export_masked" From 7d0b562b743197540df36bbd4fcb304ba1bbea98 Mon Sep 17 00:00:00 2001 From: Dmitriy Repin Date: Tue, 30 Sep 2025 16:55:58 +0000 Subject: [PATCH 07/27] Pre-commit formatting --- src/mdio/converters/segy.py | 2 +- src/mdio/creators/__init__.py | 2 +- src/mdio/creators/mdio.py | 11 ++++++----- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/src/mdio/converters/segy.py b/src/mdio/converters/segy.py index 489848d2c..598e9e4be 100644 --- a/src/mdio/converters/segy.py +++ b/src/mdio/converters/segy.py @@ -416,4 +416,4 @@ def segy_to_mdio( # noqa PLR0913 grid_map=grid.map, dataset=xr_dataset, data_variable_name=default_variable_name, - ) \ No newline at end of file + ) diff --git a/src/mdio/creators/__init__.py b/src/mdio/creators/__init__.py index 66258d208..cc7f50e26 100644 --- a/src/mdio/creators/__init__.py +++ b/src/mdio/creators/__init__.py @@ -2,4 +2,4 @@ from mdio.creators.mdio import create_empty_mdio -__all__ = ["create_empty_mdio"] \ No newline at end of file +__all__ = ["create_empty_mdio"] diff --git a/src/mdio/creators/mdio.py b/src/mdio/creators/mdio.py index 6057fb5cb..740ac1839 100644 --- a/src/mdio/creators/mdio.py +++ b/src/mdio/creators/mdio.py @@ -1,21 +1,22 @@ """Creating MDIO v1 datasets.""" from __future__ import annotations + from typing import TYPE_CHECKING + from segy.standards import get_segy_standard + from mdio.api.io import _normalize_path from mdio.api.io import to_mdio from mdio.builder.xarray_builder import to_xarray_dataset -from mdio.converters.segy import get_horizontal_coordinate_unit, populate_dim_coordinates +from mdio.converters.segy import get_horizontal_coordinate_unit +from mdio.converters.segy import populate_dim_coordinates from mdio.converters.type_converter import to_structured_type from mdio.core.grid import Grid if TYPE_CHECKING: from pathlib import Path - from typing import Any - from segy.arrays import HeaderArray as SegyHeaderArray - from segy.schema import SegySpec from upath import UPath from xarray import Dataset as xr_Dataset @@ -75,4 +76,4 @@ def create_empty_mdio( # noqa PLR0913 # Write the dimension coordinates and trace mask meta_ds = dataset[drop_vars_delayed + ["trace_mask"]] - to_mdio(meta_ds, output_path=output_path, mode="r+", compute=True) \ No newline at end of file + to_mdio(meta_ds, output_path=output_path, mode="r+", compute=True) From 56aecf6869cc06fd3b3adf0472ddb4950f9f6fbd Mon Sep 17 00:00:00 2001 From: Dmitriy Repin Date: Wed, 1 Oct 2025 14:07:33 +0000 Subject: [PATCH 08/27] _create_empty_mdio with template_name --- src/mdio/creators/mdio.py | 9 +++++---- tests/integration/test_create_empty_mdio.py | 4 +--- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/src/mdio/creators/mdio.py b/src/mdio/creators/mdio.py index 740ac1839..350f46b2a 100644 --- a/src/mdio/creators/mdio.py +++ b/src/mdio/creators/mdio.py @@ -8,6 +8,7 @@ from mdio.api.io import _normalize_path from mdio.api.io import to_mdio +from mdio.builder.template_registry import TemplateRegistry from mdio.builder.xarray_builder import to_xarray_dataset from mdio.converters.segy import get_horizontal_coordinate_unit from mdio.converters.segy import populate_dim_coordinates @@ -21,12 +22,11 @@ from xarray import Dataset as xr_Dataset from mdio.builder.schemas import Dataset - from mdio.builder.templates.abstract_dataset_template import AbstractDatasetTemplate from mdio.core.dimension import Dimension def create_empty_mdio( # noqa PLR0913 - mdio_template: AbstractDatasetTemplate, + mdio_template_name: str, dimensions: list[Dimension], output_path: UPath | Path | str, create_headers: bool = False, @@ -35,7 +35,7 @@ def create_empty_mdio( # noqa PLR0913 """A function that creates an empty MDIO v1 file with known dimensions. Args: - mdio_template: The MDIO template to use to define the dataset structure. + mdio_template_name: The MDIO template to use to define the dataset structure. dimensions: The dimensions of the MDIO file. output_path: The universal path for the output MDIO v1 file. create_headers: Whether to create a full set of SEG-Y v1.0 trace headers. Defaults to False. @@ -53,8 +53,9 @@ def create_empty_mdio( # noqa PLR0913 header_dtype = to_structured_type(get_segy_standard(1.0).trace.header.dtype) if create_headers else None grid = Grid(dims=dimensions) horizontal_unit = get_horizontal_coordinate_unit(grid.dims) + mdio_template = TemplateRegistry().get(mdio_template_name) mdio_ds: Dataset = mdio_template.build_dataset( - name=mdio_template.name, + name=mdio_template_name, sizes=grid.shape, horizontal_coord_unit=horizontal_unit, header_dtype=header_dtype, diff --git a/tests/integration/test_create_empty_mdio.py b/tests/integration/test_create_empty_mdio.py index b867f6669..ecc45a359 100644 --- a/tests/integration/test_create_empty_mdio.py +++ b/tests/integration/test_create_empty_mdio.py @@ -67,11 +67,9 @@ def _create_empty_mdio(cls, create_headers: bool, output_path: Path, overwrite: Dimension(name="time", coords=range(0, 3000, 4)), # 0-3 seconds 4ms sample rate ] - mdio_template = get_template("PostStack3DTime") - # Call create_empty_mdio create_empty_mdio( - mdio_template=mdio_template, + mdio_template_name="PostStack3DTime", dimensions=dims, output_path=output_path, create_headers=create_headers, From b7f3c404df194d76575768f760a432f198c4491f Mon Sep 17 00:00:00 2001 From: Dmitriy Repin Date: Wed, 1 Oct 2025 15:02:27 +0000 Subject: [PATCH 09/27] pre-commit --- tests/integration/test_create_empty_mdio.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/integration/test_create_empty_mdio.py b/tests/integration/test_create_empty_mdio.py index ecc45a359..33ae71eea 100644 --- a/tests/integration/test_create_empty_mdio.py +++ b/tests/integration/test_create_empty_mdio.py @@ -18,7 +18,6 @@ from mdio import __version__ from mdio.api.io import open_mdio -from mdio.builder.template_registry import get_template from mdio.core import Dimension from mdio.creators.mdio import create_empty_mdio From 94fd3fef77558aba8ced715a6b78d55f2bb7a8fd Mon Sep 17 00:00:00 2001 From: Dmitriy Repin Date: Thu, 2 Oct 2025 16:40:57 +0000 Subject: [PATCH 10/27] PR review and test_populate_empty_dataset --- src/mdio/converters/segy.py | 4 +- src/mdio/creators/__init__.py | 4 +- src/mdio/creators/mdio.py | 9 +- tests/conftest.py | 6 +- ...ate_empty_mdio.py => test_create_empty.py} | 84 ++++++++++++++++++- 5 files changed, 90 insertions(+), 17 deletions(-) rename tests/integration/{test_create_empty_mdio.py => test_create_empty.py} (63%) diff --git a/src/mdio/converters/segy.py b/src/mdio/converters/segy.py index cac5f44e0..0a8ebbdbb 100644 --- a/src/mdio/converters/segy.py +++ b/src/mdio/converters/segy.py @@ -291,7 +291,7 @@ def populate_non_dim_coordinates( return dataset, drop_vars_delayed -def get_horizontal_coordinate_unit(segy_headers: list[Dimension]) -> LengthUnitModel | None: +def _get_horizontal_coordinate_unit(segy_headers: list[Dimension]) -> LengthUnitModel | None: """Get the coordinate unit from the SEG-Y headers.""" name = TraceHeaderFieldsRev0.COORDINATE_UNIT.name.upper() unit_hdr = next((c for c in segy_headers if c.name.upper() == name), None) @@ -506,7 +506,7 @@ def segy_to_mdio( # noqa PLR0913 logger.warning("MDIO__IMPORT__RAW_HEADERS is experimental and expected to change or be removed.") mdio_template = _add_raw_headers_to_template(mdio_template) - horizontal_unit = get_horizontal_coordinate_unit(segy_dimensions) + horizontal_unit = _get_horizontal_coordinate_unit(segy_dimensions) mdio_ds: Dataset = mdio_template.build_dataset( name=mdio_template.name, sizes=grid.shape, diff --git a/src/mdio/creators/__init__.py b/src/mdio/creators/__init__.py index cc7f50e26..4eed5f9c9 100644 --- a/src/mdio/creators/__init__.py +++ b/src/mdio/creators/__init__.py @@ -1,5 +1,5 @@ """MDIO Data creation API.""" -from mdio.creators.mdio import create_empty_mdio +from mdio.creators.mdio import create_empty -__all__ = ["create_empty_mdio"] +__all__ = ["create_empty"] diff --git a/src/mdio/creators/mdio.py b/src/mdio/creators/mdio.py index 350f46b2a..89f1ad4d8 100644 --- a/src/mdio/creators/mdio.py +++ b/src/mdio/creators/mdio.py @@ -10,7 +10,6 @@ from mdio.api.io import to_mdio from mdio.builder.template_registry import TemplateRegistry from mdio.builder.xarray_builder import to_xarray_dataset -from mdio.converters.segy import get_horizontal_coordinate_unit from mdio.converters.segy import populate_dim_coordinates from mdio.converters.type_converter import to_structured_type from mdio.core.grid import Grid @@ -25,7 +24,7 @@ from mdio.core.dimension import Dimension -def create_empty_mdio( # noqa PLR0913 +def create_empty( # noqa PLR0913 mdio_template_name: str, dimensions: list[Dimension], output_path: UPath | Path | str, @@ -52,12 +51,11 @@ def create_empty_mdio( # noqa PLR0913 header_dtype = to_structured_type(get_segy_standard(1.0).trace.header.dtype) if create_headers else None grid = Grid(dims=dimensions) - horizontal_unit = get_horizontal_coordinate_unit(grid.dims) mdio_template = TemplateRegistry().get(mdio_template_name) mdio_ds: Dataset = mdio_template.build_dataset( name=mdio_template_name, sizes=grid.shape, - horizontal_coord_unit=horizontal_unit, + horizontal_coord_unit=None, header_dtype=header_dtype, ) @@ -69,9 +67,6 @@ def create_empty_mdio( # noqa PLR0913 drop_vars_delayed = [] dataset, drop_vars_delayed = populate_dim_coordinates(xr_dataset, grid, drop_vars_delayed=drop_vars_delayed) - # Set the trace mask to indicate all traces are live (since this is an empty dataset) - dataset.trace_mask.data[:] = True - # Create the Zarr store with the correct structure but with empty arrays to_mdio(dataset, output_path=output_path, mode="w", compute=False) diff --git a/tests/conftest.py b/tests/conftest.py index 3bf35dc58..77831a4b0 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -79,6 +79,8 @@ def segy_export_tmp(tmp_path_factory: pytest.TempPathFactory) -> Path: def empty_mdio_dir(tmp_path_factory: pytest.TempPathFactory) -> Path: """Make a temp file for empty MDIO testing.""" if DEBUG_MODE: - tmp_dir = Path("TMP/empty_mdio") + tmp_dir = Path("tmp/empty_mdio") tmp_dir.mkdir(parents=True, exist_ok=True) - return tmp_path_factory.mktemp(r"empty_mdio") + else: + tmp_dir = tmp_path_factory.mktemp(r"empty_mdio") + return tmp_dir diff --git a/tests/integration/test_create_empty_mdio.py b/tests/integration/test_create_empty.py similarity index 63% rename from tests/integration/test_create_empty_mdio.py rename to tests/integration/test_create_empty.py index 33ae71eea..6dd2884cd 100644 --- a/tests/integration/test_create_empty_mdio.py +++ b/tests/integration/test_create_empty.py @@ -2,24 +2,30 @@ from __future__ import annotations +import math +from turtle import speed from typing import TYPE_CHECKING import numpy as np import pytest from segy.standards import get_segy_standard +from mdio.builder.schemas.v1.units import LengthUnitEnum, LengthUnitModel, SpeedUnitEnum, SpeedUnitModel, TimeUnitEnum, TimeUnitModel + if TYPE_CHECKING: from pathlib import Path from xarray import Dataset as xr_Dataset +from mdio.builder.schemas.v1.stats import CenteredBinHistogram, SummaryStatistics +from tests.integration.test_segy_roundtrip_teapot import text_header_teapot_dome from tests.integration.testing_helpers import get_values from tests.integration.testing_helpers import validate_variable from mdio import __version__ -from mdio.api.io import open_mdio +from mdio.api.io import open_mdio, to_mdio from mdio.core import Dimension -from mdio.creators.mdio import create_empty_mdio +from mdio.creators.mdio import create_empty class TestCreateEmptyPostStack3DTimeMdio: @@ -51,7 +57,7 @@ def _validate_empty_mdio_dataset(cls, ds: xr_Dataset, has_headers: bool) -> None # Validate the trace mask (should be all True for empty dataset) validate_variable(ds, "trace_mask", (200, 300), ("inline", "crossline"), np.bool_, None, None) trace_mask = ds["trace_mask"].values - assert np.all(trace_mask), "All traces should be marked as live in empty dataset" + assert not np.any(trace_mask), "All traces should be marked as dead in empty dataset" # Validate the amplitude data (should be empty) validate_variable(ds, "amplitude", (200, 300, 750), ("inline", "crossline", "time"), np.float32, None, None) @@ -67,7 +73,7 @@ def _create_empty_mdio(cls, create_headers: bool, output_path: Path, overwrite: ] # Call create_empty_mdio - create_empty_mdio( + create_empty( mdio_template_name="PostStack3DTime", dimensions=dims, output_path=output_path, @@ -166,3 +172,73 @@ def test_overwrite_behavior(self, empty_mdio_dir: Path) -> None: # Verify the garbage data was overwritten (should not exist) assert not garbage_file.exists(), "Garbage file should have been overwritten" assert not garbage_dir.exists(), "Garbage directory should have been overwritten" + + + def test_populate_empty_dataset(self, mdio_with_headers: Path) -> None: + """Test showing how to populate empty dataset.""" + + # Open an empty PostStack3DTime dataset with SEG-Y 1.0 headers + # NOTES: + # When this empty dataset was created from the 'PostStack3DTime' template and dimensions, + # * 'inline', 'crossline', and 'time' dimension coordinate variables were created and pre-populated + # * 'cdp_x', 'cdp_y' non-dimensional coordinate variables were created + # * 'amplitude' variable was created (the name of this variable is specified in the template) + # HACK: in this example, we will use this variable to store the velocity data + # * 'trace_mask' variable was created and pre-populated with 'False' fill values + # (all traces are marked as dead) + # * 'headers' segy trace headers variable was created (if the dataset was created with create_headers=true) + # * dataset attribute called 'attributes' was created + ds = open_mdio(mdio_with_headers) + + # 1.A) Populate dataset's velocity + var_name = ds.attrs["attributes"]["defaultVariableName"] + velocity = ds[var_name] + velocity[:5,:,:] = 1 + velocity[5:10,:,:] = 2 + velocity[50:100,:,:] = 3 + velocity[150:175,:,:] = -1 + + # 1.B) Populate dataset's velocity statistics (optional) + nonzero_samples = np.ma.masked_invalid(velocity, copy=False) + stats = SummaryStatistics( + count=nonzero_samples.count(), + min=nonzero_samples.min(), + max=nonzero_samples.max(), + sum=nonzero_samples.sum(dtype="float64"), + sum_squares=(np.ma.power(nonzero_samples, 2).sum(dtype="float64")), + histogram=CenteredBinHistogram(bin_centers=[], counts=[]), + ) + velocity.attrs["statsV1"] = stats.model_dump_json() + + # 1.C) Set coordinate and data variable units (optional) + ds.time["unitsV1"] = TimeUnitModel(time=TimeUnitEnum.MILLISECOND).model_dump_json() + + ds.cdp_x.attrs["unitsV1"] = LengthUnitModel(length=LengthUnitEnum.FOOT).model_dump_json() + ds.cdp_x.attrs["unitsV1"] = LengthUnitModel(length=LengthUnitEnum.FOOT).model_dump_json() + + velocity.attrs["unitsV1"] = SpeedUnitModel(speed=SpeedUnitEnum.FEET_PER_SECOND).model_dump_json() + + # 3) Populate the non-dimensional coordinate variables 'cdp_x' and 'cdp_y' (optional) + origin = [270000, 3290000] # survey x, y origin + inline_azimuth_rad = 0.523599 # survey orientation, in radians, from the north to the east (30 degrees) + spacing = [50, 50] # survey inline, crossline spacing + inline_grid, xline_grid = np.meshgrid(ds.inline.values, ds.crossline.values, indexing='ij') + sin_azimuth = math.sin(inline_azimuth_rad) + cos_azimuth = math.cos(inline_azimuth_rad) + ds.cdp_x[:] = origin[0] + inline_grid * spacing[0] * sin_azimuth + xline_grid * spacing[1] * cos_azimuth + ds.cdp_y[:] = origin[1] + inline_grid * spacing[0] * cos_azimuth - xline_grid * spacing[1] * sin_azimuth + + # 4) Populate dataset's trace mask (optional) + ds.trace_mask[:] = ~np.isnan(velocity[:,:,0]) + + # 5) Populate dataset's segy trace headers, if those were created (optional) + if "headers" in ds.variables: + ds.headers["cdp_x"][:] = ds.cdp_x + ds.headers["cdp_y"][:] = ds.cdp_y + + # 5) Create dataset's custom attributes (optional) + ds.attrs["attributes"]["createdBy"] = "John Doe" + + output_path = mdio_with_headers.parent / "populated_empty.mdio" + to_mdio(ds, output_path=output_path, mode="w", compute=True) + From bf2e41fb60fbea0c84b0d7ea8e7b66148553882e Mon Sep 17 00:00:00 2001 From: Dmitriy Repin Date: Thu, 2 Oct 2025 20:43:04 +0000 Subject: [PATCH 11/27] USe headers: HeaderSpec --- src/mdio/creators/mdio.py | 9 +-- tests/integration/test_create_empty.py | 100 +++++++++++++++---------- 2 files changed, 63 insertions(+), 46 deletions(-) diff --git a/src/mdio/creators/mdio.py b/src/mdio/creators/mdio.py index 89f1ad4d8..3de88825b 100644 --- a/src/mdio/creators/mdio.py +++ b/src/mdio/creators/mdio.py @@ -4,8 +4,6 @@ from typing import TYPE_CHECKING -from segy.standards import get_segy_standard - from mdio.api.io import _normalize_path from mdio.api.io import to_mdio from mdio.builder.template_registry import TemplateRegistry @@ -17,6 +15,7 @@ if TYPE_CHECKING: from pathlib import Path + from segy.schema import HeaderSpec from upath import UPath from xarray import Dataset as xr_Dataset @@ -28,7 +27,7 @@ def create_empty( # noqa PLR0913 mdio_template_name: str, dimensions: list[Dimension], output_path: UPath | Path | str, - create_headers: bool = False, + headers: HeaderSpec | None = None, overwrite: bool = False, ) -> None: """A function that creates an empty MDIO v1 file with known dimensions. @@ -37,7 +36,7 @@ def create_empty( # noqa PLR0913 mdio_template_name: The MDIO template to use to define the dataset structure. dimensions: The dimensions of the MDIO file. output_path: The universal path for the output MDIO v1 file. - create_headers: Whether to create a full set of SEG-Y v1.0 trace headers. Defaults to False. + headers: SEG-Y v1.0 trace headers. Defaults to None. overwrite: Whether to overwrite the output file if it already exists. Defaults to False. Raises: @@ -49,7 +48,7 @@ def create_empty( # noqa PLR0913 err = f"Output location '{output_path.as_posix()}' exists. Set `overwrite=True` if intended." raise FileExistsError(err) - header_dtype = to_structured_type(get_segy_standard(1.0).trace.header.dtype) if create_headers else None + header_dtype = to_structured_type(headers.dtype) if headers else None grid = Grid(dims=dimensions) mdio_template = TemplateRegistry().get(mdio_template_name) mdio_ds: Dataset = mdio_template.build_dataset( diff --git a/tests/integration/test_create_empty.py b/tests/integration/test_create_empty.py index 6dd2884cd..2e36e1e45 100644 --- a/tests/integration/test_create_empty.py +++ b/tests/integration/test_create_empty.py @@ -3,27 +3,33 @@ from __future__ import annotations import math -from turtle import speed from typing import TYPE_CHECKING import numpy as np import pytest -from segy.standards import get_segy_standard +from segy.schema import HeaderField +from segy.schema import HeaderSpec -from mdio.builder.schemas.v1.units import LengthUnitEnum, LengthUnitModel, SpeedUnitEnum, SpeedUnitModel, TimeUnitEnum, TimeUnitModel +from mdio.builder.schemas.v1.units import LengthUnitEnum +from mdio.builder.schemas.v1.units import LengthUnitModel +from mdio.builder.schemas.v1.units import SpeedUnitEnum +from mdio.builder.schemas.v1.units import SpeedUnitModel +from mdio.builder.schemas.v1.units import TimeUnitEnum +from mdio.builder.schemas.v1.units import TimeUnitModel if TYPE_CHECKING: from pathlib import Path from xarray import Dataset as xr_Dataset -from mdio.builder.schemas.v1.stats import CenteredBinHistogram, SummaryStatistics -from tests.integration.test_segy_roundtrip_teapot import text_header_teapot_dome from tests.integration.testing_helpers import get_values from tests.integration.testing_helpers import validate_variable from mdio import __version__ -from mdio.api.io import open_mdio, to_mdio +from mdio.api.io import open_mdio +from mdio.api.io import to_mdio +from mdio.builder.schemas.v1.stats import CenteredBinHistogram +from mdio.builder.schemas.v1.stats import SummaryStatistics from mdio.core import Dimension from mdio.creators.mdio import create_empty @@ -31,6 +37,17 @@ class TestCreateEmptyPostStack3DTimeMdio: """Tests for create_empty_mdio function.""" + @classmethod + def _get_header_spec(cls) -> HeaderSpec: + """Get the header spec for the MDIO dataset.""" + trace_header_fields = [ + HeaderField(name="inline", byte=17, format="int32"), + HeaderField(name="crossline", byte=13, format="int32"), + HeaderField(name="cdp_x", byte=181, format="int32"), + HeaderField(name="cdp_y", byte=185, format="int32"), + ] + return HeaderSpec(fields=trace_header_fields) + @classmethod def _validate_empty_mdio_dataset(cls, ds: xr_Dataset, has_headers: bool) -> None: """Validate an empty MDIO dataset structure and content.""" @@ -49,7 +66,7 @@ def _validate_empty_mdio_dataset(cls, ds: xr_Dataset, has_headers: bool) -> None if has_headers: # Validate the headers (should be empty for empty dataset) # Infer the dtype from segy_spec and ignore endianness - header_dtype = get_segy_standard(1.0).trace.header.dtype.newbyteorder("native") + header_dtype = cls._get_header_spec().dtype.newbyteorder("native") validate_variable(ds, "headers", (200, 300), ("inline", "crossline"), header_dtype, None, None) else: assert "headers" not in ds.variables @@ -72,12 +89,12 @@ def _create_empty_mdio(cls, create_headers: bool, output_path: Path, overwrite: Dimension(name="time", coords=range(0, 3000, 4)), # 0-3 seconds 4ms sample rate ] - # Call create_empty_mdio + headers = cls._get_header_spec() if create_headers else None create_empty( mdio_template_name="PostStack3DTime", dimensions=dims, output_path=output_path, - create_headers=create_headers, + headers=headers, overwrite=overwrite, ) @@ -173,33 +190,31 @@ def test_overwrite_behavior(self, empty_mdio_dir: Path) -> None: assert not garbage_file.exists(), "Garbage file should have been overwritten" assert not garbage_dir.exists(), "Garbage directory should have been overwritten" - def test_populate_empty_dataset(self, mdio_with_headers: Path) -> None: """Test showing how to populate empty dataset.""" - # Open an empty PostStack3DTime dataset with SEG-Y 1.0 headers # NOTES: - # When this empty dataset was created from the 'PostStack3DTime' template and dimensions, + # When this empty dataset was created from the 'PostStack3DTime' template and dimensions, # * 'inline', 'crossline', and 'time' dimension coordinate variables were created and pre-populated # * 'cdp_x', 'cdp_y' non-dimensional coordinate variables were created # * 'amplitude' variable was created (the name of this variable is specified in the template) # HACK: in this example, we will use this variable to store the velocity data - # * 'trace_mask' variable was created and pre-populated with 'False' fill values + # * 'trace_mask' variable was created and pre-populated with 'False' fill values # (all traces are marked as dead) - # * 'headers' segy trace headers variable was created (if the dataset was created with create_headers=true) + # * 'headers' segy trace headers variable was created (if the dataset was created with headers not None) # * dataset attribute called 'attributes' was created - ds = open_mdio(mdio_with_headers) + ds = open_mdio(mdio_with_headers) - # 1.A) Populate dataset's velocity + # 1) Populate dataset's velocity var_name = ds.attrs["attributes"]["defaultVariableName"] velocity = ds[var_name] - velocity[:5,:,:] = 1 - velocity[5:10,:,:] = 2 - velocity[50:100,:,:] = 3 - velocity[150:175,:,:] = -1 + velocity[:5, :, :] = 1 + velocity[5:10, :, :] = 2 + velocity[50:100, :, :] = 3 + velocity[150:175, :, :] = -1 - # 1.B) Populate dataset's velocity statistics (optional) - nonzero_samples = np.ma.masked_invalid(velocity, copy=False) + # 2) Populate dataset's velocity statistics (optional) + nonzero_samples = np.ma.masked_invalid(velocity, copy=False) stats = SummaryStatistics( count=nonzero_samples.count(), min=nonzero_samples.min(), @@ -210,35 +225,38 @@ def test_populate_empty_dataset(self, mdio_with_headers: Path) -> None: ) velocity.attrs["statsV1"] = stats.model_dump_json() - # 1.C) Set coordinate and data variable units (optional) - ds.time["unitsV1"] = TimeUnitModel(time=TimeUnitEnum.MILLISECOND).model_dump_json() - - ds.cdp_x.attrs["unitsV1"] = LengthUnitModel(length=LengthUnitEnum.FOOT).model_dump_json() - ds.cdp_x.attrs["unitsV1"] = LengthUnitModel(length=LengthUnitEnum.FOOT).model_dump_json() - - velocity.attrs["unitsV1"] = SpeedUnitModel(speed=SpeedUnitEnum.FEET_PER_SECOND).model_dump_json() - # 3) Populate the non-dimensional coordinate variables 'cdp_x' and 'cdp_y' (optional) - origin = [270000, 3290000] # survey x, y origin - inline_azimuth_rad = 0.523599 # survey orientation, in radians, from the north to the east (30 degrees) - spacing = [50, 50] # survey inline, crossline spacing - inline_grid, xline_grid = np.meshgrid(ds.inline.values, ds.crossline.values, indexing='ij') + origin = [270000, 3290000] # survey x, y origin + inline_azimuth_rad = 0.523599 # survey orientation, in radians, from the north to the east (30 degrees) + spacing = [50, 50] # survey inline, crossline spacing + inline_grid, xline_grid = np.meshgrid(ds.inline.values, ds.crossline.values, indexing="ij") sin_azimuth = math.sin(inline_azimuth_rad) cos_azimuth = math.cos(inline_azimuth_rad) ds.cdp_x[:] = origin[0] + inline_grid * spacing[0] * sin_azimuth + xline_grid * spacing[1] * cos_azimuth ds.cdp_y[:] = origin[1] + inline_grid * spacing[0] * cos_azimuth - xline_grid * spacing[1] * sin_azimuth # 4) Populate dataset's trace mask (optional) - ds.trace_mask[:] = ~np.isnan(velocity[:,:,0]) + ds.trace_mask[:] = ~np.isnan(velocity[:, :, 0]) - # 5) Populate dataset's segy trace headers, if those were created (optional) - if "headers" in ds.variables: - ds.headers["cdp_x"][:] = ds.cdp_x - ds.headers["cdp_y"][:] = ds.cdp_y + # 5) Set coordinate and data variable units (optional) + ds.time["unitsV1"] = TimeUnitModel(time=TimeUnitEnum.MILLISECOND).model_dump_json() - # 5) Create dataset's custom attributes (optional) + ds.cdp_x.attrs["unitsV1"] = LengthUnitModel(length=LengthUnitEnum.FOOT).model_dump_json() + ds.cdp_x.attrs["unitsV1"] = LengthUnitModel(length=LengthUnitEnum.FOOT).model_dump_json() + + velocity.attrs["unitsV1"] = SpeedUnitModel(speed=SpeedUnitEnum.FEET_PER_SECOND).model_dump_json() + + # 6) Populate dataset's segy trace headers, if those were created (optional) + if "headers" in ds.variables: + # numpy broadcasting (200, 1) array to (200, 300) array + ds["headers"].values["inline"] = ds.inline.values[:, np.newaxis] + # numpy broadcasting (1, 300) array to (200, 300) array + ds["headers"].values["crossline"] = ds.crossline.values[np.newaxis, :] + ds["headers"]["cdp_x"][:] = ds.cdp_x + ds["headers"]["cdp_y"][:] = ds.cdp_y + + # 7) Create dataset's custom attributes (optional) ds.attrs["attributes"]["createdBy"] = "John Doe" output_path = mdio_with_headers.parent / "populated_empty.mdio" to_mdio(ds, output_path=output_path, mode="w", compute=True) - From e1e3ce275811705cfa80b1878c31eb539ffcb9ef Mon Sep 17 00:00:00 2001 From: Dmitriy Repin Date: Fri, 3 Oct 2025 22:30:36 +0000 Subject: [PATCH 12/27] Add export to segy to test_populate_empty_dataset --- src/mdio/creators/mdio.py | 5 ++ tests/integration/test_create_empty.py | 107 +++++++++++++++++++------ 2 files changed, 88 insertions(+), 24 deletions(-) diff --git a/src/mdio/creators/mdio.py b/src/mdio/creators/mdio.py index 3de88825b..200f3115a 100644 --- a/src/mdio/creators/mdio.py +++ b/src/mdio/creators/mdio.py @@ -66,6 +66,11 @@ def create_empty( # noqa PLR0913 drop_vars_delayed = [] dataset, drop_vars_delayed = populate_dim_coordinates(xr_dataset, grid, drop_vars_delayed=drop_vars_delayed) + if headers: + # Since the headers were provided, the user wants to export to SEG-Y + # Add a dummy segy_file_header variable used to export to SEG-Y + dataset["segy_file_header"] = ((), "") + # Create the Zarr store with the correct structure but with empty arrays to_mdio(dataset, output_path=output_path, mode="w", compute=False) diff --git a/tests/integration/test_create_empty.py b/tests/integration/test_create_empty.py index 2e36e1e45..09d8dcc74 100644 --- a/tests/integration/test_create_empty.py +++ b/tests/integration/test_create_empty.py @@ -9,6 +9,8 @@ import pytest from segy.schema import HeaderField from segy.schema import HeaderSpec +from segy.schema import ScalarType +from segy.standards import get_segy_standard from mdio.builder.schemas.v1.units import LengthUnitEnum from mdio.builder.schemas.v1.units import LengthUnitModel @@ -22,6 +24,7 @@ from xarray import Dataset as xr_Dataset + from tests.integration.testing_helpers import get_values from tests.integration.testing_helpers import validate_variable @@ -30,6 +33,7 @@ from mdio.api.io import to_mdio from mdio.builder.schemas.v1.stats import CenteredBinHistogram from mdio.builder.schemas.v1.stats import SummaryStatistics +from mdio.converters.mdio import mdio_to_segy from mdio.core import Dimension from mdio.creators.mdio import create_empty @@ -38,15 +42,18 @@ class TestCreateEmptyPostStack3DTimeMdio: """Tests for create_empty_mdio function.""" @classmethod - def _get_header_spec(cls) -> HeaderSpec: + def _get_customized_v10_trace_header_spec(cls) -> HeaderSpec: """Get the header spec for the MDIO dataset.""" trace_header_fields = [ - HeaderField(name="inline", byte=17, format="int32"), - HeaderField(name="crossline", byte=13, format="int32"), - HeaderField(name="cdp_x", byte=181, format="int32"), - HeaderField(name="cdp_y", byte=185, format="int32"), + HeaderField(name="inline", byte=17, format=ScalarType.INT32), + HeaderField(name="crossline", byte=13, format=ScalarType.INT32), + HeaderField(name="cdp_x", byte=181, format=ScalarType.INT32), + HeaderField(name="cdp_y", byte=185, format=ScalarType.INT32), + HeaderField(name="coordinate_scalar", byte=71, format=ScalarType.INT16), ] - return HeaderSpec(fields=trace_header_fields) + hs: HeaderSpec = get_segy_standard(1.0).trace.header + hs.customize(fields=trace_header_fields) + return hs @classmethod def _validate_empty_mdio_dataset(cls, ds: xr_Dataset, has_headers: bool) -> None: @@ -66,10 +73,12 @@ def _validate_empty_mdio_dataset(cls, ds: xr_Dataset, has_headers: bool) -> None if has_headers: # Validate the headers (should be empty for empty dataset) # Infer the dtype from segy_spec and ignore endianness - header_dtype = cls._get_header_spec().dtype.newbyteorder("native") + header_dtype = cls._get_customized_v10_trace_header_spec().dtype.newbyteorder("native") validate_variable(ds, "headers", (200, 300), ("inline", "crossline"), header_dtype, None, None) + validate_variable(ds, "segy_file_header", (), (), np.dtype("U1"), None, None) else: assert "headers" not in ds.variables + assert "segy_file_header" not in ds.variables # Validate the trace mask (should be all True for empty dataset) validate_variable(ds, "trace_mask", (200, 300), ("inline", "crossline"), np.bool_, None, None) @@ -89,7 +98,9 @@ def _create_empty_mdio(cls, create_headers: bool, output_path: Path, overwrite: Dimension(name="time", coords=range(0, 3000, 4)), # 0-3 seconds 4ms sample rate ] - headers = cls._get_header_spec() if create_headers else None + # If later on, we want to export to SEG-Y, we need to provide the trace header spec. + # The HeaderSpec can be either standard or customized. + headers = cls._get_customized_v10_trace_header_spec() if create_headers else None create_empty( mdio_template_name="PostStack3DTime", dimensions=dims, @@ -167,7 +178,7 @@ def test_overwrite_behavior(self, empty_mdio_dir: Path) -> None: garbage_file = empty_mdio / "garbage.txt" garbage_file.write_text("This is garbage data that should be overwritten") garbage_dir = empty_mdio / "garbage_dir" - garbage_dir.mkdir() + garbage_dir.mkdir(exist_ok=True) (garbage_dir / "nested_garbage.txt").write_text("More garbage") # Verify the directory exists with garbage data @@ -201,7 +212,9 @@ def test_populate_empty_dataset(self, mdio_with_headers: Path) -> None: # HACK: in this example, we will use this variable to store the velocity data # * 'trace_mask' variable was created and pre-populated with 'False' fill values # (all traces are marked as dead) - # * 'headers' segy trace headers variable was created (if the dataset was created with headers not None) + # * 'headers' and 'segy_file_header' variables were created (if the dataset was created with + # headers not None). The 'headers' variable structured datatype is defined by the HeaderSpec + # that was used to create the empty MDIO # * dataset attribute called 'attributes' was created ds = open_mdio(mdio_with_headers) @@ -223,7 +236,7 @@ def test_populate_empty_dataset(self, mdio_with_headers: Path) -> None: sum_squares=(np.ma.power(nonzero_samples, 2).sum(dtype="float64")), histogram=CenteredBinHistogram(bin_centers=[], counts=[]), ) - velocity.attrs["statsV1"] = stats.model_dump_json() + velocity.attrs["statsV1"] = stats.model_dump(mode="json") # 3) Populate the non-dimensional coordinate variables 'cdp_x' and 'cdp_y' (optional) origin = [270000, 3290000] # survey x, y origin @@ -239,24 +252,70 @@ def test_populate_empty_dataset(self, mdio_with_headers: Path) -> None: ds.trace_mask[:] = ~np.isnan(velocity[:, :, 0]) # 5) Set coordinate and data variable units (optional) - ds.time["unitsV1"] = TimeUnitModel(time=TimeUnitEnum.MILLISECOND).model_dump_json() + ds.time.attrs["unitsV1"] = TimeUnitModel(time=TimeUnitEnum.MILLISECOND).model_dump(mode="json") - ds.cdp_x.attrs["unitsV1"] = LengthUnitModel(length=LengthUnitEnum.FOOT).model_dump_json() - ds.cdp_x.attrs["unitsV1"] = LengthUnitModel(length=LengthUnitEnum.FOOT).model_dump_json() + ds.cdp_x.attrs["unitsV1"] = LengthUnitModel(length=LengthUnitEnum.FOOT).model_dump(mode="json") + ds.cdp_x.attrs["unitsV1"] = LengthUnitModel(length=LengthUnitEnum.FOOT).model_dump(mode="json") - velocity.attrs["unitsV1"] = SpeedUnitModel(speed=SpeedUnitEnum.FEET_PER_SECOND).model_dump_json() + velocity.attrs["unitsV1"] = SpeedUnitModel(speed=SpeedUnitEnum.FEET_PER_SECOND).model_dump(mode="json") - # 6) Populate dataset's segy trace headers, if those were created (optional) + # 6) Populate dataset's segy trace headers, if those were created (required only if we want to export to SEG-Y) if "headers" in ds.variables: - # numpy broadcasting (200, 1) array to (200, 300) array - ds["headers"].values["inline"] = ds.inline.values[:, np.newaxis] - # numpy broadcasting (1, 300) array to (200, 300) array - ds["headers"].values["crossline"] = ds.crossline.values[np.newaxis, :] - ds["headers"]["cdp_x"][:] = ds.cdp_x - ds["headers"]["cdp_y"][:] = ds.cdp_y + # Both the structured "headers" and the dummy "segy_file_header" variables are + # required to enable SEG-Y to MDIO conversion + + # Populate the structured trace "headers" variable + ds["headers"].values["inline"] = inline_grid + ds["headers"].values["crossline"] = xline_grid + # coordinate_scalar: + # Scalar to be applied to all coordinates specified in Standard Trace Header bytes + # 73–88 and to bytes Trace Header 181–188 to give the real value. Scalar = 1, + # ±10, ±100, ±1000, or ±10,000. If positive, scalar is used as a multiplier; if + # negative, scalar is used as divisor. A value of zero is assumed to be a scalar + # value of 1. + ds["headers"].values["coordinate_scalar"][:] = np.int16(-100) + ds["headers"].values["cdp_x"][:] = np.int32(ds.cdp_x * 100) + ds["headers"].values["cdp_y"][:] = np.int32(ds.cdp_y * 100) + + # Fill its metadata (.attrs) with 'textHeader' and 'binaryHeader'. + ds["segy_file_header"].attrs.update( + { + "textHeader": "\n".join( + [ + "C01 BYTES 13-16: CROSSLINE " + " " * 47, + "C02 BYTES 17-20: INLINE " + " " * 47, + "C03 BYTES 71-74: COORDINATE SCALAR " + " " * 47, + "C04 BYTES 181-184: CDP X " + " " * 47, + "C05 BYTES 185-188: CDP Y " + " " * 47, + *(f"C{i:02d}" + " " * 77 for i in range(6, 41)), + ] + ), + "binaryHeader": { + "data_sample_format": 1, + "sample_interval": int(ds.time[1] - ds.time[0]), + "samples_per_trace": ds.time.size, + "segy_revision_major": 0, + "segy_revision_minor": 0, + }, + } + ) # 7) Create dataset's custom attributes (optional) ds.attrs["attributes"]["createdBy"] = "John Doe" - output_path = mdio_with_headers.parent / "populated_empty.mdio" - to_mdio(ds, output_path=output_path, mode="w", compute=True) + # 8) Export to MDIO + output_path_mdio = mdio_with_headers.parent / "populated_empty.mdio" + to_mdio(ds, output_path=output_path_mdio, mode="w", compute=True) + + # 9) Convert the populated emptyMDIO to SEG-Y + if "headers" in ds.variables: + # Select the SEG-Y standard to use for the conversion + custom_segy_spec = get_segy_standard(1.0) + # Customize to use the same HeaderSpec that was used to create the empty MDIO + custom_segy_spec.trace.header = self._get_customized_v10_trace_header_spec() + # Convert the MDIO file to SEG-Y + mdio_to_segy( + segy_spec=custom_segy_spec, + input_path=output_path_mdio, + output_path=mdio_with_headers.parent / "populated_empty.sgy", + ) From 903d78a428179b0eca3be018e2b4446a81fb7bee Mon Sep 17 00:00:00 2001 From: Dmitriy Repin Date: Mon, 6 Oct 2025 14:54:21 +0000 Subject: [PATCH 13/27] Update for upstream chnages --- tests/conftest.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index f72a493be..a9e1514f6 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -63,9 +63,5 @@ def segy_export_tmp(tmp_path_factory: pytest.TempPathFactory) -> Path: @pytest.fixture(scope="class") def empty_mdio_dir(tmp_path_factory: pytest.TempPathFactory) -> Path: """Make a temp file for empty MDIO testing.""" - if DEBUG_MODE: - tmp_dir = Path("tmp/empty_mdio") - tmp_dir.mkdir(parents=True, exist_ok=True) - else: - tmp_dir = tmp_path_factory.mktemp(r"empty_mdio") - return tmp_dir + return tmp_path_factory.mktemp(r"empty_mdio") + From 344cf659e32f2536bc5df440d07363e2c57c2d0b Mon Sep 17 00:00:00 2001 From: Dmitriy Repin Date: Mon, 6 Oct 2025 14:56:34 +0000 Subject: [PATCH 14/27] Pre-commit added empty line --- tests/conftest.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/conftest.py b/tests/conftest.py index a9e1514f6..d4890b17a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -64,4 +64,3 @@ def segy_export_tmp(tmp_path_factory: pytest.TempPathFactory) -> Path: def empty_mdio_dir(tmp_path_factory: pytest.TempPathFactory) -> Path: """Make a temp file for empty MDIO testing.""" return tmp_path_factory.mktemp(r"empty_mdio") - From 83b771748e41317164c469c274fc06fbd0d63272 Mon Sep 17 00:00:00 2001 From: Dmitriy Repin Date: Wed, 8 Oct 2025 19:55:37 +0000 Subject: [PATCH 15/27] Use Teapod dimensions --- tests/integration/test_create_empty.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/tests/integration/test_create_empty.py b/tests/integration/test_create_empty.py index 09d8dcc74..023ec2327 100644 --- a/tests/integration/test_create_empty.py +++ b/tests/integration/test_create_empty.py @@ -59,43 +59,43 @@ def _get_customized_v10_trace_header_spec(cls) -> HeaderSpec: def _validate_empty_mdio_dataset(cls, ds: xr_Dataset, has_headers: bool) -> None: """Validate an empty MDIO dataset structure and content.""" # Check that the dataset has the expected shape - assert ds.sizes == {"inline": 200, "crossline": 300, "time": 750} + assert ds.sizes == {"inline": 345, "crossline": 188, "time": 1501} # Validate the dimension coordinate variables - validate_variable(ds, "inline", (200,), ("inline",), np.int32, range(100, 300), get_values) - validate_variable(ds, "crossline", (300,), ("crossline",), np.int32, range(1000, 1600, 2), get_values) - validate_variable(ds, "time", (750,), ("time",), np.int32, range(0, 3000, 4), get_values) + validate_variable(ds, "inline", (345,), ("inline",), np.int32, range(1, 346), get_values) + validate_variable(ds, "crossline", (188,), ("crossline",), np.int32, range(1, 189), get_values) + validate_variable(ds, "time", (1501,), ("time",), np.int32, range(0, 3002, 2), get_values) # Validate the non-dimensional coordinate variables (should be empty for empty dataset) - validate_variable(ds, "cdp_x", (200, 300), ("inline", "crossline"), np.float64, None, None) - validate_variable(ds, "cdp_y", (200, 300), ("inline", "crossline"), np.float64, None, None) + validate_variable(ds, "cdp_x", (345, 188), ("inline", "crossline"), np.float64, None, None) + validate_variable(ds, "cdp_y", (345, 188), ("inline", "crossline"), np.float64, None, None) if has_headers: # Validate the headers (should be empty for empty dataset) # Infer the dtype from segy_spec and ignore endianness header_dtype = cls._get_customized_v10_trace_header_spec().dtype.newbyteorder("native") - validate_variable(ds, "headers", (200, 300), ("inline", "crossline"), header_dtype, None, None) + validate_variable(ds, "headers", (345, 188), ("inline", "crossline"), header_dtype, None, None) validate_variable(ds, "segy_file_header", (), (), np.dtype("U1"), None, None) else: assert "headers" not in ds.variables assert "segy_file_header" not in ds.variables # Validate the trace mask (should be all True for empty dataset) - validate_variable(ds, "trace_mask", (200, 300), ("inline", "crossline"), np.bool_, None, None) + validate_variable(ds, "trace_mask", (345, 188), ("inline", "crossline"), np.bool_, None, None) trace_mask = ds["trace_mask"].values assert not np.any(trace_mask), "All traces should be marked as dead in empty dataset" # Validate the amplitude data (should be empty) - validate_variable(ds, "amplitude", (200, 300, 750), ("inline", "crossline", "time"), np.float32, None, None) + validate_variable(ds, "amplitude", (345, 188, 1501), ("inline", "crossline", "time"), np.float32, None, None) @classmethod def _create_empty_mdio(cls, create_headers: bool, output_path: Path, overwrite: bool = True) -> None: """Create a temporary empty MDIO file for testing.""" # Create the grid with the specified dimensions dims = [ - Dimension(name="inline", coords=range(100, 300, 1)), # 100-300 with step 1 - Dimension(name="crossline", coords=range(1000, 1600, 2)), # 1000-1600 with step 2 - Dimension(name="time", coords=range(0, 3000, 4)), # 0-3 seconds 4ms sample rate + Dimension(name="inline", coords=range(1, 346, 1)), # 100-300 with step 1 + Dimension(name="crossline", coords=range(1, 189, 1)), # 1000-1600 with step 2 + Dimension(name="time", coords=range(0, 3002, 2)), # 0-3 seconds 4ms sample rate ] # If later on, we want to export to SEG-Y, we need to provide the trace header spec. From 251e2f68cc4570ad8883a28b019d567ef5ace009 Mon Sep 17 00:00:00 2001 From: Dmitriy Repin Date: Mon, 27 Oct 2025 19:29:43 +0000 Subject: [PATCH 16/27] Merge upstream/main' into create_empty --- src/mdio/creators/mdio.py | 24 +++-- tests/integration/test_create_empty.py | 94 +++++++++++++++---- .../integration/test_segy_roundtrip_teapot.py | 41 +++++--- tests/integration/testing_helpers.py | 30 +++--- 4 files changed, 138 insertions(+), 51 deletions(-) diff --git a/src/mdio/creators/mdio.py b/src/mdio/creators/mdio.py index 200f3115a..a5432b4e6 100644 --- a/src/mdio/creators/mdio.py +++ b/src/mdio/creators/mdio.py @@ -20,11 +20,12 @@ from xarray import Dataset as xr_Dataset from mdio.builder.schemas import Dataset + from mdio.builder.templates.base import AbstractDatasetTemplate from mdio.core.dimension import Dimension def create_empty( # noqa PLR0913 - mdio_template_name: str, + mdio_template: AbstractDatasetTemplate | str, dimensions: list[Dimension], output_path: UPath | Path | str, headers: HeaderSpec | None = None, @@ -33,7 +34,14 @@ def create_empty( # noqa PLR0913 """A function that creates an empty MDIO v1 file with known dimensions. Args: - mdio_template_name: The MDIO template to use to define the dataset structure. + mdio_template: The MDIO template or template name to use to define the dataset structure. + NOTE: If you want to have a unit-aware MDIO model, you need to add the units + to the template before calling this function. For example: + 'unit_aware_template = TemplateRegistry().get("PostStack3DTime")' + 'unit_aware_template.add_units({"time": UNITS_SECOND})' + 'unit_aware_template.add_units({"cdp_x": UNITS_METER})' + 'unit_aware_template.add_units({"cdp_y": UNITS_METER})' + 'create_empty(unit_aware_template, dimensions, output_path, headers, overwrite)' dimensions: The dimensions of the MDIO file. output_path: The universal path for the output MDIO v1 file. headers: SEG-Y v1.0 trace headers. Defaults to None. @@ -50,13 +58,11 @@ def create_empty( # noqa PLR0913 header_dtype = to_structured_type(headers.dtype) if headers else None grid = Grid(dims=dimensions) - mdio_template = TemplateRegistry().get(mdio_template_name) - mdio_ds: Dataset = mdio_template.build_dataset( - name=mdio_template_name, - sizes=grid.shape, - horizontal_coord_unit=None, - header_dtype=header_dtype, - ) + if isinstance(mdio_template, str): + # A template name is passed in. Get a unit-unaware template from registry + mdio_template = TemplateRegistry().get(mdio_template) + # Build the dataset using the template + mdio_ds: Dataset = mdio_template.build_dataset(name=mdio_template.name, sizes=grid.shape, header_dtype=header_dtype) # Convert to xarray dataset xr_dataset: xr_Dataset = to_xarray_dataset(mdio_ds=mdio_ds) diff --git a/tests/integration/test_create_empty.py b/tests/integration/test_create_empty.py index 023ec2327..60f73e136 100644 --- a/tests/integration/test_create_empty.py +++ b/tests/integration/test_create_empty.py @@ -26,17 +26,61 @@ from tests.integration.testing_helpers import get_values -from tests.integration.testing_helpers import validate_variable +from tests.integration.testing_helpers import validate_xr_variable from mdio import __version__ from mdio.api.io import open_mdio from mdio.api.io import to_mdio from mdio.builder.schemas.v1.stats import CenteredBinHistogram from mdio.builder.schemas.v1.stats import SummaryStatistics +from mdio.builder.templates.seismic_3d_poststack import Seismic3DPostStackTemplate from mdio.converters.mdio import mdio_to_segy from mdio.core import Dimension from mdio.creators.mdio import create_empty +UNITS_NONE = None +UNITS_METER = LengthUnitModel(length=LengthUnitEnum.METER) +UNITS_SECOND = TimeUnitModel(time=TimeUnitEnum.SECOND) +UNITS_METER_PER_SECOND = SpeedUnitModel(speed=SpeedUnitEnum.METER_PER_SECOND) +UNITS_FOOT = LengthUnitModel(length=LengthUnitEnum.FOOT) +UNITS_FEET_PER_SECOND = SpeedUnitModel(speed=SpeedUnitEnum.FEET_PER_SECOND) + + +class PostStack3DVelocityTemplate(Seismic3DPostStackTemplate): + """Custom template that uses 'velocity' as the default variable name instead of 'amplitude'.""" + + @property + def _default_variable_name(self) -> str: + """Override the default variable name.""" + return "velocity" + + def __init__(self, data_domain: str, is_metric: bool) -> None: + super().__init__(data_domain) + if is_metric: + self._units.update( + { + "time": UNITS_SECOND, + "cdp_x": UNITS_METER, + "cdp_y": UNITS_METER, + "velocity": UNITS_METER_PER_SECOND, + } + ) + else: + self._units.update( + { + "time": UNITS_SECOND, + "cdp_x": UNITS_FOOT, + "cdp_y": UNITS_FOOT, + "velocity": UNITS_FEET_PER_SECOND, + } + ) + + @property + def _name(self) -> str: + """Override the name of the template.""" + domain_suffix = self._data_domain.capitalize() + return f"PostStack3DVelocity{domain_suffix}" + class TestCreateEmptyPostStack3DTimeMdio: """Tests for create_empty_mdio function.""" @@ -58,35 +102,51 @@ def _get_customized_v10_trace_header_spec(cls) -> HeaderSpec: @classmethod def _validate_empty_mdio_dataset(cls, ds: xr_Dataset, has_headers: bool) -> None: """Validate an empty MDIO dataset structure and content.""" + assert ds.name == "PostStack3DVelocityTime" # Check that the dataset has the expected shape assert ds.sizes == {"inline": 345, "crossline": 188, "time": 1501} # Validate the dimension coordinate variables - validate_variable(ds, "inline", (345,), ("inline",), np.int32, range(1, 346), get_values) - validate_variable(ds, "crossline", (188,), ("crossline",), np.int32, range(1, 189), get_values) - validate_variable(ds, "time", (1501,), ("time",), np.int32, range(0, 3002, 2), get_values) + validate_xr_variable(ds, "inline", {"inline": 345}, UNITS_NONE, np.int32, range(1, 346), get_values) + validate_xr_variable(ds, "crossline", {"crossline": 188}, UNITS_NONE, np.int32, range(1, 189), get_values) + validate_xr_variable(ds, "time", {"time": 1501}, UNITS_SECOND, np.int32, range(0, 3002, 2), get_values) # Validate the non-dimensional coordinate variables (should be empty for empty dataset) - validate_variable(ds, "cdp_x", (345, 188), ("inline", "crossline"), np.float64, None, None) - validate_variable(ds, "cdp_y", (345, 188), ("inline", "crossline"), np.float64, None, None) + validate_xr_variable(ds, "cdp_x", {"inline": 345, "crossline": 188}, UNITS_METER, np.float64, None, None) + validate_xr_variable(ds, "cdp_y", {"inline": 345, "crossline": 188}, UNITS_METER, np.float64, None, None) if has_headers: # Validate the headers (should be empty for empty dataset) # Infer the dtype from segy_spec and ignore endianness header_dtype = cls._get_customized_v10_trace_header_spec().dtype.newbyteorder("native") - validate_variable(ds, "headers", (345, 188), ("inline", "crossline"), header_dtype, None, None) - validate_variable(ds, "segy_file_header", (), (), np.dtype("U1"), None, None) + validate_xr_variable(ds, "headers", {"inline": 345, "crossline": 188}, UNITS_NONE, header_dtype, None, None) + validate_xr_variable( + ds, + "segy_file_header", + dims={}, + units=UNITS_NONE, + data_type=np.dtype("U1"), + expected_values=None, + actual_value_generator=None, + ) else: assert "headers" not in ds.variables assert "segy_file_header" not in ds.variables - # Validate the trace mask (should be all True for empty dataset) - validate_variable(ds, "trace_mask", (345, 188), ("inline", "crossline"), np.bool_, None, None) + validate_xr_variable(ds, "trace_mask", {"inline": 345, "crossline": 188}, UNITS_NONE, np.bool_, None, None) trace_mask = ds["trace_mask"].values assert not np.any(trace_mask), "All traces should be marked as dead in empty dataset" # Validate the amplitude data (should be empty) - validate_variable(ds, "amplitude", (345, 188, 1501), ("inline", "crossline", "time"), np.float32, None, None) + validate_xr_variable( + ds, + "velocity", + {"inline": 345, "crossline": 188, "time": 1501}, + UNITS_METER_PER_SECOND, + np.float32, + None, + None, + ) @classmethod def _create_empty_mdio(cls, create_headers: bool, output_path: Path, overwrite: bool = True) -> None: @@ -101,8 +161,10 @@ def _create_empty_mdio(cls, create_headers: bool, output_path: Path, overwrite: # If later on, we want to export to SEG-Y, we need to provide the trace header spec. # The HeaderSpec can be either standard or customized. headers = cls._get_customized_v10_trace_header_spec() if create_headers else None + + # Create an empty MDIO v1 metric post-stack 3D time velocity dataset create_empty( - mdio_template_name="PostStack3DTime", + mdio_template=PostStack3DVelocityTemplate(data_domain="time", is_metric=True), dimensions=dims, output_path=output_path, headers=headers, @@ -138,7 +200,7 @@ def test_dataset_metadata(self, mdio_with_headers: Path) -> None: # Check basic metadata attributes expected_attrs = { "apiVersion": __version__, - "name": "PostStack3DTime", + "name": "PostStack3DVelocityTime", } actual_attrs_json = ds.attrs @@ -159,7 +221,7 @@ def test_dataset_metadata(self, mdio_with_headers: Path) -> None: assert attributes is not None assert len(attributes) == 3 # Validate all attributes provided by the abstract template - assert attributes["defaultVariableName"] == "amplitude" + assert attributes["defaultVariableName"] == "velocity" assert attributes["surveyType"] == "3D" assert attributes["gatherType"] == "stacked" @@ -203,9 +265,9 @@ def test_overwrite_behavior(self, empty_mdio_dir: Path) -> None: def test_populate_empty_dataset(self, mdio_with_headers: Path) -> None: """Test showing how to populate empty dataset.""" - # Open an empty PostStack3DTime dataset with SEG-Y 1.0 headers + # Open an empty PostStack3DVelocityTime dataset with SEG-Y 1.0 headers # NOTES: - # When this empty dataset was created from the 'PostStack3DTime' template and dimensions, + # When this empty dataset was created from the 'PostStack3DVelocityTime' template and dimensions, # * 'inline', 'crossline', and 'time' dimension coordinate variables were created and pre-populated # * 'cdp_x', 'cdp_y' non-dimensional coordinate variables were created # * 'amplitude' variable was created (the name of this variable is specified in the template) diff --git a/tests/integration/test_segy_roundtrip_teapot.py b/tests/integration/test_segy_roundtrip_teapot.py index b950a8f03..ca8f41270 100644 --- a/tests/integration/test_segy_roundtrip_teapot.py +++ b/tests/integration/test_segy_roundtrip_teapot.py @@ -14,11 +14,15 @@ from segy.standards import get_segy_standard from tests.integration.testing_helpers import get_inline_header_values from tests.integration.testing_helpers import get_values -from tests.integration.testing_helpers import validate_variable +from tests.integration.testing_helpers import validate_xr_variable from mdio import __version__ from mdio import mdio_to_segy from mdio.api.io import open_mdio +from mdio.builder.schemas.v1.units import LengthUnitEnum +from mdio.builder.schemas.v1.units import LengthUnitModel +from mdio.builder.schemas.v1.units import TimeUnitEnum +from mdio.builder.schemas.v1.units import TimeUnitModel from mdio.builder.template_registry import TemplateRegistry from mdio.converters.segy import segy_to_mdio from mdio.segy.file import SegyFileWrapper @@ -148,6 +152,11 @@ def raw_binary_header_teapot_dome() -> str: ) +UNITS_NONE = None +UNITS_METER = LengthUnitModel(length=LengthUnitEnum.METER) +UNITS_SECOND = TimeUnitModel(time=TimeUnitEnum.SECOND) + + class TestTeapotRoundtrip: """Tests for Teapot Dome data ingestion and export.""" @@ -163,9 +172,13 @@ def test_teapot_import( NOTE: This test must be executed before the 'TestReader' and 'TestExport' tests. """ + unit_aware_template = TemplateRegistry().get("PostStack3DTime") + unit_aware_template.add_units({"time": UNITS_SECOND}) + unit_aware_template.add_units({"cdp_x": UNITS_METER}) + unit_aware_template.add_units({"cdp_y": UNITS_METER}) segy_to_mdio( segy_spec=teapot_segy_spec, - mdio_template=TemplateRegistry().get("PostStack3DTime"), + mdio_template=unit_aware_template, input_path=segy_input, output_path=zarr_tmp, overwrite=True, @@ -224,38 +237,38 @@ def test_grid(self, zarr_tmp: Path, teapot_segy_spec: SegySpec) -> None: ds = open_mdio(zarr_tmp) # Validate the dimension coordinate variables - validate_variable(ds, "inline", (345,), ("inline",), np.int32, range(1, 346), get_values) - validate_variable(ds, "crossline", (188,), ("crossline",), np.int32, range(1, 189), get_values) - validate_variable(ds, "time", (1501,), ("time",), np.int32, range(0, 3002, 2), get_values) + validate_xr_variable(ds, "inline", {"inline": 345}, UNITS_NONE, np.int32, range(1, 346), get_values) + validate_xr_variable(ds, "crossline", {"crossline": 188}, UNITS_NONE, np.int32, range(1, 189), get_values) + validate_xr_variable(ds, "time", {"time": 1501}, UNITS_SECOND, np.int32, range(0, 3002, 2), get_values) # Validate the non-dimensional coordinate variables - validate_variable(ds, "cdp_x", (345, 188), ("inline", "crossline"), np.float64, None, None) - validate_variable(ds, "cdp_y", (345, 188), ("inline", "crossline"), np.float64, None, None) + validate_xr_variable(ds, "cdp_x", {"inline": 345, "crossline": 188}, UNITS_METER, np.float64, None, None) + validate_xr_variable(ds, "cdp_y", {"inline": 345, "crossline": 188}, UNITS_METER, np.float64, None, None) # Validate the headers # We have a custom set of headers since we used customize_segy_specs() segy_spec = teapot_segy_spec data_type = segy_spec.trace.header.dtype - validate_variable( + validate_xr_variable( ds, "headers", - (345, 188), - ("inline", "crossline"), + {"inline": 345, "crossline": 188}, + UNITS_NONE, data_type.newbyteorder("native"), # mdio saves with machine endian, spec could be different endian range(1, 346), get_inline_header_values, ) # Validate the trace mask - validate_variable(ds, "trace_mask", (345, 188), ("inline", "crossline"), np.bool, None, None) + validate_xr_variable(ds, "trace_mask", {"inline": 345, "crossline": 188}, UNITS_NONE, np.bool, None, None) # validate the amplitude data - validate_variable( + validate_xr_variable( ds, "amplitude", - (345, 188, 1501), - ("inline", "crossline", "time"), + {"inline": 345, "crossline": 188, "time": 1501}, + UNITS_NONE, np.float32, None, None, diff --git a/tests/integration/testing_helpers.py b/tests/integration/testing_helpers.py index c871ba6db..6887abab1 100644 --- a/tests/integration/testing_helpers.py +++ b/tests/integration/testing_helpers.py @@ -4,7 +4,8 @@ import numpy as np import xarray as xr -from numpy.typing import DTypeLike + +from mdio.builder.schemas.v1.units import AllUnitModel def get_values(arr: xr.DataArray) -> np.ndarray: @@ -17,35 +18,40 @@ def get_inline_header_values(dataset: xr.Dataset) -> np.ndarray: return dataset["inline"].values -def validate_variable( # noqa PLR0913 +def validate_xr_variable( # noqa PLR0913 dataset: xr.Dataset, name: str, - shape: tuple[int, ...], - dims: tuple[str, ...], - data_type: DTypeLike, + dims: dict[int], + units: AllUnitModel, + data_type: np.dtype, expected_values: range | None, actual_value_generator: Callable[[xr.DataArray], np.ndarray] | None = None, ) -> None: """Validate the properties of a variable in an Xarray dataset.""" - arr = dataset[name] - assert shape == arr.shape - assert set(dims) == set(arr.dims) + v = dataset[name] + assert v is not None + assert v.sizes == dims if hasattr(data_type, "fields") and data_type.fields is not None: # The following assertion will fail because of differences in offsets # assert data_type == arr.dtype # Compare field names expected_names = list(data_type.names) - actual_names = list(arr.dtype.names) + actual_names = list(v.dtype.names) assert expected_names == actual_names # Compare field types expected_types = [data_type[name] for name in data_type.names] - actual_types = [arr.dtype[name] for name in arr.dtype.names] + actual_types = [v.dtype[name] for name in v.dtype.names] assert expected_types == actual_types else: - assert data_type == arr.dtype + assert data_type == v.dtype + + if units is not None: + assert v.attrs == {"unitsV1": units.model_dump(mode="json")} + else: + assert "unitsV1" not in v.attrs if expected_values is not None and actual_value_generator is not None: - actual_values = actual_value_generator(arr) + actual_values = actual_value_generator(v) assert np.array_equal(expected_values, actual_values) From 325466f72c1ee5a673786c0a4d3abc39b1b6b437 Mon Sep 17 00:00:00 2001 From: Dmitriy Repin Date: Mon, 27 Oct 2025 19:59:14 +0000 Subject: [PATCH 17/27] move creators/mdio.py : create_empty() to api/create.py: create_empty( --- src/mdio/api/__init__.py | 4 ++++ src/mdio/{creators/mdio.py => api/create.py} | 1 + src/mdio/creators/__init__.py | 5 ----- tests/integration/test_create_empty.py | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) rename src/mdio/{creators/mdio.py => api/create.py} (99%) delete mode 100644 src/mdio/creators/__init__.py diff --git a/src/mdio/api/__init__.py b/src/mdio/api/__init__.py index f731a65c2..60d43f171 100644 --- a/src/mdio/api/__init__.py +++ b/src/mdio/api/__init__.py @@ -1 +1,5 @@ """Public API.""" + +from mdio.api.create import create_empty + +__all__ = ["create_empty"] diff --git a/src/mdio/creators/mdio.py b/src/mdio/api/create.py similarity index 99% rename from src/mdio/creators/mdio.py rename to src/mdio/api/create.py index a5432b4e6..3200390a4 100644 --- a/src/mdio/creators/mdio.py +++ b/src/mdio/api/create.py @@ -83,3 +83,4 @@ def create_empty( # noqa PLR0913 # Write the dimension coordinates and trace mask meta_ds = dataset[drop_vars_delayed + ["trace_mask"]] to_mdio(meta_ds, output_path=output_path, mode="r+", compute=True) + diff --git a/src/mdio/creators/__init__.py b/src/mdio/creators/__init__.py deleted file mode 100644 index 4eed5f9c9..000000000 --- a/src/mdio/creators/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -"""MDIO Data creation API.""" - -from mdio.creators.mdio import create_empty - -__all__ = ["create_empty"] diff --git a/tests/integration/test_create_empty.py b/tests/integration/test_create_empty.py index 60f73e136..59049fb05 100644 --- a/tests/integration/test_create_empty.py +++ b/tests/integration/test_create_empty.py @@ -29,6 +29,7 @@ from tests.integration.testing_helpers import validate_xr_variable from mdio import __version__ +from mdio.api.create import create_empty from mdio.api.io import open_mdio from mdio.api.io import to_mdio from mdio.builder.schemas.v1.stats import CenteredBinHistogram @@ -36,7 +37,6 @@ from mdio.builder.templates.seismic_3d_poststack import Seismic3DPostStackTemplate from mdio.converters.mdio import mdio_to_segy from mdio.core import Dimension -from mdio.creators.mdio import create_empty UNITS_NONE = None UNITS_METER = LengthUnitModel(length=LengthUnitEnum.METER) From 1a95f822553b3240ea1a415e89af725f4ff0ebdf Mon Sep 17 00:00:00 2001 From: Dmitriy Repin Date: Mon, 27 Oct 2025 21:59:18 +0000 Subject: [PATCH 18/27] create_empty_like --- pyproject.toml | 1 + src/mdio/api/__init__.py | 3 +- src/mdio/api/create.py | 81 +++++++++++++ src/mdio/builder/dataset_builder.py | 3 +- src/mdio/converters/__init__.py | 28 ++++- tests/conftest.py | 38 +++++- tests/integration/test_create_empty.py | 109 +++++++++++------- .../test_import_streamer_grid_overrides.py | 22 ++-- .../integration/test_segy_roundtrip_teapot.py | 38 +++--- tests/integration/testing_helpers.py | 4 +- tests/unit/v1/test_dataset_serializer.py | 4 + uv.lock | 14 +++ 12 files changed, 268 insertions(+), 77 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 518eb00d8..e7ceec4d0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -55,6 +55,7 @@ dev = [ "pre-commit-hooks>=6.0.0", "pytest>=8.4.2", "pytest-dependency>=0.6.0", + "pytest-order>=1.3.0", "typeguard>=4.4.4", "xdoctest[colors]>=1.3.0", "Pygments>=2.19.2" diff --git a/src/mdio/api/__init__.py b/src/mdio/api/__init__.py index 60d43f171..44bc79850 100644 --- a/src/mdio/api/__init__.py +++ b/src/mdio/api/__init__.py @@ -1,5 +1,6 @@ """Public API.""" from mdio.api.create import create_empty +from mdio.api.create import create_empty_like -__all__ = ["create_empty"] +__all__ = ["create_empty", "create_empty_like"] diff --git a/src/mdio/api/create.py b/src/mdio/api/create.py index 3200390a4..1e0ee67a0 100644 --- a/src/mdio/api/create.py +++ b/src/mdio/api/create.py @@ -2,9 +2,12 @@ from __future__ import annotations +from datetime import UTC +from datetime import datetime from typing import TYPE_CHECKING from mdio.api.io import _normalize_path +from mdio.api.io import open_mdio from mdio.api.io import to_mdio from mdio.builder.template_registry import TemplateRegistry from mdio.builder.xarray_builder import to_xarray_dataset @@ -84,3 +87,81 @@ def create_empty( # noqa PLR0913 meta_ds = dataset[drop_vars_delayed + ["trace_mask"]] to_mdio(meta_ds, output_path=output_path, mode="r+", compute=True) + +def create_empty_like( # noqa PLR0913 + input_path: UPath | Path | str, + output_path: UPath | Path | str, + keep_coordinates: bool = False, + overwrite: bool = False, +) -> xr_Dataset: + """A function that creates an empty MDIO v1 file with the same structure as an existing one. + + Args: + input_path: The path of the input MDIO file. + output_path: The path of the output MDIO file. + If None, the output will not be written to disk. + keep_coordinates: Whether to keep the coordinates in the output file. + overwrite: Whether to overwrite the output file if it exists. + + Returns: + The output MDIO dataset. + + Raises: + FileExistsError: If the output location already exists and overwrite is False. + """ + input_path = _normalize_path(input_path) + output_path = _normalize_path(output_path) if output_path is not None else None + + if not overwrite and output_path is not None and output_path.exists(): + err = f"Output location '{output_path.as_posix()}' exists. Set `overwrite=True` if intended." + raise FileExistsError(err) + + ds = open_mdio(input_path) + + # Create a copy with the same structure but no data or, + # optionally, coordinates + ds_output = ds.copy(data=None).reset_coords(drop=not keep_coordinates) + + # Dataset + # Keep the name (which is the same as the used template name) and the original API version + # ds_output.attrs["name"] + # ds_output.attrs["apiVersion"] + ds_output.attrs["createdOn"] = datetime.now(UTC) + + # Coordinates + if not keep_coordinates: + for coord_name in ds_output.coords: + ds_output[coord_name].attrs["unitsV1"] = None + + # MDIO attributes + attr = ds_output.attrs["attributes"] + if attr is not None: + attr.pop("gridOverrides", None) # Empty dataset should not have gridOverrides + # Keep the original values for the following attributes + # attr["defaultVariableName"] + # attr["surveyType"] + # attr["gatherType"] + + # "All traces should be marked as dead in empty dataset" + if "trace_mask" in ds_output.variables: + ds_output["trace_mask"][:] = False + + # Data variable + var_name = attr["defaultVariableName"] + var = ds_output[var_name] + var.attrs["statsV1"] = None + if not keep_coordinates: + var.attrs["unitsV1"] = None + + # SEG-Y file header + if "segy_file_header" in ds_output.variables: + segy_file_header = ds_output["segy_file_header"] + if segy_file_header is not None: + segy_file_header.attrs["textHeader"] = None + segy_file_header.attrs["binaryHeader"] = None + segy_file_header.attrs["rawBinaryHeader"] = None + + if output_path is not None: + to_mdio(ds_output, output_path=output_path, mode="w", compute=True) + + return ds_output diff --git a/src/mdio/builder/dataset_builder.py b/src/mdio/builder/dataset_builder.py index 1cc515988..5fc0b288e 100644 --- a/src/mdio/builder/dataset_builder.py +++ b/src/mdio/builder/dataset_builder.py @@ -6,7 +6,6 @@ from enum import auto from typing import Any -from mdio import __version__ from mdio.builder.formatting_html import dataset_builder_repr_html from mdio.builder.schemas.compressors import ZFP from mdio.builder.schemas.compressors import Blosc @@ -59,6 +58,8 @@ class MDIODatasetBuilder: """ def __init__(self, name: str, attributes: dict[str, Any] | None = None): + from mdio import __version__ # noqa: PLC0415 - fixed circular import in mdio package and dataset_builder.py + self._metadata = DatasetMetadata( name=name, api_version=__version__, diff --git a/src/mdio/converters/__init__.py b/src/mdio/converters/__init__.py index fd88595ff..753a0fc56 100644 --- a/src/mdio/converters/__init__.py +++ b/src/mdio/converters/__init__.py @@ -1,6 +1,30 @@ """MDIO Data conversion API.""" -from mdio.converters.mdio import mdio_to_segy -from mdio.converters.segy import segy_to_mdio +from typing import TYPE_CHECKING +from typing import Any + +if TYPE_CHECKING: + from mdio.converters.mdio import mdio_to_segy + from mdio.converters.segy import segy_to_mdio __all__ = ["mdio_to_segy", "segy_to_mdio"] + + +def __getattr__(name: str) -> Any: # noqa: ANN401 - required for dynamic attribute access + """Lazy import for converters to avoid circular imports.""" + if name == "mdio_to_segy": + from mdio.converters.mdio import ( # noqa: PLC0415 - intentionally inside the function to avoid circular imports + mdio_to_segy, + ) + + return mdio_to_segy + + if name == "segy_to_mdio": + from mdio.converters.segy import ( # noqa: PLC0415 - intentionally inside the function to avoid circular imports + segy_to_mdio, + ) + + return segy_to_mdio + + err = f"module {__name__!r} has no attribute {name!r}" + raise AttributeError(err) diff --git a/tests/conftest.py b/tests/conftest.py index d4890b17a..0ad8a033e 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -47,6 +47,18 @@ def zarr_tmp(tmp_path_factory: pytest.TempPathFactory) -> Path: return tmp_path_factory.mktemp(r"mdio") +@pytest.fixture(scope="session") +def teapot_mdio_tmp(tmp_path_factory: pytest.TempPathFactory) -> Path: + """Make a temp file for the output MDIO.""" + return tmp_path_factory.mktemp(r"teapot.mdio") + + +@pytest.fixture(scope="module") +def mdio_4d_tmp(tmp_path_factory: pytest.TempPathFactory) -> Path: + """Make a temp file for the output MDIO.""" + return tmp_path_factory.mktemp(r"tmp_4d.mdio") + + @pytest.fixture(scope="module") def zarr_tmp2(tmp_path_factory: pytest.TempPathFactory) -> Path: # pragma: no cover - used by disabled test """Make a temp file for the output MDIO.""" @@ -63,4 +75,28 @@ def segy_export_tmp(tmp_path_factory: pytest.TempPathFactory) -> Path: @pytest.fixture(scope="class") def empty_mdio_dir(tmp_path_factory: pytest.TempPathFactory) -> Path: """Make a temp file for empty MDIO testing.""" - return tmp_path_factory.mktemp(r"empty_mdio") + return tmp_path_factory.mktemp(r"empty_mdio_dir") + + +# +# Uncomment the function below for local debugging +# +# @pytest.fixture(scope="session") +# def tmp_path_factory() -> pytest.TempPathFactory: +# """Custom tmp_path_factory implementation for local debugging.""" +# from pathlib import Path # noqa: PLC0415 + +# class DebugTempPathFactory: +# def __init__(self) -> None: +# self._retention_policy = "all" + +# def mktemp(self, basename: str, numbered: bool = True) -> Path: +# _ = numbered +# path = self.getbasetemp() / basename +# path.mkdir(parents=True, exist_ok=True) +# return path + +# def getbasetemp(self) -> Path: +# return Path("tmp") + +# return DebugTempPathFactory() diff --git a/tests/integration/test_create_empty.py b/tests/integration/test_create_empty.py index 59049fb05..cd130a301 100644 --- a/tests/integration/test_create_empty.py +++ b/tests/integration/test_create_empty.py @@ -7,9 +7,6 @@ import numpy as np import pytest -from segy.schema import HeaderField -from segy.schema import HeaderSpec -from segy.schema import ScalarType from segy.standards import get_segy_standard from mdio.builder.schemas.v1.units import LengthUnitEnum @@ -25,11 +22,13 @@ from xarray import Dataset as xr_Dataset +from tests.integration.test_segy_roundtrip_teapot import get_teapot_segy_spec from tests.integration.testing_helpers import get_values from tests.integration.testing_helpers import validate_xr_variable from mdio import __version__ from mdio.api.create import create_empty +from mdio.api.create import create_empty_like from mdio.api.io import open_mdio from mdio.api.io import to_mdio from mdio.builder.schemas.v1.stats import CenteredBinHistogram @@ -82,27 +81,13 @@ def _name(self) -> str: return f"PostStack3DVelocity{domain_suffix}" -class TestCreateEmptyPostStack3DTimeMdio: +@pytest.mark.order(1000) +class TestCreateEmptyMdio: """Tests for create_empty_mdio function.""" @classmethod - def _get_customized_v10_trace_header_spec(cls) -> HeaderSpec: - """Get the header spec for the MDIO dataset.""" - trace_header_fields = [ - HeaderField(name="inline", byte=17, format=ScalarType.INT32), - HeaderField(name="crossline", byte=13, format=ScalarType.INT32), - HeaderField(name="cdp_x", byte=181, format=ScalarType.INT32), - HeaderField(name="cdp_y", byte=185, format=ScalarType.INT32), - HeaderField(name="coordinate_scalar", byte=71, format=ScalarType.INT16), - ] - hs: HeaderSpec = get_segy_standard(1.0).trace.header - hs.customize(fields=trace_header_fields) - return hs - - @classmethod - def _validate_empty_mdio_dataset(cls, ds: xr_Dataset, has_headers: bool) -> None: + def _validate_empty_mdio_dataset(cls, ds: xr_Dataset, has_headers: bool, is_velocity: bool) -> None: """Validate an empty MDIO dataset structure and content.""" - assert ds.name == "PostStack3DVelocityTime" # Check that the dataset has the expected shape assert ds.sizes == {"inline": 345, "crossline": 188, "time": 1501} @@ -118,7 +103,7 @@ def _validate_empty_mdio_dataset(cls, ds: xr_Dataset, has_headers: bool) -> None if has_headers: # Validate the headers (should be empty for empty dataset) # Infer the dtype from segy_spec and ignore endianness - header_dtype = cls._get_customized_v10_trace_header_spec().dtype.newbyteorder("native") + header_dtype = get_teapot_segy_spec().trace.header.dtype.newbyteorder("native") validate_xr_variable(ds, "headers", {"inline": 345, "crossline": 188}, UNITS_NONE, header_dtype, None, None) validate_xr_variable( ds, @@ -132,21 +117,33 @@ def _validate_empty_mdio_dataset(cls, ds: xr_Dataset, has_headers: bool) -> None else: assert "headers" not in ds.variables assert "segy_file_header" not in ds.variables + # Validate the trace mask (should be all True for empty dataset) validate_xr_variable(ds, "trace_mask", {"inline": 345, "crossline": 188}, UNITS_NONE, np.bool_, None, None) trace_mask = ds["trace_mask"].values assert not np.any(trace_mask), "All traces should be marked as dead in empty dataset" - # Validate the amplitude data (should be empty) - validate_xr_variable( - ds, - "velocity", - {"inline": 345, "crossline": 188, "time": 1501}, - UNITS_METER_PER_SECOND, - np.float32, - None, - None, - ) + # Validate the velocity or amplitude data (should be empty) + if is_velocity: + validate_xr_variable( + ds, + "velocity", + {"inline": 345, "crossline": 188, "time": 1501}, + UNITS_METER_PER_SECOND, + np.float32, + None, + None, + ) + else: + validate_xr_variable( + ds, + "amplitude", + {"inline": 345, "crossline": 188, "time": 1501}, + UNITS_NONE, + np.float32, + None, + None, + ) @classmethod def _create_empty_mdio(cls, create_headers: bool, output_path: Path, overwrite: bool = True) -> None: @@ -160,8 +157,7 @@ def _create_empty_mdio(cls, create_headers: bool, output_path: Path, overwrite: # If later on, we want to export to SEG-Y, we need to provide the trace header spec. # The HeaderSpec can be either standard or customized. - headers = cls._get_customized_v10_trace_header_spec() if create_headers else None - + headers = get_teapot_segy_spec().trace.header if create_headers else None # Create an empty MDIO v1 metric post-stack 3D time velocity dataset create_empty( mdio_template=PostStack3DVelocityTemplate(data_domain="time", is_metric=True), @@ -178,7 +174,7 @@ def mdio_with_headers(self, empty_mdio_dir: Path) -> Path: This fixture is scoped to the class level, so it will be executed only once and shared across all test methods in the class. """ - empty_mdio: Path = empty_mdio_dir / "with_headers.mdio" + empty_mdio: Path = empty_mdio_dir / "mdio_with_headers.mdio" self._create_empty_mdio(create_headers=True, output_path=empty_mdio) return empty_mdio @@ -189,18 +185,21 @@ def mdio_no_headers(self, empty_mdio_dir: Path) -> Path: This fixture is scoped to the class level, so it will be executed only once and shared across all test methods in the class. """ - empty_mdio: Path = empty_mdio_dir / "no_headers.mdio" + empty_mdio: Path = empty_mdio_dir / "mdio_no_headers.mdio" self._create_empty_mdio(create_headers=False, output_path=empty_mdio) return empty_mdio - def test_dataset_metadata(self, mdio_with_headers: Path) -> None: - """Test dataset metadata for empty MDIO file.""" - ds = open_mdio(mdio_with_headers) + def validate_dataset_metadata(self, ds: xr_Dataset, is_velocity: bool) -> None: + """Validate the dataset metadata.""" + if is_velocity: + assert ds.name == "PostStack3DVelocityTime" + else: + assert ds.name == "PostStack3DTime" # Check basic metadata attributes expected_attrs = { "apiVersion": __version__, - "name": "PostStack3DVelocityTime", + "name": ds.name, } actual_attrs_json = ds.attrs @@ -221,17 +220,25 @@ def test_dataset_metadata(self, mdio_with_headers: Path) -> None: assert attributes is not None assert len(attributes) == 3 # Validate all attributes provided by the abstract template - assert attributes["defaultVariableName"] == "velocity" + if is_velocity: + assert attributes["defaultVariableName"] == "velocity" + else: + assert attributes["defaultVariableName"] == "amplitude" assert attributes["surveyType"] == "3D" assert attributes["gatherType"] == "stacked" + def test_dataset_metadata(self, mdio_with_headers: Path) -> None: + """Test dataset metadata for empty MDIO file.""" + ds = open_mdio(mdio_with_headers) + self.validate_dataset_metadata(ds, is_velocity=True) + def test_variables(self, mdio_with_headers: Path, mdio_no_headers: Path) -> None: """Test grid validation for empty MDIO file.""" ds = open_mdio(mdio_with_headers) - self._validate_empty_mdio_dataset(ds, has_headers=True) + self._validate_empty_mdio_dataset(ds, has_headers=True, is_velocity=True) ds = open_mdio(mdio_no_headers) - self._validate_empty_mdio_dataset(ds, has_headers=False) + self._validate_empty_mdio_dataset(ds, has_headers=False, is_velocity=True) def test_overwrite_behavior(self, empty_mdio_dir: Path) -> None: """Test overwrite parameter behavior in create_empty_mdio.""" @@ -257,7 +264,7 @@ def test_overwrite_behavior(self, empty_mdio_dir: Path) -> None: # Validate that the MDIO file can be loaded correctly using the helper function ds = open_mdio(empty_mdio) - self._validate_empty_mdio_dataset(ds, has_headers=True) + self._validate_empty_mdio_dataset(ds, has_headers=True, is_velocity=True) # Verify the garbage data was overwritten (should not exist) assert not garbage_file.exists(), "Garbage file should have been overwritten" @@ -374,10 +381,24 @@ def test_populate_empty_dataset(self, mdio_with_headers: Path) -> None: # Select the SEG-Y standard to use for the conversion custom_segy_spec = get_segy_standard(1.0) # Customize to use the same HeaderSpec that was used to create the empty MDIO - custom_segy_spec.trace.header = self._get_customized_v10_trace_header_spec() + custom_segy_spec.trace.header = get_teapot_segy_spec().trace.header # Convert the MDIO file to SEG-Y mdio_to_segy( segy_spec=custom_segy_spec, input_path=output_path_mdio, output_path=mdio_with_headers.parent / "populated_empty.sgy", ) + + @pytest.mark.order(1001) + @pytest.mark.dependency + def test_create_empty_like(self, teapot_mdio_tmp: Path, mdio_with_headers: Path) -> None: + """Create an empty MDIO file like the input file.""" + _ = mdio_with_headers + ds = create_empty_like( + input_path=teapot_mdio_tmp, + output_path=None, # We don't want to write to disk for now + keep_coordinates=True, + overwrite=True, + ) + self.validate_dataset_metadata(ds, is_velocity=False) + self._validate_empty_mdio_dataset(ds, has_headers=True, is_velocity=False) diff --git a/tests/integration/test_import_streamer_grid_overrides.py b/tests/integration/test_import_streamer_grid_overrides.py index d05070f5d..dbcc4dca3 100644 --- a/tests/integration/test_import_streamer_grid_overrides.py +++ b/tests/integration/test_import_streamer_grid_overrides.py @@ -39,7 +39,7 @@ class TestImport4DNonReg: # pragma: no cover - tests is skipped def test_import_4d_segy( # noqa: PLR0913 self, segy_mock_4d_shots: dict[StreamerShotGeometryType, Path], - zarr_tmp: Path, + mdio_4d_tmp: Path, grid_override: dict[str, Any], chan_header_type: StreamerShotGeometryType, ) -> None: @@ -51,7 +51,7 @@ def test_import_4d_segy( # noqa: PLR0913 segy_spec=segy_spec, mdio_template=TemplateRegistry().get("PreStackShotGathers3DTime"), input_path=segy_path, - output_path=zarr_tmp, + output_path=mdio_4d_tmp, overwrite=True, grid_overrides=grid_override, ) @@ -62,7 +62,7 @@ def test_import_4d_segy( # noqa: PLR0913 cables = [0, 101, 201, 301] receivers_per_cable = [1, 5, 7, 5] - ds = open_mdio(zarr_tmp) + ds = open_mdio(mdio_4d_tmp) assert ds["segy_file_header"].attrs["binaryHeader"]["samples_per_trace"] == num_samples assert ds.attrs["attributes"]["gridOverrides"] == grid_override @@ -86,7 +86,7 @@ class TestImport4D: def test_import_4d_segy( # noqa: PLR0913 self, segy_mock_4d_shots: dict[StreamerShotGeometryType, Path], - zarr_tmp: Path, + mdio_4d_tmp: Path, grid_override: dict[str, Any], chan_header_type: StreamerShotGeometryType, ) -> None: @@ -98,7 +98,7 @@ def test_import_4d_segy( # noqa: PLR0913 segy_spec=segy_spec, mdio_template=TemplateRegistry().get("PreStackShotGathers3DTime"), input_path=segy_path, - output_path=zarr_tmp, + output_path=mdio_4d_tmp, overwrite=True, grid_overrides=grid_override, ) @@ -109,7 +109,7 @@ def test_import_4d_segy( # noqa: PLR0913 cables = [0, 101, 201, 301] receivers_per_cable = [1, 5, 7, 5] - ds = open_mdio(zarr_tmp) + ds = open_mdio(mdio_4d_tmp) assert ds["segy_file_header"].attrs["binaryHeader"]["samples_per_trace"] == num_samples assert ds.attrs["attributes"].get("gridOverrides", None) == grid_override # may not exist, so default=None @@ -134,7 +134,7 @@ class TestImport4DSparse: def test_import_4d_segy( # noqa: PLR0913 self, segy_mock_4d_shots: dict[StreamerShotGeometryType, Path], - zarr_tmp: Path, + mdio_4d_tmp: Path, chan_header_type: StreamerShotGeometryType, ) -> None: """Test importing a SEG-Y file to MDIO.""" @@ -148,7 +148,7 @@ def test_import_4d_segy( # noqa: PLR0913 segy_spec=segy_spec, mdio_template=TemplateRegistry().get("PreStackShotGathers3DTime"), input_path=segy_path, - output_path=zarr_tmp, + output_path=mdio_4d_tmp, overwrite=True, ) @@ -167,7 +167,7 @@ class TestImport6D: # pragma: no cover - tests is skipped def test_import_6d_segy( # noqa: PLR0913 self, segy_mock_4d_shots: dict[StreamerShotGeometryType, Path], - zarr_tmp: Path, + mdio_4d_tmp: Path, grid_override: dict[str, Any], chan_header_type: StreamerShotGeometryType, ) -> None: @@ -179,7 +179,7 @@ def test_import_6d_segy( # noqa: PLR0913 segy_spec=segy_spec, mdio_template=TemplateRegistry().get("XYZ"), # Placeholder for the template input_path=segy_path, - output_path=zarr_tmp, + output_path=mdio_4d_tmp, overwrite=True, grid_overrides=grid_override, ) @@ -195,7 +195,7 @@ def test_import_6d_segy( # noqa: PLR0913 guns = [1, 2] receivers_per_cable = [1, 5, 7, 5] - ds = open_mdio(zarr_tmp) + ds = open_mdio(mdio_4d_tmp) xrt.assert_duckarray_equal(ds["gun"], guns) xrt.assert_duckarray_equal(ds["shot_point"], shots) diff --git a/tests/integration/test_segy_roundtrip_teapot.py b/tests/integration/test_segy_roundtrip_teapot.py index ca8f41270..444e0b12e 100644 --- a/tests/integration/test_segy_roundtrip_teapot.py +++ b/tests/integration/test_segy_roundtrip_teapot.py @@ -46,6 +46,11 @@ def set_env_vars(monkeypatch: Generator[pytest.MonkeyPatch]) -> None: @pytest.fixture def teapot_segy_spec() -> SegySpec: + """Return the customized SEG-Y specification for the teapot dome dataset.""" + return get_teapot_segy_spec() + + +def get_teapot_segy_spec() -> SegySpec: """Return the customized SEG-Y specification for the teapot dome dataset.""" teapot_fields = [ HeaderField(name="inline", byte=17, format=ScalarType.INT32), @@ -157,6 +162,7 @@ def raw_binary_header_teapot_dome() -> str: UNITS_SECOND = TimeUnitModel(time=TimeUnitEnum.SECOND) +@pytest.mark.order(1) class TestTeapotRoundtrip: """Tests for Teapot Dome data ingestion and export.""" @@ -165,7 +171,7 @@ class TestTeapotRoundtrip: def test_teapot_import( self, segy_input: Path, - zarr_tmp: Path, + teapot_mdio_tmp: Path, teapot_segy_spec: SegySpec, ) -> None: """Test importing a SEG-Y file to MDIO. @@ -180,14 +186,14 @@ def test_teapot_import( segy_spec=teapot_segy_spec, mdio_template=unit_aware_template, input_path=segy_input, - output_path=zarr_tmp, + output_path=teapot_mdio_tmp, overwrite=True, ) @pytest.mark.dependency("test_3d_import") - def test_dataset_metadata(self, zarr_tmp: Path) -> None: + def test_dataset_metadata(self, teapot_mdio_tmp: Path) -> None: """Metadata reading tests.""" - ds = open_mdio(zarr_tmp) + ds = open_mdio(teapot_mdio_tmp) expected_attrs = { "apiVersion": __version__, "createdOn": "2025-08-06 16:21:54.747880+00:00", @@ -215,9 +221,9 @@ def test_dataset_metadata(self, zarr_tmp: Path) -> None: assert segy_file_header.attrs["binaryHeader"] == binary_header_teapot_dome() assert segy_file_header.attrs["rawBinaryHeader"] == raw_binary_header_teapot_dome() - def test_variable_metadata(self, zarr_tmp: Path) -> None: + def test_variable_metadata(self, teapot_mdio_tmp: Path) -> None: """Metadata reading tests.""" - ds = open_mdio(zarr_tmp) + ds = open_mdio(teapot_mdio_tmp) expected_attrs = { "count": 46854270, "sum": -8594.551589292674, @@ -232,9 +238,9 @@ def test_variable_metadata(self, zarr_tmp: Path) -> None: expected_attrs.pop("histogram") np.testing.assert_allclose(list(actual_attrs.values()), list(expected_attrs.values())) - def test_grid(self, zarr_tmp: Path, teapot_segy_spec: SegySpec) -> None: + def test_grid(self, teapot_mdio_tmp: Path, teapot_segy_spec: SegySpec) -> None: """Test validating MDIO variables.""" - ds = open_mdio(zarr_tmp) + ds = open_mdio(teapot_mdio_tmp) # Validate the dimension coordinate variables validate_xr_variable(ds, "inline", {"inline": 345}, UNITS_NONE, np.int32, range(1, 346), get_values) @@ -274,35 +280,35 @@ def test_grid(self, zarr_tmp: Path, teapot_segy_spec: SegySpec) -> None: None, ) - def test_inline_reads(self, zarr_tmp: Path) -> None: + def test_inline_reads(self, teapot_mdio_tmp: Path) -> None: """Read and compare every 75 inlines' mean and std. dev.""" - ds = open_mdio(zarr_tmp) + ds = open_mdio(teapot_mdio_tmp) inlines = ds["amplitude"][::75, :, :] mean, std = inlines.mean(dtype="float64"), inlines.std(dtype="float64") npt.assert_allclose([mean, std], [0.00010555267, 0.60027058412]) # 11 precision - def test_crossline_reads(self, zarr_tmp: Path) -> None: + def test_crossline_reads(self, teapot_mdio_tmp: Path) -> None: """Read and compare every 75 crosslines' mean and std. dev.""" - ds = open_mdio(zarr_tmp) + ds = open_mdio(teapot_mdio_tmp) xlines = ds["amplitude"][:, ::75, :] mean, std = xlines.mean(dtype="float64"), xlines.std(dtype="float64") npt.assert_allclose([mean, std], [-5.03298501828e-05, 0.59406807762]) # 11 precision - def test_zslice_reads(self, zarr_tmp: Path) -> None: + def test_zslice_reads(self, teapot_mdio_tmp: Path) -> None: """Read and compare every 225 z-slices' mean and std. dev.""" - ds = open_mdio(zarr_tmp) + ds = open_mdio(teapot_mdio_tmp) slices = ds["amplitude"][:, :, ::225] mean, std = slices.mean(dtype="float64"), slices.std(dtype="float64") npt.assert_allclose([mean, std], [0.00523692339, 0.61279943571]) # 11 precision @pytest.mark.dependency("test_3d_import") def test_3d_export( - self, segy_input: Path, zarr_tmp: Path, segy_export_tmp: Path, teapot_segy_spec: SegySpec + self, segy_input: Path, teapot_mdio_tmp: Path, segy_export_tmp: Path, teapot_segy_spec: SegySpec ) -> None: """Test 3D export.""" rng = np.random.default_rng(seed=1234) - mdio_to_segy(segy_spec=teapot_segy_spec, input_path=zarr_tmp, output_path=segy_export_tmp) + mdio_to_segy(segy_spec=teapot_segy_spec, input_path=teapot_mdio_tmp, output_path=segy_export_tmp) # Check if file sizes match on IBM file. assert segy_input.stat().st_size == segy_export_tmp.stat().st_size diff --git a/tests/integration/testing_helpers.py b/tests/integration/testing_helpers.py index 6887abab1..ebd4146f8 100644 --- a/tests/integration/testing_helpers.py +++ b/tests/integration/testing_helpers.py @@ -47,10 +47,12 @@ def validate_xr_variable( # noqa PLR0913 else: assert data_type == v.dtype + assert v.attrs.get("statsV1", None) is None, "StatsV1 should be empty for empty dataset variables" + if units is not None: assert v.attrs == {"unitsV1": units.model_dump(mode="json")} else: - assert "unitsV1" not in v.attrs + assert "unitsV1" not in v.attrs, "UnitsV1 should not exist for unit-unaware variables" if expected_values is not None and actual_value_generator is not None: actual_values = actual_value_generator(v) diff --git a/tests/unit/v1/test_dataset_serializer.py b/tests/unit/v1/test_dataset_serializer.py index 45c19665f..464d9e6cc 100644 --- a/tests/unit/v1/test_dataset_serializer.py +++ b/tests/unit/v1/test_dataset_serializer.py @@ -1,5 +1,6 @@ """Tests the schema v1 dataset_serializer public API.""" +import shutil from pathlib import Path import numpy as np @@ -302,4 +303,7 @@ def test_seismic_poststack_3d_acceptance_to_xarray_dataset(tmp_path: Path) -> No xr_ds = to_xarray_dataset(dataset) file_path = f"{tmp_path}/{xr_ds.attrs['name']}.zarr" + # Delete the directory if it exists from running previous tests + if Path(file_path).exists(): + shutil.rmtree(file_path) to_mdio(xr_ds, output_path=file_path, mode="w-", compute=False) diff --git a/uv.lock b/uv.lock index 894419f5c..6e214b48d 100644 --- a/uv.lock +++ b/uv.lock @@ -1929,6 +1929,7 @@ dev = [ { name = "pygments" }, { name = "pytest" }, { name = "pytest-dependency" }, + { name = "pytest-order" }, { name = "ruff" }, { name = "typeguard" }, { name = "xdoctest", extra = ["colors"] }, @@ -1981,6 +1982,7 @@ dev = [ { name = "pygments", specifier = ">=2.19.2" }, { name = "pytest", specifier = ">=8.4.2" }, { name = "pytest-dependency", specifier = ">=0.6.0" }, + { name = "pytest-order", specifier = ">=1.3.0" }, { name = "ruff", specifier = ">=0.14.0" }, { name = "typeguard", specifier = ">=4.4.4" }, { name = "xdoctest", extras = ["colors"], specifier = ">=1.3.0" }, @@ -2821,6 +2823,18 @@ dependencies = [ ] sdist = { url = "https://files.pythonhosted.org/packages/7e/3b/317cc04e77d707d338540ca67b619df8f247f3f4c9f40e67bf5ea503ad94/pytest-dependency-0.6.0.tar.gz", hash = "sha256:934b0e6a39d95995062c193f7eaeed8a8ffa06ff1bcef4b62b0dc74a708bacc1", size = 19499, upload-time = "2023-12-31T20:38:54.991Z" } +[[package]] +name = "pytest-order" +version = "1.3.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pytest" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/1d/66/02ae17461b14a52ce5a29ae2900156b9110d1de34721ccc16ccd79419876/pytest_order-1.3.0.tar.gz", hash = "sha256:51608fec3d3ee9c0adaea94daa124a5c4c1d2bb99b00269f098f414307f23dde", size = 47544, upload-time = "2024-08-22T12:29:54.512Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/1b/73/59b038d1aafca89f8e9936eaa8ffa6bb6138d00459d13a32ce070be4f280/pytest_order-1.3.0-py3-none-any.whl", hash = "sha256:2cd562a21380345dd8d5774aa5fd38b7849b6ee7397ca5f6999bbe6e89f07f6e", size = 14609, upload-time = "2024-08-22T12:29:53.156Z" }, +] + [[package]] name = "python-dateutil" version = "2.9.0.post0" From 26d2c266b956676a715567fa3d9c2f482dd40c8c Mon Sep 17 00:00:00 2001 From: Dmitriy Repin Date: Mon, 27 Oct 2025 22:43:53 +0000 Subject: [PATCH 19/27] Add stats to validate_xr_variable --- tests/conftest.py | 2 - tests/integration/test_create_empty.py | 58 +++++++------------ .../integration/test_segy_roundtrip_teapot.py | 23 ++++---- tests/integration/testing_helpers.py | 13 ++++- 4 files changed, 42 insertions(+), 54 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 0ad8a033e..cd5058edb 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -78,9 +78,7 @@ def empty_mdio_dir(tmp_path_factory: pytest.TempPathFactory) -> Path: return tmp_path_factory.mktemp(r"empty_mdio_dir") -# # Uncomment the function below for local debugging -# # @pytest.fixture(scope="session") # def tmp_path_factory() -> pytest.TempPathFactory: # """Custom tmp_path_factory implementation for local debugging.""" diff --git a/tests/integration/test_create_empty.py b/tests/integration/test_create_empty.py index cd130a301..09279fc60 100644 --- a/tests/integration/test_create_empty.py +++ b/tests/integration/test_create_empty.py @@ -92,57 +92,39 @@ def _validate_empty_mdio_dataset(cls, ds: xr_Dataset, has_headers: bool, is_velo assert ds.sizes == {"inline": 345, "crossline": 188, "time": 1501} # Validate the dimension coordinate variables - validate_xr_variable(ds, "inline", {"inline": 345}, UNITS_NONE, np.int32, range(1, 346), get_values) - validate_xr_variable(ds, "crossline", {"crossline": 188}, UNITS_NONE, np.int32, range(1, 189), get_values) - validate_xr_variable(ds, "time", {"time": 1501}, UNITS_SECOND, np.int32, range(0, 3002, 2), get_values) + validate_xr_variable(ds, "inline", {"inline": 345}, UNITS_NONE, np.int32, False, range(1, 346), get_values) + validate_xr_variable( + ds, "crossline", {"crossline": 188}, UNITS_NONE, np.int32, False, range(1, 189), get_values + ) + validate_xr_variable(ds, "time", {"time": 1501}, UNITS_SECOND, np.int32, False, range(0, 3002, 2), get_values) # Validate the non-dimensional coordinate variables (should be empty for empty dataset) - validate_xr_variable(ds, "cdp_x", {"inline": 345, "crossline": 188}, UNITS_METER, np.float64, None, None) - validate_xr_variable(ds, "cdp_y", {"inline": 345, "crossline": 188}, UNITS_METER, np.float64, None, None) + validate_xr_variable(ds, "cdp_x", {"inline": 345, "crossline": 188}, UNITS_METER, np.float64) + validate_xr_variable(ds, "cdp_y", {"inline": 345, "crossline": 188}, UNITS_METER, np.float64) if has_headers: # Validate the headers (should be empty for empty dataset) # Infer the dtype from segy_spec and ignore endianness header_dtype = get_teapot_segy_spec().trace.header.dtype.newbyteorder("native") - validate_xr_variable(ds, "headers", {"inline": 345, "crossline": 188}, UNITS_NONE, header_dtype, None, None) - validate_xr_variable( - ds, - "segy_file_header", - dims={}, - units=UNITS_NONE, - data_type=np.dtype("U1"), - expected_values=None, - actual_value_generator=None, - ) + validate_xr_variable(ds, "headers", {"inline": 345, "crossline": 188}, UNITS_NONE, header_dtype) + validate_xr_variable(ds, "segy_file_header", dims={}, units=UNITS_NONE, data_type=np.dtype("U1")) else: assert "headers" not in ds.variables assert "segy_file_header" not in ds.variables # Validate the trace mask (should be all True for empty dataset) - validate_xr_variable(ds, "trace_mask", {"inline": 345, "crossline": 188}, UNITS_NONE, np.bool_, None, None) + validate_xr_variable(ds, "trace_mask", {"inline": 345, "crossline": 188}, UNITS_NONE, np.bool_) trace_mask = ds["trace_mask"].values assert not np.any(trace_mask), "All traces should be marked as dead in empty dataset" # Validate the velocity or amplitude data (should be empty) if is_velocity: validate_xr_variable( - ds, - "velocity", - {"inline": 345, "crossline": 188, "time": 1501}, - UNITS_METER_PER_SECOND, - np.float32, - None, - None, + ds, "velocity", {"inline": 345, "crossline": 188, "time": 1501}, UNITS_METER_PER_SECOND, np.float32 ) else: validate_xr_variable( - ds, - "amplitude", - {"inline": 345, "crossline": 188, "time": 1501}, - UNITS_NONE, - np.float32, - None, - None, + ds, "amplitude", {"inline": 345, "crossline": 188, "time": 1501}, UNITS_NONE, np.float32 ) @classmethod @@ -273,17 +255,19 @@ def test_overwrite_behavior(self, empty_mdio_dir: Path) -> None: def test_populate_empty_dataset(self, mdio_with_headers: Path) -> None: """Test showing how to populate empty dataset.""" # Open an empty PostStack3DVelocityTime dataset with SEG-Y 1.0 headers - # NOTES: + # # When this empty dataset was created from the 'PostStack3DVelocityTime' template and dimensions, # * 'inline', 'crossline', and 'time' dimension coordinate variables were created and pre-populated + # NOTE: the 'time' units are specified in the template, so they are not None in this case. # * 'cdp_x', 'cdp_y' non-dimensional coordinate variables were created - # * 'amplitude' variable was created (the name of this variable is specified in the template) - # HACK: in this example, we will use this variable to store the velocity data + # NOTE: the 'cdp_x' and 'cdp_y' units are specified in the template, so they are not None in this case. + # * 'velocity' variable was created (the name of this default variable is specified in the template) + # NOTE: the 'velocity' units are specified in the template, so they are not None in this case. # * 'trace_mask' variable was created and pre-populated with 'False' fill values # (all traces are marked as dead) # * 'headers' and 'segy_file_header' variables were created (if the dataset was created with # headers not None). The 'headers' variable structured datatype is defined by the HeaderSpec - # that was used to create the empty MDIO + # that was used to create the empty MDIO # * dataset attribute called 'attributes' was created ds = open_mdio(mdio_with_headers) @@ -320,7 +304,9 @@ def test_populate_empty_dataset(self, mdio_with_headers: Path) -> None: # 4) Populate dataset's trace mask (optional) ds.trace_mask[:] = ~np.isnan(velocity[:, :, 0]) - # 5) Set coordinate and data variable units (optional) + # 5) If the units were not set in the template or you want to change the coordinate and data variable units + # you can set the unitsV1 attribute for the coordinate and data variables (optional). + # If you are happy with the units specified in the template, you should skip this step. ds.time.attrs["unitsV1"] = TimeUnitModel(time=TimeUnitEnum.MILLISECOND).model_dump(mode="json") ds.cdp_x.attrs["unitsV1"] = LengthUnitModel(length=LengthUnitEnum.FOOT).model_dump(mode="json") @@ -376,7 +362,7 @@ def test_populate_empty_dataset(self, mdio_with_headers: Path) -> None: output_path_mdio = mdio_with_headers.parent / "populated_empty.mdio" to_mdio(ds, output_path=output_path_mdio, mode="w", compute=True) - # 9) Convert the populated emptyMDIO to SEG-Y + # 9) Convert the populated empty MDIO to SEG-Y if "headers" in ds.variables: # Select the SEG-Y standard to use for the conversion custom_segy_spec = get_segy_standard(1.0) diff --git a/tests/integration/test_segy_roundtrip_teapot.py b/tests/integration/test_segy_roundtrip_teapot.py index 444e0b12e..1ae3b843e 100644 --- a/tests/integration/test_segy_roundtrip_teapot.py +++ b/tests/integration/test_segy_roundtrip_teapot.py @@ -243,13 +243,15 @@ def test_grid(self, teapot_mdio_tmp: Path, teapot_segy_spec: SegySpec) -> None: ds = open_mdio(teapot_mdio_tmp) # Validate the dimension coordinate variables - validate_xr_variable(ds, "inline", {"inline": 345}, UNITS_NONE, np.int32, range(1, 346), get_values) - validate_xr_variable(ds, "crossline", {"crossline": 188}, UNITS_NONE, np.int32, range(1, 189), get_values) - validate_xr_variable(ds, "time", {"time": 1501}, UNITS_SECOND, np.int32, range(0, 3002, 2), get_values) + validate_xr_variable(ds, "inline", {"inline": 345}, UNITS_NONE, np.int32, False, range(1, 346), get_values) + validate_xr_variable( + ds, "crossline", {"crossline": 188}, UNITS_NONE, np.int32, False, range(1, 189), get_values + ) + validate_xr_variable(ds, "time", {"time": 1501}, UNITS_SECOND, np.int32, False, range(0, 3002, 2), get_values) # Validate the non-dimensional coordinate variables - validate_xr_variable(ds, "cdp_x", {"inline": 345, "crossline": 188}, UNITS_METER, np.float64, None, None) - validate_xr_variable(ds, "cdp_y", {"inline": 345, "crossline": 188}, UNITS_METER, np.float64, None, None) + validate_xr_variable(ds, "cdp_x", {"inline": 345, "crossline": 188}, UNITS_METER, np.float64) + validate_xr_variable(ds, "cdp_y", {"inline": 345, "crossline": 188}, UNITS_METER, np.float64) # Validate the headers # We have a custom set of headers since we used customize_segy_specs() @@ -262,22 +264,17 @@ def test_grid(self, teapot_mdio_tmp: Path, teapot_segy_spec: SegySpec) -> None: {"inline": 345, "crossline": 188}, UNITS_NONE, data_type.newbyteorder("native"), # mdio saves with machine endian, spec could be different endian + False, range(1, 346), get_inline_header_values, ) # Validate the trace mask - validate_xr_variable(ds, "trace_mask", {"inline": 345, "crossline": 188}, UNITS_NONE, np.bool, None, None) + validate_xr_variable(ds, "trace_mask", {"inline": 345, "crossline": 188}, UNITS_NONE, np.bool) # validate the amplitude data validate_xr_variable( - ds, - "amplitude", - {"inline": 345, "crossline": 188, "time": 1501}, - UNITS_NONE, - np.float32, - None, - None, + ds, "amplitude", {"inline": 345, "crossline": 188, "time": 1501}, UNITS_NONE, np.float32, True ) def test_inline_reads(self, teapot_mdio_tmp: Path) -> None: diff --git a/tests/integration/testing_helpers.py b/tests/integration/testing_helpers.py index ebd4146f8..6cc77e8fb 100644 --- a/tests/integration/testing_helpers.py +++ b/tests/integration/testing_helpers.py @@ -24,7 +24,8 @@ def validate_xr_variable( # noqa PLR0913 dims: dict[int], units: AllUnitModel, data_type: np.dtype, - expected_values: range | None, + has_stats: bool = False, + expected_values: range | None = None, actual_value_generator: Callable[[xr.DataArray], np.ndarray] | None = None, ) -> None: """Validate the properties of a variable in an Xarray dataset.""" @@ -47,10 +48,16 @@ def validate_xr_variable( # noqa PLR0913 else: assert data_type == v.dtype - assert v.attrs.get("statsV1", None) is None, "StatsV1 should be empty for empty dataset variables" + stats = v.attrs.get("statsV1", None) + if has_stats: + assert stats is not None, "StatsV1 should not be empty for dataset variables with stats" + else: + assert stats is None, "StatsV1 should be empty for dataset variables without stats" if units is not None: - assert v.attrs == {"unitsV1": units.model_dump(mode="json")} + units_v1 = v.attrs.get("unitsV1", None) + assert units_v1 is not None, "UnitsV1 should not be empty for dataset variables with units" + assert units_v1 == units.model_dump(mode="json") else: assert "unitsV1" not in v.attrs, "UnitsV1 should not exist for unit-unaware variables" From bade1232c775b3142eef858434a8101b898967ce Mon Sep 17 00:00:00 2001 From: Dmitriy Repin Date: Mon, 27 Oct 2025 22:50:20 +0000 Subject: [PATCH 20/27] fix white space change failure of pre-commit --- tests/integration/test_create_empty.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_create_empty.py b/tests/integration/test_create_empty.py index 09279fc60..70a46294a 100644 --- a/tests/integration/test_create_empty.py +++ b/tests/integration/test_create_empty.py @@ -255,7 +255,7 @@ def test_overwrite_behavior(self, empty_mdio_dir: Path) -> None: def test_populate_empty_dataset(self, mdio_with_headers: Path) -> None: """Test showing how to populate empty dataset.""" # Open an empty PostStack3DVelocityTime dataset with SEG-Y 1.0 headers - # + # # When this empty dataset was created from the 'PostStack3DVelocityTime' template and dimensions, # * 'inline', 'crossline', and 'time' dimension coordinate variables were created and pre-populated # NOTE: the 'time' units are specified in the template, so they are not None in this case. From baa3da3f7f3692b43639966f63f39b6348b7ebe2 Mon Sep 17 00:00:00 2001 From: Dmitriy Repin Date: Mon, 27 Oct 2025 22:59:26 +0000 Subject: [PATCH 21/27] remove tmp_path_factory --- tests/conftest.py | 22 ---------------------- 1 file changed, 22 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index cd5058edb..76bf90f9d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -76,25 +76,3 @@ def segy_export_tmp(tmp_path_factory: pytest.TempPathFactory) -> Path: def empty_mdio_dir(tmp_path_factory: pytest.TempPathFactory) -> Path: """Make a temp file for empty MDIO testing.""" return tmp_path_factory.mktemp(r"empty_mdio_dir") - - -# Uncomment the function below for local debugging -# @pytest.fixture(scope="session") -# def tmp_path_factory() -> pytest.TempPathFactory: -# """Custom tmp_path_factory implementation for local debugging.""" -# from pathlib import Path # noqa: PLC0415 - -# class DebugTempPathFactory: -# def __init__(self) -> None: -# self._retention_policy = "all" - -# def mktemp(self, basename: str, numbered: bool = True) -> Path: -# _ = numbered -# path = self.getbasetemp() / basename -# path.mkdir(parents=True, exist_ok=True) -# return path - -# def getbasetemp(self) -> Path: -# return Path("tmp") - -# return DebugTempPathFactory() From d9741251504596cc3b390351a366f930a74f9c54 Mon Sep 17 00:00:00 2001 From: Dmitriy Repin Date: Tue, 28 Oct 2025 17:18:44 +0000 Subject: [PATCH 22/27] Ensure test order: create_empty after teapod_roundtrip --- pyproject.toml | 1 - .../integration/test_segy_roundtrip_teapot.py | 28 +-- ...create_empty.py => test_z_create_empty.py} | 183 ++++++++++-------- tests/integration/testing_helpers.py | 28 +++ uv.lock | 14 -- 5 files changed, 129 insertions(+), 125 deletions(-) rename tests/integration/{test_create_empty.py => test_z_create_empty.py} (89%) diff --git a/pyproject.toml b/pyproject.toml index e7ceec4d0..518eb00d8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -55,7 +55,6 @@ dev = [ "pre-commit-hooks>=6.0.0", "pytest>=8.4.2", "pytest-dependency>=0.6.0", - "pytest-order>=1.3.0", "typeguard>=4.4.4", "xdoctest[colors]>=1.3.0", "Pygments>=2.19.2" diff --git a/tests/integration/test_segy_roundtrip_teapot.py b/tests/integration/test_segy_roundtrip_teapot.py index 1ae3b843e..742b97419 100644 --- a/tests/integration/test_segy_roundtrip_teapot.py +++ b/tests/integration/test_segy_roundtrip_teapot.py @@ -9,20 +9,17 @@ import numpy as np import numpy.testing as npt import pytest -from segy.schema import HeaderField -from segy.schema import ScalarType -from segy.standards import get_segy_standard +from tests.integration.testing_helpers import UNITS_METER +from tests.integration.testing_helpers import UNITS_NONE +from tests.integration.testing_helpers import UNITS_SECOND from tests.integration.testing_helpers import get_inline_header_values +from tests.integration.testing_helpers import get_teapot_segy_spec from tests.integration.testing_helpers import get_values from tests.integration.testing_helpers import validate_xr_variable from mdio import __version__ from mdio import mdio_to_segy from mdio.api.io import open_mdio -from mdio.builder.schemas.v1.units import LengthUnitEnum -from mdio.builder.schemas.v1.units import LengthUnitModel -from mdio.builder.schemas.v1.units import TimeUnitEnum -from mdio.builder.schemas.v1.units import TimeUnitModel from mdio.builder.template_registry import TemplateRegistry from mdio.converters.segy import segy_to_mdio from mdio.segy.file import SegyFileWrapper @@ -50,17 +47,6 @@ def teapot_segy_spec() -> SegySpec: return get_teapot_segy_spec() -def get_teapot_segy_spec() -> SegySpec: - """Return the customized SEG-Y specification for the teapot dome dataset.""" - teapot_fields = [ - HeaderField(name="inline", byte=17, format=ScalarType.INT32), - HeaderField(name="crossline", byte=13, format=ScalarType.INT32), - HeaderField(name="cdp_x", byte=81, format=ScalarType.INT32), - HeaderField(name="cdp_y", byte=85, format=ScalarType.INT32), - ] - return get_segy_standard(1.0).customize(trace_header_fields=teapot_fields) - - def text_header_teapot_dome() -> str: """Return the teapot dome expected text header.""" header_rows = [ @@ -157,12 +143,6 @@ def raw_binary_header_teapot_dome() -> str: ) -UNITS_NONE = None -UNITS_METER = LengthUnitModel(length=LengthUnitEnum.METER) -UNITS_SECOND = TimeUnitModel(time=TimeUnitEnum.SECOND) - - -@pytest.mark.order(1) class TestTeapotRoundtrip: """Tests for Teapot Dome data ingestion and export.""" diff --git a/tests/integration/test_create_empty.py b/tests/integration/test_z_create_empty.py similarity index 89% rename from tests/integration/test_create_empty.py rename to tests/integration/test_z_create_empty.py index 70a46294a..f60317190 100644 --- a/tests/integration/test_create_empty.py +++ b/tests/integration/test_z_create_empty.py @@ -1,4 +1,11 @@ -"""Test for create_empty_mdio function.""" +"""Test for create_empty_mdio function. + +This set of tests has to run after the segy_roundtrip_teapot tests have run because +the teapot dataset is used as the input for the create_empty_like test. + +NOTE: The only reliable way to ensure the test order (including the case when the +test are run in parallel) is to use the alphabetical order of the test names. +""" from __future__ import annotations @@ -22,9 +29,11 @@ from xarray import Dataset as xr_Dataset -from tests.integration.test_segy_roundtrip_teapot import get_teapot_segy_spec -from tests.integration.testing_helpers import get_values +from tests.integration.testing_helpers import UNITS_NONE, UNITS_SECOND, UNITS_METER, UNITS_FOOT, UNITS_METER_PER_SECOND, UNITS_FEET_PER_SECOND +from tests.integration.testing_helpers import get_teapot_segy_spec from tests.integration.testing_helpers import validate_xr_variable +from tests.integration.testing_helpers import get_values + from mdio import __version__ from mdio.api.create import create_empty @@ -37,14 +46,6 @@ from mdio.converters.mdio import mdio_to_segy from mdio.core import Dimension -UNITS_NONE = None -UNITS_METER = LengthUnitModel(length=LengthUnitEnum.METER) -UNITS_SECOND = TimeUnitModel(time=TimeUnitEnum.SECOND) -UNITS_METER_PER_SECOND = SpeedUnitModel(speed=SpeedUnitEnum.METER_PER_SECOND) -UNITS_FOOT = LengthUnitModel(length=LengthUnitEnum.FOOT) -UNITS_FEET_PER_SECOND = SpeedUnitModel(speed=SpeedUnitEnum.FEET_PER_SECOND) - - class PostStack3DVelocityTemplate(Seismic3DPostStackTemplate): """Custom template that uses 'velocity' as the default variable name instead of 'amplitude'.""" @@ -80,13 +81,72 @@ def _name(self) -> str: domain_suffix = self._data_domain.capitalize() return f"PostStack3DVelocity{domain_suffix}" - -@pytest.mark.order(1000) class TestCreateEmptyMdio: """Tests for create_empty_mdio function.""" @classmethod - def _validate_empty_mdio_dataset(cls, ds: xr_Dataset, has_headers: bool, is_velocity: bool) -> None: + def _create_empty_mdio(cls, create_headers: bool, output_path: Path, overwrite: bool = True) -> None: + """Create a temporary empty MDIO file for testing.""" + # Create the grid with the specified dimensions + dims = [ + Dimension(name="inline", coords=range(1, 346, 1)), # 100-300 with step 1 + Dimension(name="crossline", coords=range(1, 189, 1)), # 1000-1600 with step 2 + Dimension(name="time", coords=range(0, 3002, 2)), # 0-3 seconds 4ms sample rate + ] + + # If later on, we want to export to SEG-Y, we need to provide the trace header spec. + # The HeaderSpec can be either standard or customized. + headers = get_teapot_segy_spec().trace.header if create_headers else None + # Create an empty MDIO v1 metric post-stack 3D time velocity dataset + create_empty( + mdio_template=PostStack3DVelocityTemplate(data_domain="time", is_metric=True), + dimensions=dims, + output_path=output_path, + headers=headers, + overwrite=overwrite, + ) + + @classmethod + def validate_teapod_dataset_metadata(cls, ds: xr_Dataset, is_velocity: bool) -> None: + """Validate the dataset metadata.""" + if is_velocity: + assert ds.name == "PostStack3DVelocityTime" + else: + assert ds.name == "PostStack3DTime" + + # Check basic metadata attributes + expected_attrs = { + "apiVersion": __version__, + "name": ds.name, + } + actual_attrs_json = ds.attrs + + # Compare one by one due to ever changing createdOn + for key, value in expected_attrs.items(): + assert key in actual_attrs_json + if key == "createdOn": + assert actual_attrs_json[key] is not None + else: + assert actual_attrs_json[key] == value + + # Check that createdOn exists + assert "createdOn" in actual_attrs_json + assert actual_attrs_json["createdOn"] is not None + + # Validate template attributes + attributes = ds.attrs["attributes"] + assert attributes is not None + assert len(attributes) == 3 + # Validate all attributes provided by the abstract template + if is_velocity: + assert attributes["defaultVariableName"] == "velocity" + else: + assert attributes["defaultVariableName"] == "amplitude" + assert attributes["surveyType"] == "3D" + assert attributes["gatherType"] == "stacked" + + @classmethod + def validate_teapod_dataset_variables(cls, ds: xr_Dataset, header_dtype: np.dtype | None, is_velocity: bool) -> None: """Validate an empty MDIO dataset structure and content.""" # Check that the dataset has the expected shape assert ds.sizes == {"inline": 345, "crossline": 188, "time": 1501} @@ -102,10 +162,10 @@ def _validate_empty_mdio_dataset(cls, ds: xr_Dataset, has_headers: bool, is_velo validate_xr_variable(ds, "cdp_x", {"inline": 345, "crossline": 188}, UNITS_METER, np.float64) validate_xr_variable(ds, "cdp_y", {"inline": 345, "crossline": 188}, UNITS_METER, np.float64) - if has_headers: + if header_dtype is not None: # Validate the headers (should be empty for empty dataset) # Infer the dtype from segy_spec and ignore endianness - header_dtype = get_teapot_segy_spec().trace.header.dtype.newbyteorder("native") + header_dtype = header_dtype.newbyteorder("native") validate_xr_variable(ds, "headers", {"inline": 345, "crossline": 188}, UNITS_NONE, header_dtype) validate_xr_variable(ds, "segy_file_header", dims={}, units=UNITS_NONE, data_type=np.dtype("U1")) else: @@ -127,28 +187,6 @@ def _validate_empty_mdio_dataset(cls, ds: xr_Dataset, has_headers: bool, is_velo ds, "amplitude", {"inline": 345, "crossline": 188, "time": 1501}, UNITS_NONE, np.float32 ) - @classmethod - def _create_empty_mdio(cls, create_headers: bool, output_path: Path, overwrite: bool = True) -> None: - """Create a temporary empty MDIO file for testing.""" - # Create the grid with the specified dimensions - dims = [ - Dimension(name="inline", coords=range(1, 346, 1)), # 100-300 with step 1 - Dimension(name="crossline", coords=range(1, 189, 1)), # 1000-1600 with step 2 - Dimension(name="time", coords=range(0, 3002, 2)), # 0-3 seconds 4ms sample rate - ] - - # If later on, we want to export to SEG-Y, we need to provide the trace header spec. - # The HeaderSpec can be either standard or customized. - headers = get_teapot_segy_spec().trace.header if create_headers else None - # Create an empty MDIO v1 metric post-stack 3D time velocity dataset - create_empty( - mdio_template=PostStack3DVelocityTemplate(data_domain="time", is_metric=True), - dimensions=dims, - output_path=output_path, - headers=headers, - overwrite=overwrite, - ) - @pytest.fixture(scope="class") def mdio_with_headers(self, empty_mdio_dir: Path) -> Path: """Create a temporary empty MDIO file for testing. @@ -171,56 +209,22 @@ def mdio_no_headers(self, empty_mdio_dir: Path) -> Path: self._create_empty_mdio(create_headers=False, output_path=empty_mdio) return empty_mdio - def validate_dataset_metadata(self, ds: xr_Dataset, is_velocity: bool) -> None: - """Validate the dataset metadata.""" - if is_velocity: - assert ds.name == "PostStack3DVelocityTime" - else: - assert ds.name == "PostStack3DTime" - - # Check basic metadata attributes - expected_attrs = { - "apiVersion": __version__, - "name": ds.name, - } - actual_attrs_json = ds.attrs - - # Compare one by one due to ever changing createdOn - for key, value in expected_attrs.items(): - assert key in actual_attrs_json - if key == "createdOn": - assert actual_attrs_json[key] is not None - else: - assert actual_attrs_json[key] == value - - # Check that createdOn exists - assert "createdOn" in actual_attrs_json - assert actual_attrs_json["createdOn"] is not None - - # Validate template attributes - attributes = ds.attrs["attributes"] - assert attributes is not None - assert len(attributes) == 3 - # Validate all attributes provided by the abstract template - if is_velocity: - assert attributes["defaultVariableName"] == "velocity" - else: - assert attributes["defaultVariableName"] == "amplitude" - assert attributes["surveyType"] == "3D" - assert attributes["gatherType"] == "stacked" def test_dataset_metadata(self, mdio_with_headers: Path) -> None: """Test dataset metadata for empty MDIO file.""" ds = open_mdio(mdio_with_headers) - self.validate_dataset_metadata(ds, is_velocity=True) + self.validate_teapod_dataset_metadata(ds, is_velocity=True) + def test_variables(self, mdio_with_headers: Path, mdio_no_headers: Path) -> None: """Test grid validation for empty MDIO file.""" + ds = open_mdio(mdio_with_headers) - self._validate_empty_mdio_dataset(ds, has_headers=True, is_velocity=True) + header_dtype = get_teapot_segy_spec().trace.header.dtype + self.validate_teapod_dataset_variables(ds, header_dtype=header_dtype, is_velocity=True) ds = open_mdio(mdio_no_headers) - self._validate_empty_mdio_dataset(ds, has_headers=False, is_velocity=True) + self.validate_teapod_dataset_variables(ds, header_dtype=None, is_velocity=True) def test_overwrite_behavior(self, empty_mdio_dir: Path) -> None: """Test overwrite parameter behavior in create_empty_mdio.""" @@ -246,7 +250,9 @@ def test_overwrite_behavior(self, empty_mdio_dir: Path) -> None: # Validate that the MDIO file can be loaded correctly using the helper function ds = open_mdio(empty_mdio) - self._validate_empty_mdio_dataset(ds, has_headers=True, is_velocity=True) + self.validate_teapod_dataset_metadata(ds, is_velocity=True) + header_dtype = get_teapot_segy_spec().trace.header.dtype + self.validate_teapod_dataset_variables(ds, header_dtype=header_dtype, is_velocity=True) # Verify the garbage data was overwritten (should not exist) assert not garbage_file.exists(), "Garbage file should have been overwritten" @@ -375,16 +381,21 @@ def test_populate_empty_dataset(self, mdio_with_headers: Path) -> None: output_path=mdio_with_headers.parent / "populated_empty.sgy", ) - @pytest.mark.order(1001) - @pytest.mark.dependency - def test_create_empty_like(self, teapot_mdio_tmp: Path, mdio_with_headers: Path) -> None: - """Create an empty MDIO file like the input file.""" - _ = mdio_with_headers + + def test_create_empty_like(self, teapot_mdio_tmp: Path) -> None: + """Create an empty MDIO file like the input MDIO file. + + This test has to run after the segy_roundtrip_teapot tests have run because + its uses 'teapot_mdio_tmp' created by the segy_roundtrip_teapot tests as the input. + """ + ds = create_empty_like( input_path=teapot_mdio_tmp, + # TODO: write to a file output_path=None, # We don't want to write to disk for now keep_coordinates=True, overwrite=True, ) - self.validate_dataset_metadata(ds, is_velocity=False) - self._validate_empty_mdio_dataset(ds, has_headers=True, is_velocity=False) + self.validate_teapod_dataset_metadata(ds, is_velocity=False) + header_dtype = get_teapot_segy_spec().trace.header.dtype + self.validate_teapod_dataset_variables(ds, header_dtype=header_dtype, is_velocity=False) diff --git a/tests/integration/testing_helpers.py b/tests/integration/testing_helpers.py index 6cc77e8fb..f33e4747a 100644 --- a/tests/integration/testing_helpers.py +++ b/tests/integration/testing_helpers.py @@ -4,8 +4,36 @@ import numpy as np import xarray as xr +from segy.schema import HeaderField +from segy.schema import ScalarType +from segy.schema.segy import SegySpec +from segy.standards import get_segy_standard from mdio.builder.schemas.v1.units import AllUnitModel +from mdio.builder.schemas.v1.units import LengthUnitEnum +from mdio.builder.schemas.v1.units import LengthUnitModel +from mdio.builder.schemas.v1.units import SpeedUnitEnum +from mdio.builder.schemas.v1.units import SpeedUnitModel +from mdio.builder.schemas.v1.units import TimeUnitEnum +from mdio.builder.schemas.v1.units import TimeUnitModel + +UNITS_NONE = None +UNITS_METER = LengthUnitModel(length=LengthUnitEnum.METER) +UNITS_SECOND = TimeUnitModel(time=TimeUnitEnum.SECOND) +UNITS_METER_PER_SECOND = SpeedUnitModel(speed=SpeedUnitEnum.METER_PER_SECOND) +UNITS_FOOT = LengthUnitModel(length=LengthUnitEnum.FOOT) +UNITS_FEET_PER_SECOND = SpeedUnitModel(speed=SpeedUnitEnum.FEET_PER_SECOND) + + +def get_teapot_segy_spec() -> SegySpec: + """Return the customized SEG-Y specification for the teapot dome dataset.""" + teapot_fields = [ + HeaderField(name="inline", byte=17, format=ScalarType.INT32), + HeaderField(name="crossline", byte=13, format=ScalarType.INT32), + HeaderField(name="cdp_x", byte=81, format=ScalarType.INT32), + HeaderField(name="cdp_y", byte=85, format=ScalarType.INT32), + ] + return get_segy_standard(1.0).customize(trace_header_fields=teapot_fields) def get_values(arr: xr.DataArray) -> np.ndarray: diff --git a/uv.lock b/uv.lock index 6e214b48d..894419f5c 100644 --- a/uv.lock +++ b/uv.lock @@ -1929,7 +1929,6 @@ dev = [ { name = "pygments" }, { name = "pytest" }, { name = "pytest-dependency" }, - { name = "pytest-order" }, { name = "ruff" }, { name = "typeguard" }, { name = "xdoctest", extra = ["colors"] }, @@ -1982,7 +1981,6 @@ dev = [ { name = "pygments", specifier = ">=2.19.2" }, { name = "pytest", specifier = ">=8.4.2" }, { name = "pytest-dependency", specifier = ">=0.6.0" }, - { name = "pytest-order", specifier = ">=1.3.0" }, { name = "ruff", specifier = ">=0.14.0" }, { name = "typeguard", specifier = ">=4.4.4" }, { name = "xdoctest", extras = ["colors"], specifier = ">=1.3.0" }, @@ -2823,18 +2821,6 @@ dependencies = [ ] sdist = { url = "https://files.pythonhosted.org/packages/7e/3b/317cc04e77d707d338540ca67b619df8f247f3f4c9f40e67bf5ea503ad94/pytest-dependency-0.6.0.tar.gz", hash = "sha256:934b0e6a39d95995062c193f7eaeed8a8ffa06ff1bcef4b62b0dc74a708bacc1", size = 19499, upload-time = "2023-12-31T20:38:54.991Z" } -[[package]] -name = "pytest-order" -version = "1.3.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "pytest" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/1d/66/02ae17461b14a52ce5a29ae2900156b9110d1de34721ccc16ccd79419876/pytest_order-1.3.0.tar.gz", hash = "sha256:51608fec3d3ee9c0adaea94daa124a5c4c1d2bb99b00269f098f414307f23dde", size = 47544, upload-time = "2024-08-22T12:29:54.512Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/1b/73/59b038d1aafca89f8e9936eaa8ffa6bb6138d00459d13a32ce070be4f280/pytest_order-1.3.0-py3-none-any.whl", hash = "sha256:2cd562a21380345dd8d5774aa5fd38b7849b6ee7397ca5f6999bbe6e89f07f6e", size = 14609, upload-time = "2024-08-22T12:29:53.156Z" }, -] - [[package]] name = "python-dateutil" version = "2.9.0.post0" From 2d6e250c282fe0d8c176cebd967c7e0ef3871c55 Mon Sep 17 00:00:00 2001 From: Dmitriy Repin Date: Tue, 28 Oct 2025 19:12:06 +0000 Subject: [PATCH 23/27] Fir createdOn in create_empty_like --- src/mdio/api/create.py | 2 +- tests/integration/test_z_create_empty.py | 30 +++++++++++++----------- 2 files changed, 17 insertions(+), 15 deletions(-) diff --git a/src/mdio/api/create.py b/src/mdio/api/create.py index 1e0ee67a0..1282dbf60 100644 --- a/src/mdio/api/create.py +++ b/src/mdio/api/create.py @@ -126,7 +126,7 @@ def create_empty_like( # noqa PLR0913 # Keep the name (which is the same as the used template name) and the original API version # ds_output.attrs["name"] # ds_output.attrs["apiVersion"] - ds_output.attrs["createdOn"] = datetime.now(UTC) + ds_output.attrs["createdOn"] = str(datetime.now(UTC)) # Coordinates if not keep_coordinates: diff --git a/tests/integration/test_z_create_empty.py b/tests/integration/test_z_create_empty.py index f60317190..20b3c7665 100644 --- a/tests/integration/test_z_create_empty.py +++ b/tests/integration/test_z_create_empty.py @@ -3,7 +3,7 @@ This set of tests has to run after the segy_roundtrip_teapot tests have run because the teapot dataset is used as the input for the create_empty_like test. -NOTE: The only reliable way to ensure the test order (including the case when the +NOTE: The only reliable way to ensure the test order (including the case when the test are run in parallel) is to use the alphabetical order of the test names. """ @@ -29,11 +29,15 @@ from xarray import Dataset as xr_Dataset -from tests.integration.testing_helpers import UNITS_NONE, UNITS_SECOND, UNITS_METER, UNITS_FOOT, UNITS_METER_PER_SECOND, UNITS_FEET_PER_SECOND +from tests.integration.testing_helpers import UNITS_FEET_PER_SECOND +from tests.integration.testing_helpers import UNITS_FOOT +from tests.integration.testing_helpers import UNITS_METER +from tests.integration.testing_helpers import UNITS_METER_PER_SECOND +from tests.integration.testing_helpers import UNITS_NONE +from tests.integration.testing_helpers import UNITS_SECOND from tests.integration.testing_helpers import get_teapot_segy_spec -from tests.integration.testing_helpers import validate_xr_variable from tests.integration.testing_helpers import get_values - +from tests.integration.testing_helpers import validate_xr_variable from mdio import __version__ from mdio.api.create import create_empty @@ -46,6 +50,7 @@ from mdio.converters.mdio import mdio_to_segy from mdio.core import Dimension + class PostStack3DVelocityTemplate(Seismic3DPostStackTemplate): """Custom template that uses 'velocity' as the default variable name instead of 'amplitude'.""" @@ -81,6 +86,7 @@ def _name(self) -> str: domain_suffix = self._data_domain.capitalize() return f"PostStack3DVelocity{domain_suffix}" + class TestCreateEmptyMdio: """Tests for create_empty_mdio function.""" @@ -146,7 +152,9 @@ def validate_teapod_dataset_metadata(cls, ds: xr_Dataset, is_velocity: bool) -> assert attributes["gatherType"] == "stacked" @classmethod - def validate_teapod_dataset_variables(cls, ds: xr_Dataset, header_dtype: np.dtype | None, is_velocity: bool) -> None: + def validate_teapod_dataset_variables( + cls, ds: xr_Dataset, header_dtype: np.dtype | None, is_velocity: bool + ) -> None: """Validate an empty MDIO dataset structure and content.""" # Check that the dataset has the expected shape assert ds.sizes == {"inline": 345, "crossline": 188, "time": 1501} @@ -209,16 +217,13 @@ def mdio_no_headers(self, empty_mdio_dir: Path) -> Path: self._create_empty_mdio(create_headers=False, output_path=empty_mdio) return empty_mdio - def test_dataset_metadata(self, mdio_with_headers: Path) -> None: """Test dataset metadata for empty MDIO file.""" ds = open_mdio(mdio_with_headers) self.validate_teapod_dataset_metadata(ds, is_velocity=True) - def test_variables(self, mdio_with_headers: Path, mdio_no_headers: Path) -> None: """Test grid validation for empty MDIO file.""" - ds = open_mdio(mdio_with_headers) header_dtype = get_teapot_segy_spec().trace.header.dtype self.validate_teapod_dataset_variables(ds, header_dtype=header_dtype, is_velocity=True) @@ -381,18 +386,15 @@ def test_populate_empty_dataset(self, mdio_with_headers: Path) -> None: output_path=mdio_with_headers.parent / "populated_empty.sgy", ) - - def test_create_empty_like(self, teapot_mdio_tmp: Path) -> None: + def test_create_empty_like(self, teapot_mdio_tmp: Path, empty_mdio_dir: Path) -> None: """Create an empty MDIO file like the input MDIO file. - + This test has to run after the segy_roundtrip_teapot tests have run because its uses 'teapot_mdio_tmp' created by the segy_roundtrip_teapot tests as the input. """ - ds = create_empty_like( input_path=teapot_mdio_tmp, - # TODO: write to a file - output_path=None, # We don't want to write to disk for now + output_path=empty_mdio_dir / "create_empty_like.mdio", keep_coordinates=True, overwrite=True, ) From 163739697445e901e2895928feed8607598d8283 Mon Sep 17 00:00:00 2001 From: Dmitriy Repin Date: Tue, 28 Oct 2025 19:25:31 +0000 Subject: [PATCH 24/27] Return xr_dataset from create_empty --- src/mdio/api/create.py | 19 ++++++++++++------- tests/integration/test_z_create_empty.py | 14 ++++++++++---- 2 files changed, 22 insertions(+), 11 deletions(-) diff --git a/src/mdio/api/create.py b/src/mdio/api/create.py index 1282dbf60..532f56870 100644 --- a/src/mdio/api/create.py +++ b/src/mdio/api/create.py @@ -30,10 +30,10 @@ def create_empty( # noqa PLR0913 mdio_template: AbstractDatasetTemplate | str, dimensions: list[Dimension], - output_path: UPath | Path | str, + output_path: UPath | Path | str | None, headers: HeaderSpec | None = None, overwrite: bool = False, -) -> None: +) -> xr_Dataset: """A function that creates an empty MDIO v1 file with known dimensions. Args: @@ -73,19 +73,24 @@ def create_empty( # noqa PLR0913 # Populate coordinates using the grid # For empty datasets, we only populate dimension coordinates drop_vars_delayed = [] - dataset, drop_vars_delayed = populate_dim_coordinates(xr_dataset, grid, drop_vars_delayed=drop_vars_delayed) + xr_dataset, drop_vars_delayed = populate_dim_coordinates(xr_dataset, grid, drop_vars_delayed=drop_vars_delayed) if headers: # Since the headers were provided, the user wants to export to SEG-Y # Add a dummy segy_file_header variable used to export to SEG-Y - dataset["segy_file_header"] = ((), "") + xr_dataset["segy_file_header"] = ((), "") # Create the Zarr store with the correct structure but with empty arrays - to_mdio(dataset, output_path=output_path, mode="w", compute=False) + if output_path is not None: + to_mdio(xr_dataset, output_path=output_path, mode="w", compute=False) # Write the dimension coordinates and trace mask - meta_ds = dataset[drop_vars_delayed + ["trace_mask"]] - to_mdio(meta_ds, output_path=output_path, mode="r+", compute=True) + xr_dataset = xr_dataset[drop_vars_delayed + ["trace_mask"]] + + if output_path is not None: + to_mdio(xr_dataset, output_path=output_path, mode="r+", compute=True) + + return xr_dataset def create_empty_like( # noqa PLR0913 diff --git a/tests/integration/test_z_create_empty.py b/tests/integration/test_z_create_empty.py index 20b3c7665..2db834ea2 100644 --- a/tests/integration/test_z_create_empty.py +++ b/tests/integration/test_z_create_empty.py @@ -104,13 +104,14 @@ def _create_empty_mdio(cls, create_headers: bool, output_path: Path, overwrite: # The HeaderSpec can be either standard or customized. headers = get_teapot_segy_spec().trace.header if create_headers else None # Create an empty MDIO v1 metric post-stack 3D time velocity dataset - create_empty( + xr_dataset = create_empty( mdio_template=PostStack3DVelocityTemplate(data_domain="time", is_metric=True), dimensions=dims, output_path=output_path, headers=headers, overwrite=overwrite, ) + return xr_dataset @classmethod def validate_teapod_dataset_metadata(cls, ds: xr_Dataset, is_velocity: bool) -> None: @@ -203,7 +204,8 @@ def mdio_with_headers(self, empty_mdio_dir: Path) -> Path: and shared across all test methods in the class. """ empty_mdio: Path = empty_mdio_dir / "mdio_with_headers.mdio" - self._create_empty_mdio(create_headers=True, output_path=empty_mdio) + xr_dataset = self._create_empty_mdio(create_headers=True, output_path=empty_mdio) + assert xr_dataset is not None return empty_mdio @pytest.fixture(scope="class") @@ -214,7 +216,8 @@ def mdio_no_headers(self, empty_mdio_dir: Path) -> Path: and shared across all test methods in the class. """ empty_mdio: Path = empty_mdio_dir / "mdio_no_headers.mdio" - self._create_empty_mdio(create_headers=False, output_path=empty_mdio) + xr_dataset = self._create_empty_mdio(create_headers=False, output_path=empty_mdio) + assert xr_dataset is not None return empty_mdio def test_dataset_metadata(self, mdio_with_headers: Path) -> None: @@ -251,7 +254,8 @@ def test_overwrite_behavior(self, empty_mdio_dir: Path) -> None: self._create_empty_mdio(create_headers=True, output_path=empty_mdio, overwrite=False) # Third call: Create MDIO with overwrite=True - should succeed and overwrite garbage - self._create_empty_mdio(create_headers=True, output_path=empty_mdio, overwrite=True) + xr_dataset = self._create_empty_mdio(create_headers=True, output_path=empty_mdio, overwrite=True) + assert xr_dataset is not None # Validate that the MDIO file can be loaded correctly using the helper function ds = open_mdio(empty_mdio) @@ -398,6 +402,8 @@ def test_create_empty_like(self, teapot_mdio_tmp: Path, empty_mdio_dir: Path) -> keep_coordinates=True, overwrite=True, ) + assert ds is not None + self.validate_teapod_dataset_metadata(ds, is_velocity=False) header_dtype = get_teapot_segy_spec().trace.header.dtype self.validate_teapod_dataset_variables(ds, header_dtype=header_dtype, is_velocity=False) From cc1f6d7efd1113cc0874569f9abadc1776d42788 Mon Sep 17 00:00:00 2001 From: Dmitriy Repin Date: Tue, 28 Oct 2025 19:27:29 +0000 Subject: [PATCH 25/27] Fix pre-commit --- src/mdio/api/create.py | 5 ++++- tests/integration/test_z_create_empty.py | 5 ++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/mdio/api/create.py b/src/mdio/api/create.py index 532f56870..fc0a09f6c 100644 --- a/src/mdio/api/create.py +++ b/src/mdio/api/create.py @@ -50,6 +50,9 @@ def create_empty( # noqa PLR0913 headers: SEG-Y v1.0 trace headers. Defaults to None. overwrite: Whether to overwrite the output file if it already exists. Defaults to False. + Returns: + The output MDIO dataset. + Raises: FileExistsError: If the output location already exists and overwrite is False. """ @@ -86,7 +89,7 @@ def create_empty( # noqa PLR0913 # Write the dimension coordinates and trace mask xr_dataset = xr_dataset[drop_vars_delayed + ["trace_mask"]] - + if output_path is not None: to_mdio(xr_dataset, output_path=output_path, mode="r+", compute=True) diff --git a/tests/integration/test_z_create_empty.py b/tests/integration/test_z_create_empty.py index 2db834ea2..ddef6a70d 100644 --- a/tests/integration/test_z_create_empty.py +++ b/tests/integration/test_z_create_empty.py @@ -91,7 +91,7 @@ class TestCreateEmptyMdio: """Tests for create_empty_mdio function.""" @classmethod - def _create_empty_mdio(cls, create_headers: bool, output_path: Path, overwrite: bool = True) -> None: + def _create_empty_mdio(cls, create_headers: bool, output_path: Path, overwrite: bool = True) -> xr_Dataset: """Create a temporary empty MDIO file for testing.""" # Create the grid with the specified dimensions dims = [ @@ -104,14 +104,13 @@ def _create_empty_mdio(cls, create_headers: bool, output_path: Path, overwrite: # The HeaderSpec can be either standard or customized. headers = get_teapot_segy_spec().trace.header if create_headers else None # Create an empty MDIO v1 metric post-stack 3D time velocity dataset - xr_dataset = create_empty( + return create_empty( mdio_template=PostStack3DVelocityTemplate(data_domain="time", is_metric=True), dimensions=dims, output_path=output_path, headers=headers, overwrite=overwrite, ) - return xr_dataset @classmethod def validate_teapod_dataset_metadata(cls, ds: xr_Dataset, is_velocity: bool) -> None: From 1da04f98f54089cdcabe936f9d1b4b4bb0fedeb1 Mon Sep 17 00:00:00 2001 From: Dmitriy Repin Date: Wed, 29 Oct 2025 14:29:05 +0000 Subject: [PATCH 26/27] Adjust to the breaking changes in upstream/main --- tests/integration/test_z_create_empty.py | 6 +++--- tests/integration/testing_helpers.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/integration/test_z_create_empty.py b/tests/integration/test_z_create_empty.py index ddef6a70d..3006d7d62 100644 --- a/tests/integration/test_z_create_empty.py +++ b/tests/integration/test_z_create_empty.py @@ -32,7 +32,7 @@ from tests.integration.testing_helpers import UNITS_FEET_PER_SECOND from tests.integration.testing_helpers import UNITS_FOOT from tests.integration.testing_helpers import UNITS_METER -from tests.integration.testing_helpers import UNITS_METER_PER_SECOND +from tests.integration.testing_helpers import UNITS_METERS_PER_SECOND from tests.integration.testing_helpers import UNITS_NONE from tests.integration.testing_helpers import UNITS_SECOND from tests.integration.testing_helpers import get_teapot_segy_spec @@ -67,7 +67,7 @@ def __init__(self, data_domain: str, is_metric: bool) -> None: "time": UNITS_SECOND, "cdp_x": UNITS_METER, "cdp_y": UNITS_METER, - "velocity": UNITS_METER_PER_SECOND, + "velocity": UNITS_METERS_PER_SECOND, } ) else: @@ -188,7 +188,7 @@ def validate_teapod_dataset_variables( # Validate the velocity or amplitude data (should be empty) if is_velocity: validate_xr_variable( - ds, "velocity", {"inline": 345, "crossline": 188, "time": 1501}, UNITS_METER_PER_SECOND, np.float32 + ds, "velocity", {"inline": 345, "crossline": 188, "time": 1501}, UNITS_METERS_PER_SECOND, np.float32 ) else: validate_xr_variable( diff --git a/tests/integration/testing_helpers.py b/tests/integration/testing_helpers.py index f33e4747a..18c262887 100644 --- a/tests/integration/testing_helpers.py +++ b/tests/integration/testing_helpers.py @@ -20,7 +20,7 @@ UNITS_NONE = None UNITS_METER = LengthUnitModel(length=LengthUnitEnum.METER) UNITS_SECOND = TimeUnitModel(time=TimeUnitEnum.SECOND) -UNITS_METER_PER_SECOND = SpeedUnitModel(speed=SpeedUnitEnum.METER_PER_SECOND) +UNITS_METERS_PER_SECOND = SpeedUnitModel(speed=SpeedUnitEnum.METERS_PER_SECOND) UNITS_FOOT = LengthUnitModel(length=LengthUnitEnum.FOOT) UNITS_FEET_PER_SECOND = SpeedUnitModel(speed=SpeedUnitEnum.FEET_PER_SECOND) From 62e9be7bb2f97fa4093a9a30bad7e9b953da7ec1 Mon Sep 17 00:00:00 2001 From: Dmitriy Repin Date: Wed, 29 Oct 2025 19:20:54 +0000 Subject: [PATCH 27/27] Address some of PR review comments --- src/mdio/api/create.py | 19 ++---- .../integration/test_segy_roundtrip_teapot.py | 8 ++- tests/integration/test_z_create_empty.py | 66 ++++++++----------- tests/integration/testing_helpers.py | 1 + 4 files changed, 40 insertions(+), 54 deletions(-) diff --git a/src/mdio/api/create.py b/src/mdio/api/create.py index fc0a09f6c..c17a01091 100644 --- a/src/mdio/api/create.py +++ b/src/mdio/api/create.py @@ -38,13 +38,6 @@ def create_empty( # noqa PLR0913 Args: mdio_template: The MDIO template or template name to use to define the dataset structure. - NOTE: If you want to have a unit-aware MDIO model, you need to add the units - to the template before calling this function. For example: - 'unit_aware_template = TemplateRegistry().get("PostStack3DTime")' - 'unit_aware_template.add_units({"time": UNITS_SECOND})' - 'unit_aware_template.add_units({"cdp_x": UNITS_METER})' - 'unit_aware_template.add_units({"cdp_y": UNITS_METER})' - 'create_empty(unit_aware_template, dimensions, output_path, headers, overwrite)' dimensions: The dimensions of the MDIO file. output_path: The universal path for the output MDIO v1 file. headers: SEG-Y v1.0 trace headers. Defaults to None. @@ -139,7 +132,7 @@ def create_empty_like( # noqa PLR0913 # Coordinates if not keep_coordinates: for coord_name in ds_output.coords: - ds_output[coord_name].attrs["unitsV1"] = None + ds_output[coord_name].attrs.pop("unitsV1", None) # MDIO attributes attr = ds_output.attrs["attributes"] @@ -157,17 +150,17 @@ def create_empty_like( # noqa PLR0913 # Data variable var_name = attr["defaultVariableName"] var = ds_output[var_name] - var.attrs["statsV1"] = None + var.attrs.pop("statsV1", None) if not keep_coordinates: - var.attrs["unitsV1"] = None + var.attrs.pop("unitsV1", None) # SEG-Y file header if "segy_file_header" in ds_output.variables: segy_file_header = ds_output["segy_file_header"] if segy_file_header is not None: - segy_file_header.attrs["textHeader"] = None - segy_file_header.attrs["binaryHeader"] = None - segy_file_header.attrs["rawBinaryHeader"] = None + segy_file_header.attrs.pop("textHeader", None) + segy_file_header.attrs.pop("binaryHeader", None) + segy_file_header.attrs.pop("rawBinaryHeader", None) if output_path is not None: to_mdio(ds_output, output_path=output_path, mode="w", compute=True) diff --git a/tests/integration/test_segy_roundtrip_teapot.py b/tests/integration/test_segy_roundtrip_teapot.py index 742b97419..86e401092 100644 --- a/tests/integration/test_segy_roundtrip_teapot.py +++ b/tests/integration/test_segy_roundtrip_teapot.py @@ -10,8 +10,8 @@ import numpy.testing as npt import pytest from tests.integration.testing_helpers import UNITS_METER +from tests.integration.testing_helpers import UNITS_MILLISECOND from tests.integration.testing_helpers import UNITS_NONE -from tests.integration.testing_helpers import UNITS_SECOND from tests.integration.testing_helpers import get_inline_header_values from tests.integration.testing_helpers import get_teapot_segy_spec from tests.integration.testing_helpers import get_values @@ -159,7 +159,7 @@ def test_teapot_import( NOTE: This test must be executed before the 'TestReader' and 'TestExport' tests. """ unit_aware_template = TemplateRegistry().get("PostStack3DTime") - unit_aware_template.add_units({"time": UNITS_SECOND}) + unit_aware_template.add_units({"time": UNITS_MILLISECOND}) unit_aware_template.add_units({"cdp_x": UNITS_METER}) unit_aware_template.add_units({"cdp_y": UNITS_METER}) segy_to_mdio( @@ -227,7 +227,9 @@ def test_grid(self, teapot_mdio_tmp: Path, teapot_segy_spec: SegySpec) -> None: validate_xr_variable( ds, "crossline", {"crossline": 188}, UNITS_NONE, np.int32, False, range(1, 189), get_values ) - validate_xr_variable(ds, "time", {"time": 1501}, UNITS_SECOND, np.int32, False, range(0, 3002, 2), get_values) + validate_xr_variable( + ds, "time", {"time": 1501}, UNITS_MILLISECOND, np.int32, False, range(0, 3002, 2), get_values + ) # Validate the non-dimensional coordinate variables validate_xr_variable(ds, "cdp_x", {"inline": 345, "crossline": 188}, UNITS_METER, np.float64) diff --git a/tests/integration/test_z_create_empty.py b/tests/integration/test_z_create_empty.py index 3006d7d62..517b5005b 100644 --- a/tests/integration/test_z_create_empty.py +++ b/tests/integration/test_z_create_empty.py @@ -29,10 +29,9 @@ from xarray import Dataset as xr_Dataset -from tests.integration.testing_helpers import UNITS_FEET_PER_SECOND -from tests.integration.testing_helpers import UNITS_FOOT from tests.integration.testing_helpers import UNITS_METER from tests.integration.testing_helpers import UNITS_METERS_PER_SECOND +from tests.integration.testing_helpers import UNITS_MILLISECOND from tests.integration.testing_helpers import UNITS_NONE from tests.integration.testing_helpers import UNITS_SECOND from tests.integration.testing_helpers import get_teapot_segy_spec @@ -51,7 +50,7 @@ from mdio.core import Dimension -class PostStack3DVelocityTemplate(Seismic3DPostStackTemplate): +class PostStack3DVelocityMetricTemplate(Seismic3DPostStackTemplate): """Custom template that uses 'velocity' as the default variable name instead of 'amplitude'.""" @property @@ -59,26 +58,16 @@ def _default_variable_name(self) -> str: """Override the default variable name.""" return "velocity" - def __init__(self, data_domain: str, is_metric: bool) -> None: + def __init__(self, data_domain: str) -> None: super().__init__(data_domain) - if is_metric: - self._units.update( - { - "time": UNITS_SECOND, - "cdp_x": UNITS_METER, - "cdp_y": UNITS_METER, - "velocity": UNITS_METERS_PER_SECOND, - } - ) - else: - self._units.update( - { - "time": UNITS_SECOND, - "cdp_x": UNITS_FOOT, - "cdp_y": UNITS_FOOT, - "velocity": UNITS_FEET_PER_SECOND, - } - ) + self._units.update( + { + "time": UNITS_MILLISECOND, + "cdp_x": UNITS_METER, + "cdp_y": UNITS_METER, + "velocity": UNITS_METERS_PER_SECOND, + } + ) @property def _name(self) -> str: @@ -95,9 +84,9 @@ def _create_empty_mdio(cls, create_headers: bool, output_path: Path, overwrite: """Create a temporary empty MDIO file for testing.""" # Create the grid with the specified dimensions dims = [ - Dimension(name="inline", coords=range(1, 346, 1)), # 100-300 with step 1 - Dimension(name="crossline", coords=range(1, 189, 1)), # 1000-1600 with step 2 - Dimension(name="time", coords=range(0, 3002, 2)), # 0-3 seconds 4ms sample rate + Dimension(name="inline", coords=range(1, 346, 1)), + Dimension(name="crossline", coords=range(1, 189, 1)), + Dimension(name="time", coords=range(0, 3002, 2)), ] # If later on, we want to export to SEG-Y, we need to provide the trace header spec. @@ -105,7 +94,7 @@ def _create_empty_mdio(cls, create_headers: bool, output_path: Path, overwrite: headers = get_teapot_segy_spec().trace.header if create_headers else None # Create an empty MDIO v1 metric post-stack 3D time velocity dataset return create_empty( - mdio_template=PostStack3DVelocityTemplate(data_domain="time", is_metric=True), + mdio_template=PostStack3DVelocityMetricTemplate(data_domain="time"), dimensions=dims, output_path=output_path, headers=headers, @@ -113,7 +102,7 @@ def _create_empty_mdio(cls, create_headers: bool, output_path: Path, overwrite: ) @classmethod - def validate_teapod_dataset_metadata(cls, ds: xr_Dataset, is_velocity: bool) -> None: + def validate_teapot_dataset_metadata(cls, ds: xr_Dataset, is_velocity: bool) -> None: """Validate the dataset metadata.""" if is_velocity: assert ds.name == "PostStack3DVelocityTime" @@ -137,7 +126,6 @@ def validate_teapod_dataset_metadata(cls, ds: xr_Dataset, is_velocity: bool) -> # Check that createdOn exists assert "createdOn" in actual_attrs_json - assert actual_attrs_json["createdOn"] is not None # Validate template attributes attributes = ds.attrs["attributes"] @@ -152,7 +140,7 @@ def validate_teapod_dataset_metadata(cls, ds: xr_Dataset, is_velocity: bool) -> assert attributes["gatherType"] == "stacked" @classmethod - def validate_teapod_dataset_variables( + def validate_teapot_dataset_variables( cls, ds: xr_Dataset, header_dtype: np.dtype | None, is_velocity: bool ) -> None: """Validate an empty MDIO dataset structure and content.""" @@ -164,7 +152,9 @@ def validate_teapod_dataset_variables( validate_xr_variable( ds, "crossline", {"crossline": 188}, UNITS_NONE, np.int32, False, range(1, 189), get_values ) - validate_xr_variable(ds, "time", {"time": 1501}, UNITS_SECOND, np.int32, False, range(0, 3002, 2), get_values) + validate_xr_variable( + ds, "time", {"time": 1501}, UNITS_MILLISECOND, np.int32, False, range(0, 3002, 2), get_values + ) # Validate the non-dimensional coordinate variables (should be empty for empty dataset) validate_xr_variable(ds, "cdp_x", {"inline": 345, "crossline": 188}, UNITS_METER, np.float64) @@ -183,7 +173,7 @@ def validate_teapod_dataset_variables( # Validate the trace mask (should be all True for empty dataset) validate_xr_variable(ds, "trace_mask", {"inline": 345, "crossline": 188}, UNITS_NONE, np.bool_) trace_mask = ds["trace_mask"].values - assert not np.any(trace_mask), "All traces should be marked as dead in empty dataset" + assert not np.any(trace_mask), "Expected all `False` values in `trace_mask` but found `True`." # Validate the velocity or amplitude data (should be empty) if is_velocity: @@ -222,16 +212,16 @@ def mdio_no_headers(self, empty_mdio_dir: Path) -> Path: def test_dataset_metadata(self, mdio_with_headers: Path) -> None: """Test dataset metadata for empty MDIO file.""" ds = open_mdio(mdio_with_headers) - self.validate_teapod_dataset_metadata(ds, is_velocity=True) + self.validate_teapot_dataset_metadata(ds, is_velocity=True) def test_variables(self, mdio_with_headers: Path, mdio_no_headers: Path) -> None: """Test grid validation for empty MDIO file.""" ds = open_mdio(mdio_with_headers) header_dtype = get_teapot_segy_spec().trace.header.dtype - self.validate_teapod_dataset_variables(ds, header_dtype=header_dtype, is_velocity=True) + self.validate_teapot_dataset_variables(ds, header_dtype=header_dtype, is_velocity=True) ds = open_mdio(mdio_no_headers) - self.validate_teapod_dataset_variables(ds, header_dtype=None, is_velocity=True) + self.validate_teapot_dataset_variables(ds, header_dtype=None, is_velocity=True) def test_overwrite_behavior(self, empty_mdio_dir: Path) -> None: """Test overwrite parameter behavior in create_empty_mdio.""" @@ -258,9 +248,9 @@ def test_overwrite_behavior(self, empty_mdio_dir: Path) -> None: # Validate that the MDIO file can be loaded correctly using the helper function ds = open_mdio(empty_mdio) - self.validate_teapod_dataset_metadata(ds, is_velocity=True) + self.validate_teapot_dataset_metadata(ds, is_velocity=True) header_dtype = get_teapot_segy_spec().trace.header.dtype - self.validate_teapod_dataset_variables(ds, header_dtype=header_dtype, is_velocity=True) + self.validate_teapot_dataset_variables(ds, header_dtype=header_dtype, is_velocity=True) # Verify the garbage data was overwritten (should not exist) assert not garbage_file.exists(), "Garbage file should have been overwritten" @@ -403,6 +393,6 @@ def test_create_empty_like(self, teapot_mdio_tmp: Path, empty_mdio_dir: Path) -> ) assert ds is not None - self.validate_teapod_dataset_metadata(ds, is_velocity=False) + self.validate_teapot_dataset_metadata(ds, is_velocity=False) header_dtype = get_teapot_segy_spec().trace.header.dtype - self.validate_teapod_dataset_variables(ds, header_dtype=header_dtype, is_velocity=False) + self.validate_teapot_dataset_variables(ds, header_dtype=header_dtype, is_velocity=False) diff --git a/tests/integration/testing_helpers.py b/tests/integration/testing_helpers.py index 18c262887..27a35d985 100644 --- a/tests/integration/testing_helpers.py +++ b/tests/integration/testing_helpers.py @@ -20,6 +20,7 @@ UNITS_NONE = None UNITS_METER = LengthUnitModel(length=LengthUnitEnum.METER) UNITS_SECOND = TimeUnitModel(time=TimeUnitEnum.SECOND) +UNITS_MILLISECOND = TimeUnitModel(time=TimeUnitEnum.MILLISECOND) UNITS_METERS_PER_SECOND = SpeedUnitModel(speed=SpeedUnitEnum.METERS_PER_SECOND) UNITS_FOOT = LengthUnitModel(length=LengthUnitEnum.FOOT) UNITS_FEET_PER_SECOND = SpeedUnitModel(speed=SpeedUnitEnum.FEET_PER_SECOND)