From 8bb2015f89169b9e3ef922c4b1a222f62b1b95f0 Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Mon, 25 May 2026 07:03:19 -0700 Subject: [PATCH 1/2] Make corpus determinism check toolchain-agnostic (#2299) COG and JPEG fixtures fall back to a semantic rasterio-read comparison instead of byte-md5; the rest still md5-compare. The conda-forge workflow drops its --ignore now that the test handles encoder drift on its own. --- .github/workflows/test-geotiff-corpus.yml | 17 +-- .../golden_corpus/test_corpus_determinism.py | 135 ++++++++++++++++-- 2 files changed, 127 insertions(+), 25 deletions(-) diff --git a/.github/workflows/test-geotiff-corpus.yml b/.github/workflows/test-geotiff-corpus.yml index 8451ce3a..91ae2496 100644 --- a/.github/workflows/test-geotiff-corpus.yml +++ b/.github/workflows/test-geotiff-corpus.yml @@ -73,17 +73,12 @@ jobs: # Target is scoped to `golden_corpus/` to match the issue (#2289); # broader geotiff integration tests stay on the `test.yml` job. # - # `test_corpus_determinism.py` is deselected here: it asserts md5 - # equality between committed fixtures and bytes regenerated by - # `generate.py`, which depends on the exact GDAL / libjpeg the - # corpus was produced against. Conda-forge ships different - # versions than the developer machine that originally built the - # fixtures (today: COG-with-overview and JPEG-YCbCr drift), so - # this check is fundamentally toolchain-coupled. Tracking the - # cleanup in a follow-up issue; the oracle and nodata tests -- - # which compare semantic output, not byte exactness -- still run. + # `test_corpus_determinism.py` is included here now that the + # COG / JPEG cells use a semantic comparison instead of byte-md5 + # (issue #2299). The remaining fixtures still md5-compare, which + # is what catches generator-side drift. if: github.event_name == 'pull_request' - run: pytest xrspatial/geotiff/tests/golden_corpus/ -m "not slow" --ignore=xrspatial/geotiff/tests/golden_corpus/test_corpus_determinism.py + run: pytest xrspatial/geotiff/tests/golden_corpus/ -m "not slow" - name: Run geotiff golden_corpus tests (full) if: github.event_name != 'pull_request' - run: pytest xrspatial/geotiff/tests/golden_corpus/ --ignore=xrspatial/geotiff/tests/golden_corpus/test_corpus_determinism.py + run: pytest xrspatial/geotiff/tests/golden_corpus/ diff --git a/xrspatial/geotiff/tests/golden_corpus/test_corpus_determinism.py b/xrspatial/geotiff/tests/golden_corpus/test_corpus_determinism.py index abb15a10..f7c2fac2 100644 --- a/xrspatial/geotiff/tests/golden_corpus/test_corpus_determinism.py +++ b/xrspatial/geotiff/tests/golden_corpus/test_corpus_determinism.py @@ -1,26 +1,36 @@ -"""Corpus determinism gate (issue #1930). +"""Corpus determinism gate (issue #1930, refined in #2299). -The golden corpus generator is built to be byte-deterministic: fixed -seeds, sorted iteration, and an explicit ``os.utime`` pass that pins -file mtimes to a constant epoch. This test guards that property in CI -so a regression in the generator (or a manually-edited fixture on -disk) fails the build instead of silently drifting. +The golden corpus generator is built to be reproducible: fixed seeds, +sorted iteration, and an explicit ``os.utime`` pass that pins file +mtimes to a constant epoch. This test guards that property in CI so a +regression in the generator (or a manually-edited fixture on disk) +fails the build instead of silently drifting. What this catches: * a generator-side change that flips RNG ordering, drops the mtime normalisation, or otherwise breaks reproducibility -- the - regenerated bytes diverge from the committed bytes; + regenerated output diverges from the committed fixture; * a fixture-on-disk drift where the manifest still says X but the committed ``.tif`` was edited (or stale) so it no longer matches what the manifest would produce. -The test is fast (regenerating the 30-fixture corpus measured ~0.3s -locally) and carries the ``fast`` tag in spirit: it is parameterised -over every manifest entry but each parameter is a single md5 -compare, so the whole module runs comfortably inside the PR fast -lane. We do not attach the ``slow`` pytest marker so -``pytest -m "not slow"`` keeps it in scope. +How the comparison runs (issue #2299): + +Most fixtures use a byte-level md5 check. That is the strictest signal +available and catches generator-side regressions immediately. + +Two fixture classes are intrinsically coupled to the GDAL / libjpeg +build used to produce them: COG (``cog: true``) and JPEG +(``compression: jpeg``). The COG driver's overview pyramid layout and +libjpeg's encoder output both change across versions even when the +input pixels are identical, so a byte-md5 check against a fixture +committed by one toolchain will fail under another (e.g. a developer +laptop vs. the conda-forge CI lane). For those fixtures the test falls +back to a semantic comparison: open both files with rasterio and +verify dtype, shape, transform, CRS, nodata, and -- for lossless cells +-- pixel arrays. JPEG cells carry ``tolerance.lossy: true`` in the +manifest and skip the pixel comparison. Fixtures the manifest declares but that are not committed on disk (today: ``example_tiled_uint16_deflate_pred2``, kept as a @@ -32,6 +42,7 @@ import hashlib import pathlib +import numpy as np import pytest # rasterio / pyyaml are runtime deps of the generator. importorskip @@ -40,6 +51,8 @@ pytest.importorskip("yaml") pytest.importorskip("rasterio") +import rasterio # noqa: E402 + from xrspatial.geotiff.tests.golden_corpus import generate # noqa: E402 FIXTURES_DIR = ( @@ -58,6 +71,89 @@ def _md5(path: pathlib.Path) -> str: return h.hexdigest() +def _is_encoder_sensitive(entry: dict) -> bool: + """Fixtures whose on-disk bytes depend on the encoder build. + + The COG driver and libjpeg both produce output that varies across + versions even when the input pixels are identical. Byte-md5 across + different builds is therefore unstable for those cells, while a + semantic read still round-trips correctly. + """ + return bool(entry.get("cog")) or entry.get("compression") == "jpeg" + + +def _nodata_equal(a, b) -> bool: + """NaN-aware nodata comparison. + + Both None counts as equal. NaN sentinels compare equal even though + ``float('nan') != float('nan')`` in plain Python. + """ + if a is None and b is None: + return True + if a is None or b is None: + return False + try: + af = float(a) + bf = float(b) + except (TypeError, ValueError): + return a == b + if np.isnan(af) and np.isnan(bf): + return True + return af == bf + + +def _assert_semantic_equal( + committed: pathlib.Path, + regenerated: pathlib.Path, + entry: dict, +) -> None: + """Compare two TIFFs by their rasterio read, not by file bytes. + + Used for ``cog`` and ``jpeg`` fixtures where the on-disk encoding + is toolchain-coupled but the readable content is stable. + Lossy cells (``tolerance.lossy: true`` in the manifest, today the + JPEG-YCbCr entry) skip pixel equality and check only shape, dtype, + georeferencing, and nodata. + """ + lossy = bool(entry.get("tolerance", {}).get("lossy", False)) + with rasterio.open(committed) as ref, rasterio.open(regenerated) as cand: + assert ref.count == cand.count, ( + f"band count differs: committed={ref.count}, " + f"regenerated={cand.count}" + ) + assert ref.dtypes == cand.dtypes, ( + f"dtypes differ: committed={ref.dtypes}, " + f"regenerated={cand.dtypes}" + ) + assert (ref.width, ref.height) == (cand.width, cand.height), ( + f"shape differs: committed={(ref.width, ref.height)}, " + f"regenerated={(cand.width, cand.height)}" + ) + assert tuple(ref.transform) == tuple(cand.transform), ( + f"transform differs: committed={tuple(ref.transform)}, " + f"regenerated={tuple(cand.transform)}" + ) + assert ref.crs == cand.crs, ( + f"CRS differs: committed={ref.crs!r}, regenerated={cand.crs!r}" + ) + assert _nodata_equal(ref.nodata, cand.nodata), ( + f"nodata differs: committed={ref.nodata!r}, " + f"regenerated={cand.nodata!r}" + ) + assert ref.overviews(1) == cand.overviews(1), ( + f"overview decimation factors differ: " + f"committed={ref.overviews(1)}, regenerated={cand.overviews(1)}" + ) + if lossy: + return + ref_pixels = ref.read() + cand_pixels = cand.read() + assert np.array_equal(ref_pixels, cand_pixels, equal_nan=True), ( + f"pixel arrays differ for {entry['id']!r}; the generator output " + f"no longer round-trips to the committed fixture's pixels" + ) + + def _load_entries() -> list[dict]: """Return validated manifest entries (defaults merged), sorted by id.""" return sorted(generate.validate(generate.load_manifest()), key=lambda e: e["id"]) @@ -67,6 +163,7 @@ def _load_entries() -> list[dict]: # share one manifest load. Each call to ``validate()`` re-walks every # entry, so collapsing the two callers cuts validation work in half. _ENTRIES = _load_entries() +_ENTRY_BY_ID = {e["id"]: e for e in _ENTRIES} _MANIFEST_IDS = [e["id"] for e in _ENTRIES] _EXTERNAL_OVR_IDS = [e["id"] for e in _ENTRIES if e.get("external_overview")] @@ -88,7 +185,13 @@ def test_fixture_bytes_are_deterministic( fixture_id: str, regenerated_dir: pathlib.Path ) -> None: """The committed ``.tif`` for each manifest id matches what the - generator would produce today, byte for byte. + generator would produce today. + + For most fixtures the check is byte-md5: the strictest guard + against generator drift. For ``cog`` and ``jpeg`` fixtures the + GDAL / libjpeg encoder output is toolchain-coupled, so the check + falls back to a semantic comparison via rasterio (see + ``_assert_semantic_equal``). Issue #2299 has the rationale. Skip rather than fail when the committed file is missing -- that means the fixture is declared but intentionally not shipped @@ -106,6 +209,10 @@ def test_fixture_bytes_are_deterministic( f"generator did not produce {fixture_id!r}; check the generator " f"and manifest stayed in sync" ) + entry = _ENTRY_BY_ID[fixture_id] + if _is_encoder_sensitive(entry): + _assert_semantic_equal(committed, regenerated, entry) + return committed_md5 = _md5(committed) regenerated_md5 = _md5(regenerated) assert committed_md5 == regenerated_md5, ( From f5aa21419da4b87c13f3060784ff593bae9fd11f Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Mon, 25 May 2026 07:07:31 -0700 Subject: [PATCH 2/2] Address review nits: overview pixels, lossy mean check, drift tests (#2299) - Lossless cells now also compare every overview level's pixel array, closing the gap where the COG fixture's pyramid could drift unnoticed. - Lossy (JPEG) cells gain a coarse per-band mean tolerance so a content regression in the JPEG branch is still detected. - Add direct negative-path tests that doctor a fixture and assert _assert_semantic_equal rejects it. - Rename the regenerated_dir tmp prefix to reflect the test's scope. --- .../golden_corpus/test_corpus_determinism.py | 169 ++++++++++++++++-- 1 file changed, 155 insertions(+), 14 deletions(-) diff --git a/xrspatial/geotiff/tests/golden_corpus/test_corpus_determinism.py b/xrspatial/geotiff/tests/golden_corpus/test_corpus_determinism.py index f7c2fac2..c419bb0d 100644 --- a/xrspatial/geotiff/tests/golden_corpus/test_corpus_determinism.py +++ b/xrspatial/geotiff/tests/golden_corpus/test_corpus_determinism.py @@ -102,6 +102,15 @@ def _nodata_equal(a, b) -> bool: return af == bf +# Lossy cells (today: JPEG-YCbCr) can't compare pixels bit-exactly across +# libjpeg versions, but a per-band mean drift much beyond a few intensity +# units points at a real generator regression (wrong input array, swapped +# band order before encode) rather than codec noise. 4.0 on a 0-255 scale +# leaves several stops of headroom for libjpeg/YCbCr churn while still +# catching the kind of bug the rest of the test is meant to flag. +_LOSSY_PIXEL_MEAN_TOL = 4.0 + + def _assert_semantic_equal( committed: pathlib.Path, regenerated: pathlib.Path, @@ -111,11 +120,14 @@ def _assert_semantic_equal( Used for ``cog`` and ``jpeg`` fixtures where the on-disk encoding is toolchain-coupled but the readable content is stable. - Lossy cells (``tolerance.lossy: true`` in the manifest, today the - JPEG-YCbCr entry) skip pixel equality and check only shape, dtype, - georeferencing, and nodata. + Lossless cells assert bit-exact pixels at the base IFD and at + every overview level the file declares. Lossy cells + (``tolerance.lossy: true`` in the manifest, today the JPEG-YCbCr + entry) drop to a coarse per-band mean tolerance instead of a + bit-exact compare. """ lossy = bool(entry.get("tolerance", {}).get("lossy", False)) + fid = entry["id"] with rasterio.open(committed) as ref, rasterio.open(regenerated) as cand: assert ref.count == cand.count, ( f"band count differs: committed={ref.count}, " @@ -140,19 +152,85 @@ def _assert_semantic_equal( f"nodata differs: committed={ref.nodata!r}, " f"regenerated={cand.nodata!r}" ) - assert ref.overviews(1) == cand.overviews(1), ( + ref_overviews = ref.overviews(1) + assert ref_overviews == cand.overviews(1), ( f"overview decimation factors differ: " - f"committed={ref.overviews(1)}, regenerated={cand.overviews(1)}" - ) - if lossy: - return - ref_pixels = ref.read() - cand_pixels = cand.read() - assert np.array_equal(ref_pixels, cand_pixels, equal_nan=True), ( - f"pixel arrays differ for {entry['id']!r}; the generator output " - f"no longer round-trips to the committed fixture's pixels" + f"committed={ref_overviews}, regenerated={cand.overviews(1)}" ) + if lossy: + _assert_pixels_close_lossy(committed, regenerated, fid) + return + _assert_pixels_exact(committed, regenerated, fid) + # Overview pixels are part of the determinism contract for fixtures + # that ship them (the COG cell today). rasterio's OVERVIEW_LEVEL + # is 0-indexed against the overview chain, hence range(len(...)). + for level in range(len(ref_overviews)): + _assert_overview_pixels_exact(committed, regenerated, level, fid) + + +def _read_all(path: pathlib.Path, *, overview_level: int | None = None) -> np.ndarray: + """Open ``path`` with rasterio and return ``src.read()`` for the + requested IFD. ``overview_level=None`` reads the base IFD. + """ + if overview_level is None: + with rasterio.open(path) as src: + return src.read() + with rasterio.open(path, OVERVIEW_LEVEL=overview_level) as src: + return src.read() + + +def _assert_pixels_exact( + committed: pathlib.Path, regenerated: pathlib.Path, fid: str, +) -> None: + ref_pixels = _read_all(committed) + cand_pixels = _read_all(regenerated) + assert np.array_equal(ref_pixels, cand_pixels, equal_nan=True), ( + f"pixel arrays differ for {fid!r}; the generator output " + f"no longer round-trips to the committed fixture's pixels" + ) + + +def _assert_overview_pixels_exact( + committed: pathlib.Path, + regenerated: pathlib.Path, + overview_level: int, + fid: str, +) -> None: + ref_pixels = _read_all(committed, overview_level=overview_level) + cand_pixels = _read_all(regenerated, overview_level=overview_level) + assert np.array_equal(ref_pixels, cand_pixels, equal_nan=True), ( + f"overview level {overview_level} pixels differ for {fid!r}; " + f"the generator's overview pyramid no longer matches the " + f"committed fixture" + ) + + +def _assert_pixels_close_lossy( + committed: pathlib.Path, regenerated: pathlib.Path, fid: str, +) -> None: + """Coarse per-band mean comparison for lossy (JPEG) cells. + + Bit-exact comparison would re-introduce the libjpeg coupling this + PR removed, but the per-band mean is stable enough across libjpeg + versions to catch a real content regression (a swapped input + array, a band-permutation bug) while tolerating ordinary codec + drift. + """ + ref_pixels = _read_all(committed).astype(np.float64) + cand_pixels = _read_all(regenerated).astype(np.float64) + # rasterio always returns (bands, H, W), so axis=(1, 2) collapses + # to one mean per band. + ref_means = ref_pixels.mean(axis=(1, 2)) + cand_means = cand_pixels.mean(axis=(1, 2)) + diff = np.abs(ref_means - cand_means) + assert np.all(diff <= _LOSSY_PIXEL_MEAN_TOL), ( + f"per-band mean drift exceeds {_LOSSY_PIXEL_MEAN_TOL} for {fid!r}: " + f"committed_means={ref_means.tolist()}, " + f"regenerated_means={cand_means.tolist()}, " + f"abs_diff={diff.tolist()}" + ) + def _load_entries() -> list[dict]: """Return validated manifest entries (defaults merged), sorted by id.""" @@ -175,7 +253,7 @@ def regenerated_dir(tmp_path_factory: pytest.TempPathFactory) -> pathlib.Path: Module-scoped so the (few-second) write cost is paid once per test session rather than per parametrised case. """ - out = tmp_path_factory.mktemp("regen_corpus_1930") + out = tmp_path_factory.mktemp("regen_corpus_determinism") generate.generate(output_dir=out) return out @@ -257,6 +335,69 @@ def test_external_overview_sidecar_is_deterministic( ) +def _write_doctored_copy( + src: pathlib.Path, dst: pathlib.Path, *, delta: int = 1 +) -> None: + """Copy ``src`` to ``dst`` and flip one pixel by ``delta``. + + Used by the negative-path tests below: the resulting file has the + same georeferencing and overview chain as the source but differs + in pixel content, so a working semantic check must reject it. + """ + with rasterio.open(src) as r: + profile = r.profile + data = r.read() + overview_factors = r.overviews(1) + data = data.copy() + data[0, 0, 0] = (int(data[0, 0, 0]) + delta) & np.iinfo(data.dtype).max + with rasterio.open(dst, "w", **profile) as w: + w.write(data) + if overview_factors: + # Match the source's overview chain so the decimation check + # passes and the comparison falls through to pixel reads. + w.build_overviews(overview_factors) + + +def test_semantic_equal_rejects_lossless_pixel_drift(tmp_path) -> None: + """A doctored lossless fixture with one flipped pixel must fail + ``_assert_semantic_equal``. Locks the drift-detection path that the + PR refactor depends on. + """ + src = FIXTURES_DIR / "cog_internal_overview_uint16.tif" + if not src.exists(): + pytest.skip("cog fixture not committed; cannot exercise drift path") + doctored = tmp_path / "cog_doctored_2299.tif" + _write_doctored_copy(src, doctored) + entry = _ENTRY_BY_ID["cog_internal_overview_uint16"] + with pytest.raises(AssertionError, match=r"pixels? .* differ"): + _assert_semantic_equal(src, doctored, entry) + + +def test_semantic_equal_rejects_lossy_mean_drift(tmp_path) -> None: + """A doctored lossy fixture with a large constant offset must fail + the per-band mean check. Catches the case where the JPEG path + would otherwise silently accept anything since pixel equality is + skipped. + """ + src = FIXTURES_DIR / "compression_jpeg_uint8_ycbcr.tif" + if not src.exists(): + pytest.skip("jpeg fixture not committed; cannot exercise drift path") + # Read the source, add a constant offset well past the mean + # tolerance, then re-encode through the same profile so the + # resulting file is still a valid JPEG-YCbCr TIFF. + with rasterio.open(src) as r: + profile = r.profile + data = r.read() + doctored = tmp_path / "jpeg_doctored_2299.tif" + offset = int(_LOSSY_PIXEL_MEAN_TOL * 4) + 1 + shifted = np.clip(data.astype(np.int32) + offset, 0, 255).astype(data.dtype) + with rasterio.open(doctored, "w", **profile) as w: + w.write(shifted) + entry = _ENTRY_BY_ID["compression_jpeg_uint8_ycbcr"] + with pytest.raises(AssertionError, match="per-band mean drift"): + _assert_semantic_equal(src, doctored, entry) + + def test_no_orphan_fixtures_on_disk() -> None: """Every committed ``.tif`` (and ``.tif.ovr`` sidecar) corresponds to a manifest entry. Catches stale fixtures left behind after a