From 56a464e8281623dc2f76b1935f204fe6c73460a5 Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Sat, 14 Mar 2026 23:37:53 +0000 Subject: [PATCH 01/12] Test with DECODE_TO_STRINGS off: WIP problem with cube contents. --- lib/iris/fileformats/cf.py | 2 + .../integration/netcdf/test_stringdata.py | 72 +++++++++++++++---- .../netcdf/test_bytecoding_datasets.py | 5 +- 3 files changed, 61 insertions(+), 18 deletions(-) diff --git a/lib/iris/fileformats/cf.py b/lib/iris/fileformats/cf.py index 247914bf61..9c093feea5 100644 --- a/lib/iris/fileformats/cf.py +++ b/lib/iris/fileformats/cf.py @@ -1344,6 +1344,8 @@ def __init__(self, file_source, warn=False, monotonic=False): self._with_ugrid = False # Read the variables in the dataset only once to reduce runtime. + # Turn off *any* automatic decoding in the underlying netCDF4 dataset + self._dataset._contained_instance.set_auto_chartostring(False) variables = self._dataset.variables self._translate(variables) self._build_cf_groups(variables) diff --git a/lib/iris/tests/integration/netcdf/test_stringdata.py b/lib/iris/tests/integration/netcdf/test_stringdata.py index 585388c5e4..800ca3c502 100644 --- a/lib/iris/tests/integration/netcdf/test_stringdata.py +++ b/lib/iris/tests/integration/netcdf/test_stringdata.py @@ -19,7 +19,11 @@ import iris from iris.coords import AuxCoord, DimCoord from iris.cube import Cube -from iris.fileformats.netcdf import SUPPORTED_ENCODINGS, _thread_safe_nc +from iris.fileformats.netcdf import ( + DECODE_TO_STRINGS_ON_READ, + SUPPORTED_ENCODINGS, + _thread_safe_nc, +) @pytest.fixture(scope="module") @@ -87,7 +91,9 @@ class SamplefileDetails: filepath: Path datavar_data: ArrayLike + datavar_bytes: ArrayLike stringcoord_data: ArrayLike + stringcoord_bytes: ArrayLike numericcoord_data: ArrayLike @@ -171,7 +177,9 @@ def make_testfile( return SamplefileDetails( filepath=testfile_path, datavar_data=datavar_strings, + datavar_bytes=datavar_bytearray, stringcoord_data=coordvar_strings, + stringcoord_bytes=coordvar_bytearray, numericcoord_data=numeric_values, ) @@ -237,29 +245,63 @@ def readtest_data( # ncdump(str(tempfile_path)) return testdata - def test_valid_encodings(self, encoding, readtest_data: SamplefileDetails): - testfile_path, datavar_strings, coordvar_strings, numeric_data = ( + @pytest.fixture(params=["strings", "bytes"]) + def readmode(self, request): + return request.param + + def test_valid_encodings( + self, encoding, readtest_data: SamplefileDetails, readmode + ): + ( + testfile_path, + datavar_strings, + datavar_bytes, + coordvar_strings, + coordvar_bytes, + numeric_data, + ) = ( readtest_data.filepath, readtest_data.datavar_data, + readtest_data.datavar_bytes, readtest_data.stringcoord_data, + readtest_data.stringcoord_bytes, readtest_data.numericcoord_data, ) - cube = iris.load_cube(testfile_path) - assert load_problems_list() == [] - assert cube.shape == (N_XDIM,) + as_strings = readmode == "strings" + if as_strings: + # Regular load + cube = iris.load_cube(testfile_path) + expected_shape: tuple = (N_XDIM,) + else: + # Special NON-decoded read + with DECODE_TO_STRINGS_ON_READ.context(False): + cube = iris.load_cube(testfile_path) + expected_shape = (N_XDIM, N_CHARS_DIM) - if encoding == "utf-32": - expected_string_width = (N_CHARS_DIM // 4) - 1 - elif encoding == "utf-16": - expected_string_width = (N_CHARS_DIM) // 2 - 1 + assert load_problems_list() == [] + assert cube.shape == expected_shape + + if as_strings: + if encoding == "utf-32": + expected_string_width = (N_CHARS_DIM // 4) - 1 + elif encoding == "utf-16": + expected_string_width = (N_CHARS_DIM) // 2 - 1 + else: + expected_string_width = N_CHARS_DIM + expected_dtype = f" Date: Tue, 26 May 2026 17:32:46 +0100 Subject: [PATCH 02/12] Resolve load problems with non-string read loading by skipping problem testcases. --- lib/iris/tests/integration/netcdf/test_stringdata.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/lib/iris/tests/integration/netcdf/test_stringdata.py b/lib/iris/tests/integration/netcdf/test_stringdata.py index 800ca3c502..dd89e8c239 100644 --- a/lib/iris/tests/integration/netcdf/test_stringdata.py +++ b/lib/iris/tests/integration/netcdf/test_stringdata.py @@ -250,7 +250,7 @@ def readmode(self, request): return request.param def test_valid_encodings( - self, encoding, readtest_data: SamplefileDetails, readmode + self, encoding, readtest_data: SamplefileDetails, readmode, use_separate_dims ): ( testfile_path, @@ -267,6 +267,14 @@ def test_valid_encodings( readtest_data.stringcoord_bytes, readtest_data.numericcoord_data, ) + + if readmode == "bytes" and use_separate_dims == True: + msg = ( + "Unsupported load combination : character coordinates with a non-cube " + "string dimension can't attach to the cube, when read as bytes." + ) + pytest.skip(msg) + as_strings = readmode == "strings" if as_strings: # Regular load From a7e928a327846c5229cc9b4c001b2256b6c6834f Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Wed, 27 May 2026 10:42:22 +0100 Subject: [PATCH 03/12] Make cf chartostring disable work for non-owned regular datasets. --- lib/iris/fileformats/cf.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/lib/iris/fileformats/cf.py b/lib/iris/fileformats/cf.py index 9c093feea5..01440450f9 100644 --- a/lib/iris/fileformats/cf.py +++ b/lib/iris/fileformats/cf.py @@ -1345,7 +1345,11 @@ def __init__(self, file_source, warn=False, monotonic=False): # Read the variables in the dataset only once to reduce runtime. # Turn off *any* automatic decoding in the underlying netCDF4 dataset - self._dataset._contained_instance.set_auto_chartostring(False) + ds = self._dataset + if isinstance(ds, _thread_safe_nc.DatasetWrapper): + ds._contained_instance.set_auto_chartostring(False) + else: + ds.set_auto_chartostring(False) variables = self._dataset.variables self._translate(variables) self._build_cf_groups(variables) From e6bdb53442e591a1d0d59c7ebbbb277e2fd13cc0 Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Wed, 27 May 2026 17:35:31 +0100 Subject: [PATCH 04/12] Checks for alternate encoding name. --- .../integration/netcdf/test_stringdata.py | 17 ++++--- .../netcdf/test_bytecoding_datasets.py | 45 ++++++++++++------- 2 files changed, 36 insertions(+), 26 deletions(-) diff --git a/lib/iris/tests/integration/netcdf/test_stringdata.py b/lib/iris/tests/integration/netcdf/test_stringdata.py index dd89e8c239..eae8006834 100644 --- a/lib/iris/tests/integration/netcdf/test_stringdata.py +++ b/lib/iris/tests/integration/netcdf/test_stringdata.py @@ -42,7 +42,8 @@ def all_lazy_auxcoords(): PERSIST_TESTFILES: str | None = None NO_ENCODING_STR = "" -TEST_ENCODINGS = [NO_ENCODING_STR] + SUPPORTED_ENCODINGS +ALIAS_UTF8_STR = "UTF8" # an alternative acceptable form (should be written as-is) +TEST_ENCODINGS = [NO_ENCODING_STR, ALIAS_UTF8_STR] + SUPPORTED_ENCODINGS # @@ -223,7 +224,7 @@ def readtest_path( else: filetag = encoding dimtag = "diffdims" if use_separate_dims else "samedims" - tempfile_path = tmp_path / f"sample_read_{filetag}_{dimtag}.nc" + tempfile_path = tmp_path / f"sample_stringdata_read_{filetag}_{dimtag}.nc" return tempfile_path @pytest.fixture @@ -408,7 +409,7 @@ def write_bytes(self, request): return request.param == "dataAsBytes" @pytest.fixture - def writetest_path(self, encoding, write_bytes, tmp_path): + def writetest_path(self, encoding, write_bytes, lazy_data, tmp_path): """Create a suitable test cube, with either string or byte content.""" if PERSIST_TESTFILES: tmp_path = Path(PERSIST_TESTFILES).expanduser() @@ -417,7 +418,10 @@ def writetest_path(self, encoding, write_bytes, tmp_path): else: filetag = encoding datatag = "writebytes" if write_bytes else "writestrings" - tempfile_path = tmp_path / f"sample_write_{filetag}_{datatag}.nc" + lazytag = "alllazy" if lazy_data else "smallreal" + tempfile_path = ( + tmp_path / f"sample_stringdata_write_{filetag}_{datatag}_{lazytag}.nc" + ) return tempfile_path @pytest.fixture @@ -440,11 +444,6 @@ def writetest_data(self, writetest_path, encoding, write_bytes, lazy_data): def test_valid_encodings(self, encoding, writetest_data, write_bytes): cube_info = writetest_data cube, path = cube_info.cube, cube_info.save_path - # TODO: not testing the "byte read/write" yet - # Make a quick check for cube equality : but the presentation depends on the read mode - # with DECODE_TO_STRINGS_ON_READ.context(not write_bytes): - # read_cube = iris.load_cube(path) - # assert read_cube == cube # N.B. file content should not depend on whether bytes or strings were written vararray, coordarray = cube_info.datavar_data, cube_info.stringcoord_data diff --git a/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py b/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py index 29b4889e4d..f153033fa9 100644 --- a/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py +++ b/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py @@ -28,7 +28,10 @@ from iris.tests.stock.netcdf import ncgen_from_cdl from iris.warnings import IrisCfLoadWarning, IrisCfSaveWarning -encoding_options = [None] + SUPPORTED_ENCODINGS +# Note: for test options, include "no encoding" and an alias name +ENCODING_NONE = None +ENCODING_UTF8_ALIAS = "UTF8" +encoding_options = [ENCODING_NONE, ENCODING_UTF8_ALIAS] + SUPPORTED_ENCODINGS samples_3_ascii = np.array( ["one", "", "seven"], # N.B. include empty! @@ -45,9 +48,17 @@ def encoding(request): return request.param +# TODO: remove (debug) +# PERSIST_TESTFILES: str | None = "~/chararray_testfiles" +PERSIST_TESTFILES: str | None = None + + @pytest.fixture(scope="module") def tempdir(tmp_path_factory): - path = tmp_path_factory.mktemp("netcdf") + if PERSIST_TESTFILES: + path = Path(PERSIST_TESTFILES).expanduser() + else: + path = tmp_path_factory.mktemp("netcdf") return path @@ -134,7 +145,7 @@ class TestWriteStrings: def test_encodings(self, encoding, tempdir): # Create a dataset with the variable - path = tempdir / f"test_writestrings_encoding_{encoding!s}.nc" + path = tempdir / f"test_bytecoded_writestrings_encoding_{encoding!s}.nc" if encoding in [None, "ascii"]: writedata = samples_3_ascii @@ -164,7 +175,7 @@ def test_encodings(self, encoding, tempdir): def test_scalar(self, tempdir): # Like 'test_write_strings', but the variable has *only* the string dimension. - path = tempdir / "test_writestrings_scalar.nc" + path = tempdir / "test_bytecoded_writestrings_scalar.nc" strlen = 5 ds_encoded = make_encoded_dataset(path, strlen=strlen) @@ -180,7 +191,7 @@ def test_scalar(self, tempdir): def test_multidim(self, tempdir): # Like 'test_write_strings', but the variable has additional dimensions. - path = tempdir / "test_writestrings_multidim.nc" + path = tempdir / "test_bytecoded_writestrings_multidim.nc" strlen = 5 ds_encoded = make_encoded_dataset(path, strlen=strlen) @@ -209,7 +220,7 @@ def test_multidim(self, tempdir): @pytest.mark.parametrize("encoding", [None, "ascii"]) def test_write_encoding_failure(self, tempdir, encoding): - path = tempdir / f"test_writestrings_encoding_{encoding}_fail.nc" + path = tempdir / f"test_bytecoded_writestrings_encoding_{encoding}_fail.nc" ds = make_encoded_dataset(path, strlen=5, encoding=encoding) v = ds.variables["vxs"] encoding_name = encoding @@ -223,7 +234,7 @@ def test_write_encoding_failure(self, tempdir, encoding): v[:] = samples_3_nonascii def test_write_badencoding_ignore(self, tempdir): - path = tempdir / "test_writestrings_badencoding_ignore.nc" + path = tempdir / "test_bytecoded_writestrings_badencoding_ignore.nc" ds = make_encoded_dataset(path, strlen=5, encoding="unknown") v = ds.variables["vxs"] msg = ( @@ -235,7 +246,7 @@ def test_write_badencoding_ignore(self, tempdir): def test_overlength(self, tempdir): # Check expected behaviour with over-length data - path = tempdir / "test_writestrings_overlength.nc" + path = tempdir / "test_bytecoded_writestrings_overlength.nc" strlen = 6 ds = make_encoded_dataset(path, strlen=strlen, encoding="utf8") v = ds.variables["vxs"] @@ -248,7 +259,7 @@ def test_overlength(self, tempdir): def test_overlength_splitcoding(self, tempdir): # Check expected behaviour when non-ascii multibyte coding gets truncated - path = tempdir / "test_writestrings_overlength_splitcoding.nc" + path = tempdir / "test_bytecoded_writestrings_overlength_splitcoding.nc" strlen = 5 ds = make_encoded_dataset(path, strlen=strlen, encoding="utf-8") v = ds.variables["vxs"] @@ -291,7 +302,7 @@ def test_write_chars(self, tempdir, write_form): strlen = strings_maxbytes(write_strings, encoding) write_bytes = make_bytearray(write_strings, strlen, encoding=encoding) # NOTE: 'flexi' form util decides the width needs to be 7 !! - path = tempdir / f"test_writechars_{write_form}.nc" + path = tempdir / f"test_bytecoded_writechars_{write_form}.nc" ds = make_encoded_dataset(path, encoding=encoding, strlen=strlen) v = ds.variables["vxs"] @@ -331,7 +342,7 @@ def undecoded_testvar(self, ds_encoded, varname: str): def test_encodings(self, encoding, tempdir, readmode): # Create a dataset with the variable - path = tempdir / f"test_read_encodings_{encoding!s}_{readmode}.nc" + path = tempdir / f"test_bytecoded_read_encodings_{encoding!s}_{readmode}.nc" if encoding in [None, "ascii"]: write_strings = samples_3_ascii @@ -352,10 +363,10 @@ def test_encodings(self, encoding, tempdir, readmode): # Test "normal" read --> string array result = v[:] expected = write_strings - if encoding in ("utf-8", "utf-16"): + if encoding in ("utf-8", ENCODING_UTF8_ALIAS, "utf-16"): # In these cases, with the given non-ascii sample data, the # "default minimum string length" is overestimated. - if encoding == "utf-8": + if encoding in ["utf-8", ENCODING_UTF8_ALIAS]: assert strlen == 7 assert result.dtype == "U7" # correct the result dtype to pass the write_strings comparison below @@ -378,7 +389,7 @@ def test_encodings(self, encoding, tempdir, readmode): def test_scalar(self, tempdir, readmode): # Like 'test_write_strings', but the variable has *only* the string dimension. - path = tempdir / f"test_read_scalar_{readmode}.nc" + path = tempdir / f"test_bytecoded_read_scalar_{readmode}.nc" strlen = 5 ds_encoded = make_encoded_dataset(path, strlen=strlen) @@ -404,7 +415,7 @@ def test_scalar(self, tempdir, readmode): def test_multidim(self, tempdir, readmode): # Like 'test_write_strings', but the variable has additional dimensions. - path = tempdir / f"test_read_multidim_{readmode}.nc" + path = tempdir / f"test_bytecoded_read_multidim_{readmode}.nc" strlen = 5 ds_encoded = make_encoded_dataset(path, strlen=strlen) @@ -440,7 +451,7 @@ def test_multidim(self, tempdir, readmode): check_array_matching(result, expected) def test_read_encoding_failure(self, tempdir, readmode): - path = tempdir / f"test_read_encoding_failure_{readmode}.nc" + path = tempdir / f"test_bytecoded_read_encoding_failure_{readmode}.nc" strlen = 10 ds_encoded = make_encoded_dataset(path, strlen=strlen, encoding="ascii") v = ds_encoded.variables["vxs"] @@ -463,7 +474,7 @@ def test_read_encoding_failure(self, tempdir, readmode): assert np.all(result == test_utf8_bytes) def test_read_badencoding_ignore(self, tempdir): - path = tempdir / f"test_read_badencoding_ignore.nc" + path = tempdir / f"test_bytecoded_read_badencoding_ignore.nc" strlen = 10 ds = make_encoded_dataset(path, strlen=strlen, encoding="unknown") v = ds.variables["vxs"] From fa9c66df7606e9fe4749d0e9e580762dfffbcfe6 Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Thu, 28 May 2026 14:42:17 +0100 Subject: [PATCH 05/12] Fix bug where nc save was removing _Encoding attributes from saved cubes. --- lib/iris/fileformats/netcdf/saver.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/lib/iris/fileformats/netcdf/saver.py b/lib/iris/fileformats/netcdf/saver.py index 08f577c499..bde1cbd1fe 100644 --- a/lib/iris/fileformats/netcdf/saver.py +++ b/lib/iris/fileformats/netcdf/saver.py @@ -1719,20 +1719,23 @@ def add_names_attrs(): if element.units.calendar: _setncattr(cf_var, "calendar", str(element.units.calendar)) + # Take a copy so we can remove things + element_attrs = element.attributes.copy() + # Note: when writing UGRID, "element" can be a Mesh which has no "dtype", # and for dataless cubes it will have a 'None' dtype. if getattr(element, "dtype", None) is not None: # Most attributes are dealt with later. But _Encoding needs to be defined # *before* we can write to a character variable. - if element.dtype.kind in "SU" and "_Encoding" in element.attributes: - encoding = element.attributes.pop("_Encoding") + if element.dtype.kind in "SU" and "_Encoding" in element_attrs: + encoding = element_attrs.pop("_Encoding") _setncattr(cf_var, "_Encoding", encoding) if not isinstance(element, Cube): # Add any other custom coordinate attributes. # N.B. not Cube, which has specific handling in _create_cf_data_variable - for name in sorted(element.attributes): - value = element.attributes[name] + for name in sorted(element_attrs): + value = element_attrs[name] if name == "STASH": # Adopting provisional Metadata Conventions for representing MO From 3def8f1056172a8b3a72020446b7b5bd4aa8fcdb Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Thu, 28 May 2026 14:48:58 +0100 Subject: [PATCH 06/12] Fix bug: stop cube equality failing with char/string data. --- lib/iris/cube.py | 21 +++++++++++++-------- lib/iris/tests/unit/cube/test_Cube.py | 10 ++++++++++ 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/lib/iris/cube.py b/lib/iris/cube.py index 44be3a63d7..e8f1107773 100644 --- a/lib/iris/cube.py +++ b/lib/iris/cube.py @@ -4468,15 +4468,20 @@ def __eq__(self, other): # Having checked everything else, check approximate data equality. if result and not dataless_equality: - # TODO: why do we use allclose() here, but strict equality in - # _DimensionalMetadata (via util.array_equal())? - result = bool( - np.allclose( - self.core_data(), - other.core_data(), - equal_nan=True, + if self.dtype.kind in "if": + # numbers + # TODO: why do we use allclose() here, but strict equality in + # _DimensionalMetadata (via util.array_equal())? + result = bool( + np.allclose( + self.core_data(), + other.core_data(), + equal_nan=True, + ) ) - ) + else: + # non-numeric: use exact equality + result = bool(np.all(self.core_data() == other.core_data())) return result # Must supply __ne__, Python does not defer to __eq__ for negative equality diff --git a/lib/iris/tests/unit/cube/test_Cube.py b/lib/iris/tests/unit/cube/test_Cube.py index d91c7e81c0..756809e128 100644 --- a/lib/iris/tests/unit/cube/test_Cube.py +++ b/lib/iris/tests/unit/cube/test_Cube.py @@ -3620,6 +3620,16 @@ def test_data_bool_not_eq(self): cube2 = Cube([True, True]) assert cube1 != cube2 + def test_data_string_eq(self): + cube1 = Cube(["a", "b", "c"]) + cube2 = Cube(["a", "b", "c"]) + assert cube1 == cube2 + + def test_data_string_not_eq(self): + cube1 = Cube(["a", "b", "c"]) + cube2 = Cube(["a", "b", "d"]) + assert cube1 != cube2 + class Test__eq__meta: def test_ancillary_fail(self): From 713f6316b4d49e65940dffa4a0a8b984c161ab58 Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Thu, 28 May 2026 15:32:36 +0100 Subject: [PATCH 07/12] Remove ban on string data for cubes: Extracting a scalar string cube then works. --- lib/iris/cube.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/iris/cube.py b/lib/iris/cube.py index e8f1107773..038950bc7a 100644 --- a/lib/iris/cube.py +++ b/lib/iris/cube.py @@ -1280,9 +1280,9 @@ def __init__( ... (longitude, 1)]) """ - # Temporary error while we transition the API. - if isinstance(data, str): - raise TypeError("Invalid data type: {!r}.".format(data)) + # # Temporary error while we transition the API. + # if isinstance(data, str): + # raise TypeError("Invalid data type: {!r}.".format(data)) # Configure the metadata manager. self._metadata_manager = metadata_manager_factory(CubeMetadata) From 4e956e02df63b958a60bd52eb2742d79a382a944 Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Thu, 28 May 2026 16:22:35 +0100 Subject: [PATCH 08/12] Tests for saving scalar strings, and a mix of encodings and lengths. --- .../integration/netcdf/test_stringdata.py | 124 ++++++++++++++++++ 1 file changed, 124 insertions(+) diff --git a/lib/iris/tests/integration/netcdf/test_stringdata.py b/lib/iris/tests/integration/netcdf/test_stringdata.py index eae8006834..0d3ac5bda8 100644 --- a/lib/iris/tests/integration/netcdf/test_stringdata.py +++ b/lib/iris/tests/integration/netcdf/test_stringdata.py @@ -19,6 +19,7 @@ import iris from iris.coords import AuxCoord, DimCoord from iris.cube import Cube +import iris.exceptions from iris.fileformats.netcdf import ( DECODE_TO_STRINGS_ON_READ, SUPPORTED_ENCODINGS, @@ -46,6 +47,13 @@ def all_lazy_auxcoords(): TEST_ENCODINGS = [NO_ENCODING_STR, ALIAS_UTF8_STR] + SUPPORTED_ENCODINGS +# Common fixture to save with split-attrs ONLY in these tests +@pytest.fixture(scope="module", autouse=True) +def all_split_attrs(): + with iris.FUTURE.context(save_split_attrs=True): + yield + + # # Routines to convert between byte and string arrays. # Independently defined here, to avoid relying on any code we are testing. @@ -476,3 +484,119 @@ def test_valid_encodings(self, encoding, writetest_data, write_bytes): ) assert np.all(data_main == vararray) assert np.all(data_co == coordarray) + + +class TestStringCubeBehaviour: + def test_create(self): + cube = Cube(["this", "that", "cliché"]) + assert isinstance(cube.core_data(), np.ndarray) + assert cube.shape == (3,) + assert cube.dtype == np.dtype("U6") + + def test_scalar_extract(self): + cube = Cube(["one", "two", "thirteen"]) + cube = cube[0] + assert isinstance(cube.core_data(), np.ndarray) + assert cube.shape == () + assert cube.dtype == np.dtype("U3") + + def test_scalar_create(self): + cube = Cube("éclair") + assert isinstance(cube.core_data(), np.ndarray) + assert cube.shape == () + assert cube.dtype == np.dtype("U6") + + +class TestWriteReadMixedEncodings: + """Check saving of different types of string data, in cubes. + + Checks that encodings are preserved through save/load. + Checks that scalar cubes save. + Checks that multiple cubes with different encodings save correctly. + """ + + def test_mixed(self, tmp_path): + # Save a mixture of string + numeric cubes, 1-D and scalar + # Ensure that they save, and read back correctly. + c1 = Cube(["test-string"], var_name="c1") + c2 = Cube(["test=éclair"], var_name="c2", attributes={"_Encoding": "utf16"}) + c3 = Cube(4.5, var_name="c3") + c4 = Cube(np.array("q"), var_name="c4") # a SCALAR character-type cube + cubes = [c1, c2, c3, c4] + originals = [c.copy() for c in cubes] + + # Check they save OK + filepath = tmp_path / "tst.nc" + iris.save(cubes, filepath) + + # Check they also read back the same (except for Conventions attribute) + results = iris.load_cubes(filepath, ["c1", "c2", "c3", "c4"]) + for cube in results: + cube.attributes.pop("Conventions", None) + assert all(orig == result for orig, result in zip(originals, results)) + + +class TestWriteReadScalarStringCubes: + """Check how scalar string-typed cubes are saved. + NB all these gain a string dimension, even when only a single byte character, + so they are not actually "scalar" in the file. + """ + + def test_save_scalar_ascii__ok(self, tmp_path): + # We can save a scalar cube containing a *single ascii character* + scalar_char_cube = Cube( + np.array("x"), + var_name="c1", + attributes={"_Encoding": "utf8"}, # NB no encoding is *needed* here. + ) + assert scalar_char_cube.shape == () + filepath = tmp_path / "tst.nc" + iris.save(scalar_char_cube, filepath) + + # Check dims in file + ds = _thread_safe_nc.DatasetWrapper(filepath) + assert ds.variables["c1"].dimensions == ("string1",) + assert ds.dimensions["string1"].size == 1 + ds.close() + + # check read-back result + result = iris.load_cube(filepath) + result.attributes.pop("Conventions", None) + assert result == scalar_char_cube + + def test_save_scalar_unicode__fail(self, tmp_path): + # You *can't* save a scalar cube containing a non-ascii character + # *without an explicitly lengthened dtype*, + # because it doesn't convert to a single "char". + scalar_char_bad = Cube( + np.array("ü"), var_name="c1", attributes={"_Encoding": "utf8"} + ) + assert scalar_char_bad.shape == () + filepath = tmp_path / "tst.nc" + msg = ( + "String 'ü' written .* is 2 bytes long, " + "which exceeds the string dimension length" + ) + with pytest.raises(iris.exceptions.TranslationError, match=msg): + iris.save(scalar_char_bad, filepath) + + def test_save_single_unicode__okay(self, tmp_path): + # You *can* save a scalar cube containing a non-ascii character, + # *if* the dtype is extended to allow for multiple encoded bytes. + scalar_char_cube = Cube( + np.array("ü", dtype="U2"), var_name="c1", attributes={"_Encoding": "utf8"} + ) + assert scalar_char_cube.shape == () + filepath = tmp_path / "tst.nc" + iris.save(scalar_char_cube, filepath) + + # Check dims in file + ds = _thread_safe_nc.DatasetWrapper(filepath) + assert ds.variables["c1"].dimensions == ("string2",) + assert ds.dimensions["string2"].size == 2 + ds.close() + + # check read-back result + result = iris.load_cube(filepath) + result.attributes.pop("Conventions", None) + assert result == scalar_char_cube From 2c276a8b78bf1c0b1042c728ad40b334c982e431 Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Fri, 29 May 2026 12:05:40 +0100 Subject: [PATCH 09/12] Tests for some specific save/load usecases. --- lib/iris/fileformats/netcdf/saver.py | 77 +++++++------ .../integration/netcdf/test_stringdata.py | 102 +++++++++++++++++- 2 files changed, 141 insertions(+), 38 deletions(-) diff --git a/lib/iris/fileformats/netcdf/saver.py b/lib/iris/fileformats/netcdf/saver.py index bde1cbd1fe..49fbbb1221 100644 --- a/lib/iris/fileformats/netcdf/saver.py +++ b/lib/iris/fileformats/netcdf/saver.py @@ -1833,8 +1833,8 @@ def _create_generic_cf_array_var( if cube is not None and data is not None and cube.shape != data.shape: compression_kwargs = {} - if not is_dataless and np.issubdtype(data.dtype, np.str_): - # Deal with string-type variables. + if not is_dataless and data.dtype.kind == "U": + # Deal with unicode-string-type variables. # Typically CF label variables, but also possibly ancil-vars ? # NOTE: all we are doing here is to calculate the byte dimension length, @@ -1843,37 +1843,26 @@ def _create_generic_cf_array_var( # being a _bytecoding_datasets.EncodedVariable. string_dimension_depth = data.dtype.itemsize - if data.dtype.kind == "U": - # String content (U) instead of bytes (S). - # For numpy strings, itemsize is **always** a multiple of 4 - if string_dimension_depth % 4 != 0: - msg = ( - "Unexpected numpy string 'itemsize' for element " - f"{cube_or_mesh.name()}: " - f"'dtype.itemsize = {string_dimension_depth}, expected " - "a multiple of four (always)." - ) - raise ValueError(msg) - nchars = string_dimension_depth // 4 - - encoding_attr = element.attributes.get("_Encoding", "ascii") - # Look this up + return a supported encoding name - # NB implements defaults and raises a warning if given not recognised. - encoding = bytecoding_datasets._identify_encoding( - encoding=encoding_attr, var_name=cf_name, writing=True + # String content (U) instead of bytes (S). + # For numpy strings, itemsize is **always** a multiple of 4 + if string_dimension_depth % 4 != 0: + msg = ( + "Unexpected numpy string 'itemsize' for element " + f"{cube_or_mesh.name()}: " + f"'dtype.itemsize = {string_dimension_depth}, expected " + "a multiple of four (always)." ) - width_fns = bytecoding_datasets._ENCODING_WIDTH_TRANSLATIONS[encoding] - string_dimension_depth = width_fns.nchars_2_nbytes(nchars) - else: - if data.dtype.kind != "S" or data.dtype.itemsize != 1: - # Some type of data we don't "understand". - # NB this includes "Sxx" types other than "S1" : It seems that - # netCDF4 saves Sxx as variable-length strings. But we don't support that type in Iris. - msg = ( - f"Variable {cf_name!r} has unexpected string/character dtype, " - f"{data.dtype} -- should be either 'S' or 'U' type." - ) - raise ValueError(msg) + raise ValueError(msg) + nchars = string_dimension_depth // 4 + + encoding_attr = element.attributes.get("_Encoding", "ascii") + # Look this up + return a supported encoding name + # NB implements defaults and raises a warning if given not recognised. + encoding = bytecoding_datasets._identify_encoding( + encoding=encoding_attr, var_name=cf_name, writing=True + ) + width_fns = bytecoding_datasets._ENCODING_WIDTH_TRANSLATIONS[encoding] + string_dimension_depth = width_fns.nchars_2_nbytes(nchars) string_dimension_name = "string%d" % string_dimension_depth @@ -1893,12 +1882,34 @@ def _create_generic_cf_array_var( # Create the label coordinate variable. cf_var = self._dataset.createVariable(cf_name, "|S1", element_dims) else: - # A normal (numeric) variable. + # A non-string variable. # ensure a valid datatype for the file format. if is_dataless: dtype = self._DATALESS_DTYPE fill_value = self._DATALESS_FILLVALUE else: + # Normal non-string data. + # NOTE: this includes byte-arrays (S1 only) : however these must + # use an actual cube dimension for the 'string dimension', which + # seriously limits the utility of DECODE_TO_STRINGS_ON_READ. + # TODO: also support netCDF variable-length strings ("string" type). + # Currently hit a **write error here**, being numpy object dtype ("O"). + if ( + data.dtype.kind not in "iufSU" + or data.dtype.kind == "S" + and data.dtype.itemsize != 1 + ): + # This is a type of data we don't "understand". + # NB this includes "Sxx" types other than "S1" : It seems that + # netCDF4 saves Sxx as variable-length strings. + # But we don't support that type in Iris. + msg = ( + f"Variable {cf_name!r} has unexpected dtype, {data.dtype!r}." + f"Data content arrays must be numeric, or contain " + "single-bytes (dtype 'S1'), or unicode strings (dtype 'U')." + ) + raise ValueError(msg) + element_type = type(element).__name__ data = self._ensure_valid_dtype(data, element_type, element) if not packing_controls: diff --git a/lib/iris/tests/integration/netcdf/test_stringdata.py b/lib/iris/tests/integration/netcdf/test_stringdata.py index 0d3ac5bda8..71a02fc093 100644 --- a/lib/iris/tests/integration/netcdf/test_stringdata.py +++ b/lib/iris/tests/integration/netcdf/test_stringdata.py @@ -109,7 +109,10 @@ class SamplefileDetails: def make_testfile( testfile_path: Path, encoding_str: str, - coords_on_separate_dim: bool, + coords_on_separate_dim: bool = False, + # If set, determines the "_Encoding" attrs content, including None --> no attr. + # Otherwise, they follow 'encoding_str', including NO_ENCODING_STR --> no attr. + encoding_attr: str | None = "", ) -> SamplefileDetails: """Create a test netcdf file. @@ -120,6 +123,9 @@ def make_testfile( else: encoding = encoding_str + if encoding_attr == "": + encoding_attr = encoding + data_is_ascii = encoding in (None, "ascii") numeric_values = np.arange(3.0) @@ -156,8 +162,8 @@ def make_testfile( ) v_co[:] = coordvar_bytearray - if encoding is not None: - v_co._Encoding = encoding + if encoding_attr is not None: + v_co._Encoding = encoding_attr v_numeric = ds.createVariable( "v_numeric", @@ -176,8 +182,8 @@ def make_testfile( ) v_datavar[:] = datavar_bytearray - if encoding is not None: - v_datavar._Encoding = encoding + if encoding_attr is not None: + v_datavar._Encoding = encoding_attr v_datavar.coordinates = "v_co v_numeric" finally: @@ -600,3 +606,89 @@ def test_save_single_unicode__okay(self, tmp_path): result = iris.load_cube(filepath) result.attributes.pop("Conventions", None) assert result == scalar_char_cube + + +class TestReadParticularCases: + @pytest.mark.parametrize("data_encoding", ["utf8", "utf16", "utf32"]) + def test_read_no_encoding(self, tmp_path, data_encoding): + # Check that we can read UTF-8 encoded data, even with no _Encoding attribute. + # This is a common case in the wild, and now accepted by CF as a default. + # However, other encodings will FAIL to decode. + filepath = tmp_path / "utf8_no_encoding.nc" + testdata = make_testfile( + testfile_path=filepath, + encoding_str=data_encoding, + encoding_attr=None, + ) + cube = iris.load_cube(filepath) + assert "_Encoding" not in cube.attributes + + if data_encoding == "utf8": + assert np.all(cube.data == testdata.datavar_data) + else: + msg = "Character data .* could not be decoded with the 'utf-8' encoding" + with pytest.raises(ValueError, match=msg): + cube.data + + def test_read_wrong_encoding__fail(self, tmp_path): + filepath = tmp_path / "missing_encoding.nc" + testdata = make_testfile( + testfile_path=filepath, + encoding_str="utf-16", + encoding_attr="utf-8", + ) + cube = iris.load_cube(filepath) + # NOTE: error only occurs when you attempt to fetch + translate the content. + msg = "Character data .* could not be decoded with the 'utf-8' encoding." + with pytest.raises(ValueError, match=msg): + data = cube.data + + +class TestWriteParticularCases: + def test_write_unicode_no_encoding__fail(self, tmp_path): + cube = Cube(np.array("éclair")) + filepath = tmp_path / "write_unicode_no_encoding.nc" + msg = ( + "String data written to netcdf character variable 'unknown' " + "could not be represented in encoding 'ascii'" + ) + with pytest.raises(ValueError, match=msg): + iris.save(cube, filepath) + + def test_write_encoded_overlength__fail(self, tmp_path): + cube = Cube(np.array("éclair"), attributes={"_Encoding": "utf8"}) + filepath = tmp_path / "write_encoded_overlength.nc" + msg = ( + "String 'éclair' written into netcdf variable 'unknown' " + "with encoding 'utf-8' is 7 bytes long, which exceeds the " + "string dimension length, 6. " + r"This can be fixed by converting the data to a \"wider\" string dtype, " + r"e.g. cube.data = cube.data.astype\(\"U7\"\)" + ) + with pytest.raises(iris.exceptions.TranslationError, match=msg): + iris.save(cube, filepath) + + def test_write_multibytes__fail(self, tmp_path): + encoded_bytes = "éclair".encode("utf8") + byte_array = np.array(encoded_bytes) + cube = Cube(byte_array, attributes={"_Encoding": "utf8"}) + filepath = tmp_path / "write_multibyte_Sxx.nc" + msg = ( + r"Variable 'unknown' has unexpected dtype, dtype\('S7'\)." + "Data content arrays must be numeric, or contain single-bytes " + r"\(dtype 'S1'\), or unicode strings \(dtype 'U'\)." + ) + with pytest.raises(ValueError, match=msg): + iris.save(cube, filepath) + + def test_write_stringobjects__fail(self, tmp_path): + string_array = np.array(["one", "four"], dtype="O") + cube = Cube(string_array) + filepath = tmp_path / "write_stringobjects.nc" + msg = ( + r"Variable 'unknown' has unexpected dtype, dtype\('O'\)." + "Data content arrays must be numeric, or contain single-bytes " + r"\(dtype 'S1'\), or unicode strings \(dtype 'U'\)." + ) + with pytest.raises(ValueError, match=msg): + iris.save(cube, filepath) From 6c4be387e87ae20c96740cd4109ffcdc8f664450 Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Fri, 29 May 2026 16:05:48 +0100 Subject: [PATCH 10/12] Test save/load of a 'bad' unicode sequence as a byte array. --- .../integration/netcdf/test_stringdata.py | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/lib/iris/tests/integration/netcdf/test_stringdata.py b/lib/iris/tests/integration/netcdf/test_stringdata.py index 71a02fc093..cf5ad42187 100644 --- a/lib/iris/tests/integration/netcdf/test_stringdata.py +++ b/lib/iris/tests/integration/netcdf/test_stringdata.py @@ -692,3 +692,23 @@ def test_write_stringobjects__fail(self, tmp_path): ) with pytest.raises(ValueError, match=msg): iris.save(cube, filepath) + + +class TestSaveloadBadUnicodeAsBytes: + def test_save_load_bad_unicode(self, tmp_path): + filepath = tmp_path / "bad_unicode_utf8.nc" + test_string = "marré" + bytes_array = test_string.encode("utf8") + s1_array = np.array([bytes_array[i : i + 1] for i in range(len(bytes_array))]) + s1_array_bad_utf8 = s1_array[:-1] # invalid without the last byte + cube = Cube(s1_array_bad_utf8, attributes={"_Encoding": "utf8"}) + iris.save(cube, filepath) + # First check for error when reading back *normally* + msg = "could not be decoded with the 'utf-8' encoding" + with pytest.raises(ValueError, match=msg): + iris.load(filepath) + # .. but OK in byte-reading mode + with iris.fileformats.netcdf.DECODE_TO_STRINGS_ON_READ.context(False): + readback_cube = iris.load_cube(filepath) + assert readback_cube.dtype == "S1" + assert np.all(readback_cube.data == s1_array_bad_utf8) From 5d5bedc79a6120d52b07efc3a9bf6ed8bcfd89be Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Fri, 29 May 2026 16:29:56 +0100 Subject: [PATCH 11/12] Remove various obsolete comments, and debug code. --- lib/iris/cube.py | 4 ---- lib/iris/fileformats/_nc_load_rules/helpers.py | 4 ---- .../fileformats/netcdf/_bytecoding_datasets.py | 1 - lib/iris/fileformats/netcdf/_thread_safe_nc.py | 2 +- .../tests/integration/netcdf/test_stringdata.py | 14 -------------- .../fileformats/netcdf/test_bytecoding_datasets.py | 10 +--------- 6 files changed, 2 insertions(+), 33 deletions(-) diff --git a/lib/iris/cube.py b/lib/iris/cube.py index 038950bc7a..0bed1c2d10 100644 --- a/lib/iris/cube.py +++ b/lib/iris/cube.py @@ -1280,10 +1280,6 @@ def __init__( ... (longitude, 1)]) """ - # # Temporary error while we transition the API. - # if isinstance(data, str): - # raise TypeError("Invalid data type: {!r}.".format(data)) - # Configure the metadata manager. self._metadata_manager = metadata_manager_factory(CubeMetadata) diff --git a/lib/iris/fileformats/_nc_load_rules/helpers.py b/lib/iris/fileformats/_nc_load_rules/helpers.py index 0815be335d..7c4810ffe7 100644 --- a/lib/iris/fileformats/_nc_load_rules/helpers.py +++ b/lib/iris/fileformats/_nc_load_rules/helpers.py @@ -1637,11 +1637,7 @@ def _add_auxiliary_coordinate( # Determine the name of the dimension/s shared between the CF-netCDF data variable # and the coordinate being built. coord_dims = cf_coord_var.dimensions - # if cf._is_str_dtype(cf_coord_var): - # coord_dims = coord_dims[:-1] datavar_dims = engine.cf_var.dimensions - # if cf._is_str_dtype(engine.cf_var): - # datavar_dims = datavar_dims[:-1] common_dims = [dim for dim in coord_dims if dim in datavar_dims] data_dims = None if common_dims: diff --git a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py index da36c10c98..cab4eb9421 100644 --- a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py +++ b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py @@ -116,7 +116,6 @@ def encode_stringarray_as_bytearray( raise ValueError(msg) from err n_bytes = len(bytes) - # TODO: may want to issue warning or error if we overflow the length? if n_bytes > string_dimension_length: from iris.exceptions import TranslationError diff --git a/lib/iris/fileformats/netcdf/_thread_safe_nc.py b/lib/iris/fileformats/netcdf/_thread_safe_nc.py index 8bc1af7a7c..486ad518fc 100644 --- a/lib/iris/fileformats/netcdf/_thread_safe_nc.py +++ b/lib/iris/fileformats/netcdf/_thread_safe_nc.py @@ -160,7 +160,7 @@ class GroupWrapper(_ThreadSafeWrapper): # Note: will also accept a whole Dataset object, but that is OK. _DUCKTYPE_CHECK_PROPERTIES = ["createVariable"] # Class to use when creating variable wrappers (default=VariableWrapper). - # - needed to support _byte_encoded_data.EncodedDataset. + # - needed to support _bytecoding_datasets.EncodedDataset. VAR_WRAPPER_CLS = VariableWrapper GRP_WRAPPER_CLS: typing.Any | None = None # self-reference : fill in later diff --git a/lib/iris/tests/integration/netcdf/test_stringdata.py b/lib/iris/tests/integration/netcdf/test_stringdata.py index cf5ad42187..8cc7d9a7eb 100644 --- a/lib/iris/tests/integration/netcdf/test_stringdata.py +++ b/lib/iris/tests/integration/netcdf/test_stringdata.py @@ -38,9 +38,6 @@ def all_lazy_auxcoords(): N_XDIM = 3 N_CHARS_DIM = 64 -# TODO: remove (debug) -# PERSIST_TESTFILES: str | None = "~/chararray_testfiles" -PERSIST_TESTFILES: str | None = None NO_ENCODING_STR = "" ALIAS_UTF8_STR = "UTF8" # an alternative acceptable form (should be written as-is) @@ -228,11 +225,6 @@ def readtest_path( use_separate_dims, ) -> Iterable[SamplefileDetails]: """Create a suitable valid testfile, and return expected string content.""" - match PERSIST_TESTFILES: - case str(): - tmp_path = Path(PERSIST_TESTFILES).expanduser() - case _: - pass if encoding == "": filetag = "noencoding" else: @@ -254,10 +246,6 @@ def readtest_data( encoding_str=encoding, coords_on_separate_dim=use_separate_dims, ) - - # # TODO: temporary for debug -- TO REMOVE - # from iris.tests.integration.netcdf.test_chararrays import ncdump - # ncdump(str(tempfile_path)) return testdata @pytest.fixture(params=["strings", "bytes"]) @@ -425,8 +413,6 @@ def write_bytes(self, request): @pytest.fixture def writetest_path(self, encoding, write_bytes, lazy_data, tmp_path): """Create a suitable test cube, with either string or byte content.""" - if PERSIST_TESTFILES: - tmp_path = Path(PERSIST_TESTFILES).expanduser() if encoding == "": filetag = "noencoding" else: diff --git a/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py b/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py index f153033fa9..8432a0831f 100644 --- a/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py +++ b/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py @@ -48,17 +48,9 @@ def encoding(request): return request.param -# TODO: remove (debug) -# PERSIST_TESTFILES: str | None = "~/chararray_testfiles" -PERSIST_TESTFILES: str | None = None - - @pytest.fixture(scope="module") def tempdir(tmp_path_factory): - if PERSIST_TESTFILES: - path = Path(PERSIST_TESTFILES).expanduser() - else: - path = tmp_path_factory.mktemp("netcdf") + path = tmp_path_factory.mktemp("netcdf") return path From cbbaf5ab96c5b6c71278a62045971d77fb3b1faa Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Mon, 8 Jun 2026 17:36:22 +0100 Subject: [PATCH 12/12] Review changes. --- lib/iris/fileformats/netcdf/saver.py | 6 ++---- lib/iris/tests/integration/netcdf/test_stringdata.py | 3 ++- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/lib/iris/fileformats/netcdf/saver.py b/lib/iris/fileformats/netcdf/saver.py index 49fbbb1221..4938f481c4 100644 --- a/lib/iris/fileformats/netcdf/saver.py +++ b/lib/iris/fileformats/netcdf/saver.py @@ -1894,10 +1894,8 @@ def _create_generic_cf_array_var( # seriously limits the utility of DECODE_TO_STRINGS_ON_READ. # TODO: also support netCDF variable-length strings ("string" type). # Currently hit a **write error here**, being numpy object dtype ("O"). - if ( - data.dtype.kind not in "iufSU" - or data.dtype.kind == "S" - and data.dtype.itemsize != 1 + if data.dtype.kind not in "iufSU" or ( + data.dtype.kind == "S" and data.dtype.itemsize != 1 ): # This is a type of data we don't "understand". # NB this includes "Sxx" types other than "S1" : It seems that diff --git a/lib/iris/tests/integration/netcdf/test_stringdata.py b/lib/iris/tests/integration/netcdf/test_stringdata.py index 8cc7d9a7eb..925da599a6 100644 --- a/lib/iris/tests/integration/netcdf/test_stringdata.py +++ b/lib/iris/tests/integration/netcdf/test_stringdata.py @@ -612,6 +612,7 @@ def test_read_no_encoding(self, tmp_path, data_encoding): if data_encoding == "utf8": assert np.all(cube.data == testdata.datavar_data) else: + # NOTE: no error on loading, only when you fetch content + it decodes. msg = "Character data .* could not be decoded with the 'utf-8' encoding" with pytest.raises(ValueError, match=msg): cube.data @@ -624,7 +625,7 @@ def test_read_wrong_encoding__fail(self, tmp_path): encoding_attr="utf-8", ) cube = iris.load_cube(filepath) - # NOTE: error only occurs when you attempt to fetch + translate the content. + # NOTE: no error on loading, only when you fetch content + it decodes. msg = "Character data .* could not be decoded with the 'utf-8' encoding." with pytest.raises(ValueError, match=msg): data = cube.data