Chardata moretests #7101

pp-mo · 2026-06-11T14:49:37Z

pp-mo · 2026-06-11T14:44:44Z

pp-mo · 2026-06-11T14:48:21Z

logic changed + code moved elsewhere : see here

pp-mo · 2026-06-11T14:48:05Z

moved from above, following dtype testing logic change

-Original file line number
+Diff line change
@@ Expand Up / @@ -1280,10 +1280,6 @@ def __init__( @@
                 ...                                  (longitude, 1)])
             """
-            # Temporary error while we transition the API.
-            if isinstance(data, str):
-                raise TypeError("Invalid data type: {!r}.".format(data))
             # Configure the metadata manager.
             self._metadata_manager = metadata_manager_factory(CubeMetadata)
@@ Expand Down Expand Up / @@ -4468,15 +4464,20 @@ def __eq__(self, other): @@
                 # Having checked everything else, check approximate data equality.
                 if result and not dataless_equality:
-                    # TODO: why do we use allclose() here, but strict equality in
-                    #  _DimensionalMetadata (via util.array_equal())?
-                    result = bool(
-                        np.allclose(
-                            self.core_data(),
-                            other.core_data(),
-                            equal_nan=True,
+                    if self.dtype.kind in "if":
+                        # numbers
+                        # TODO: why do we use allclose() here, but strict equality in
+                        #  _DimensionalMetadata (via util.array_equal())?
+                        result = bool(
+                            np.allclose(
+                                self.core_data(),
+                                other.core_data(),
+                                equal_nan=True,
+                            )
                         )
-                    )
+                    else:
+                        # non-numeric: use exact equality
+                        result = bool(np.all(self.core_data() == other.core_data()))
             return result
         # Must supply __ne__, Python does not defer to __eq__ for negative equality
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -1637,11 +1637,7 @@ def _add_auxiliary_coordinate( @@
         # Determine the name of the dimension/s shared between the CF-netCDF data variable
         # and the coordinate being built.
         coord_dims = cf_coord_var.dimensions
-        # if cf._is_str_dtype(cf_coord_var):
-        #     coord_dims = coord_dims[:-1]
         datavar_dims = engine.cf_var.dimensions
-        # if cf._is_str_dtype(engine.cf_var):
-        #     datavar_dims = datavar_dims[:-1]
         common_dims = [dim for dim in coord_dims if dim in datavar_dims]
         data_dims = None
         if common_dims:
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up @@
                 self._with_ugrid = False
             # Read the variables in the dataset only once to reduce runtime.
+            # Turn off *any* automatic decoding in the underlying netCDF4 dataset
+            ds = self._dataset
+            if isinstance(ds, _thread_safe_nc.DatasetWrapper):
+                ds._contained_instance.set_auto_chartostring(False)
+            else:
+                ds.set_auto_chartostring(False)
             variables = self._dataset.variables
             self._translate(variables)
             self._build_cf_groups(variables)
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -116,7 +116,6 @@ def encode_stringarray_as_bytearray( @@
                 raise ValueError(msg) from err
             n_bytes = len(bytes)
-            # TODO: may want to issue warning or error if we overflow the length?
             if n_bytes > string_dimension_length:
                 from iris.exceptions import TranslationError
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -160,7 +160,7 @@ class GroupWrapper(_ThreadSafeWrapper): @@
         # Note: will also accept a whole Dataset object, but that is OK.
         _DUCKTYPE_CHECK_PROPERTIES = ["createVariable"]
         # Class to use when creating variable wrappers (default=VariableWrapper).
-        # - needed to support _byte_encoded_data.EncodedDataset.
+        # - needed to support _bytecoding_datasets.EncodedDataset.
         VAR_WRAPPER_CLS = VariableWrapper
         GRP_WRAPPER_CLS: typing.Any | None = None  # self-reference : fill in later
@@ Expand Down @@

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Chardata moretests #7101

Uh oh!

Diff view

Diff view

There are no files selected for viewing

pp-mo Jun 11, 2026

Uh oh!

pp-mo Jun 11, 2026

Uh oh!

pp-mo Jun 11, 2026

Uh oh!

pp-mo Jun 11, 2026

Uh oh!

Uh oh!

Uh oh!

-Original file line number
+Diff line change
@@ Expand Up / @@ -1719,20 +1719,23 @@ def add_names_attrs(): @@
             if element.units.calendar:
                 _setncattr(cf_var, "calendar", str(element.units.calendar))
+            # Take a copy so we can remove things
+            element_attrs = element.attributes.copy()
             # Note: when writing UGRID, "element" can be a Mesh which has no "dtype",
             # and for dataless cubes it will have a 'None' dtype.
             if getattr(element, "dtype", None) is not None:
                 # Most attributes are dealt with later.  But _Encoding needs to be defined
                 #  *before* we can write to a character variable.
-                if element.dtype.kind in "SU" and "_Encoding" in element.attributes:
-                    encoding = element.attributes.pop("_Encoding")
+                if element.dtype.kind in "SU" and "_Encoding" in element_attrs:
+                    encoding = element_attrs.pop("_Encoding")
                     _setncattr(cf_var, "_Encoding", encoding)
             if not isinstance(element, Cube):
                 # Add any other custom coordinate attributes.
                 # N.B. not Cube, which has specific handling in  _create_cf_data_variable
-                for name in sorted(element.attributes):
-                    value = element.attributes[name]
+                for name in sorted(element_attrs):
+                    value = element_attrs[name]
                     if name == "STASH":
                         # Adopting provisional Metadata Conventions for representing MO
@@ Expand Down Expand Up / @@ -1830,8 +1833,8 @@ def _create_generic_cf_array_var( @@
             if cube is not None and data is not None and cube.shape != data.shape:
                 compression_kwargs = {}
-            if not is_dataless and np.issubdtype(data.dtype, np.str_):
-                # Deal with string-type variables.
+            if not is_dataless and data.dtype.kind == "U":
+                # Deal with unicode-string-type variables.
                 # Typically CF label variables, but also possibly ancil-vars ?
                 # NOTE: all we are doing here is to calculate the byte dimension length,
@@ Expand All / @@ -1840,37 +1843,26 @@ def _create_generic_cf_array_var( @@
                 # being a _bytecoding_datasets.EncodedVariable.
                 string_dimension_depth = data.dtype.itemsize
-                if data.dtype.kind == "U":
-                    # String content (U) instead of bytes (S).
-                    # For numpy strings, itemsize is **always** a multiple of 4
-                    if string_dimension_depth % 4 != 0:
-                        msg = (
-                            "Unexpected numpy string 'itemsize' for element "
-                            f"{cube_or_mesh.name()}: "
-                            f"'dtype.itemsize = {string_dimension_depth}, expected "
-                            "a multiple of four (always)."
-                        )
-                        raise ValueError(msg)
-                    nchars = string_dimension_depth // 4
-                    encoding_attr = element.attributes.get("_Encoding", "ascii")
-                    # Look this up + return a supported encoding name
-                    # NB implements defaults and raises a warning if given not recognised.
-                    encoding = bytecoding_datasets._identify_encoding(
-                        encoding=encoding_attr, var_name=cf_name, writing=True
+                # String content (U) instead of bytes (S).
+                # For numpy strings, itemsize is **always** a multiple of 4
+                if string_dimension_depth % 4 != 0:
+                    msg = (
+                        "Unexpected numpy string 'itemsize' for element "
+                        f"{cube_or_mesh.name()}: "
+                        f"'dtype.itemsize = {string_dimension_depth}, expected "
+                        "a multiple of four (always)."
                     )
-                    width_fns = bytecoding_datasets._ENCODING_WIDTH_TRANSLATIONS[encoding]
-                    string_dimension_depth = width_fns.nchars_2_nbytes(nchars)
-                else:
-                    if data.dtype.kind != "S" or data.dtype.itemsize != 1:
-                        # Some type of data we don't "understand".
-                        # NB this includes "Sxx" types other than "S1" :  It seems that
-                        # netCDF4 saves Sxx as variable-length strings.  But we don't support that type in Iris.
-                        msg = (
-                            f"Variable {cf_name!r} has unexpected string/character dtype, "
-                            f"{data.dtype} -- should be either 'S' or 'U' type."
-                        )
-                        raise ValueError(msg)
+                    raise ValueError(msg)
+                nchars = string_dimension_depth // 4
+                encoding_attr = element.attributes.get("_Encoding", "ascii")
+                # Look this up + return a supported encoding name
+                # NB implements defaults and raises a warning if given not recognised.
+                encoding = bytecoding_datasets._identify_encoding(
+                    encoding=encoding_attr, var_name=cf_name, writing=True
+                )
+                width_fns = bytecoding_datasets._ENCODING_WIDTH_TRANSLATIONS[encoding]
+                string_dimension_depth = width_fns.nchars_2_nbytes(nchars)
                 string_dimension_name = "string%d" % string_dimension_depth
@@ Expand All / @@ -1890,12 +1882,32 @@ def _create_generic_cf_array_var( @@
                 # Create the label coordinate variable.
                 cf_var = self._dataset.createVariable(cf_name, "|S1", element_dims)
             else:
-                # A normal (numeric) variable.
+                # A non-string variable.
                 # ensure a valid datatype for the file format.
                 if is_dataless:
                     dtype = self._DATALESS_DTYPE
                     fill_value = self._DATALESS_FILLVALUE
                 else:
+                    # Normal non-string data.
+                    # NOTE: this includes byte-arrays (S1 only) : however these must
+                    # use an actual cube dimension for the 'string dimension', which
+                    # seriously limits the utility of DECODE_TO_STRINGS_ON_READ.
+                    # TODO: also support netCDF variable-length strings ("string" type).
+                    #  Currently hit a **write error here**, being numpy object dtype ("O").
+                    if data.dtype.kind not in "iufSU" or (
+                        data.dtype.kind == "S" and data.dtype.itemsize != 1
+                    ):
+                        # This is a type of data we don't "understand".
+                        # NB this includes "Sxx" types other than "S1" :  It seems that
+                        # netCDF4 saves Sxx as variable-length strings.
+                        # But we don't support that type in Iris.
+                        msg = (
+                            f"Variable {cf_name!r} has unexpected dtype, {data.dtype!r}."
+                            f"Data content arrays must be numeric, or contain "
+                            "single-bytes (dtype 'S1'), or unicode strings (dtype 'U<n>')."
+                        )
+                        raise ValueError(msg)
                     element_type = type(element).__name__
                     data = self._ensure_valid_dtype(data, element_type, element)
                     if not packing_controls:
@@ Expand Down @@

Uh oh!

Chardata moretests #7101

Uh oh!

Chardata moretests #7101

Uh oh!

Uh oh!

Diff view

Diff view

There are no files selected for viewing

pp-mo Jun 11, 2026

Choose a reason for hiding this comment

Uh oh!

pp-mo Jun 11, 2026

Choose a reason for hiding this comment

Uh oh!

pp-mo Jun 11, 2026

Choose a reason for hiding this comment

Uh oh!

pp-mo Jun 11, 2026

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!