From 0bbc9d14b5740dea8073278edfd4ffbcc2e6ad38 Mon Sep 17 00:00:00 2001 From: jswhit2 Date: Mon, 27 Oct 2025 15:04:51 -0600 Subject: [PATCH 01/10] tentative fixes for issue #1440 --- Changelog | 2 ++ include/netcdf-compat.h | 2 +- src/netCDF4/_netCDF4.pyx | 13 +++++++------ 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/Changelog b/Changelog index f9d0de0f5..455bd1e06 100644 --- a/Changelog +++ b/Changelog @@ -1,3 +1,5 @@ + version 1.7.4 (not yet released) + ================================ version 1.7.3 (tag v1.7.3rel) ============================= * Python 3.14 wheels (issue #1432) diff --git a/include/netcdf-compat.h b/include/netcdf-compat.h index d1144d979..ccfb8322e 100644 --- a/include/netcdf-compat.h +++ b/include/netcdf-compat.h @@ -60,7 +60,7 @@ static inline int nc_get_alignment(int* thresholdp, int* alignmentp) { #else #define HAS_NCRCSET 0 static inline int nc_rc_set(const char* key, const char* value) { return NC_EINVAL; } -static inline const char *nc_rc_get(const char* key) { return NC_EINVAL; } +static inline const char *nc_rc_get(const char* key) { return NULL; } #endif #if NC_VERSION_GE(4, 4, 0) diff --git a/src/netCDF4/_netCDF4.pyx b/src/netCDF4/_netCDF4.pyx index acdfdf5a5..487dbcb02 100644 --- a/src/netCDF4/_netCDF4.pyx +++ b/src/netCDF4/_netCDF4.pyx @@ -5525,7 +5525,11 @@ cannot be safely cast to variable data type""" % attname # if data is a string or a bytes object, convert to a numpy string array # whose length is equal to the rightmost dimension of the # variable. - if type(data) in [str,bytes]: data = numpy.asarray(data,dtype='S'+repr(self.shape[-1])) + if type(data) in [str,bytes]: + if encoding == 'ascii': + data = numpy.asarray(data,dtype='S'+repr(self.shape[-1])) + else: + data = numpy.asarray(data,dtype='U'+repr(self.shape[-1])) if data.dtype.kind in ['S','U'] and data.dtype.itemsize > 1: # if data is a numpy string array, convert it to an array # of characters with one more dimension. @@ -6816,15 +6820,12 @@ returns a numpy string array with datatype `'UN'` (or `'SN'`) and shape dtype = b.dtype.kind if dtype not in ["S","U"]: raise ValueError("type must be string or unicode ('S' or 'U')") - if encoding in ['none','None','bytes']: - bs = b.tobytes() - else: - bs = b.tobytes().decode(encoding) + bs = b.tobytes() slen = int(b.shape[-1]) if encoding in ['none','None','bytes']: a = numpy.array([bs[n1:n1+slen] for n1 in range(0,len(bs),slen)],'S'+repr(slen)) else: - a = numpy.array([bs[n1:n1+slen] for n1 in range(0,len(bs),slen)],'U'+repr(slen)) + a = numpy.array([bs[n1:n1+slen].decode(encoding) for n1 in range(0,len(bs),slen)],'U'+repr(slen)) a.shape = b.shape[:-1] return a From 826b634a7896db9e4e1e9aaa41b267010fcbba12 Mon Sep 17 00:00:00 2001 From: jswhit2 Date: Mon, 27 Oct 2025 19:19:24 -0600 Subject: [PATCH 02/10] update --- src/netCDF4/_netCDF4.pyx | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/src/netCDF4/_netCDF4.pyx b/src/netCDF4/_netCDF4.pyx index 487dbcb02..5f9108b1e 100644 --- a/src/netCDF4/_netCDF4.pyx +++ b/src/netCDF4/_netCDF4.pyx @@ -5533,7 +5533,7 @@ cannot be safely cast to variable data type""" % attname if data.dtype.kind in ['S','U'] and data.dtype.itemsize > 1: # if data is a numpy string array, convert it to an array # of characters with one more dimension. - data = stringtochar(data, encoding=encoding) + data = stringtochar(data, encoding=encoding,n_strlen=self.shape[-1]) # if structured data has strings (and _Encoding att set), create view as char arrays # (issue #773) @@ -6775,9 +6775,9 @@ returns a rank 1 numpy character array of length NUMCHARS with datatype `'S1'` arr[0:len(string)] = tuple(string) return arr -def stringtochar(a,encoding='utf-8'): +def stringtochar(a,encoding='utf-8',n_strlen=None): """ -**`stringtochar(a,encoding='utf-8')`** +**`stringtochar(a,encoding='utf-8',n_strlen=None)`** convert a string array to a character array with one extra dimension @@ -6789,16 +6789,29 @@ optional kwarg `encoding` can be used to specify character encoding (default `utf-8`). If `encoding` is 'none' or 'bytes', a `numpy.string_` the input array is treated a raw byte strings (`numpy.string_`). +optional kwarg `n_strlen` is the number of characters in each string. Default +is None, which means `n_strlen` will be set to a.itemsize (the number of bytes +used to represent each string in the input array). + returns a numpy character array with datatype `'S1'` or `'U1'` and shape `a.shape + (N,)`, where N is the length of each string in a.""" dtype = a.dtype.kind + if n_strlen is None: + n_strlen = a.dtype.itemsize if dtype not in ["S","U"]: raise ValueError("type must string or unicode ('S' or 'U')") if encoding in ['none','None','bytes']: b = numpy.array(tuple(a.tobytes()),'S1') + elif encoding == 'ascii': + b = numpy.array(tuple(a.tobytes().decode('ascii'))) + b.shape = a.shape + (n_strlen,) else: - b = numpy.array(tuple(a.tobytes().decode(encoding)),dtype+'1') - b.shape = a.shape + (a.itemsize,) + if not a.ndim: + a = numpy.array([a]) + bbytes = [text.encode(encoding) for text in a] + pad = b'\0' * n_strlen + bbytes = [(x + pad)[:n_strlen] for x in bbytes] + b = numpy.array([[bb[i:i+1] for i in range(n_strlen)] for bb in bbytes]) return b def chartostring(b,encoding='utf-8'): From 915c133b7680df723339942d4ce870579f441b70 Mon Sep 17 00:00:00 2001 From: jswhit2 Date: Mon, 27 Oct 2025 20:22:41 -0600 Subject: [PATCH 03/10] update --- src/netCDF4/_netCDF4.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/netCDF4/_netCDF4.pyx b/src/netCDF4/_netCDF4.pyx index 5f9108b1e..87880cc45 100644 --- a/src/netCDF4/_netCDF4.pyx +++ b/src/netCDF4/_netCDF4.pyx @@ -6803,7 +6803,7 @@ and shape `a.shape + (N,)`, where N is the length of each string in a.""" if encoding in ['none','None','bytes']: b = numpy.array(tuple(a.tobytes()),'S1') elif encoding == 'ascii': - b = numpy.array(tuple(a.tobytes().decode('ascii'))) + b = numpy.array(tuple(a.tobytes().decode(encoding)),dtype+'1') b.shape = a.shape + (n_strlen,) else: if not a.ndim: From c5c25875b647fa0d76fb90a53bb8f678b4f73e67 Mon Sep 17 00:00:00 2001 From: jswhit2 Date: Mon, 27 Oct 2025 20:48:23 -0600 Subject: [PATCH 04/10] update stringtochar stub --- src/netCDF4/__init__.pyi | 1 + 1 file changed, 1 insertion(+) diff --git a/src/netCDF4/__init__.pyi b/src/netCDF4/__init__.pyi index 97062cf51..f07aa0b8f 100644 --- a/src/netCDF4/__init__.pyi +++ b/src/netCDF4/__init__.pyi @@ -699,6 +699,7 @@ def stringtoarr( def stringtochar( a: npt.NDArray[np.character], encoding: Literal["none", "None", "bytes"], + n_strlen: int ) -> npt.NDArray[np.bytes_]: ... @overload def stringtochar( From 4b7201e50e21e41e7f19e4e8f8559e98b8e4284e Mon Sep 17 00:00:00 2001 From: jswhit2 Date: Mon, 27 Oct 2025 20:57:24 -0600 Subject: [PATCH 05/10] update --- src/netCDF4/__init__.pyi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/netCDF4/__init__.pyi b/src/netCDF4/__init__.pyi index f07aa0b8f..e27fb6370 100644 --- a/src/netCDF4/__init__.pyi +++ b/src/netCDF4/__init__.pyi @@ -699,7 +699,7 @@ def stringtoarr( def stringtochar( a: npt.NDArray[np.character], encoding: Literal["none", "None", "bytes"], - n_strlen: int + n_strlen: int | None = None, ) -> npt.NDArray[np.bytes_]: ... @overload def stringtochar( From 4e0abacdc5821bdbc1ce16b3e1eda2d081b84ac0 Mon Sep 17 00:00:00 2001 From: jswhit2 Date: Tue, 28 Oct 2025 09:03:35 -0600 Subject: [PATCH 06/10] add test for issue1440 --- test/test_stringarr.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/test/test_stringarr.py b/test/test_stringarr.py index 9d4fcd909..aed2203df 100644 --- a/test/test_stringarr.py +++ b/test/test_stringarr.py @@ -3,6 +3,7 @@ import unittest import os from numpy.testing import assert_array_equal, assert_array_almost_equal +import numpy as np def generateString(length, alphabet=string.ascii_letters + string.digits + string.punctuation): return(''.join([random.choice(alphabet) for i in range(length)])) @@ -20,6 +21,10 @@ def generateString(length, alphabet=string.ascii_letters + string.digits + strin datau = data.astype('U') datac = stringtochar(data, encoding='ascii') +nx, n_strlen = 3, 10 +unicode_strings = np.array(['Münster', 'London', 'Amsterdam'],dtype='U'+str(n_strlen)) +unicode_strings2 = np.array(['Münster', 'Liége', 'Amsterdam'],dtype='U'+str(n_strlen)) + class StringArrayTestCase(unittest.TestCase): def setUp(self): @@ -28,6 +33,8 @@ def setUp(self): nc.createDimension('n1',None) nc.createDimension('n2',n2) nc.createDimension('nchar',nchar) + nc.createDimension("x", nx) + nc.createDimension("nstr", n_strlen) v = nc.createVariable('strings','S1',('n1','n2','nchar')) v2 = nc.createVariable('strings2','S1',('n1','n2','nchar')) # if _Encoding set, string array should automatically be converted @@ -44,6 +51,11 @@ def setUp(self): v2[-1,-1] = data[-1,-1].tobytes() # write single python string # _Encoding should be ignored if an array of characters is specified v3[:] = stringtochar(data, encoding='ascii') + # test unicode strings (issue #1440) + v4 = nc.createVariable("strings4", "S1", dimensions=("x", "nstr",)) + v4._Encoding = "UTF-8" + v4[:] = unicode_strings + v4[1] = "Liége" nc.close() def tearDown(self): @@ -57,6 +69,8 @@ def runTest(self): v = nc.variables['strings'] v2 = nc.variables['strings2'] v3 = nc.variables['strings3'] + v4 = nc.variables['strings4'] + assert np.all(v4[:]==unicode_strings2) assert v.dtype.str[1:] in ['S1','U1'] assert v.shape == (nrecs,n2,nchar) for nrec in range(nrecs): From 51edbab95ba10083f592615b789238d255ff403e Mon Sep 17 00:00:00 2001 From: jswhit2 Date: Tue, 28 Oct 2025 09:38:06 -0600 Subject: [PATCH 07/10] update --- Changelog | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Changelog b/Changelog index 455bd1e06..c74134c2a 100644 --- a/Changelog +++ b/Changelog @@ -1,5 +1,8 @@ version 1.7.4 (not yet released) ================================ + * Make sure automatic conversion of character arrays <--> string arrays works for Unicode strings (issue #1440). + (previously only worked correctly for encoding="ascii"). + version 1.7.3 (tag v1.7.3rel) ============================= * Python 3.14 wheels (issue #1432) From ea3d3a16b634aa60897114de0fc906a149220a6c Mon Sep 17 00:00:00 2001 From: jswhit2 Date: Tue, 28 Oct 2025 09:41:25 -0600 Subject: [PATCH 08/10] update --- src/netCDF4/_netCDF4.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/netCDF4/_netCDF4.pyx b/src/netCDF4/_netCDF4.pyx index 87880cc45..f0025dcd2 100644 --- a/src/netCDF4/_netCDF4.pyx +++ b/src/netCDF4/_netCDF4.pyx @@ -1066,7 +1066,7 @@ If the `_Encoding` special attribute is set for a character array (dtype `S1`) variable, the `chartostring` utility function is used to convert the array of characters to an array of strings with one less dimension (the last dimension is interpreted as the length of each string) when reading the data. The character -set (usually ascii) is specified by the `_Encoding` attribute. If `_Encoding` +set is specified by the `_Encoding` attribute. If `_Encoding` is 'none' or 'bytes', then the character array is converted to a numpy fixed-width byte string array (dtype `S#`), otherwise a numpy unicode (dtype `U#`) array is created. When writing the data, From a0486d2ba8baef19dc243ca4431988c8e6bed0a3 Mon Sep 17 00:00:00 2001 From: jswhit2 Date: Tue, 28 Oct 2025 10:00:10 -0600 Subject: [PATCH 09/10] update --- test/test_stringarr.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/test/test_stringarr.py b/test/test_stringarr.py index aed2203df..36a3ea03f 100644 --- a/test/test_stringarr.py +++ b/test/test_stringarr.py @@ -21,9 +21,9 @@ def generateString(length, alphabet=string.ascii_letters + string.digits + strin datau = data.astype('U') datac = stringtochar(data, encoding='ascii') -nx, n_strlen = 3, 10 -unicode_strings = np.array(['Münster', 'London', 'Amsterdam'],dtype='U'+str(n_strlen)) -unicode_strings2 = np.array(['Münster', 'Liége', 'Amsterdam'],dtype='U'+str(n_strlen)) +nx, n_strlen = 3, 12 +unicode_strings = np.array(['Münster', 'Liége', '東京'],dtype='U'+str(n_strlen)) +unicode_strings2 = np.array(['Münster', 'Москва', '東京'],dtype='U'+str(n_strlen)) class StringArrayTestCase(unittest.TestCase): @@ -55,7 +55,7 @@ def setUp(self): v4 = nc.createVariable("strings4", "S1", dimensions=("x", "nstr",)) v4._Encoding = "UTF-8" v4[:] = unicode_strings - v4[1] = "Liége" + v4[1] = "Москва" nc.close() def tearDown(self): @@ -70,6 +70,7 @@ def runTest(self): v2 = nc.variables['strings2'] v3 = nc.variables['strings3'] v4 = nc.variables['strings4'] + print(v4[:]) assert np.all(v4[:]==unicode_strings2) assert v.dtype.str[1:] in ['S1','U1'] assert v.shape == (nrecs,n2,nchar) From caed197d76a8cd86bc2362e9d672ae3103f7d109 Mon Sep 17 00:00:00 2001 From: jswhit2 Date: Tue, 28 Oct 2025 10:10:41 -0600 Subject: [PATCH 10/10] update --- test/test_stringarr.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/test/test_stringarr.py b/test/test_stringarr.py index 36a3ea03f..780dd7ca7 100644 --- a/test/test_stringarr.py +++ b/test/test_stringarr.py @@ -24,6 +24,7 @@ def generateString(length, alphabet=string.ascii_letters + string.digits + strin nx, n_strlen = 3, 12 unicode_strings = np.array(['Münster', 'Liége', '東京'],dtype='U'+str(n_strlen)) unicode_strings2 = np.array(['Münster', 'Москва', '東京'],dtype='U'+str(n_strlen)) +unicode_strings2_bytes = [b'M', b'\xc3', b'\xbc', b'n', b's', b't', b'e', b'r', b'\xd0', b'\x9c', b'\xd0', b'\xbe', b'\xd1', b'\x81', b'\xd0', b'\xba', b'\xd0', b'\xb2', b'\xd0', b'\xb0', b'\xe6', b'\x9d', b'\xb1', b'\xe4', b'\xba', b'\xac'] class StringArrayTestCase(unittest.TestCase): @@ -70,8 +71,9 @@ def runTest(self): v2 = nc.variables['strings2'] v3 = nc.variables['strings3'] v4 = nc.variables['strings4'] - print(v4[:]) assert np.all(v4[:]==unicode_strings2) + v4.set_auto_chartostring(False) + assert (v4[:].compressed().tolist() == unicode_strings2_bytes) assert v.dtype.str[1:] in ['S1','U1'] assert v.shape == (nrecs,n2,nchar) for nrec in range(nrecs):