From f3dbab8edf40e96bef869903083e28cb2d94bdae Mon Sep 17 00:00:00 2001 From: Inada Naoki Date: Mon, 18 Mar 2019 18:45:37 +0900 Subject: [PATCH] wip --- Include/cpython/unicodeobject.h | 11 ++ Modules/_ctypes/_ctypes.c | 10 +- Modules/_ctypes/cfield.c | 15 ++- Modules/arraymodule.c | 10 ++ Objects/unicodeobject.c | 173 ++++++++++++++++++++++---------- Python/getargs.c | 5 + Python/traceback.c | 4 + 7 files changed, 162 insertions(+), 66 deletions(-) diff --git a/Include/cpython/unicodeobject.h b/Include/cpython/unicodeobject.h index 4eecc963ae2da1d..2012926a52a7afe 100644 --- a/Include/cpython/unicodeobject.h +++ b/Include/cpython/unicodeobject.h @@ -218,7 +218,9 @@ typedef struct { 4 bytes (see issue #19537 on m68k). */ unsigned int :24; } state; +#ifdef Py_WCHAR_CACHE wchar_t *wstr; /* wchar_t representation (null-terminated) */ +#endif } PyASCIIObject; /* Non-ASCII strings allocated through PyUnicode_New use the @@ -229,8 +231,10 @@ typedef struct { Py_ssize_t utf8_length; /* Number of bytes in utf8, excluding the * terminating \0. */ char *utf8; /* UTF-8 representation (null-terminated) */ +#ifdef Py_WCHAR_CACHE Py_ssize_t wstr_length; /* Number of code points in wstr, possible * surrogates count as two code points. */ +#endif } PyCompactUnicodeObject; /* Strings allocated through PyUnicode_FromUnicode(NULL, len) use the @@ -246,6 +250,9 @@ typedef struct { } data; /* Canonical, smallest-form Unicode buffer */ } PyUnicodeObject; + +#ifdef Py_WCHAR_CACHE + /* Fast access macros */ #define PyUnicode_WSTR_LENGTH(op) \ (PyUnicode_IS_COMPACT_ASCII(op) ? \ @@ -285,6 +292,8 @@ typedef struct { ((const char *)(PyUnicode_AS_UNICODE(op))) /* Py_DEPRECATED(3.3) */ +#endif + /* --- Flexible String Representation Helper Macros (PEP 393) -------------- */ @@ -568,6 +577,7 @@ PyAPI_FUNC(Py_UCS4) _PyUnicode_FindMaxChar ( Py_ssize_t start, Py_ssize_t end); +#ifdef Py_WCHAR_CACHE /* Return a read-only pointer to the Unicode object's internal Py_UNICODE buffer. If the wchar_t/Py_UNICODE representation is not yet available, this @@ -595,6 +605,7 @@ PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicodeAndSize( /* Get the maximum ordinal for a Unicode character. */ PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void) Py_DEPRECATED(3.3); +#endif /* --- _PyUnicodeWriter API ----------------------------------------------- */ diff --git a/Modules/_ctypes/_ctypes.c b/Modules/_ctypes/_ctypes.c index 0d95d2b6f76eceb..bee2764187fedde 100644 --- a/Modules/_ctypes/_ctypes.c +++ b/Modules/_ctypes/_ctypes.c @@ -1293,8 +1293,6 @@ static int WCharArray_set_value(CDataObject *self, PyObject *value, void *Py_UNUSED(ignored)) { Py_ssize_t result = 0; - Py_UNICODE *wstr; - Py_ssize_t len; if (value == NULL) { PyErr_SetString(PyExc_TypeError, @@ -1309,12 +1307,12 @@ WCharArray_set_value(CDataObject *self, PyObject *value, void *Py_UNUSED(ignored } else Py_INCREF(value); - wstr = PyUnicode_AsUnicodeAndSize(value, &len); - if (wstr == NULL) + Py_ssize_t len = PyUnicode_AsWideChar(value, NULL, 0); + if (len < 0) { return -1; + } if ((size_t)len > self->b_size/sizeof(wchar_t)) { - PyErr_SetString(PyExc_ValueError, - "string too long"); + PyErr_SetString(PyExc_ValueError, "string too long"); result = -1; goto done; } diff --git a/Modules/_ctypes/cfield.c b/Modules/_ctypes/cfield.c index 5f194e21550f7ba..693e500c267522e 100644 --- a/Modules/_ctypes/cfield.c +++ b/Modules/_ctypes/cfield.c @@ -1229,9 +1229,6 @@ U_get(void *ptr, Py_ssize_t size) static PyObject * U_set(void *ptr, PyObject *value, Py_ssize_t length) { - Py_UNICODE *wstr; - Py_ssize_t size; - /* It's easier to calculate in characters than in bytes */ length /= sizeof(wchar_t); @@ -1242,9 +1239,10 @@ U_set(void *ptr, PyObject *value, Py_ssize_t length) return NULL; } - wstr = PyUnicode_AsUnicodeAndSize(value, &size); - if (wstr == NULL) + Py_ssize_t size = PyUnicode_AsWideChar(value, NULL, 0); + if (size < 0) { return NULL; + } if (size > length) { PyErr_Format(PyExc_ValueError, "string too long (%zd, maximum length %zd)", @@ -1421,11 +1419,10 @@ BSTR_set(void *ptr, PyObject *value, Py_ssize_t size) /* create a BSTR from value */ if (value) { - wchar_t* wvalue; - Py_ssize_t wsize; - wvalue = PyUnicode_AsUnicodeAndSize(value, &wsize); - if (wvalue == NULL) + Py_ssize_t wsize = PyUnicode_AsWideChar(value, NULL, 0); + if (wsize < 0) { return NULL; + } if ((unsigned) wsize != wsize) { PyErr_SetString(PyExc_ValueError, "String too long for BSTR"); return NULL; diff --git a/Modules/arraymodule.c b/Modules/arraymodule.c index a5ba27cb36e2180..86df8684d6941b7 100644 --- a/Modules/arraymodule.c +++ b/Modules/arraymodule.c @@ -532,7 +532,9 @@ d_setitem(arrayobject *ap, Py_ssize_t i, PyObject *v) DEFINE_COMPAREITEMS(b, signed char) DEFINE_COMPAREITEMS(BB, unsigned char) +#ifdef Py_WCHAR_CACHE DEFINE_COMPAREITEMS(u, Py_UNICODE) +#endif DEFINE_COMPAREITEMS(h, short) DEFINE_COMPAREITEMS(HH, unsigned short) DEFINE_COMPAREITEMS(i, int) @@ -550,7 +552,9 @@ DEFINE_COMPAREITEMS(QQ, unsigned long long) static const struct arraydescr descriptors[] = { {'b', 1, b_getitem, b_setitem, b_compareitems, "b", 1, 1}, {'B', 1, BB_getitem, BB_setitem, BB_compareitems, "B", 1, 0}, +#ifdef Py_WCHAR_CACHE {'u', sizeof(Py_UNICODE), u_getitem, u_setitem, u_compareitems, "u", 0, 0}, +#endif {'h', sizeof(short), h_getitem, h_setitem, h_compareitems, "h", 1, 1}, {'H', sizeof(short), HH_getitem, HH_setitem, HH_compareitems, "H", 1, 0}, {'i', sizeof(int), i_getitem, i_setitem, i_compareitems, "i", 1, 1}, @@ -2633,6 +2637,7 @@ array_new(PyTypeObject *type, PyObject *args, PyObject *kwds) if (!PyArg_ParseTuple(args, "C|O:array", &c, &initial)) return NULL; +#ifdef Py_WCHAR_CACHE if (initial && c != 'u') { if (PyUnicode_Check(initial)) { PyErr_Format(PyExc_TypeError, "cannot use a str to initialize " @@ -2646,12 +2651,15 @@ array_new(PyTypeObject *type, PyObject *args, PyObject *kwds) return NULL; } } +#endif if (!(initial == NULL || PyList_Check(initial) || PyByteArray_Check(initial) || PyBytes_Check(initial) || PyTuple_Check(initial) +#ifdef Py_WCHAR_CACHE || ((c=='u') && PyUnicode_Check(initial)) +#endif || (array_Check(initial) && c == ((arrayobject*)initial)->ob_descr->typecode))) { it = PyObject_GetIter(initial); @@ -2710,6 +2718,7 @@ array_new(PyTypeObject *type, PyObject *args, PyObject *kwds) } Py_DECREF(v); } +#ifdef Py_WCHAR_CACHE else if (initial != NULL && PyUnicode_Check(initial)) { Py_UNICODE *ustr; Py_ssize_t n; @@ -2737,6 +2746,7 @@ array_new(PyTypeObject *type, PyObject *args, PyObject *kwds) self->allocated = Py_SIZE(self); } } +#endif else if (initial != NULL && array_Check(initial) && len > 0) { arrayobject *self = (arrayobject *)a; arrayobject *other = (arrayobject *)initial; diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index b3a851a9f8142ba..b0125eea5e57e9f 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -138,9 +138,6 @@ extern "C" { (assert(_PyUnicode_CHECK(op)), \ assert(!PyUnicode_IS_COMPACT_ASCII(op)), \ (_PyUnicode_UTF8(op) == PyUnicode_DATA(op))) -#define _PyUnicode_SHARE_WSTR(op) \ - (assert(_PyUnicode_CHECK(op)), \ - (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op))) /* true if the Unicode object has an allocated UTF-8 memory block (not shared with other data) */ @@ -149,6 +146,12 @@ extern "C" { && _PyUnicode_UTF8(op) \ && _PyUnicode_UTF8(op) != PyUnicode_DATA(op))) +#ifdef Py_WCHAR_CACHE + +#define _PyUnicode_SHARE_WSTR(op) \ + (assert(_PyUnicode_CHECK(op)), \ + (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op))) + /* true if the Unicode object has an allocated wstr memory block (not shared with other data) */ #define _PyUnicode_HAS_WSTR_MEMORY(op) \ @@ -156,6 +159,13 @@ extern "C" { (!PyUnicode_IS_READY(op) || \ _PyUnicode_WSTR(op) != PyUnicode_DATA(op)))) +#else + +#define _PyUnicode_SHARE_WSTR(op) (assert(_PyUnicode_CHECK(op)), 0) +#define _PyUnicode_HAS_WSTR_MEMORY(op) (assert(_PyUnicode_CHECK(op)), 0) + +#endif + /* Generic helper macro to convert characters of different types. from_type and to_type have to be valid type names, begin and end are pointers to the source characters which should be of type @@ -432,6 +442,7 @@ _PyUnicode_CheckConsistency(PyObject *op, int check_content) data = unicode->data.any; if (kind == PyUnicode_WCHAR_KIND) { +#if Py_WCHAR_CACHE ASSERT(ascii->length == 0); ASSERT(ascii->hash == -1); ASSERT(ascii->state.compact == 0); @@ -441,6 +452,9 @@ _PyUnicode_CheckConsistency(PyObject *op, int check_content) ASSERT(ascii->wstr != NULL); ASSERT(data == NULL); ASSERT(compact->utf8 == NULL); +#else + ASSERT(0); +#endif } else { ASSERT(kind == PyUnicode_1BYTE_KIND @@ -457,6 +471,7 @@ _PyUnicode_CheckConsistency(PyObject *op, int check_content) ASSERT (compact->utf8 != data); } } +#if Py_WCHAR_CACHE if (kind != PyUnicode_WCHAR_KIND) { if ( #if SIZEOF_WCHAR_T == 2 @@ -471,11 +486,14 @@ _PyUnicode_CheckConsistency(PyObject *op, int check_content) } else ASSERT(ascii->wstr != data); } +#endif if (compact->utf8 == NULL) ASSERT(compact->utf8_length == 0); +#if Py_WCHAR_CACHE if (ascii->wstr == NULL) ASSERT(compact->wstr_length == 0); +#endif } /* check that the best kind is used */ if (check_content && kind != PyUnicode_WCHAR_KIND) @@ -516,6 +534,7 @@ _PyUnicode_CheckConsistency(PyObject *op, int check_content) } #endif +#ifdef Py_WCHAR_CACHE static PyObject* unicode_result_wchar(PyObject *unicode) { @@ -550,6 +569,7 @@ unicode_result_wchar(PyObject *unicode) #endif return unicode; } +#endif static PyObject* unicode_result_ready(PyObject *unicode) @@ -595,10 +615,14 @@ static PyObject* unicode_result(PyObject *unicode) { assert(_PyUnicode_CHECK(unicode)); +#ifdef Py_WCHAR_CACHE if (PyUnicode_IS_READY(unicode)) return unicode_result_ready(unicode); else return unicode_result_wchar(unicode); +#else + return unicode_result_ready(unicode); +#endif } static PyObject* @@ -857,12 +881,6 @@ ensure_unicode(PyObject *obj) #include "stringlib/find_max_char.h" #include "stringlib/undef.h" -#include "stringlib/unicodedefs.h" -#include "stringlib/fastsearch.h" -#include "stringlib/count.h" -#include "stringlib/find.h" -#include "stringlib/undef.h" - /* --- Unicode Object ----------------------------------------------------- */ static inline Py_ssize_t @@ -920,7 +938,6 @@ resize_compact(PyObject *unicode, Py_ssize_t length) Py_ssize_t char_size; Py_ssize_t struct_size; Py_ssize_t new_size; - int share_wstr; PyObject *new_unicode; #ifdef Py_DEBUG Py_ssize_t old_length = _PyUnicode_LENGTH(unicode); @@ -935,7 +952,6 @@ resize_compact(PyObject *unicode, Py_ssize_t length) struct_size = sizeof(PyASCIIObject); else struct_size = sizeof(PyCompactUnicodeObject); - share_wstr = _PyUnicode_SHARE_WSTR(unicode); if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) { PyErr_NoMemory(); @@ -961,6 +977,8 @@ resize_compact(PyObject *unicode, Py_ssize_t length) _Py_NewReference(unicode); _PyUnicode_LENGTH(unicode) = length; +#ifdef Py_WCHAR_CACHE + int share_wstr = _PyUnicode_SHARE_WSTR(unicode); if (share_wstr) { _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode); if (!PyUnicode_IS_ASCII(unicode)) @@ -972,6 +990,7 @@ resize_compact(PyObject *unicode, Py_ssize_t length) if (!PyUnicode_IS_ASCII(unicode)) _PyUnicode_WSTR_LENGTH(unicode) = 0; } +#endif #ifdef Py_DEBUG unicode_fill_invalid(unicode, old_length); #endif @@ -981,6 +1000,7 @@ resize_compact(PyObject *unicode, Py_ssize_t length) return unicode; } +#if Py_WCHAR_CACHE static int resize_inplace(PyObject *unicode, Py_ssize_t length) { @@ -991,7 +1011,7 @@ resize_inplace(PyObject *unicode, Py_ssize_t length) if (PyUnicode_IS_READY(unicode)) { Py_ssize_t char_size; - int share_wstr, share_utf8; + int share_utf8; void *data; #ifdef Py_DEBUG Py_ssize_t old_length = _PyUnicode_LENGTH(unicode); @@ -999,7 +1019,6 @@ resize_inplace(PyObject *unicode, Py_ssize_t length) data = _PyUnicode_DATA_ANY(unicode); char_size = PyUnicode_KIND(unicode); - share_wstr = _PyUnicode_SHARE_WSTR(unicode); share_utf8 = _PyUnicode_SHARE_UTF8(unicode); if (length > (PY_SSIZE_T_MAX / char_size - 1)) { @@ -1021,10 +1040,13 @@ resize_inplace(PyObject *unicode, Py_ssize_t length) return -1; } _PyUnicode_DATA_ANY(unicode) = data; +#if Py_WCHAR_CACHE + share_wstr = _PyUnicode_SHARE_WSTR(unicode); if (share_wstr) { _PyUnicode_WSTR(unicode) = data; _PyUnicode_WSTR_LENGTH(unicode) = length; } +#endif if (share_utf8) { _PyUnicode_UTF8(unicode) = data; _PyUnicode_UTF8_LENGTH(unicode) = length; @@ -1034,10 +1056,12 @@ resize_inplace(PyObject *unicode, Py_ssize_t length) #ifdef Py_DEBUG unicode_fill_invalid(unicode, old_length); #endif +#if Py_WCHAR_CACHE if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) { assert(_PyUnicode_CheckConsistency(unicode, 0)); return 0; } +#endif } assert(_PyUnicode_WSTR(unicode) != NULL); @@ -1059,6 +1083,7 @@ resize_inplace(PyObject *unicode, Py_ssize_t length) assert(_PyUnicode_CheckConsistency(unicode, 0)); return 0; } +#endif /* Py_WCHAR_CACHE */ static PyObject* resize_copy(PyObject *unicode, Py_ssize_t length) @@ -1078,6 +1103,7 @@ resize_copy(PyObject *unicode, Py_ssize_t length) return copy; } else { +#if Py_WCHAR_CACHE PyObject *w; w = (PyObject*)_PyUnicode_New(length); @@ -1088,6 +1114,9 @@ resize_copy(PyObject *unicode, Py_ssize_t length) memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode), copy_length * sizeof(wchar_t)); return w; +#else + Py_UNREACHABLE(); +#endif } } @@ -1099,7 +1128,7 @@ resize_copy(PyObject *unicode, Py_ssize_t length) free list never reduces its size below 1. */ - +#ifdef Py_WCHAR_CACHE static PyUnicodeObject * _PyUnicode_New(Py_ssize_t length) { @@ -1159,6 +1188,7 @@ _PyUnicode_New(Py_ssize_t length) assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0)); return unicode; } +#endif static const char* unicode_kind_name(PyObject *unicode) @@ -1242,12 +1272,16 @@ _PyUnicode_Dump(PyObject *op) printf("%s: len=%" PY_FORMAT_SIZE_T "u, ", unicode_kind_name(op), ascii->length); +#ifdef Py_WCHAR_CACHE if (ascii->wstr == data) printf("shared "); printf("wstr=%p", ascii->wstr); +#endif if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) { +#ifdef Py_WCHAR_CACHE printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length); +#endif if (!ascii->state.compact && compact->utf8 == unicode->data.any) printf("shared "); printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)", @@ -1339,12 +1373,16 @@ PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar) _PyUnicode_STATE(unicode).ascii = is_ascii; if (is_ascii) { ((char*)data)[size] = 0; +#ifdef Py_WCHAR_CACHE _PyUnicode_WSTR(unicode) = NULL; +#endif } else if (kind == PyUnicode_1BYTE_KIND) { ((char*)data)[size] = 0; +#ifdef Py_WCHAR_CACHE _PyUnicode_WSTR(unicode) = NULL; _PyUnicode_WSTR_LENGTH(unicode) = 0; +#endif unicode->utf8 = NULL; unicode->utf8_length = 0; } @@ -1355,6 +1393,7 @@ PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar) ((Py_UCS2*)data)[size] = 0; else /* kind == PyUnicode_4BYTE_KIND */ ((Py_UCS4*)data)[size] = 0; +#ifdef Py_WCHAR_CACHE if (is_sharing) { _PyUnicode_WSTR_LENGTH(unicode) = size; _PyUnicode_WSTR(unicode) = (wchar_t *)data; @@ -1363,6 +1402,7 @@ PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar) _PyUnicode_WSTR_LENGTH(unicode) = 0; _PyUnicode_WSTR(unicode) = NULL; } +#endif } #ifdef Py_DEBUG unicode_fill_invalid((PyObject*)unicode, 0); @@ -1675,6 +1715,7 @@ find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end, int _PyUnicode_Ready(PyObject *unicode) { +#ifdef Py_WCHAR_CACHE wchar_t *end; Py_UCS4 maxchar = 0; Py_ssize_t num_surrogates; @@ -1799,6 +1840,9 @@ _PyUnicode_Ready(PyObject *unicode) _PyUnicode_STATE(unicode).ready = 1; assert(_PyUnicode_CheckConsistency(unicode, 1)); return 0; +#else + Py_UNREACHABLE(); +#endif } static void @@ -1824,8 +1868,10 @@ unicode_dealloc(PyObject *unicode) Py_FatalError("Inconsistent interned string state."); } +#ifdef Py_WCHAR_CACHE if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) PyObject_DEL(_PyUnicode_WSTR(unicode)); +#endif if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) PyObject_DEL(_PyUnicode_UTF8(unicode)); if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) @@ -1883,10 +1929,14 @@ unicode_resize(PyObject **p_unicode, Py_ssize_t length) assert(PyUnicode_Check(unicode)); assert(0 <= length); +#if Py_WCHAR_CACHE if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND) old_length = PyUnicode_WSTR_LENGTH(unicode); else old_length = PyUnicode_GET_LENGTH(unicode); +#else + old_length = PyUnicode_GET_LENGTH(unicode); +#endif if (old_length == length) return 0; @@ -1913,7 +1963,11 @@ unicode_resize(PyObject **p_unicode, Py_ssize_t length) *p_unicode = new_unicode; return 0; } +#if Py_WCHAR_CACHE return resize_inplace(unicode, length); +#else + Py_UNREACHABLE(); +#endif } int @@ -2029,8 +2083,14 @@ unicode_char(Py_UCS4 ch) PyObject * PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size) { - if (u == NULL) + if (u == NULL) { +#ifdef Py_WCHAR_CACHE return (PyObject*)_PyUnicode_New(size); +#else + PyErr_BadInternalCall(); + return NULL; +#endif + } if (size < 0) { PyErr_BadInternalCall(); @@ -2116,10 +2176,20 @@ PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size) "Negative size passed to PyUnicode_FromStringAndSize"); return NULL; } - if (u != NULL) + if (u != NULL) { return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL); - else + } + else { +#ifdef Py_WCHAR_CACHE return (PyObject *)_PyUnicode_New(size); +#else + if (size == 0) { + _Py_RETURN_UNICODE_EMPTY(); + } + PyErr_BadInternalCall(); + return NULL; +#endif + } } PyObject * @@ -2975,9 +3045,11 @@ unicode_get_widechar_size(PyObject *unicode) assert(unicode != NULL); assert(_PyUnicode_CHECK(unicode)); +#ifdef Py_WCHAR_CACHE if (_PyUnicode_WSTR(unicode) != NULL) { return PyUnicode_WSTR_LENGTH(unicode); } +#endif assert(PyUnicode_IS_READY(unicode)); res = _PyUnicode_LENGTH(unicode); @@ -3003,11 +3075,14 @@ unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size) assert(unicode != NULL); assert(_PyUnicode_CHECK(unicode)); +#ifdef Py_WCHAR_CACHE wstr = _PyUnicode_WSTR(unicode); if (wstr != NULL) { memcpy(w, wstr, size * sizeof(wchar_t)); return; } +#endif /* Py_WCHAR_CACHE */ + assert(PyUnicode_IS_READY(unicode)); if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) { @@ -3016,18 +3091,19 @@ unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size) *w = *s; } } - else { -#if SIZEOF_WCHAR_T == 4 + else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) { assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND); const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode); for (; size--; ++s, ++w) { *w = *s; } -#else + } + else { assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND); const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode); for (; size--; ++s, ++w) { Py_UCS4 ch = *s; +#if SIZEOF_WCHAR_T == 2 if (ch > 0xFFFF) { assert(ch <= MAX_UNICODE); /* encode surrogate pair in this case */ @@ -3039,11 +3115,14 @@ unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size) else { *w = ch; } - } +#else + *w = ch; #endif + } } } + #ifdef HAVE_WCHAR_H /* Convert a Unicode object to a wide character string. @@ -3903,6 +3982,7 @@ PyUnicode_AsUTF8(PyObject *unicode) return PyUnicode_AsUTF8AndSize(unicode, NULL); } +#ifdef Py_WCHAR_CACHE Py_UNICODE * PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size) { @@ -3975,6 +4055,8 @@ PyUnicode_GetSize(PyObject *unicode) return -1; } +#endif + Py_ssize_t PyUnicode_GetLength(PyObject *unicode) { @@ -10920,6 +11002,7 @@ PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str) const unsigned char *ustr = (const unsigned char *)str; assert(_PyUnicode_CHECK(uni)); +#ifdef Py_WCHAR_CACHE if (!PyUnicode_IS_READY(uni)) { const wchar_t *ws = _PyUnicode_WSTR(uni); /* Compare Unicode string and source character set string */ @@ -10935,6 +11018,7 @@ PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str) return -1; /* str is longer */ return 0; } +#endif kind = PyUnicode_KIND(uni); if (kind == PyUnicode_1BYTE_KIND) { const void *data = PyUnicode_1BYTE_DATA(uni); @@ -10975,6 +11059,7 @@ PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str) static int non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str) { +#ifdef Py_WCHAR_CACHE size_t i, len; const wchar_t *p; len = (size_t)_PyUnicode_WSTR_LENGTH(unicode); @@ -10988,6 +11073,9 @@ non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str) return 0; } return 1; +#else + Py_UNREACHABLE(); +#endif } int @@ -11301,7 +11389,7 @@ PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right) } /* -Wraps stringlib_parse_args_finds() and additionally ensures that the +Wraps asciilib_parse_args_finds() and additionally ensures that the first argument is a unicode object. */ @@ -11310,7 +11398,7 @@ parse_args_finds_unicode(const char * function_name, PyObject *args, PyObject **substring, Py_ssize_t *start, Py_ssize_t *end) { - if(stringlib_parse_args_finds(function_name, args, substring, + if (asciilib_parse_args_finds(function_name, args, substring, start, end)) { if (ensure_unicode(*substring) < 0) return 0; @@ -13285,7 +13373,7 @@ unicode_startswith(PyObject *self, Py_ssize_t end = PY_SSIZE_T_MAX; int result; - if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end)) + if (!asciilib_parse_args_finds("startswith", args, &subobj, &start, &end)) return NULL; if (PyTuple_Check(subobj)) { Py_ssize_t i; @@ -13339,7 +13427,7 @@ unicode_endswith(PyObject *self, Py_ssize_t end = PY_SSIZE_T_MAX; int result; - if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end)) + if (!asciilib_parse_args_finds("endswith", args, &subobj, &start, &end)) return NULL; if (PyTuple_Check(subobj)) { Py_ssize_t i; @@ -13775,8 +13863,10 @@ unicode_sizeof_impl(PyObject *self) } /* If the wstr pointer is present, account for it unless it is shared with the data pointer. Check if the data is not shared. */ +#if Py_WCHAR_CACHE if (_PyUnicode_HAS_WSTR_MEMORY(self)) size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t); +#endif if (_PyUnicode_HAS_UTF8_MEMORY(self)) size += PyUnicode_UTF8_LENGTH(self) + 1; @@ -14976,10 +15066,12 @@ unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds) _PyUnicode_STATE(self).compact = 0; _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii; _PyUnicode_STATE(self).ready = 1; - _PyUnicode_WSTR(self) = NULL; _PyUnicode_UTF8_LENGTH(self) = 0; _PyUnicode_UTF8(self) = NULL; +#ifdef Py_WCHAR_CACHE + _PyUnicode_WSTR(self) = NULL; _PyUnicode_WSTR_LENGTH(self) = 0; +#endif _PyUnicode_DATA_ANY(self) = NULL; share_utf8 = 0; @@ -15017,10 +15109,12 @@ unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds) _PyUnicode_UTF8_LENGTH(self) = length; _PyUnicode_UTF8(self) = data; } +#ifdef Py_WCHAR_CACHE if (share_wstr) { _PyUnicode_WSTR_LENGTH(self) = length; _PyUnicode_WSTR(self) = (wchar_t *)data; } +#endif memcpy(data, PyUnicode_DATA(unicode), kind * (length + 1)); @@ -15347,7 +15441,7 @@ unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored)) return Py_BuildValue("N(O)n", _PyEval_GetBuiltinId(&PyId_iter), it->it_seq, it->it_index); } else { - PyObject *u = (PyObject *)_PyUnicode_New(0); + PyObject *u = PyUnicode_FromStringAndSize("", 0); if (u == NULL) return NULL; return Py_BuildValue("N(N)", _PyEval_GetBuiltinId(&PyId_iter), u); @@ -15529,30 +15623,7 @@ Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c) Py_UNICODE* PyUnicode_AsUnicodeCopy(PyObject *unicode) { - Py_UNICODE *u, *copy; - Py_ssize_t len, size; - - if (!PyUnicode_Check(unicode)) { - PyErr_BadArgument(); - return NULL; - } - u = PyUnicode_AsUnicodeAndSize(unicode, &len); - if (u == NULL) - return NULL; - /* Ensure we won't overflow the size. */ - if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) { - PyErr_NoMemory(); - return NULL; - } - size = len + 1; /* copy the null character */ - size *= sizeof(Py_UNICODE); - copy = PyMem_Malloc(size); - if (copy == NULL) { - PyErr_NoMemory(); - return NULL; - } - memcpy(copy, u, size); - return copy; + return PyUnicode_AsWideCharString(unicode, NULL); } /* A _string module, to export formatter_parser and formatter_field_name_split diff --git a/Python/getargs.c b/Python/getargs.c index e50f9b5f5c9dcd7..8cdd40e26733239 100644 --- a/Python/getargs.c +++ b/Python/getargs.c @@ -1070,6 +1070,7 @@ convertsimple(PyObject *arg, const char **p_format, va_list *p_va, int flags, case 'u': /* raw unicode buffer (Py_UNICODE *) */ case 'Z': /* raw unicode buffer or None */ { +#if Py_WCHAR_CACHE Py_UNICODE **p = va_arg(*p_va, Py_UNICODE **); if (*format == '#') { @@ -1108,6 +1109,10 @@ convertsimple(PyObject *arg, const char **p_format, va_list *p_va, int flags, return converterr(c == 'Z' ? "str or None" : "str", arg, msgbuf, bufsize); } +#else + PyErr_BadInternalCall(); + RETURN_ERR_OCCURRED; +#endif break; } diff --git a/Python/traceback.c b/Python/traceback.c index bd1061ed43b1e13..e035b665e307838 100644 --- a/Python/traceback.c +++ b/Python/traceback.c @@ -683,10 +683,14 @@ _Py_DumpASCII(int fd, PyObject *text) size = ascii->length; kind = ascii->state.kind; if (kind == PyUnicode_WCHAR_KIND) { +#if Py_WCHAR_CACHE wstr = ((PyASCIIObject *)text)->wstr; if (wstr == NULL) return; size = ((PyCompactUnicodeObject *)text)->wstr_length; +#else + Py_UNREACHABLE(); +#endif } else if (ascii->state.compact) { if (ascii->state.ascii)