From 8aa73b750a0bd2199ac6ca84c8eb968c5708d66f Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Mon, 17 Jun 2024 15:25:37 +0200 Subject: [PATCH 1/9] gh-119182: Add PyUnicodeWriter_DecodeUTF8Stateful() Add PyUnicodeWriter_WriteWideChar() and PyUnicodeWriter_DecodeUTF8Stateful() functions. --- Doc/c-api/unicode.rst | 33 ++++++++- Doc/whatsnew/3.14.rst | 2 + Include/cpython/unicodeobject.h | 10 +++ Modules/_testcapi/unicode.c | 121 ++++++++++++++++++++++++++++++++ Objects/unicodeobject.c | 46 ++++++++++++ 5 files changed, 209 insertions(+), 3 deletions(-) diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index 02e696c303fa91..262b4bfcbca91a 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -1551,9 +1551,17 @@ object. On success, return ``0``. On error, set an exception, leave the writer unchanged, and return ``-1``. - To use a different error handler than ``strict``, - :c:func:`PyUnicode_DecodeUTF8` can be used with - :c:func:`PyUnicodeWriter_WriteStr`. + See also :c:func:`PyUnicodeWriter_DecodeUTF8Stateful`. + +.. c:function:: PyUnicodeWriter_WriteWideChar(PyUnicodeWriter *writer, wchar_t *str, Py_ssize_t size) + + Writer the wide string *str* into *writer*. + + *size* is a number of wide characters. If *size* is equal to ``-1``, call + ``wcslen(str)`` to get the string length. + + On success, return ``0``. + On error, set an exception, leave the writer unchanged, and return ``-1``. .. c:function:: int PyUnicodeWriter_WriteStr(PyUnicodeWriter *writer, PyObject *obj) @@ -1586,3 +1594,22 @@ object. On success, return ``0``. On error, set an exception, leave the writer unchanged, and return ``-1``. + +.. c:function:: int PyUnicodeWriter_DecodeUTF8Stateful(PyUnicodeWriter *writer, const char *string, Py_ssize_t length, const char *errors, Py_ssize_t *consumed) + + Decode the string *str* from UTF-8 with *errors* error handler and write the + output into *writer*. + + *size* is the string length in bytes. If *size* is equal to ``-1``, call + ``strlen(str)`` to get the string length. + + *errors* is an error handler name, such as ``"replace"``. If *errors* is + ``NULL``, use the strict error handler. + + If *consumed* is not ``NULL``, set *\*consumed* to the number of decoded + bytes on success. + + On success, return ``0``. + On error, set an exception, leave the writer unchanged, and return ``-1``. + + See also :c:func:`PyUnicodeWriter_WriteUTF8`. diff --git a/Doc/whatsnew/3.14.rst b/Doc/whatsnew/3.14.rst index 55541ff14d88ce..9f5cfaf36ee12c 100644 --- a/Doc/whatsnew/3.14.rst +++ b/Doc/whatsnew/3.14.rst @@ -291,10 +291,12 @@ New Features * :c:func:`PyUnicodeWriter_Finish`. * :c:func:`PyUnicodeWriter_WriteChar`. * :c:func:`PyUnicodeWriter_WriteUTF8`. + * :c:func:`PyUnicodeWriter_WriteWideChar`. * :c:func:`PyUnicodeWriter_WriteStr`. * :c:func:`PyUnicodeWriter_WriteRepr`. * :c:func:`PyUnicodeWriter_WriteSubstring`. * :c:func:`PyUnicodeWriter_Format`. + * :c:func:`PyUnicodeWriter_DecodeUTF8Stateful`. (Contributed by Victor Stinner in :gh:`119182`.) diff --git a/Include/cpython/unicodeobject.h b/Include/cpython/unicodeobject.h index e5e1b6be118588..54b0f665861e03 100644 --- a/Include/cpython/unicodeobject.h +++ b/Include/cpython/unicodeobject.h @@ -459,6 +459,10 @@ PyAPI_FUNC(int) PyUnicodeWriter_WriteUTF8( PyUnicodeWriter *writer, const char *str, Py_ssize_t size); +PyAPI_FUNC(int) PyUnicodeWriter_WriteWideChar( + PyUnicodeWriter *writer, + wchar_t *str, + Py_ssize_t size); PyAPI_FUNC(int) PyUnicodeWriter_WriteStr( PyUnicodeWriter *writer, @@ -475,6 +479,12 @@ PyAPI_FUNC(int) PyUnicodeWriter_Format( PyUnicodeWriter *writer, const char *format, ...); +PyAPI_FUNC(int) PyUnicodeWriter_DecodeUTF8Stateful( + PyUnicodeWriter *writer, + const char *string, /* UTF-8 encoded string */ + Py_ssize_t length, /* size of string */ + const char *errors, /* error handling */ + Py_ssize_t *consumed); /* bytes consumed */ /* --- Private _PyUnicodeWriter API --------------------------------------- */ diff --git a/Modules/_testcapi/unicode.c b/Modules/_testcapi/unicode.c index 79f99c404cd757..c3ba891f8a0ac4 100644 --- a/Modules/_testcapi/unicode.c +++ b/Modules/_testcapi/unicode.c @@ -374,6 +374,88 @@ test_unicodewriter_recover_error(PyObject *self, PyObject *Py_UNUSED(args)) } +static PyObject * +test_unicodewriter_decode_utf8(PyObject *self, PyObject *Py_UNUSED(args)) +{ + // test PyUnicodeWriter_DecodeUTF8Stateful() + PyUnicodeWriter *writer = PyUnicodeWriter_Create(0); + if (writer == NULL) { + return NULL; + } + if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "ign\xFFore", -1, "ignore", NULL) < 0) { + goto error; + } + if (PyUnicodeWriter_WriteChar(writer, '-') < 0) { + goto error; + } + if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "replace\xFF", -1, "replace", NULL) < 0) { + goto error; + } + + PyObject *result = PyUnicodeWriter_Finish(writer); + if (result == NULL) { + return NULL; + } + assert(PyUnicode_EqualToUTF8(result, "ignore-replace\xef\xbf\xbd")); + Py_DECREF(result); + + Py_RETURN_NONE; + +error: + PyUnicodeWriter_Discard(writer); + return NULL; +} + + +static PyObject * +test_unicodewriter_decode_utf8_consumed(PyObject *self, PyObject *Py_UNUSED(args)) +{ + // test PyUnicodeWriter_DecodeUTF8Stateful() + PyUnicodeWriter *writer = PyUnicodeWriter_Create(0); + if (writer == NULL) { + return NULL; + } + Py_ssize_t consumed; + + // valid string + consumed = 12345; + if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "text", -1, NULL, &consumed) < 0) { + goto error; + } + assert(consumed == 4); + + if (PyUnicodeWriter_WriteChar(writer, '-') < 0) { + goto error; + } + + // consumed is 0 if write fails + consumed = 12345; + assert(PyUnicodeWriter_DecodeUTF8Stateful(writer, "invalid\xFF", -1, NULL, &consumed) < 0); + PyErr_Clear(); + assert(consumed == 0); + + // ignore error handler + consumed = 12345; + if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "more\xFF", -1, "ignore", &consumed) < 0) { + goto error; + } + assert(consumed == 5); + + PyObject *result = PyUnicodeWriter_Finish(writer); + if (result == NULL) { + return NULL; + } + assert(PyUnicode_EqualToUTF8(result, "text-more")); + Py_DECREF(result); + + Py_RETURN_NONE; + +error: + PyUnicodeWriter_Discard(writer); + return NULL; +} + + static PyObject * test_unicodewriter_format(PyObject *self, PyObject *Py_UNUSED(args)) { @@ -436,6 +518,42 @@ test_unicodewriter_format_recover_error(PyObject *self, PyObject *Py_UNUSED(args } +static PyObject * +test_unicodewriter_widechar(PyObject *self, PyObject *Py_UNUSED(args)) +{ + PyUnicodeWriter *writer = PyUnicodeWriter_Create(0); + if (writer == NULL) { + return NULL; + } + if (PyUnicodeWriter_WriteWideChar(writer, L"latin1=\xE9 IGNORED", 8) < 0) { + goto error; + } + if (PyUnicodeWriter_WriteWideChar(writer, L"-", 1) < 0) { + goto error; + } + if (PyUnicodeWriter_WriteWideChar(writer, L"euro=\u20AC", -1) < 0) { + goto error; + } + if (PyUnicodeWriter_WriteChar(writer, '.') < 0) { + goto error; + } + + PyObject *result = PyUnicodeWriter_Finish(writer); + if (result == NULL) { + return NULL; + } + assert(PyUnicode_EqualToUTF8(result, + "latin1=\xC3\xA9-euro=\xE2\x82\xAC.")); + Py_DECREF(result); + + Py_RETURN_NONE; + +error: + PyUnicodeWriter_Discard(writer); + return NULL; +} + + static PyMethodDef TestMethods[] = { {"unicode_new", unicode_new, METH_VARARGS}, {"unicode_fill", unicode_fill, METH_VARARGS}, @@ -448,8 +566,11 @@ static PyMethodDef TestMethods[] = { {"test_unicodewriter_utf8", test_unicodewriter_utf8, METH_NOARGS}, {"test_unicodewriter_invalid_utf8", test_unicodewriter_invalid_utf8, METH_NOARGS}, {"test_unicodewriter_recover_error", test_unicodewriter_recover_error, METH_NOARGS}, + {"test_unicodewriter_decode_utf8", test_unicodewriter_decode_utf8, METH_NOARGS}, + {"test_unicodewriter_decode_utf8_consumed", test_unicodewriter_decode_utf8_consumed, METH_NOARGS}, {"test_unicodewriter_format", test_unicodewriter_format, METH_NOARGS}, {"test_unicodewriter_format_recover_error", test_unicodewriter_format_recover_error, METH_NOARGS}, + {"test_unicodewriter_widechar", test_unicodewriter_widechar, METH_NOARGS}, {NULL}, }; diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 1f8c89dd12a528..acd388c2ea73ae 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -13500,6 +13500,52 @@ PyUnicodeWriter_WriteUTF8(PyUnicodeWriter *writer, return res; } + +int +PyUnicodeWriter_DecodeUTF8Stateful(PyUnicodeWriter *writer, + const char *string, + Py_ssize_t length, + const char *errors, + Py_ssize_t *consumed) +{ + if (length < 0) { + length = strlen(string); + } + + _PyUnicodeWriter *_writer = (_PyUnicodeWriter*)writer; + Py_ssize_t old_pos = _writer->pos; + int res = unicode_decode_utf8_writer(_writer, string, length, + _Py_ERROR_UNKNOWN, errors, consumed); + if (res < 0) { + _writer->pos = old_pos; + if (consumed) { + *consumed = 0; + } + } + return res; +} + + +int +PyUnicodeWriter_WriteWideChar(PyUnicodeWriter *writer, + wchar_t *str, + Py_ssize_t size) +{ + if (size < 0) { + size = wcslen(str); + } + PyObject *obj = PyUnicode_FromWideChar(str, size); + if (obj == NULL) { + return -1; + } + + _PyUnicodeWriter *_writer = (_PyUnicodeWriter *)writer; + int res = _PyUnicodeWriter_WriteStr(_writer, obj); + Py_DECREF(obj); + return res; +} + + int _PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer, const char *str, Py_ssize_t len) From 788a85f37d946ec377e2eba4af2a3dcc1e94f8cc Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Mon, 17 Jun 2024 21:57:34 +0200 Subject: [PATCH 2/9] doc: fix typo --- Doc/c-api/unicode.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index 262b4bfcbca91a..37b8b1ecf73bbf 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -1553,7 +1553,7 @@ object. See also :c:func:`PyUnicodeWriter_DecodeUTF8Stateful`. -.. c:function:: PyUnicodeWriter_WriteWideChar(PyUnicodeWriter *writer, wchar_t *str, Py_ssize_t size) +.. c:function:: int PyUnicodeWriter_WriteWideChar(PyUnicodeWriter *writer, wchar_t *str, Py_ssize_t size) Writer the wide string *str* into *writer*. From e67a8b4993329eb46641c6422392281e8eddd9e7 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Wed, 19 Jun 2024 12:14:23 +0200 Subject: [PATCH 3/9] Optimize PyUnicodeWriter_WriteWideChar() Avoid a temporary Unicode object, write directly into the writer. --- Objects/unicodeobject.c | 138 ++++++++++++++++++++++++++-------------- 1 file changed, 89 insertions(+), 49 deletions(-) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index acd388c2ea73ae..ae57a316f9bb00 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -1294,19 +1294,14 @@ PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar) characters for a terminating null character. */ static void unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end, - PyObject *unicode) + Py_UCS4 *ucs4_out, +#ifndef NDEBUG + Py_UCS4 *ucs4_end, +#endif + ) { - const wchar_t *iter; - Py_UCS4 *ucs4_out; - - assert(unicode != NULL); - assert(_PyUnicode_CHECK(unicode)); - assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND); - ucs4_out = PyUnicode_4BYTE_DATA(unicode); - - for (iter = begin; iter < end; ) { - assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) + - _PyUnicode_GET_LENGTH(unicode))); + for (const wchar_t *iter = begin; iter < end; ) { + assert(ucs4_out < ucs4_end); if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0]) && (iter+1) < end && Py_UNICODE_IS_LOW_SURROGATE(iter[1])) @@ -1319,9 +1314,7 @@ unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end, iter++; } } - assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) + - _PyUnicode_GET_LENGTH(unicode))); - + assert(ucs4_out == ucs4_end); } #endif @@ -1790,16 +1783,13 @@ unicode_char(Py_UCS4 ch) return unicode; } -PyObject * -PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size) +static inline int +unicode_fromwidechar(const wchar_t *u, Py_ssize_t size, + PyObject **punicode, _PyUnicodeWriter *writer) { - PyObject *unicode; - Py_UCS4 maxchar = 0; - Py_ssize_t num_surrogates; - if (u == NULL && size != 0) { PyErr_BadInternalCall(); - return NULL; + return -1; } if (size == -1) { @@ -1810,8 +1800,12 @@ PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size) some optimizations which share commonly used objects. */ /* Optimization for empty strings */ - if (size == 0) - _Py_RETURN_UNICODE_EMPTY(); + if (size == 0) { + if (punicode) { + *punicode = unicode_get_empty(); + } + return 0; + } #ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION /* Oracle Solaris uses non-Unicode internal wchar_t form for @@ -1819,59 +1813,111 @@ PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size) if (_Py_LocaleUsesNonUnicodeWchar()) { wchar_t* converted = _Py_DecodeNonUnicodeWchar(u, size); if (!converted) { - return NULL; + return -1; } PyObject *unicode = _PyUnicode_FromUCS4(converted, size); PyMem_Free(converted); - return unicode; + if (punicode) { + *punicode = unicode; + return 0; + } + else { + int res = _PyUnicodeWriter_WriteStr(_writer, unicode); + Py_DECREF(unicode); + return res; + } } #endif /* Single character Unicode objects in the Latin-1 range are shared when using this constructor */ - if (size == 1 && (Py_UCS4)*u < 256) - return get_latin1_char((unsigned char)*u); + if (punicode && size == 1 && (Py_UCS4)*u < 256) { + *punicode = get_latin1_char((unsigned char)*u); + return 0; + } /* If not empty and not single character, copy the Unicode data into the new object */ + Py_UCS4 maxchar = 0; + Py_ssize_t num_surrogates; if (find_maxchar_surrogates(u, u + size, - &maxchar, &num_surrogates) == -1) - return NULL; + &maxchar, &num_surrogates) == -1) { + return -1; + } - unicode = PyUnicode_New(size - num_surrogates, maxchar); - if (!unicode) - return NULL; + PyObject *unicode = NULL; + int kind; + void *data; + if (punicode) { + unicode = PyUnicode_New(size - num_surrogates, maxchar); + if (!unicode) { + return -1; + } + kind = PyUnicode_KIND(unicode); + data = PyUnicode_DATA(unicode); + } + else { + if (_PyUnicodeWriter_Prepare(writer, size - num_surrogates, + maxchar) < 0) { + return -1; + } + kind = writer->kind; + data = writer->data + writer->pos * kind; + } - switch (PyUnicode_KIND(unicode)) { + switch (kind) { case PyUnicode_1BYTE_KIND: - _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char, - u, u + size, PyUnicode_1BYTE_DATA(unicode)); + _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char, u, u + size, data); break; case PyUnicode_2BYTE_KIND: #if Py_UNICODE_SIZE == 2 - memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2); + memcpy(data, u, size * 2); #else - _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2, - u, u + size, PyUnicode_2BYTE_DATA(unicode)); + _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2, u, u + size, data); #endif break; case PyUnicode_4BYTE_KIND: + { #if SIZEOF_WCHAR_T == 2 /* This is the only case which has to process surrogates, thus a simple copy loop is not enough and we need a function. */ - unicode_convert_wchar_to_ucs4(u, u + size, unicode); +# ifndef NDEBUG + Py_UCS4* ucs4_end = (Py_UCS4*)data + (size - num_surrogates); + unicode_convert_wchar_to_ucs4(u, u + size, (Py_UCS4*)data, ucs4_end); +# else + unicode_convert_wchar_to_ucs4(u, u + size, (Py_UCS4*)data); +# endif #else assert(num_surrogates == 0); - memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4); + memcpy(data, u, size * 4); #endif break; + } default: Py_UNREACHABLE(); } - return unicode_result(unicode); + if (punicode) { + *punicode = unicode_result(unicode); + } + else { + writer->pos += size - num_surrogates; + } + return 0; +} + + +PyObject * +PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size) +{ + PyObject *unicode; + if (unicode_fromwidechar(u, size, &unicode, NULL) < 0) { + return NULL; + } + return unicode; } + PyObject * PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size) { @@ -13534,15 +13580,9 @@ PyUnicodeWriter_WriteWideChar(PyUnicodeWriter *writer, if (size < 0) { size = wcslen(str); } - PyObject *obj = PyUnicode_FromWideChar(str, size); - if (obj == NULL) { - return -1; - } _PyUnicodeWriter *_writer = (_PyUnicodeWriter *)writer; - int res = _PyUnicodeWriter_WriteStr(_writer, obj); - Py_DECREF(obj); - return res; + return unicode_fromwidechar(str, size, NULL, _writer); } From de56475973fb1314728fd373d7afe7fc44277f6b Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Wed, 19 Jun 2024 17:45:07 +0300 Subject: [PATCH 4/9] Update Objects/unicodeobject.c --- Objects/unicodeobject.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index ae57a316f9bb00..f398b3d8668282 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -1294,9 +1294,9 @@ PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar) characters for a terminating null character. */ static void unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end, - Py_UCS4 *ucs4_out, + Py_UCS4 *ucs4_out #ifndef NDEBUG - Py_UCS4 *ucs4_end, + , Py_UCS4 *ucs4_end #endif ) { From e48eec7e773bde2ce7b5a056811ec01fe928f51c Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Wed, 19 Jun 2024 17:57:57 +0200 Subject: [PATCH 5/9] Fix compiler warning --- Objects/unicodeobject.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index f398b3d8668282..8657f8780a048b 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -1862,7 +1862,7 @@ unicode_fromwidechar(const wchar_t *u, Py_ssize_t size, return -1; } kind = writer->kind; - data = writer->data + writer->pos * kind; + data = (Py_UCS1*)writer->data + writer->pos * kind; } switch (kind) { From 75fa8ba208cea43009ddf2f060ca4fab464597a9 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Thu, 20 Jun 2024 10:35:19 +0200 Subject: [PATCH 6/9] Add unicode_write_widechar() Remove unicode_convert_wchar_to_ucs4(). Refactor PyUnicode_FromWideChar() and PyUnicodeWriter_WriteWideChar(). --- Doc/c-api/unicode.rst | 2 + Objects/unicodeobject.c | 262 +++++++++++++++++++--------------------- 2 files changed, 129 insertions(+), 135 deletions(-) diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index 37b8b1ecf73bbf..dc49fafda15c70 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -1557,6 +1557,8 @@ object. Writer the wide string *str* into *writer*. + *str* must not be ``NULL``. + *size* is a number of wide characters. If *size* is equal to ``-1``, call ``wcslen(str)`` to get the string length. diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 8657f8780a048b..bb95c0dea0b828 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -1285,39 +1285,6 @@ PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar) return obj; } -#if SIZEOF_WCHAR_T == 2 -/* Helper function to convert a 16-bits wchar_t representation to UCS4, this - will decode surrogate pairs, the other conversions are implemented as macros - for efficiency. - - This function assumes that unicode can hold one more code point than wstr - characters for a terminating null character. */ -static void -unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end, - Py_UCS4 *ucs4_out -#ifndef NDEBUG - , Py_UCS4 *ucs4_end -#endif - ) -{ - for (const wchar_t *iter = begin; iter < end; ) { - assert(ucs4_out < ucs4_end); - if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0]) - && (iter+1) < end - && Py_UNICODE_IS_LOW_SURROGATE(iter[1])) - { - *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]); - iter += 2; - } - else { - *ucs4_out++ = *iter; - iter++; - } - } - assert(ucs4_out == ucs4_end); -} -#endif - static int unicode_check_modifiable(PyObject *unicode) { @@ -1783,13 +1750,76 @@ unicode_char(Py_UCS4 ch) return unicode; } -static inline int -unicode_fromwidechar(const wchar_t *u, Py_ssize_t size, - PyObject **punicode, _PyUnicodeWriter *writer) + +static inline void +unicode_write_widechar(int kind, void *data, + const wchar_t *u, Py_ssize_t size, + Py_ssize_t num_surrogates) +{ + switch (kind) { + case PyUnicode_1BYTE_KIND: + _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char, u, u + size, data); + break; + + case PyUnicode_2BYTE_KIND: +#if Py_UNICODE_SIZE == 2 + memcpy(data, u, size * 2); +#else + _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2, u, u + size, data); +#endif + break; + + case PyUnicode_4BYTE_KIND: + { +#if SIZEOF_WCHAR_T == 2 + // Convert a 16-bits wchar_t representation to UCS4, this will decode + // surrogate pairs, the other conversions are implemented as macros + // for efficiency. + // + // This code assumes that unicode can hold one more code point than + // wstr characters for a terminating null character. + const wchar_t *end = u + size; + Py_UCS4 *ucs4_out = (Py_UCS4*)data; +# ifndef NDEBUG + Py_UCS4 *ucs4_end = (Py_UCS4*)data + (size - num_surrogates); +# endif + for (const wchar_t *iter = u; iter < end; ) { + assert(ucs4_out < ucs4_end); + if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0]) + && (iter+1) < end + && Py_UNICODE_IS_LOW_SURROGATE(iter[1])) + { + *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]); + iter += 2; + } + else { + *ucs4_out++ = *iter; + iter++; + } + } + assert(ucs4_out == ucs4_end); +#else + assert(num_surrogates == 0); + memcpy(data, u, size * 4); +#endif + break; + } + default: + Py_UNREACHABLE(); + } +} + + +PyObject * +PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size) { + PyObject *unicode; + Py_UCS4 maxchar = 0; + Py_ssize_t num_surrogates; + if (u == NULL && size != 0) { PyErr_BadInternalCall(); - return -1; + return NULL; } if (size == -1) { @@ -1800,12 +1830,8 @@ unicode_fromwidechar(const wchar_t *u, Py_ssize_t size, some optimizations which share commonly used objects. */ /* Optimization for empty strings */ - if (size == 0) { - if (punicode) { - *punicode = unicode_get_empty(); - } - return 0; - } + if (size == 0) + _Py_RETURN_UNICODE_EMPTY(); #ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION /* Oracle Solaris uses non-Unicode internal wchar_t form for @@ -1813,108 +1839,88 @@ unicode_fromwidechar(const wchar_t *u, Py_ssize_t size, if (_Py_LocaleUsesNonUnicodeWchar()) { wchar_t* converted = _Py_DecodeNonUnicodeWchar(u, size); if (!converted) { - return -1; + return NULL; } PyObject *unicode = _PyUnicode_FromUCS4(converted, size); PyMem_Free(converted); - if (punicode) { - *punicode = unicode; - return 0; - } - else { - int res = _PyUnicodeWriter_WriteStr(_writer, unicode); - Py_DECREF(unicode); - return res; - } + return unicode; } #endif /* Single character Unicode objects in the Latin-1 range are shared when using this constructor */ - if (punicode && size == 1 && (Py_UCS4)*u < 256) { - *punicode = get_latin1_char((unsigned char)*u); - return 0; - } + if (size == 1 && (Py_UCS4)*u < 256) + return get_latin1_char((unsigned char)*u); /* If not empty and not single character, copy the Unicode data into the new object */ - Py_UCS4 maxchar = 0; - Py_ssize_t num_surrogates; if (find_maxchar_surrogates(u, u + size, - &maxchar, &num_surrogates) == -1) { - return -1; + &maxchar, &num_surrogates) == -1) + return NULL; + + unicode = PyUnicode_New(size - num_surrogates, maxchar); + if (!unicode) + return NULL; + + unicode_write_widechar(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode), + u, size, num_surrogates); + + return unicode_result(unicode); +} + + +int +PyUnicodeWriter_WriteWideChar(PyUnicodeWriter *pub_writer, + wchar_t *str, + Py_ssize_t size) +{ + _PyUnicodeWriter *writer = (_PyUnicodeWriter *)pub_writer; + + if (size < 0) { + size = wcslen(str); } - PyObject *unicode = NULL; - int kind; - void *data; - if (punicode) { - unicode = PyUnicode_New(size - num_surrogates, maxchar); - if (!unicode) { - return -1; - } - kind = PyUnicode_KIND(unicode); - data = PyUnicode_DATA(unicode); + if (size == 0) { + return 0; } - else { - if (_PyUnicodeWriter_Prepare(writer, size - num_surrogates, - maxchar) < 0) { + +#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION + /* Oracle Solaris uses non-Unicode internal wchar_t form for + non-Unicode locales and hence needs conversion to UCS-4 first. */ + if (_Py_LocaleUsesNonUnicodeWchar()) { + wchar_t* converted = _Py_DecodeNonUnicodeWchar(str, size); + if (!converted) { return -1; } - kind = writer->kind; - data = (Py_UCS1*)writer->data + writer->pos * kind; - } + PyObject *unicode = _PyUnicode_FromUCS4(converted, size); + PyMem_Free(converted); - switch (kind) { - case PyUnicode_1BYTE_KIND: - _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char, u, u + size, data); - break; - case PyUnicode_2BYTE_KIND: -#if Py_UNICODE_SIZE == 2 - memcpy(data, u, size * 2); -#else - _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2, u, u + size, data); -#endif - break; - case PyUnicode_4BYTE_KIND: - { -#if SIZEOF_WCHAR_T == 2 - /* This is the only case which has to process surrogates, thus - a simple copy loop is not enough and we need a function. */ -# ifndef NDEBUG - Py_UCS4* ucs4_end = (Py_UCS4*)data + (size - num_surrogates); - unicode_convert_wchar_to_ucs4(u, u + size, (Py_UCS4*)data, ucs4_end); -# else - unicode_convert_wchar_to_ucs4(u, u + size, (Py_UCS4*)data); -# endif -#else - assert(num_surrogates == 0); - memcpy(data, u, size * 4); -#endif - break; - } - default: - Py_UNREACHABLE(); + int res = _PyUnicodeWriter_WriteStr(writer, unicode); + Py_DECREF(unicode); + return res; } +#endif - if (punicode) { - *punicode = unicode_result(unicode); + /* If not empty and not single character, copy the Unicode data + into the new object */ + Py_UCS4 maxchar = 0; + Py_ssize_t num_surrogates; + if (find_maxchar_surrogates(str, str + size, + &maxchar, &num_surrogates) == -1) { + return -1; } - else { - writer->pos += size - num_surrogates; + + if (_PyUnicodeWriter_Prepare(writer, size - num_surrogates, + maxchar) < 0) { + return -1; } - return 0; -} + int kind = writer->kind; + void *data = (Py_UCS1*)writer->data + writer->pos * kind; + unicode_write_widechar(kind, data, str, size, num_surrogates); -PyObject * -PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size) -{ - PyObject *unicode; - if (unicode_fromwidechar(u, size, &unicode, NULL) < 0) { - return NULL; - } - return unicode; + writer->pos += size - num_surrogates; + return 0; } @@ -13572,20 +13578,6 @@ PyUnicodeWriter_DecodeUTF8Stateful(PyUnicodeWriter *writer, } -int -PyUnicodeWriter_WriteWideChar(PyUnicodeWriter *writer, - wchar_t *str, - Py_ssize_t size) -{ - if (size < 0) { - size = wcslen(str); - } - - _PyUnicodeWriter *_writer = (_PyUnicodeWriter *)writer; - return unicode_fromwidechar(str, size, NULL, _writer); -} - - int _PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer, const char *str, Py_ssize_t len) From 3f284f8537c15568cd93316b04aad6a12543cbc4 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Thu, 20 Jun 2024 15:40:12 +0200 Subject: [PATCH 7/9] Update Doc/c-api/unicode.rst Co-authored-by: Serhiy Storchaka --- Doc/c-api/unicode.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index dc49fafda15c70..37761274acdae8 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -1609,7 +1609,9 @@ object. ``NULL``, use the strict error handler. If *consumed* is not ``NULL``, set *\*consumed* to the number of decoded - bytes on success. + bytes on success. + If *consumed* is ``NULL``, treat trailing incomplete UTF-8 byte sequences + as an error. On success, return ``0``. On error, set an exception, leave the writer unchanged, and return ``-1``. From 1e018d24b71ff2cbc9ec3e3069711d8a9c08078a Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Thu, 20 Jun 2024 15:52:13 +0200 Subject: [PATCH 8/9] Address Serhiy's review --- Doc/c-api/unicode.rst | 6 ++---- Include/cpython/unicodeobject.h | 2 +- Objects/unicodeobject.c | 15 ++++----------- 3 files changed, 7 insertions(+), 16 deletions(-) diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index 37761274acdae8..4ea20bde38c1db 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -1553,12 +1553,10 @@ object. See also :c:func:`PyUnicodeWriter_DecodeUTF8Stateful`. -.. c:function:: int PyUnicodeWriter_WriteWideChar(PyUnicodeWriter *writer, wchar_t *str, Py_ssize_t size) +.. c:function:: int PyUnicodeWriter_WriteWideChar(PyUnicodeWriter *writer, const wchar_t *str, Py_ssize_t size) Writer the wide string *str* into *writer*. - *str* must not be ``NULL``. - *size* is a number of wide characters. If *size* is equal to ``-1``, call ``wcslen(str)`` to get the string length. @@ -1609,7 +1607,7 @@ object. ``NULL``, use the strict error handler. If *consumed* is not ``NULL``, set *\*consumed* to the number of decoded - bytes on success. + bytes on success. If *consumed* is ``NULL``, treat trailing incomplete UTF-8 byte sequences as an error. diff --git a/Include/cpython/unicodeobject.h b/Include/cpython/unicodeobject.h index 54b0f665861e03..059bec8618c8d9 100644 --- a/Include/cpython/unicodeobject.h +++ b/Include/cpython/unicodeobject.h @@ -461,7 +461,7 @@ PyAPI_FUNC(int) PyUnicodeWriter_WriteUTF8( Py_ssize_t size); PyAPI_FUNC(int) PyUnicodeWriter_WriteWideChar( PyUnicodeWriter *writer, - wchar_t *str, + const wchar_t *str, Py_ssize_t size); PyAPI_FUNC(int) PyUnicodeWriter_WriteStr( diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index bb95c0dea0b828..335381c327f602 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -1762,7 +1762,7 @@ unicode_write_widechar(int kind, void *data, break; case PyUnicode_2BYTE_KIND: -#if Py_UNICODE_SIZE == 2 +#if SIZEOF_WCHAR_T == 2 memcpy(data, u, size * 2); #else _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2, u, u + size, data); @@ -1773,11 +1773,7 @@ unicode_write_widechar(int kind, void *data, { #if SIZEOF_WCHAR_T == 2 // Convert a 16-bits wchar_t representation to UCS4, this will decode - // surrogate pairs, the other conversions are implemented as macros - // for efficiency. - // - // This code assumes that unicode can hold one more code point than - // wstr characters for a terminating null character. + // surrogate pairs. const wchar_t *end = u + size; Py_UCS4 *ucs4_out = (Py_UCS4*)data; # ifndef NDEBUG @@ -1871,7 +1867,7 @@ PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size) int PyUnicodeWriter_WriteWideChar(PyUnicodeWriter *pub_writer, - wchar_t *str, + const wchar_t *str, Py_ssize_t size) { _PyUnicodeWriter *writer = (_PyUnicodeWriter *)pub_writer; @@ -1901,8 +1897,6 @@ PyUnicodeWriter_WriteWideChar(PyUnicodeWriter *pub_writer, } #endif - /* If not empty and not single character, copy the Unicode data - into the new object */ Py_UCS4 maxchar = 0; Py_ssize_t num_surrogates; if (find_maxchar_surrogates(str, str + size, @@ -1910,8 +1904,7 @@ PyUnicodeWriter_WriteWideChar(PyUnicodeWriter *pub_writer, return -1; } - if (_PyUnicodeWriter_Prepare(writer, size - num_surrogates, - maxchar) < 0) { + if (_PyUnicodeWriter_Prepare(writer, size - num_surrogates, maxchar) < 0) { return -1; } From 6f29c53223b57e7fa1eca35dcf7d9446d278a180 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Thu, 20 Jun 2024 15:54:58 +0200 Subject: [PATCH 9/9] Add more tests --- Modules/_testcapi/unicode.c | 35 +++++++++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/Modules/_testcapi/unicode.c b/Modules/_testcapi/unicode.c index c3ba891f8a0ac4..da658b4129dffd 100644 --- a/Modules/_testcapi/unicode.c +++ b/Modules/_testcapi/unicode.c @@ -391,12 +391,22 @@ test_unicodewriter_decode_utf8(PyObject *self, PyObject *Py_UNUSED(args)) if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "replace\xFF", -1, "replace", NULL) < 0) { goto error; } + if (PyUnicodeWriter_WriteChar(writer, '-') < 0) { + goto error; + } + + // incomplete trailing UTF-8 sequence + if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "incomplete\xC3", -1, "replace", NULL) < 0) { + goto error; + } PyObject *result = PyUnicodeWriter_Finish(writer); if (result == NULL) { return NULL; } - assert(PyUnicode_EqualToUTF8(result, "ignore-replace\xef\xbf\xbd")); + assert(PyUnicode_EqualToUTF8(result, + "ignore-replace\xef\xbf\xbd" + "-incomplete\xef\xbf\xbd")); Py_DECREF(result); Py_RETURN_NONE; @@ -423,7 +433,16 @@ test_unicodewriter_decode_utf8_consumed(PyObject *self, PyObject *Py_UNUSED(args goto error; } assert(consumed == 4); + if (PyUnicodeWriter_WriteChar(writer, '-') < 0) { + goto error; + } + // non-ASCII + consumed = 12345; + if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "\xC3\xA9-\xE2\x82\xAC", 6, NULL, &consumed) < 0) { + goto error; + } + assert(consumed == 6); if (PyUnicodeWriter_WriteChar(writer, '-') < 0) { goto error; } @@ -440,12 +459,24 @@ test_unicodewriter_decode_utf8_consumed(PyObject *self, PyObject *Py_UNUSED(args goto error; } assert(consumed == 5); + if (PyUnicodeWriter_WriteChar(writer, '-') < 0) { + goto error; + } + + // incomplete trailing UTF-8 sequence + consumed = 12345; + if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "incomplete\xC3", -1, "ignore", &consumed) < 0) { + goto error; + } + assert(consumed == 10); PyObject *result = PyUnicodeWriter_Finish(writer); if (result == NULL) { return NULL; } - assert(PyUnicode_EqualToUTF8(result, "text-more")); + assert(PyUnicode_EqualToUTF8(result, + "text-\xC3\xA9-\xE2\x82\xAC-" + "more-incomplete")); Py_DECREF(result); Py_RETURN_NONE;