From 8aa73b750a0bd2199ac6ca84c8eb968c5708d66f Mon Sep 17 00:00:00 2001
From: Victor Stinner <vstinner@python.org>
Date: Mon, 17 Jun 2024 15:25:37 +0200
Subject: [PATCH 1/9] gh-119182: Add PyUnicodeWriter_DecodeUTF8Stateful()

Add PyUnicodeWriter_WriteWideChar() and
PyUnicodeWriter_DecodeUTF8Stateful() functions.
---
 Doc/c-api/unicode.rst           |  33 ++++++++-
 Doc/whatsnew/3.14.rst           |   2 +
 Include/cpython/unicodeobject.h |  10 +++
 Modules/_testcapi/unicode.c     | 121 ++++++++++++++++++++++++++++++++
 Objects/unicodeobject.c         |  46 ++++++++++++
 5 files changed, 209 insertions(+), 3 deletions(-)

diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst
index 02e696c303fa91..262b4bfcbca91a 100644
--- a/Doc/c-api/unicode.rst
+++ b/Doc/c-api/unicode.rst
@@ -1551,9 +1551,17 @@ object.
    On success, return ``0``.
    On error, set an exception, leave the writer unchanged, and return ``-1``.
 
-   To use a different error handler than ``strict``,
-   :c:func:`PyUnicode_DecodeUTF8` can be used with
-   :c:func:`PyUnicodeWriter_WriteStr`.
+   See also :c:func:`PyUnicodeWriter_DecodeUTF8Stateful`.
+
+.. c:function:: PyUnicodeWriter_WriteWideChar(PyUnicodeWriter *writer, wchar_t *str, Py_ssize_t size)
+
+   Writer the wide string *str* into *writer*.
+
+   *size* is a number of wide characters. If *size* is equal to ``-1``, call
+   ``wcslen(str)`` to get the string length.
+
+   On success, return ``0``.
+   On error, set an exception, leave the writer unchanged, and return ``-1``.
 
 .. c:function:: int PyUnicodeWriter_WriteStr(PyUnicodeWriter *writer, PyObject *obj)
 
@@ -1586,3 +1594,22 @@ object.
 
    On success, return ``0``.
    On error, set an exception, leave the writer unchanged, and return ``-1``.
+
+.. c:function:: int PyUnicodeWriter_DecodeUTF8Stateful(PyUnicodeWriter *writer, const char *string, Py_ssize_t length, const char *errors, Py_ssize_t *consumed)
+
+   Decode the string *str* from UTF-8 with *errors* error handler and write the
+   output into *writer*.
+
+   *size* is the string length in bytes. If *size* is equal to ``-1``, call
+   ``strlen(str)`` to get the string length.
+
+   *errors* is an error handler name, such as ``"replace"``. If *errors* is
+   ``NULL``, use the strict error handler.
+
+   If *consumed* is not ``NULL``, set *\*consumed* to the number of decoded
+   bytes on success.
+
+   On success, return ``0``.
+   On error, set an exception, leave the writer unchanged, and return ``-1``.
+
+   See also :c:func:`PyUnicodeWriter_WriteUTF8`.
diff --git a/Doc/whatsnew/3.14.rst b/Doc/whatsnew/3.14.rst
index 55541ff14d88ce..9f5cfaf36ee12c 100644
--- a/Doc/whatsnew/3.14.rst
+++ b/Doc/whatsnew/3.14.rst
@@ -291,10 +291,12 @@ New Features
   * :c:func:`PyUnicodeWriter_Finish`.
   * :c:func:`PyUnicodeWriter_WriteChar`.
   * :c:func:`PyUnicodeWriter_WriteUTF8`.
+  * :c:func:`PyUnicodeWriter_WriteWideChar`.
   * :c:func:`PyUnicodeWriter_WriteStr`.
   * :c:func:`PyUnicodeWriter_WriteRepr`.
   * :c:func:`PyUnicodeWriter_WriteSubstring`.
   * :c:func:`PyUnicodeWriter_Format`.
+  * :c:func:`PyUnicodeWriter_DecodeUTF8Stateful`.
 
   (Contributed by Victor Stinner in :gh:`119182`.)
 
diff --git a/Include/cpython/unicodeobject.h b/Include/cpython/unicodeobject.h
index e5e1b6be118588..54b0f665861e03 100644
--- a/Include/cpython/unicodeobject.h
+++ b/Include/cpython/unicodeobject.h
@@ -459,6 +459,10 @@ PyAPI_FUNC(int) PyUnicodeWriter_WriteUTF8(
     PyUnicodeWriter *writer,
     const char *str,
     Py_ssize_t size);
+PyAPI_FUNC(int) PyUnicodeWriter_WriteWideChar(
+    PyUnicodeWriter *writer,
+    wchar_t *str,
+    Py_ssize_t size);
 
 PyAPI_FUNC(int) PyUnicodeWriter_WriteStr(
     PyUnicodeWriter *writer,
@@ -475,6 +479,12 @@ PyAPI_FUNC(int) PyUnicodeWriter_Format(
     PyUnicodeWriter *writer,
     const char *format,
     ...);
+PyAPI_FUNC(int) PyUnicodeWriter_DecodeUTF8Stateful(
+    PyUnicodeWriter *writer,
+    const char *string,         /* UTF-8 encoded string */
+    Py_ssize_t length,          /* size of string */
+    const char *errors,         /* error handling */
+    Py_ssize_t *consumed);      /* bytes consumed */
 
 
 /* --- Private _PyUnicodeWriter API --------------------------------------- */
diff --git a/Modules/_testcapi/unicode.c b/Modules/_testcapi/unicode.c
index 79f99c404cd757..c3ba891f8a0ac4 100644
--- a/Modules/_testcapi/unicode.c
+++ b/Modules/_testcapi/unicode.c
@@ -374,6 +374,88 @@ test_unicodewriter_recover_error(PyObject *self, PyObject *Py_UNUSED(args))
 }
 
 
+static PyObject *
+test_unicodewriter_decode_utf8(PyObject *self, PyObject *Py_UNUSED(args))
+{
+    // test PyUnicodeWriter_DecodeUTF8Stateful()
+    PyUnicodeWriter *writer = PyUnicodeWriter_Create(0);
+    if (writer == NULL) {
+        return NULL;
+    }
+    if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "ign\xFFore", -1, "ignore", NULL) < 0) {
+        goto error;
+    }
+    if (PyUnicodeWriter_WriteChar(writer, '-') < 0) {
+        goto error;
+    }
+    if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "replace\xFF", -1, "replace", NULL) < 0) {
+        goto error;
+    }
+
+    PyObject *result = PyUnicodeWriter_Finish(writer);
+    if (result == NULL) {
+        return NULL;
+    }
+    assert(PyUnicode_EqualToUTF8(result, "ignore-replace\xef\xbf\xbd"));
+    Py_DECREF(result);
+
+    Py_RETURN_NONE;
+
+error:
+    PyUnicodeWriter_Discard(writer);
+    return NULL;
+}
+
+
+static PyObject *
+test_unicodewriter_decode_utf8_consumed(PyObject *self, PyObject *Py_UNUSED(args))
+{
+    // test PyUnicodeWriter_DecodeUTF8Stateful()
+    PyUnicodeWriter *writer = PyUnicodeWriter_Create(0);
+    if (writer == NULL) {
+        return NULL;
+    }
+    Py_ssize_t consumed;
+
+    // valid string
+    consumed = 12345;
+    if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "text", -1, NULL, &consumed) < 0) {
+        goto error;
+    }
+    assert(consumed == 4);
+
+    if (PyUnicodeWriter_WriteChar(writer, '-') < 0) {
+        goto error;
+    }
+
+    // consumed is 0 if write fails
+    consumed = 12345;
+    assert(PyUnicodeWriter_DecodeUTF8Stateful(writer, "invalid\xFF", -1, NULL, &consumed) < 0);
+    PyErr_Clear();
+    assert(consumed == 0);
+
+    // ignore error handler
+    consumed = 12345;
+    if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "more\xFF", -1, "ignore", &consumed) < 0) {
+        goto error;
+    }
+    assert(consumed == 5);
+
+    PyObject *result = PyUnicodeWriter_Finish(writer);
+    if (result == NULL) {
+        return NULL;
+    }
+    assert(PyUnicode_EqualToUTF8(result, "text-more"));
+    Py_DECREF(result);
+
+    Py_RETURN_NONE;
+
+error:
+    PyUnicodeWriter_Discard(writer);
+    return NULL;
+}
+
+
 static PyObject *
 test_unicodewriter_format(PyObject *self, PyObject *Py_UNUSED(args))
 {
@@ -436,6 +518,42 @@ test_unicodewriter_format_recover_error(PyObject *self, PyObject *Py_UNUSED(args
 }
 
 
+static PyObject *
+test_unicodewriter_widechar(PyObject *self, PyObject *Py_UNUSED(args))
+{
+    PyUnicodeWriter *writer = PyUnicodeWriter_Create(0);
+    if (writer == NULL) {
+        return NULL;
+    }
+    if (PyUnicodeWriter_WriteWideChar(writer, L"latin1=\xE9 IGNORED", 8) < 0) {
+        goto error;
+    }
+    if (PyUnicodeWriter_WriteWideChar(writer, L"-", 1) < 0) {
+        goto error;
+    }
+    if (PyUnicodeWriter_WriteWideChar(writer, L"euro=\u20AC", -1) < 0) {
+        goto error;
+    }
+    if (PyUnicodeWriter_WriteChar(writer, '.') < 0) {
+        goto error;
+    }
+
+    PyObject *result = PyUnicodeWriter_Finish(writer);
+    if (result == NULL) {
+        return NULL;
+    }
+    assert(PyUnicode_EqualToUTF8(result,
+                                 "latin1=\xC3\xA9-euro=\xE2\x82\xAC."));
+    Py_DECREF(result);
+
+    Py_RETURN_NONE;
+
+error:
+    PyUnicodeWriter_Discard(writer);
+    return NULL;
+}
+
+
 static PyMethodDef TestMethods[] = {
     {"unicode_new",              unicode_new,                    METH_VARARGS},
     {"unicode_fill",             unicode_fill,                   METH_VARARGS},
@@ -448,8 +566,11 @@ static PyMethodDef TestMethods[] = {
     {"test_unicodewriter_utf8",  test_unicodewriter_utf8,        METH_NOARGS},
     {"test_unicodewriter_invalid_utf8", test_unicodewriter_invalid_utf8, METH_NOARGS},
     {"test_unicodewriter_recover_error", test_unicodewriter_recover_error, METH_NOARGS},
+    {"test_unicodewriter_decode_utf8", test_unicodewriter_decode_utf8, METH_NOARGS},
+    {"test_unicodewriter_decode_utf8_consumed", test_unicodewriter_decode_utf8_consumed, METH_NOARGS},
     {"test_unicodewriter_format", test_unicodewriter_format,     METH_NOARGS},
     {"test_unicodewriter_format_recover_error", test_unicodewriter_format_recover_error, METH_NOARGS},
+    {"test_unicodewriter_widechar", test_unicodewriter_widechar, METH_NOARGS},
     {NULL},
 };
 
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 1f8c89dd12a528..acd388c2ea73ae 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -13500,6 +13500,52 @@ PyUnicodeWriter_WriteUTF8(PyUnicodeWriter *writer,
     return res;
 }
 
+
+int
+PyUnicodeWriter_DecodeUTF8Stateful(PyUnicodeWriter *writer,
+                                   const char *string,
+                                   Py_ssize_t length,
+                                   const char *errors,
+                                   Py_ssize_t *consumed)
+{
+    if (length < 0) {
+        length = strlen(string);
+    }
+
+    _PyUnicodeWriter *_writer = (_PyUnicodeWriter*)writer;
+    Py_ssize_t old_pos = _writer->pos;
+    int res = unicode_decode_utf8_writer(_writer, string, length,
+                                         _Py_ERROR_UNKNOWN, errors, consumed);
+    if (res < 0) {
+        _writer->pos = old_pos;
+        if (consumed) {
+            *consumed = 0;
+        }
+    }
+    return res;
+}
+
+
+int
+PyUnicodeWriter_WriteWideChar(PyUnicodeWriter *writer,
+                              wchar_t *str,
+                              Py_ssize_t size)
+{
+    if (size < 0) {
+        size = wcslen(str);
+    }
+    PyObject *obj = PyUnicode_FromWideChar(str, size);
+    if (obj == NULL) {
+        return -1;
+    }
+
+    _PyUnicodeWriter *_writer = (_PyUnicodeWriter *)writer;
+    int res = _PyUnicodeWriter_WriteStr(_writer, obj);
+    Py_DECREF(obj);
+    return res;
+}
+
+
 int
 _PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
                                    const char *str, Py_ssize_t len)

From 788a85f37d946ec377e2eba4af2a3dcc1e94f8cc Mon Sep 17 00:00:00 2001
From: Victor Stinner <vstinner@python.org>
Date: Mon, 17 Jun 2024 21:57:34 +0200
Subject: [PATCH 2/9] doc: fix typo

---
 Doc/c-api/unicode.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst
index 262b4bfcbca91a..37b8b1ecf73bbf 100644
--- a/Doc/c-api/unicode.rst
+++ b/Doc/c-api/unicode.rst
@@ -1553,7 +1553,7 @@ object.
 
    See also :c:func:`PyUnicodeWriter_DecodeUTF8Stateful`.
 
-.. c:function:: PyUnicodeWriter_WriteWideChar(PyUnicodeWriter *writer, wchar_t *str, Py_ssize_t size)
+.. c:function:: int PyUnicodeWriter_WriteWideChar(PyUnicodeWriter *writer, wchar_t *str, Py_ssize_t size)
 
    Writer the wide string *str* into *writer*.
 

From e67a8b4993329eb46641c6422392281e8eddd9e7 Mon Sep 17 00:00:00 2001
From: Victor Stinner <vstinner@python.org>
Date: Wed, 19 Jun 2024 12:14:23 +0200
Subject: [PATCH 3/9] Optimize PyUnicodeWriter_WriteWideChar()

Avoid a temporary Unicode object, write directly into the writer.
---
 Objects/unicodeobject.c | 138 ++++++++++++++++++++++++++--------------
 1 file changed, 89 insertions(+), 49 deletions(-)

diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index acd388c2ea73ae..ae57a316f9bb00 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -1294,19 +1294,14 @@ PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
    characters for a terminating null character. */
 static void
 unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
-                              PyObject *unicode)
+                              Py_UCS4 *ucs4_out,
+#ifndef NDEBUG
+                              Py_UCS4 *ucs4_end,
+#endif
+                              )
 {
-    const wchar_t *iter;
-    Py_UCS4 *ucs4_out;
-
-    assert(unicode != NULL);
-    assert(_PyUnicode_CHECK(unicode));
-    assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
-    ucs4_out = PyUnicode_4BYTE_DATA(unicode);
-
-    for (iter = begin; iter < end; ) {
-        assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
-                           _PyUnicode_GET_LENGTH(unicode)));
+    for (const wchar_t *iter = begin; iter < end; ) {
+        assert(ucs4_out < ucs4_end);
         if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
             && (iter+1) < end
             && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
@@ -1319,9 +1314,7 @@ unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
             iter++;
         }
     }
-    assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
-                        _PyUnicode_GET_LENGTH(unicode)));
-
+    assert(ucs4_out == ucs4_end);
 }
 #endif
 
@@ -1790,16 +1783,13 @@ unicode_char(Py_UCS4 ch)
     return unicode;
 }
 
-PyObject *
-PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
+static inline int
+unicode_fromwidechar(const wchar_t *u, Py_ssize_t size,
+                     PyObject **punicode, _PyUnicodeWriter *writer)
 {
-    PyObject *unicode;
-    Py_UCS4 maxchar = 0;
-    Py_ssize_t num_surrogates;
-
     if (u == NULL && size != 0) {
         PyErr_BadInternalCall();
-        return NULL;
+        return -1;
     }
 
     if (size == -1) {
@@ -1810,8 +1800,12 @@ PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
        some optimizations which share commonly used objects. */
 
     /* Optimization for empty strings */
-    if (size == 0)
-        _Py_RETURN_UNICODE_EMPTY();
+    if (size == 0) {
+        if (punicode) {
+            *punicode = unicode_get_empty();
+        }
+        return 0;
+    }
 
 #ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
     /* Oracle Solaris uses non-Unicode internal wchar_t form for
@@ -1819,59 +1813,111 @@ PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
     if (_Py_LocaleUsesNonUnicodeWchar()) {
         wchar_t* converted = _Py_DecodeNonUnicodeWchar(u, size);
         if (!converted) {
-            return NULL;
+            return -1;
         }
         PyObject *unicode = _PyUnicode_FromUCS4(converted, size);
         PyMem_Free(converted);
-        return unicode;
+        if (punicode) {
+            *punicode = unicode;
+            return 0;
+        }
+        else {
+            int res = _PyUnicodeWriter_WriteStr(_writer, unicode);
+            Py_DECREF(unicode);
+            return res;
+        }
     }
 #endif
 
     /* Single character Unicode objects in the Latin-1 range are
        shared when using this constructor */
-    if (size == 1 && (Py_UCS4)*u < 256)
-        return get_latin1_char((unsigned char)*u);
+    if (punicode && size == 1 && (Py_UCS4)*u < 256) {
+        *punicode = get_latin1_char((unsigned char)*u);
+        return 0;
+    }
 
     /* If not empty and not single character, copy the Unicode data
        into the new object */
+    Py_UCS4 maxchar = 0;
+    Py_ssize_t num_surrogates;
     if (find_maxchar_surrogates(u, u + size,
-                                &maxchar, &num_surrogates) == -1)
-        return NULL;
+                                &maxchar, &num_surrogates) == -1) {
+        return -1;
+    }
 
-    unicode = PyUnicode_New(size - num_surrogates, maxchar);
-    if (!unicode)
-        return NULL;
+    PyObject *unicode = NULL;
+    int kind;
+    void *data;
+    if (punicode) {
+        unicode = PyUnicode_New(size - num_surrogates, maxchar);
+        if (!unicode) {
+            return -1;
+        }
+        kind = PyUnicode_KIND(unicode);
+        data = PyUnicode_DATA(unicode);
+    }
+    else {
+        if (_PyUnicodeWriter_Prepare(writer, size - num_surrogates,
+                                     maxchar) < 0) {
+            return -1;
+        }
+        kind = writer->kind;
+        data = writer->data + writer->pos * kind;
+    }
 
-    switch (PyUnicode_KIND(unicode)) {
+    switch (kind) {
     case PyUnicode_1BYTE_KIND:
-        _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
-                                u, u + size, PyUnicode_1BYTE_DATA(unicode));
+        _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char, u, u + size, data);
         break;
     case PyUnicode_2BYTE_KIND:
 #if Py_UNICODE_SIZE == 2
-        memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
+        memcpy(data, u, size * 2);
 #else
-        _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
-                                u, u + size, PyUnicode_2BYTE_DATA(unicode));
+        _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2, u, u + size, data);
 #endif
         break;
     case PyUnicode_4BYTE_KIND:
+    {
 #if SIZEOF_WCHAR_T == 2
         /* This is the only case which has to process surrogates, thus
            a simple copy loop is not enough and we need a function. */
-        unicode_convert_wchar_to_ucs4(u, u + size, unicode);
+#  ifndef NDEBUG
+        Py_UCS4* ucs4_end = (Py_UCS4*)data + (size - num_surrogates);
+        unicode_convert_wchar_to_ucs4(u, u + size, (Py_UCS4*)data, ucs4_end);
+#  else
+        unicode_convert_wchar_to_ucs4(u, u + size, (Py_UCS4*)data);
+#  endif
 #else
         assert(num_surrogates == 0);
-        memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
+        memcpy(data, u, size * 4);
 #endif
         break;
+    }
     default:
         Py_UNREACHABLE();
     }
 
-    return unicode_result(unicode);
+    if (punicode) {
+        *punicode = unicode_result(unicode);
+    }
+    else {
+        writer->pos += size - num_surrogates;
+    }
+    return 0;
+}
+
+
+PyObject *
+PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
+{
+    PyObject *unicode;
+    if (unicode_fromwidechar(u, size, &unicode, NULL) < 0) {
+        return NULL;
+    }
+    return unicode;
 }
 
+
 PyObject *
 PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
 {
@@ -13534,15 +13580,9 @@ PyUnicodeWriter_WriteWideChar(PyUnicodeWriter *writer,
     if (size < 0) {
         size = wcslen(str);
     }
-    PyObject *obj = PyUnicode_FromWideChar(str, size);
-    if (obj == NULL) {
-        return -1;
-    }
 
     _PyUnicodeWriter *_writer = (_PyUnicodeWriter *)writer;
-    int res = _PyUnicodeWriter_WriteStr(_writer, obj);
-    Py_DECREF(obj);
-    return res;
+    return unicode_fromwidechar(str, size, NULL, _writer);
 }
 
 

From de56475973fb1314728fd373d7afe7fc44277f6b Mon Sep 17 00:00:00 2001
From: Serhiy Storchaka <storchaka@gmail.com>
Date: Wed, 19 Jun 2024 17:45:07 +0300
Subject: [PATCH 4/9] Update Objects/unicodeobject.c

---
 Objects/unicodeobject.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index ae57a316f9bb00..f398b3d8668282 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -1294,9 +1294,9 @@ PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
    characters for a terminating null character. */
 static void
 unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
-                              Py_UCS4 *ucs4_out,
+                              Py_UCS4 *ucs4_out
 #ifndef NDEBUG
-                              Py_UCS4 *ucs4_end,
+                              , Py_UCS4 *ucs4_end
 #endif
                               )
 {

From e48eec7e773bde2ce7b5a056811ec01fe928f51c Mon Sep 17 00:00:00 2001
From: Victor Stinner <vstinner@python.org>
Date: Wed, 19 Jun 2024 17:57:57 +0200
Subject: [PATCH 5/9] Fix compiler warning

---
 Objects/unicodeobject.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index f398b3d8668282..8657f8780a048b 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -1862,7 +1862,7 @@ unicode_fromwidechar(const wchar_t *u, Py_ssize_t size,
             return -1;
         }
         kind = writer->kind;
-        data = writer->data + writer->pos * kind;
+        data = (Py_UCS1*)writer->data + writer->pos * kind;
     }
 
     switch (kind) {

From 75fa8ba208cea43009ddf2f060ca4fab464597a9 Mon Sep 17 00:00:00 2001
From: Victor Stinner <vstinner@python.org>
Date: Thu, 20 Jun 2024 10:35:19 +0200
Subject: [PATCH 6/9] Add unicode_write_widechar()

Remove unicode_convert_wchar_to_ucs4().

Refactor PyUnicode_FromWideChar() and
PyUnicodeWriter_WriteWideChar().
---
 Doc/c-api/unicode.rst   |   2 +
 Objects/unicodeobject.c | 262 +++++++++++++++++++---------------------
 2 files changed, 129 insertions(+), 135 deletions(-)

diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst
index 37b8b1ecf73bbf..dc49fafda15c70 100644
--- a/Doc/c-api/unicode.rst
+++ b/Doc/c-api/unicode.rst
@@ -1557,6 +1557,8 @@ object.
 
    Writer the wide string *str* into *writer*.
 
+   *str* must not be ``NULL``.
+
    *size* is a number of wide characters. If *size* is equal to ``-1``, call
    ``wcslen(str)`` to get the string length.
 
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 8657f8780a048b..bb95c0dea0b828 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -1285,39 +1285,6 @@ PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
     return obj;
 }
 
-#if SIZEOF_WCHAR_T == 2
-/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
-   will decode surrogate pairs, the other conversions are implemented as macros
-   for efficiency.
-
-   This function assumes that unicode can hold one more code point than wstr
-   characters for a terminating null character. */
-static void
-unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
-                              Py_UCS4 *ucs4_out
-#ifndef NDEBUG
-                              , Py_UCS4 *ucs4_end
-#endif
-                              )
-{
-    for (const wchar_t *iter = begin; iter < end; ) {
-        assert(ucs4_out < ucs4_end);
-        if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
-            && (iter+1) < end
-            && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
-        {
-            *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
-            iter += 2;
-        }
-        else {
-            *ucs4_out++ = *iter;
-            iter++;
-        }
-    }
-    assert(ucs4_out == ucs4_end);
-}
-#endif
-
 static int
 unicode_check_modifiable(PyObject *unicode)
 {
@@ -1783,13 +1750,76 @@ unicode_char(Py_UCS4 ch)
     return unicode;
 }
 
-static inline int
-unicode_fromwidechar(const wchar_t *u, Py_ssize_t size,
-                     PyObject **punicode, _PyUnicodeWriter *writer)
+
+static inline void
+unicode_write_widechar(int kind, void *data,
+                       const wchar_t *u, Py_ssize_t size,
+                       Py_ssize_t num_surrogates)
+{
+    switch (kind) {
+    case PyUnicode_1BYTE_KIND:
+        _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char, u, u + size, data);
+        break;
+
+    case PyUnicode_2BYTE_KIND:
+#if Py_UNICODE_SIZE == 2
+        memcpy(data, u, size * 2);
+#else
+        _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2, u, u + size, data);
+#endif
+        break;
+
+    case PyUnicode_4BYTE_KIND:
+    {
+#if SIZEOF_WCHAR_T == 2
+        // Convert a 16-bits wchar_t representation to UCS4, this will decode
+        // surrogate pairs, the other conversions are implemented as macros
+        // for efficiency.
+        //
+        // This code assumes that unicode can hold one more code point than
+        // wstr characters for a terminating null character.
+        const wchar_t *end = u + size;
+        Py_UCS4 *ucs4_out = (Py_UCS4*)data;
+#  ifndef NDEBUG
+        Py_UCS4 *ucs4_end = (Py_UCS4*)data + (size - num_surrogates);
+#  endif
+        for (const wchar_t *iter = u; iter < end; ) {
+            assert(ucs4_out < ucs4_end);
+            if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
+                && (iter+1) < end
+                && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
+            {
+                *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
+                iter += 2;
+            }
+            else {
+                *ucs4_out++ = *iter;
+                iter++;
+            }
+        }
+        assert(ucs4_out == ucs4_end);
+#else
+        assert(num_surrogates == 0);
+        memcpy(data, u, size * 4);
+#endif
+        break;
+    }
+    default:
+        Py_UNREACHABLE();
+    }
+}
+
+
+PyObject *
+PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
 {
+    PyObject *unicode;
+    Py_UCS4 maxchar = 0;
+    Py_ssize_t num_surrogates;
+
     if (u == NULL && size != 0) {
         PyErr_BadInternalCall();
-        return -1;
+        return NULL;
     }
 
     if (size == -1) {
@@ -1800,12 +1830,8 @@ unicode_fromwidechar(const wchar_t *u, Py_ssize_t size,
        some optimizations which share commonly used objects. */
 
     /* Optimization for empty strings */
-    if (size == 0) {
-        if (punicode) {
-            *punicode = unicode_get_empty();
-        }
-        return 0;
-    }
+    if (size == 0)
+        _Py_RETURN_UNICODE_EMPTY();
 
 #ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
     /* Oracle Solaris uses non-Unicode internal wchar_t form for
@@ -1813,108 +1839,88 @@ unicode_fromwidechar(const wchar_t *u, Py_ssize_t size,
     if (_Py_LocaleUsesNonUnicodeWchar()) {
         wchar_t* converted = _Py_DecodeNonUnicodeWchar(u, size);
         if (!converted) {
-            return -1;
+            return NULL;
         }
         PyObject *unicode = _PyUnicode_FromUCS4(converted, size);
         PyMem_Free(converted);
-        if (punicode) {
-            *punicode = unicode;
-            return 0;
-        }
-        else {
-            int res = _PyUnicodeWriter_WriteStr(_writer, unicode);
-            Py_DECREF(unicode);
-            return res;
-        }
+        return unicode;
     }
 #endif
 
     /* Single character Unicode objects in the Latin-1 range are
        shared when using this constructor */
-    if (punicode && size == 1 && (Py_UCS4)*u < 256) {
-        *punicode = get_latin1_char((unsigned char)*u);
-        return 0;
-    }
+    if (size == 1 && (Py_UCS4)*u < 256)
+        return get_latin1_char((unsigned char)*u);
 
     /* If not empty and not single character, copy the Unicode data
        into the new object */
-    Py_UCS4 maxchar = 0;
-    Py_ssize_t num_surrogates;
     if (find_maxchar_surrogates(u, u + size,
-                                &maxchar, &num_surrogates) == -1) {
-        return -1;
+                                &maxchar, &num_surrogates) == -1)
+        return NULL;
+
+    unicode = PyUnicode_New(size - num_surrogates, maxchar);
+    if (!unicode)
+        return NULL;
+
+    unicode_write_widechar(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
+                           u, size, num_surrogates);
+
+    return unicode_result(unicode);
+}
+
+
+int
+PyUnicodeWriter_WriteWideChar(PyUnicodeWriter *pub_writer,
+                              wchar_t *str,
+                              Py_ssize_t size)
+{
+    _PyUnicodeWriter *writer = (_PyUnicodeWriter *)pub_writer;
+
+    if (size < 0) {
+        size = wcslen(str);
     }
 
-    PyObject *unicode = NULL;
-    int kind;
-    void *data;
-    if (punicode) {
-        unicode = PyUnicode_New(size - num_surrogates, maxchar);
-        if (!unicode) {
-            return -1;
-        }
-        kind = PyUnicode_KIND(unicode);
-        data = PyUnicode_DATA(unicode);
+    if (size == 0) {
+        return 0;
     }
-    else {
-        if (_PyUnicodeWriter_Prepare(writer, size - num_surrogates,
-                                     maxchar) < 0) {
+
+#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
+    /* Oracle Solaris uses non-Unicode internal wchar_t form for
+       non-Unicode locales and hence needs conversion to UCS-4 first. */
+    if (_Py_LocaleUsesNonUnicodeWchar()) {
+        wchar_t* converted = _Py_DecodeNonUnicodeWchar(str, size);
+        if (!converted) {
             return -1;
         }
-        kind = writer->kind;
-        data = (Py_UCS1*)writer->data + writer->pos * kind;
-    }
+        PyObject *unicode = _PyUnicode_FromUCS4(converted, size);
+        PyMem_Free(converted);
 
-    switch (kind) {
-    case PyUnicode_1BYTE_KIND:
-        _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char, u, u + size, data);
-        break;
-    case PyUnicode_2BYTE_KIND:
-#if Py_UNICODE_SIZE == 2
-        memcpy(data, u, size * 2);
-#else
-        _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2, u, u + size, data);
-#endif
-        break;
-    case PyUnicode_4BYTE_KIND:
-    {
-#if SIZEOF_WCHAR_T == 2
-        /* This is the only case which has to process surrogates, thus
-           a simple copy loop is not enough and we need a function. */
-#  ifndef NDEBUG
-        Py_UCS4* ucs4_end = (Py_UCS4*)data + (size - num_surrogates);
-        unicode_convert_wchar_to_ucs4(u, u + size, (Py_UCS4*)data, ucs4_end);
-#  else
-        unicode_convert_wchar_to_ucs4(u, u + size, (Py_UCS4*)data);
-#  endif
-#else
-        assert(num_surrogates == 0);
-        memcpy(data, u, size * 4);
-#endif
-        break;
-    }
-    default:
-        Py_UNREACHABLE();
+        int res = _PyUnicodeWriter_WriteStr(writer, unicode);
+        Py_DECREF(unicode);
+        return res;
     }
+#endif
 
-    if (punicode) {
-        *punicode = unicode_result(unicode);
+    /* If not empty and not single character, copy the Unicode data
+       into the new object */
+    Py_UCS4 maxchar = 0;
+    Py_ssize_t num_surrogates;
+    if (find_maxchar_surrogates(str, str + size,
+                                &maxchar, &num_surrogates) == -1) {
+        return -1;
     }
-    else {
-        writer->pos += size - num_surrogates;
+
+    if (_PyUnicodeWriter_Prepare(writer, size - num_surrogates,
+                                 maxchar) < 0) {
+        return -1;
     }
-    return 0;
-}
 
+    int kind = writer->kind;
+    void *data = (Py_UCS1*)writer->data + writer->pos * kind;
+    unicode_write_widechar(kind, data, str, size, num_surrogates);
 
-PyObject *
-PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
-{
-    PyObject *unicode;
-    if (unicode_fromwidechar(u, size, &unicode, NULL) < 0) {
-        return NULL;
-    }
-    return unicode;
+    writer->pos += size - num_surrogates;
+    return 0;
 }
 
 
@@ -13572,20 +13578,6 @@ PyUnicodeWriter_DecodeUTF8Stateful(PyUnicodeWriter *writer,
 }
 
 
-int
-PyUnicodeWriter_WriteWideChar(PyUnicodeWriter *writer,
-                              wchar_t *str,
-                              Py_ssize_t size)
-{
-    if (size < 0) {
-        size = wcslen(str);
-    }
-
-    _PyUnicodeWriter *_writer = (_PyUnicodeWriter *)writer;
-    return unicode_fromwidechar(str, size, NULL, _writer);
-}
-
-
 int
 _PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
                                    const char *str, Py_ssize_t len)

From 3f284f8537c15568cd93316b04aad6a12543cbc4 Mon Sep 17 00:00:00 2001
From: Victor Stinner <vstinner@python.org>
Date: Thu, 20 Jun 2024 15:40:12 +0200
Subject: [PATCH 7/9] Update Doc/c-api/unicode.rst

Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
---
 Doc/c-api/unicode.rst | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst
index dc49fafda15c70..37761274acdae8 100644
--- a/Doc/c-api/unicode.rst
+++ b/Doc/c-api/unicode.rst
@@ -1609,7 +1609,9 @@ object.
    ``NULL``, use the strict error handler.
 
    If *consumed* is not ``NULL``, set *\*consumed* to the number of decoded
-   bytes on success.
+   bytes on success. 
+   If *consumed* is ``NULL``, treat trailing incomplete UTF-8 byte sequences
+   as an error.
 
    On success, return ``0``.
    On error, set an exception, leave the writer unchanged, and return ``-1``.

From 1e018d24b71ff2cbc9ec3e3069711d8a9c08078a Mon Sep 17 00:00:00 2001
From: Victor Stinner <vstinner@python.org>
Date: Thu, 20 Jun 2024 15:52:13 +0200
Subject: [PATCH 8/9] Address Serhiy's review

---
 Doc/c-api/unicode.rst           |  6 ++----
 Include/cpython/unicodeobject.h |  2 +-
 Objects/unicodeobject.c         | 15 ++++-----------
 3 files changed, 7 insertions(+), 16 deletions(-)

diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst
index 37761274acdae8..4ea20bde38c1db 100644
--- a/Doc/c-api/unicode.rst
+++ b/Doc/c-api/unicode.rst
@@ -1553,12 +1553,10 @@ object.
 
    See also :c:func:`PyUnicodeWriter_DecodeUTF8Stateful`.
 
-.. c:function:: int PyUnicodeWriter_WriteWideChar(PyUnicodeWriter *writer, wchar_t *str, Py_ssize_t size)
+.. c:function:: int PyUnicodeWriter_WriteWideChar(PyUnicodeWriter *writer, const wchar_t *str, Py_ssize_t size)
 
    Writer the wide string *str* into *writer*.
 
-   *str* must not be ``NULL``.
-
    *size* is a number of wide characters. If *size* is equal to ``-1``, call
    ``wcslen(str)`` to get the string length.
 
@@ -1609,7 +1607,7 @@ object.
    ``NULL``, use the strict error handler.
 
    If *consumed* is not ``NULL``, set *\*consumed* to the number of decoded
-   bytes on success. 
+   bytes on success.
    If *consumed* is ``NULL``, treat trailing incomplete UTF-8 byte sequences
    as an error.
 
diff --git a/Include/cpython/unicodeobject.h b/Include/cpython/unicodeobject.h
index 54b0f665861e03..059bec8618c8d9 100644
--- a/Include/cpython/unicodeobject.h
+++ b/Include/cpython/unicodeobject.h
@@ -461,7 +461,7 @@ PyAPI_FUNC(int) PyUnicodeWriter_WriteUTF8(
     Py_ssize_t size);
 PyAPI_FUNC(int) PyUnicodeWriter_WriteWideChar(
     PyUnicodeWriter *writer,
-    wchar_t *str,
+    const wchar_t *str,
     Py_ssize_t size);
 
 PyAPI_FUNC(int) PyUnicodeWriter_WriteStr(
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index bb95c0dea0b828..335381c327f602 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -1762,7 +1762,7 @@ unicode_write_widechar(int kind, void *data,
         break;
 
     case PyUnicode_2BYTE_KIND:
-#if Py_UNICODE_SIZE == 2
+#if SIZEOF_WCHAR_T == 2
         memcpy(data, u, size * 2);
 #else
         _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2, u, u + size, data);
@@ -1773,11 +1773,7 @@ unicode_write_widechar(int kind, void *data,
     {
 #if SIZEOF_WCHAR_T == 2
         // Convert a 16-bits wchar_t representation to UCS4, this will decode
-        // surrogate pairs, the other conversions are implemented as macros
-        // for efficiency.
-        //
-        // This code assumes that unicode can hold one more code point than
-        // wstr characters for a terminating null character.
+        // surrogate pairs.
         const wchar_t *end = u + size;
         Py_UCS4 *ucs4_out = (Py_UCS4*)data;
 #  ifndef NDEBUG
@@ -1871,7 +1867,7 @@ PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
 
 int
 PyUnicodeWriter_WriteWideChar(PyUnicodeWriter *pub_writer,
-                              wchar_t *str,
+                              const wchar_t *str,
                               Py_ssize_t size)
 {
     _PyUnicodeWriter *writer = (_PyUnicodeWriter *)pub_writer;
@@ -1901,8 +1897,6 @@ PyUnicodeWriter_WriteWideChar(PyUnicodeWriter *pub_writer,
     }
 #endif
 
-    /* If not empty and not single character, copy the Unicode data
-       into the new object */
     Py_UCS4 maxchar = 0;
     Py_ssize_t num_surrogates;
     if (find_maxchar_surrogates(str, str + size,
@@ -1910,8 +1904,7 @@ PyUnicodeWriter_WriteWideChar(PyUnicodeWriter *pub_writer,
         return -1;
     }
 
-    if (_PyUnicodeWriter_Prepare(writer, size - num_surrogates,
-                                 maxchar) < 0) {
+    if (_PyUnicodeWriter_Prepare(writer, size - num_surrogates, maxchar) < 0) {
         return -1;
     }
 

From 6f29c53223b57e7fa1eca35dcf7d9446d278a180 Mon Sep 17 00:00:00 2001
From: Victor Stinner <vstinner@python.org>
Date: Thu, 20 Jun 2024 15:54:58 +0200
Subject: [PATCH 9/9] Add more tests

---
 Modules/_testcapi/unicode.c | 35 +++++++++++++++++++++++++++++++++--
 1 file changed, 33 insertions(+), 2 deletions(-)

diff --git a/Modules/_testcapi/unicode.c b/Modules/_testcapi/unicode.c
index c3ba891f8a0ac4..da658b4129dffd 100644
--- a/Modules/_testcapi/unicode.c
+++ b/Modules/_testcapi/unicode.c
@@ -391,12 +391,22 @@ test_unicodewriter_decode_utf8(PyObject *self, PyObject *Py_UNUSED(args))
     if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "replace\xFF", -1, "replace", NULL) < 0) {
         goto error;
     }
+    if (PyUnicodeWriter_WriteChar(writer, '-') < 0) {
+        goto error;
+    }
+
+    // incomplete trailing UTF-8 sequence
+    if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "incomplete\xC3", -1, "replace", NULL) < 0) {
+        goto error;
+    }
 
     PyObject *result = PyUnicodeWriter_Finish(writer);
     if (result == NULL) {
         return NULL;
     }
-    assert(PyUnicode_EqualToUTF8(result, "ignore-replace\xef\xbf\xbd"));
+    assert(PyUnicode_EqualToUTF8(result,
+                                 "ignore-replace\xef\xbf\xbd"
+                                 "-incomplete\xef\xbf\xbd"));
     Py_DECREF(result);
 
     Py_RETURN_NONE;
@@ -423,7 +433,16 @@ test_unicodewriter_decode_utf8_consumed(PyObject *self, PyObject *Py_UNUSED(args
         goto error;
     }
     assert(consumed == 4);
+    if (PyUnicodeWriter_WriteChar(writer, '-') < 0) {
+        goto error;
+    }
 
+    // non-ASCII
+    consumed = 12345;
+    if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "\xC3\xA9-\xE2\x82\xAC", 6, NULL, &consumed) < 0) {
+        goto error;
+    }
+    assert(consumed == 6);
     if (PyUnicodeWriter_WriteChar(writer, '-') < 0) {
         goto error;
     }
@@ -440,12 +459,24 @@ test_unicodewriter_decode_utf8_consumed(PyObject *self, PyObject *Py_UNUSED(args
         goto error;
     }
     assert(consumed == 5);
+    if (PyUnicodeWriter_WriteChar(writer, '-') < 0) {
+        goto error;
+    }
+
+    // incomplete trailing UTF-8 sequence
+    consumed = 12345;
+    if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "incomplete\xC3", -1, "ignore", &consumed) < 0) {
+        goto error;
+    }
+    assert(consumed == 10);
 
     PyObject *result = PyUnicodeWriter_Finish(writer);
     if (result == NULL) {
         return NULL;
     }
-    assert(PyUnicode_EqualToUTF8(result, "text-more"));
+    assert(PyUnicode_EqualToUTF8(result,
+                                 "text-\xC3\xA9-\xE2\x82\xAC-"
+                                 "more-incomplete"));
     Py_DECREF(result);
 
     Py_RETURN_NONE;