diff --git a/Lib/test/test_unicodedata.py b/Lib/test/test_unicodedata.py index 0285f0d51f2365..8e3fef6b6fe4a0 100644 --- a/Lib/test/test_unicodedata.py +++ b/Lib/test/test_unicodedata.py @@ -467,6 +467,29 @@ def test_bug_834676(self): # Check for bug 834676 unicodedata.normalize('NFC', '\ud55c\uae00') + def test_normalize_return_type(self): + # gh-129569: normalize() return type must always be str + normalize = unicodedata.normalize + + class MyStr(str): + pass + + normalization_forms = ("NFC", "NFKC", "NFD", "NFKD") + input_strings = ( + # normalized strings + "", + "ascii", + # unnormalized strings + "\u1e0b\u0323", + "\u0071\u0307\u0323", + ) + + for form in normalization_forms: + for input_str in input_strings: + with self.subTest(form=form, input_str=input_str): + self.assertIs(type(normalize(form, input_str)), str) + self.assertIs(type(normalize(form, MyStr(input_str))), str) + if __name__ == "__main__": unittest.main() diff --git a/Misc/NEWS.d/next/Library/2025-02-02-16-30-27.gh-issue-129569.i0kPOG.rst b/Misc/NEWS.d/next/Library/2025-02-02-16-30-27.gh-issue-129569.i0kPOG.rst new file mode 100644 index 00000000000000..c4b8965106aa56 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-02-02-16-30-27.gh-issue-129569.i0kPOG.rst @@ -0,0 +1 @@ +Fix :func:`unicodedata.normalize` to always return a built-in :class:`str` object when given an input of a :class:`str` subclass, regardless of whether the string is already normalized. diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c index 60bde755d24574..79be7674fc8ab5 100644 --- a/Modules/unicodedata.c +++ b/Modules/unicodedata.c @@ -933,34 +933,34 @@ unicodedata_UCD_normalize_impl(PyObject *self, PyObject *form, if (PyUnicode_GET_LENGTH(input) == 0) { /* Special case empty input strings, since resizing them later would cause internal errors. */ - return Py_NewRef(input); + return PyUnicode_FromObject(input); } if (PyUnicode_CompareWithASCIIString(form, "NFC") == 0) { if (is_normalized_quickcheck(self, input, true, false, true) == YES) { - return Py_NewRef(input); + return PyUnicode_FromObject(input); } return nfc_nfkc(self, input, 0); } if (PyUnicode_CompareWithASCIIString(form, "NFKC") == 0) { if (is_normalized_quickcheck(self, input, true, true, true) == YES) { - return Py_NewRef(input); + return PyUnicode_FromObject(input); } return nfc_nfkc(self, input, 1); } if (PyUnicode_CompareWithASCIIString(form, "NFD") == 0) { if (is_normalized_quickcheck(self, input, false, false, true) == YES) { - return Py_NewRef(input); + return PyUnicode_FromObject(input); } return nfd_nfkd(self, input, 0); } if (PyUnicode_CompareWithASCIIString(form, "NFKD") == 0) { if (is_normalized_quickcheck(self, input, false, true, true) == YES) { - return Py_NewRef(input); + return PyUnicode_FromObject(input); } return nfd_nfkd(self, input, 1); }