From ee4dfd3331eb2b62f77a0965451a02579f9960ca Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Mon, 11 May 2020 20:54:17 +0300 Subject: [PATCH 1/2] bpo-40596: Fix str.isidentifier() for non-canonicalized strings containing non-BMP characters on Windows. --- Lib/test/test_unicode.py | 7 +++++++ .../2020-05-11-20-53-52.bpo-40596.dwOH_X.rst | 2 ++ Objects/typeobject.c | 6 ++++++ Objects/unicodeobject.c | 6 ++++++ 4 files changed, 21 insertions(+) create mode 100644 Misc/NEWS.d/next/Core and Builtins/2020-05-11-20-53-52.bpo-40596.dwOH_X.rst diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py index 28398896467898..2ee4e64d635303 100644 --- a/Lib/test/test_unicode.py +++ b/Lib/test/test_unicode.py @@ -720,6 +720,13 @@ def test_isidentifier(self): self.assertFalse("©".isidentifier()) self.assertFalse("0".isidentifier()) + @support.cpython_only + def test_isidentifier_legacy(self): + import _testcapi + u = '𝖀𝖓𝖎𝖈𝖔𝖉𝖊' + self.assertTrue(u.isidentifier()) + self.assertTrue(_testcapi.unicode_legacy_string(u).isidentifier()) + def test_isprintable(self): self.assertTrue("".isprintable()) self.assertTrue(" ".isprintable()) diff --git a/Misc/NEWS.d/next/Core and Builtins/2020-05-11-20-53-52.bpo-40596.dwOH_X.rst b/Misc/NEWS.d/next/Core and Builtins/2020-05-11-20-53-52.bpo-40596.dwOH_X.rst new file mode 100644 index 00000000000000..1252db4dc9848d --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2020-05-11-20-53-52.bpo-40596.dwOH_X.rst @@ -0,0 +1,2 @@ +Fixed :meth:`str.isidentifier` for non-canonicalized strings containing +non-BMP characters on Windows. diff --git a/Objects/typeobject.c b/Objects/typeobject.c index 525f5ac5d5775a..71f48e30d9fc72 100644 --- a/Objects/typeobject.c +++ b/Objects/typeobject.c @@ -2319,6 +2319,12 @@ valid_identifier(PyObject *s) Py_TYPE(s)->tp_name); return 0; } + /* Since there is no way to return an error from PyUnicode_IsIdentifier() + we have to call PyUnicode_READY() to ensure that the string object is + in the "canonical" representation. */ + if (PyUnicode_READY(s) < 0) { + return 0; + } if (!PyUnicode_IsIdentifier(s)) { PyErr_SetString(PyExc_TypeError, "__slots__ must be identifiers"); diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 18b9458721de18..31f6badb51ce3a 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -12373,6 +12373,12 @@ static PyObject * unicode_isidentifier_impl(PyObject *self) /*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/ { + /* Since there is no way to return an error from PyUnicode_IsIdentifier() + we have to call PyUnicode_READY() to ensure that the string object is + in the "canonical" representation. */ + if (PyUnicode_READY(self) < 0) { + return NULL; + } return PyBool_FromLong(PyUnicode_IsIdentifier(self)); } From 750d91fe0de7b72b991ca3c749248d0f82207024 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Tue, 12 May 2020 10:33:48 +0300 Subject: [PATCH 2/2] Apply suggestions from code review Co-authored-by: Victor Stinner --- Objects/typeobject.c | 3 +-- Objects/unicodeobject.c | 3 --- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/Objects/typeobject.c b/Objects/typeobject.c index 71f48e30d9fc72..5a9f5c56c5f6b8 100644 --- a/Objects/typeobject.c +++ b/Objects/typeobject.c @@ -2320,8 +2320,7 @@ valid_identifier(PyObject *s) return 0; } /* Since there is no way to return an error from PyUnicode_IsIdentifier() - we have to call PyUnicode_READY() to ensure that the string object is - in the "canonical" representation. */ + we have to call explicitly PyUnicode_READY(). */ if (PyUnicode_READY(s) < 0) { return 0; } diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 31f6badb51ce3a..ea85e4cca03f2f 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -12373,9 +12373,6 @@ static PyObject * unicode_isidentifier_impl(PyObject *self) /*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/ { - /* Since there is no way to return an error from PyUnicode_IsIdentifier() - we have to call PyUnicode_READY() to ensure that the string object is - in the "canonical" representation. */ if (PyUnicode_READY(self) < 0) { return NULL; }