Skip to content

bpo-42318: Fix support of non-BMP characters in Tkinter on macOS #23281

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Nov 15, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 40 additions & 6 deletions Lib/test/test_tcl.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import unittest
import locale
import re
import subprocess
import sys
Expand Down Expand Up @@ -61,6 +62,10 @@ def test_eval_null_in_result(self):
tcl = self.interp
self.assertEqual(tcl.eval('set a "a\\0b"'), 'a\x00b')

def test_eval_surrogates_in_result(self):
tcl = self.interp
self.assertIn(tcl.eval(r'set a "<\ud83d\udcbb>"'), '<\U0001f4bb>')

def testEvalException(self):
tcl = self.interp
self.assertRaises(TclError,tcl.eval,'set a')
Expand Down Expand Up @@ -193,29 +198,48 @@ def test_getboolean(self):

def testEvalFile(self):
tcl = self.interp
with open(os_helper.TESTFN, 'w') as f:
self.addCleanup(os_helper.unlink, os_helper.TESTFN)
filename = os_helper.TESTFN_ASCII
self.addCleanup(os_helper.unlink, filename)
with open(filename, 'w') as f:
f.write("""set a 1
set b 2
set c [ expr $a + $b ]
""")
tcl.evalfile(os_helper.TESTFN)
tcl.evalfile(filename)
self.assertEqual(tcl.eval('set a'),'1')
self.assertEqual(tcl.eval('set b'),'2')
self.assertEqual(tcl.eval('set c'),'3')

def test_evalfile_null_in_result(self):
tcl = self.interp
with open(os_helper.TESTFN, 'w') as f:
self.addCleanup(os_helper.unlink, os_helper.TESTFN)
filename = os_helper.TESTFN_ASCII
self.addCleanup(os_helper.unlink, filename)
with open(filename, 'w') as f:
f.write("""
set a "a\0b"
set b "a\\0b"
""")
tcl.evalfile(os_helper.TESTFN)
tcl.evalfile(filename)
self.assertEqual(tcl.eval('set a'), 'a\x00b')
self.assertEqual(tcl.eval('set b'), 'a\x00b')

def test_evalfile_surrogates_in_result(self):
tcl = self.interp
encoding = tcl.call('encoding', 'system')
self.addCleanup(tcl.call, 'encoding', 'system', encoding)
tcl.call('encoding', 'system', 'utf-8')

filename = os_helper.TESTFN_ASCII
self.addCleanup(os_helper.unlink, filename)
with open(filename, 'wb') as f:
f.write(b"""
set a "<\xed\xa0\xbd\xed\xb2\xbb>"
set b "<\\ud83d\\udcbb>"
""")
tcl.evalfile(filename)
self.assertEqual(tcl.eval('set a'), '<\U0001f4bb>')
self.assertEqual(tcl.eval('set b'), '<\U0001f4bb>')

def testEvalFileException(self):
tcl = self.interp
filename = "doesnotexists"
Expand Down Expand Up @@ -438,6 +462,11 @@ def passValue(value):
self.assertEqual(passValue('str\x00ing\u20ac'), 'str\x00ing\u20ac')
self.assertEqual(passValue('str\x00ing\U0001f4bb'),
'str\x00ing\U0001f4bb')
if sys.platform != 'win32':
self.assertEqual(passValue('<\udce2\udc82\udcac>'),
'<\u20ac>')
self.assertEqual(passValue('<\udced\udca0\udcbd\udced\udcb2\udcbb>'),
'<\U0001f4bb>')
self.assertEqual(passValue(b'str\x00ing'),
b'str\x00ing' if self.wantobjects else 'str\x00ing')
self.assertEqual(passValue(b'str\xc0\x80ing'),
Expand Down Expand Up @@ -497,6 +526,9 @@ def float_eq(actual, expected):
check('string\xbd')
check('string\u20ac')
check('string\U0001f4bb')
if sys.platform != 'win32':
check('<\udce2\udc82\udcac>', '<\u20ac>')
check('<\udced\udca0\udcbd\udced\udcb2\udcbb>', '<\U0001f4bb>')
check('')
check(b'string', 'string')
check(b'string\xe2\x82\xac', 'string\xe2\x82\xac')
Expand Down Expand Up @@ -540,6 +572,8 @@ def test_splitlist(self):
('a \u20ac', ('a', '\u20ac')),
('a \U0001f4bb', ('a', '\U0001f4bb')),
(b'a \xe2\x82\xac', ('a', '\u20ac')),
(b'a \xf0\x9f\x92\xbb', ('a', '\U0001f4bb')),
(b'a \xed\xa0\xbd\xed\xb2\xbb', ('a', '\U0001f4bb')),
(b'a\xc0\x80b c\xc0\x80d', ('a\x00b', 'c\x00d')),
('a {b c}', ('a', 'b c')),
(r'a b\ c', ('a', 'b c')),
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fixed support of non-BMP characters in :mod:`tkinter` on macOS.
54 changes: 53 additions & 1 deletion Modules/_tkinter.c
Original file line number Diff line number Diff line change
Expand Up @@ -395,7 +395,8 @@ unicodeFromTclStringAndSize(const char *s, Py_ssize_t size)

char *buf = NULL;
PyErr_Clear();
/* Tcl encodes null character as \xc0\x80 */
/* Tcl encodes null character as \xc0\x80.
https://en.wikipedia.org/wiki/UTF-8#Modified_UTF-8 */
if (memchr(s, '\xc0', size)) {
char *q;
const char *e = s + size;
Expand All @@ -419,6 +420,57 @@ unicodeFromTclStringAndSize(const char *s, Py_ssize_t size)
if (buf != NULL) {
PyMem_Free(buf);
}
if (r == NULL || PyUnicode_KIND(r) == PyUnicode_1BYTE_KIND) {
return r;
}

/* In CESU-8 non-BMP characters are represented as a surrogate pair,
like in UTF-16, and then each surrogate code point is encoded in UTF-8.
https://en.wikipedia.org/wiki/CESU-8 */
Py_ssize_t len = PyUnicode_GET_LENGTH(r);
Py_ssize_t i, j;
/* All encoded surrogate characters start with \xED. */
i = PyUnicode_FindChar(r, 0xdcED, 0, len, 1);
if (i == -2) {
Py_DECREF(r);
return NULL;
}
if (i == -1) {
return r;
}
Py_UCS4 *u = PyUnicode_AsUCS4Copy(r);
Py_DECREF(r);
if (u == NULL) {
return NULL;
}
Py_UCS4 ch;
for (j = i; i < len; i++, u[j++] = ch) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I had to reread this loop a couple of times before I understood how the contents of "u" is updated. My C is getting a bit rusty ;-)

The code looks fine though.

Py_UCS4 ch1, ch2, ch3, high, low;
/* Low surrogates U+D800 - U+DBFF are encoded as
\xED\xA0\x80 - \xED\xAF\xBF. */
ch1 = ch = u[i];
if (ch1 != 0xdcED) continue;
ch2 = u[i + 1];
if (!(0xdcA0 <= ch2 && ch2 <= 0xdcAF)) continue;
ch3 = u[i + 2];
if (!(0xdc80 <= ch3 && ch3 <= 0xdcBF)) continue;
high = 0xD000 | ((ch2 & 0x3F) << 6) | (ch3 & 0x3F);
assert(Py_UNICODE_IS_HIGH_SURROGATE(high));
/* High surrogates U+DC00 - U+DFFF are encoded as
\xED\xB0\x80 - \xED\xBF\xBF. */
ch1 = u[i + 3];
if (ch1 != 0xdcED) continue;
ch2 = u[i + 4];
if (!(0xdcB0 <= ch2 && ch2 <= 0xdcBF)) continue;
ch3 = u[i + 5];
if (!(0xdc80 <= ch3 && ch3 <= 0xdcBF)) continue;
low = 0xD000 | ((ch2 & 0x3F) << 6) | (ch3 & 0x3F);
assert(Py_UNICODE_IS_HIGH_SURROGATE(high));
ch = Py_UNICODE_JOIN_SURROGATES(high, low);
i += 5;
}
r = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, u, j);
PyMem_Free(u);
return r;
}

Expand Down