Skip to content

Commit b66863d

Browse files
committed
use UCS4 instead of UTF8
1 parent 8e5e00b commit b66863d

File tree

1 file changed

+41
-28
lines changed

1 file changed

+41
-28
lines changed

Modules/_json.c

Lines changed: 41 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -303,12 +303,11 @@ escape_unicode(PyObject *pystr)
303303
return rval;
304304
}
305305

306-
#define ESCAPE_BUF_SIZE 200
307-
308306
// Take a PyUnicode pystr and write an escaped string to writer. (ensure_ascii)
309307
static int
310308
write_escaped_ascii(PyUnicodeWriter *writer, PyObject *pystr)
311309
{
310+
#define ESCAPE_BUF_SIZE 200
312311
Py_ssize_t i;
313312
Py_ssize_t input_chars;
314313
Py_ssize_t buf_len;
@@ -367,60 +366,74 @@ static int
367366
write_escaped_unicode(PyUnicodeWriter *writer, PyObject *pystr)
368367
{
369368
Py_ssize_t i;
370-
Py_ssize_t input_size;
371-
Py_ssize_t buf_len;
372-
const unsigned char *input;
369+
Py_ssize_t input_chars;
370+
Py_ssize_t chars = 0;
371+
const void *input;
372+
int kind;
373373
int ret;
374-
unsigned char c = 0;
375-
char buf[ESCAPE_BUF_SIZE];
374+
Py_UCS4 output[ESCAPE_BUF_SIZE];
376375

377-
// We don't need to escape non-ASCII chars.
378-
// So we just copy UTF-8 from pystr to buf.
379-
input = (const unsigned char*) PyUnicode_AsUTF8AndSize(pystr, &input_size);
376+
input_chars = PyUnicode_GET_LENGTH(pystr);
377+
input = PyUnicode_DATA(pystr);
378+
kind = PyUnicode_KIND(pystr);
380379

381380
ret = PyUnicodeWriter_WriteChar(writer, '"');
382381
if (ret) return ret;
383382

384383
// Fast path for string doesn't need escape at all: e.g. "id", "name"
385-
for (i = 0; i < input_size; i++) {
386-
c = input[i];
384+
for (i = 0; i < input_chars; i++) {
385+
Py_UCS4 c = PyUnicode_READ(kind, input, i);
387386
if (c <= 0x1f || c == '\\' || c == '"') {
388387
break;
389388
}
390389
}
391390
if (i > 0) {
392-
ret = PyUnicodeWriter_WriteUTF8(writer, (const char *)input, i);
391+
ret = PyUnicodeWriter_WriteSubstring(writer, pystr, 0, i);
393392
if (ret) return ret;
394393
}
395-
if (i == input_size) {
394+
if (i == input_chars) {
396395
return PyUnicodeWriter_WriteChar(writer, '"');
397396
}
398397

399-
buf_len = ascii_escape_unichar(c, (unsigned char *)buf, 0);
398+
for (; i < input_chars; i++) {
399+
Py_UCS4 c = PyUnicode_READ(kind, input, i);
400400

401-
for (i++; i < input_size; i++) {
402-
c = input[i];
403-
if (c <= 0x1f || c == '\\' || c == '"') {
404-
buf_len = ascii_escape_unichar(c, (unsigned char *)buf, buf_len);
405-
}
406-
else {
407-
buf[buf_len++] = c;
401+
// Same to ENCODE_OUTPUT in escape_unicode
402+
switch (c) {
403+
case '\\': output[chars++] = '\\'; output[chars++] = c; break;
404+
case '"': output[chars++] = '\\'; output[chars++] = c; break;
405+
case '\b': output[chars++] = '\\'; output[chars++] = 'b'; break;
406+
case '\f': output[chars++] = '\\'; output[chars++] = 'f'; break;
407+
case '\n': output[chars++] = '\\'; output[chars++] = 'n'; break;
408+
case '\r': output[chars++] = '\\'; output[chars++] = 'r'; break;
409+
case '\t': output[chars++] = '\\'; output[chars++] = 't'; break;
410+
default:
411+
if (c <= 0x1f) {
412+
output[chars++] = '\\';
413+
output[chars++] = 'u';
414+
output[chars++] = '0';
415+
output[chars++] = '0';
416+
output[chars++] = Py_hexdigits[(c >> 4) & 0xf];
417+
output[chars++] = Py_hexdigits[(c ) & 0xf];
418+
} else {
419+
output[chars++] = c;
420+
}
408421
}
409422

410-
if (buf_len + 6 > ESCAPE_BUF_SIZE) {
411-
ret = PyUnicodeWriter_WriteUTF8(writer, buf, buf_len);
423+
if (chars + 6 > ESCAPE_BUF_SIZE) {
424+
ret = PyUnicodeWriter_WriteUCS4(writer, output, chars);
412425
if (ret) return ret;
413-
buf_len = 0;
426+
chars = 0;
414427
}
415428
}
416429

417430
assert(buf_len < ESCAPE_BUF_SIZE);
418-
buf[buf_len++] = '"';
419-
return PyUnicodeWriter_WriteUTF8(writer, buf, buf_len);
431+
output[chars++] = '"';
432+
return PyUnicodeWriter_WriteUCS4(writer, output, chars);
420433
}
421-
422434
#undef ESCAPE_BUF_SIZE
423435

436+
424437
static void
425438
raise_errmsg(const char *msg, PyObject *s, Py_ssize_t end)
426439
{

0 commit comments

Comments
 (0)