From 308ac73eb4810f4c8a453547d9a96f9ee651e612 Mon Sep 17 00:00:00 2001 From: Pablo Galindo Date: Mon, 15 Jun 2020 01:21:24 +0100 Subject: [PATCH 1/4] bpo-40958: Avoid buffer overflow in the parser when indexing the current line --- .../2020-06-15-01-20-44.bpo-40958.7O2Wh1.rst | 2 ++ Parser/pegen.c | 14 ++++++-------- 2 files changed, 8 insertions(+), 8 deletions(-) create mode 100644 Misc/NEWS.d/next/Core and Builtins/2020-06-15-01-20-44.bpo-40958.7O2Wh1.rst diff --git a/Misc/NEWS.d/next/Core and Builtins/2020-06-15-01-20-44.bpo-40958.7O2Wh1.rst b/Misc/NEWS.d/next/Core and Builtins/2020-06-15-01-20-44.bpo-40958.7O2Wh1.rst new file mode 100644 index 00000000000000..8e36897948f9b4 --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2020-06-15-01-20-44.bpo-40958.7O2Wh1.rst @@ -0,0 +1,2 @@ +Fix a possible buffer overflow in the PEG parser when gathering information +for emitting syntax errors. Patch by Pablo Galindo. diff --git a/Parser/pegen.c b/Parser/pegen.c index e29910bf86ed59..5164e88b0c39ff 100644 --- a/Parser/pegen.c +++ b/Parser/pegen.c @@ -145,15 +145,12 @@ byte_offset_to_character_offset(PyObject *line, int col_offset) if (!str) { return 0; } + assert(col_offset <= strlen(str)); PyObject *text = PyUnicode_DecodeUTF8(str, col_offset, "replace"); if (!text) { return 0; } Py_ssize_t size = PyUnicode_GET_LENGTH(text); - str = PyUnicode_AsUTF8(text); - if (str != NULL && (int)strlen(str) == col_offset) { - size = strlen(str); - } Py_DECREF(text); return size; } @@ -400,16 +397,17 @@ _PyPegen_raise_error_known_location(Parser *p, PyObject *errtype, if (!error_line) { Py_ssize_t size = p->tok->inp - p->tok->buf; - if (size && p->tok->buf[size-1] == '\n') { - size--; - } error_line = PyUnicode_DecodeUTF8(p->tok->buf, size, "replace"); if (!error_line) { goto error; } } - Py_ssize_t col_number = byte_offset_to_character_offset(error_line, col_offset); + Py_ssize_t col_number = col_offset; + + if (p->tok->encoding != NULL) { + col_number = byte_offset_to_character_offset(error_line, col_offset); + } tmp = Py_BuildValue("(OiiN)", p->tok->filename, lineno, col_number, error_line); if (!tmp) { From 4654706483a032ad315b0522df46682e19a36165 Mon Sep 17 00:00:00 2001 From: Pablo Galindo Date: Mon, 15 Jun 2020 11:11:12 +0100 Subject: [PATCH 2/4] Use Py_ssize_t in more places and ensure that col_offset is positive --- Parser/pegen.c | 8 ++++---- Parser/pegen.h | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Parser/pegen.c b/Parser/pegen.c index 5164e88b0c39ff..a1addb2d2e4675 100644 --- a/Parser/pegen.c +++ b/Parser/pegen.c @@ -139,13 +139,13 @@ _create_dummy_identifier(Parser *p) } static inline Py_ssize_t -byte_offset_to_character_offset(PyObject *line, int col_offset) +byte_offset_to_character_offset(PyObject *line, Py_ssize_t col_offset) { const char *str = PyUnicode_AsUTF8(line); if (!str) { return 0; } - assert(col_offset <= strlen(str)); + assert(col_offset > 0 && (unsigned long)col_offset <= strlen(str)); PyObject *text = PyUnicode_DecodeUTF8(str, col_offset, "replace"); if (!text) { return 0; @@ -357,7 +357,7 @@ void * _PyPegen_raise_error(Parser *p, PyObject *errtype, const char *errmsg, ...) { Token *t = p->known_err_token != NULL ? p->known_err_token : p->tokens[p->fill - 1]; - int col_offset; + Py_ssize_t col_offset; if (t->col_offset == -1) { col_offset = Py_SAFE_DOWNCAST(p->tok->cur - p->tok->buf, intptr_t, int); @@ -377,7 +377,7 @@ _PyPegen_raise_error(Parser *p, PyObject *errtype, const char *errmsg, ...) void * _PyPegen_raise_error_known_location(Parser *p, PyObject *errtype, - int lineno, int col_offset, + Py_ssize_t lineno, Py_ssize_t col_offset, const char *errmsg, va_list va) { PyObject *value = NULL; diff --git a/Parser/pegen.h b/Parser/pegen.h index 64cf0ec8929135..6f3157dc9750d8 100644 --- a/Parser/pegen.h +++ b/Parser/pegen.h @@ -132,7 +132,7 @@ void *_PyPegen_string_token(Parser *p); const char *_PyPegen_get_expr_name(expr_ty); void *_PyPegen_raise_error(Parser *p, PyObject *errtype, const char *errmsg, ...); void *_PyPegen_raise_error_known_location(Parser *p, PyObject *errtype, - int lineno, int col_offset, + Py_ssize_t lineno, Py_ssize_t col_offset, const char *errmsg, va_list va); void *_PyPegen_dummy_name(Parser *p, ...); From a58c5bb1aa8201d9fa98df0dd8377334517c3a03 Mon Sep 17 00:00:00 2001 From: Pablo Galindo Date: Mon, 15 Jun 2020 11:51:49 +0100 Subject: [PATCH 3/4] Allow col_offset=0 --- Parser/pegen.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Parser/pegen.c b/Parser/pegen.c index a1addb2d2e4675..f0a830868b7764 100644 --- a/Parser/pegen.c +++ b/Parser/pegen.c @@ -145,7 +145,7 @@ byte_offset_to_character_offset(PyObject *line, Py_ssize_t col_offset) if (!str) { return 0; } - assert(col_offset > 0 && (unsigned long)col_offset <= strlen(str)); + assert(col_offset >= 0 && (unsigned long)col_offset <= strlen(str)); PyObject *text = PyUnicode_DecodeUTF8(str, col_offset, "replace"); if (!text) { return 0; From 626a2eda8d897562a4c302e6515936e7853a4bad Mon Sep 17 00:00:00 2001 From: Pablo Galindo Date: Mon, 15 Jun 2020 13:22:36 +0100 Subject: [PATCH 4/4] Use Py_ssize_t in the Token struct --- Parser/pegen.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Parser/pegen.h b/Parser/pegen.h index 6f3157dc9750d8..c4ff8c9d51252d 100644 --- a/Parser/pegen.h +++ b/Parser/pegen.h @@ -34,7 +34,7 @@ typedef struct _memo { typedef struct { int type; PyObject *bytes; - int lineno, col_offset, end_lineno, end_col_offset; + Py_ssize_t lineno, col_offset, end_lineno, end_col_offset; Memo *memo; } Token;