From 4729b3739de30758d8fac2457022395b5ae07e02 Mon Sep 17 00:00:00 2001 From: Daniel Lenski Date: Mon, 10 Jan 2022 19:32:00 -0800 Subject: [PATCH] bpo-22833: Fix bytes/str inconsistency in email.header.decode_header() This functions possible return types have been non-intuitive and surprising for the entirety of its Python 3.x history. It can return either: 1. `typing.List[typing.Tuple[bytes, typing.Optional[str]]]` 2. or `typing.List[typing.Tuple[str, None]]`, of length exactly 1 This has meant that any user of this function must be prepared to accept either `bytes` or `str` for the first member of the 2-tuples it returns, which is a very surprising behavior in Python 3.x, particularly given that the second member of the tuple is supposed to represent the charset/encoding of the first member. This change eliminates case (2), ensuring that `email.header.decode_header()` always returns `bytes`, never `str`, as the first member of the 2-tuples it returns. It also adds a test case to verify this behavior. --- Lib/email/header.py | 4 ++-- Lib/test/test_email/test_email.py | 12 ++++++++++++ .../Library/2022-01-11-21-40-14.bpo-22833.WB-JWw.rst | 3 +++ 3 files changed, 17 insertions(+), 2 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2022-01-11-21-40-14.bpo-22833.WB-JWw.rst diff --git a/Lib/email/header.py b/Lib/email/header.py index 4ab0032bc66123..76987daef6d5b2 100644 --- a/Lib/email/header.py +++ b/Lib/email/header.py @@ -61,7 +61,7 @@ def decode_header(header): """Decode a message header value without converting charset. - Returns a list of (string, charset) pairs containing each of the decoded + Returns a list of (bytes, charset) pairs containing each of the decoded parts of the header. Charset is None for non-encoded parts of the header, otherwise a lower-case string containing the name of the character set specified in the encoded string. @@ -78,7 +78,7 @@ def decode_header(header): for string, charset in header._chunks] # If no encoding, just return the header with no charset. if not ecre.search(header): - return [(header, None)] + return [bytes(header, 'raw-unicode-escape'), None)] # First step is to parse all the encoded parts into triplets of the form # (encoded_string, encoding, charset). For unencoded strings, the last # two parts will be None. diff --git a/Lib/test/test_email/test_email.py b/Lib/test/test_email/test_email.py index a3ccbbbabfb328..d89bd87aaf118f 100644 --- a/Lib/test/test_email/test_email.py +++ b/Lib/test/test_email/test_email.py @@ -2432,6 +2432,18 @@ def test_multiline_header(self): self.assertEqual(str(make_header(decode_header(s))), '"Müller T" ') + def test_unencoded_ascii(self): + # issue 22833 + s = 'header without encoded words' + self.assertEqual(decode_header(s), + [(b'header without encoded words', None)]) + + def test_unencoded_utf8(self): + # issue 22833 + s = 'header with unexpected non ASCII caract\xe8res' + self.assertEqual(decode_header(s), + [(b'header with unexpected non ASCII caract\xe8res', None)]) + # Test the MIMEMessage class class TestMIMEMessage(TestEmailBase): diff --git a/Misc/NEWS.d/next/Library/2022-01-11-21-40-14.bpo-22833.WB-JWw.rst b/Misc/NEWS.d/next/Library/2022-01-11-21-40-14.bpo-22833.WB-JWw.rst new file mode 100644 index 00000000000000..5ca8dc7da62399 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2022-01-11-21-40-14.bpo-22833.WB-JWw.rst @@ -0,0 +1,3 @@ +The :func:`email.header.decode_header` function now always provides :class:`bytes`, +never :class:`str`, as the first member of the tuples it returns. Previously, it would +return (str, None) when decoding a header consisting only of a single, unencoded part. \ No newline at end of file