From 4729b3739de30758d8fac2457022395b5ae07e02 Mon Sep 17 00:00:00 2001
From: Daniel Lenski <dlenski@amazon.com>
Date: Mon, 10 Jan 2022 19:32:00 -0800
Subject: [PATCH] bpo-22833: Fix bytes/str inconsistency in
 email.header.decode_header()

This functions possible return types have been non-intuitive and surprising
for the entirety of its Python 3.x history. It can return either:

1. `typing.List[typing.Tuple[bytes, typing.Optional[str]]]`
2. or `typing.List[typing.Tuple[str, None]]`, of length exactly 1

This has meant that any user of this function must be prepared to accept
either `bytes` or `str` for the first member of the 2-tuples it returns,
which is a very surprising behavior in Python 3.x, particularly given
that the second member of the tuple is supposed to represent the
charset/encoding of the first member.

This change eliminates case (2), ensuring that
`email.header.decode_header()` always returns `bytes`, never `str`, as the
first member of the 2-tuples it returns. It also adds a test case to verify
this behavior.
---
 Lib/email/header.py                                  |  4 ++--
 Lib/test/test_email/test_email.py                    | 12 ++++++++++++
 .../Library/2022-01-11-21-40-14.bpo-22833.WB-JWw.rst |  3 +++
 3 files changed, 17 insertions(+), 2 deletions(-)
 create mode 100644 Misc/NEWS.d/next/Library/2022-01-11-21-40-14.bpo-22833.WB-JWw.rst

diff --git a/Lib/email/header.py b/Lib/email/header.py
index 4ab0032bc66123..76987daef6d5b2 100644
--- a/Lib/email/header.py
+++ b/Lib/email/header.py
@@ -61,7 +61,7 @@
 def decode_header(header):
     """Decode a message header value without converting charset.
 
-    Returns a list of (string, charset) pairs containing each of the decoded
+    Returns a list of (bytes, charset) pairs containing each of the decoded
     parts of the header.  Charset is None for non-encoded parts of the header,
     otherwise a lower-case string containing the name of the character set
     specified in the encoded string.
@@ -78,7 +78,7 @@ def decode_header(header):
                     for string, charset in header._chunks]
     # If no encoding, just return the header with no charset.
     if not ecre.search(header):
-        return [(header, None)]
+        return [bytes(header, 'raw-unicode-escape'), None)]
     # First step is to parse all the encoded parts into triplets of the form
     # (encoded_string, encoding, charset).  For unencoded strings, the last
     # two parts will be None.
diff --git a/Lib/test/test_email/test_email.py b/Lib/test/test_email/test_email.py
index a3ccbbbabfb328..d89bd87aaf118f 100644
--- a/Lib/test/test_email/test_email.py
+++ b/Lib/test/test_email/test_email.py
@@ -2432,6 +2432,18 @@ def test_multiline_header(self):
         self.assertEqual(str(make_header(decode_header(s))),
                          '"Müller T" <T.Mueller@xxx.com>')
 
+    def test_unencoded_ascii(self):
+        # issue 22833
+        s = 'header without encoded words'
+        self.assertEqual(decode_header(s),
+            [(b'header without encoded words', None)])
+
+    def test_unencoded_utf8(self):
+        # issue 22833
+        s = 'header with unexpected non ASCII caract\xe8res'
+        self.assertEqual(decode_header(s),
+            [(b'header with unexpected non ASCII caract\xe8res', None)])
+
 
 # Test the MIMEMessage class
 class TestMIMEMessage(TestEmailBase):
diff --git a/Misc/NEWS.d/next/Library/2022-01-11-21-40-14.bpo-22833.WB-JWw.rst b/Misc/NEWS.d/next/Library/2022-01-11-21-40-14.bpo-22833.WB-JWw.rst
new file mode 100644
index 00000000000000..5ca8dc7da62399
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2022-01-11-21-40-14.bpo-22833.WB-JWw.rst
@@ -0,0 +1,3 @@
+The :func:`email.header.decode_header` function now always provides :class:`bytes`,
+never :class:`str`, as the first member of the tuples it returns. Previously, it would
+return (str, None) when decoding a header consisting only of a single, unencoded part.
\ No newline at end of file