diff --git a/Doc/library/email.errors.rst b/Doc/library/email.errors.rst index 689e7397cbcf1f..d9254039d882a1 100644 --- a/Doc/library/email.errors.rst +++ b/Doc/library/email.errors.rst @@ -59,6 +59,15 @@ The following exception classes are defined in the :mod:`email.errors` module: headers. +.. exception:: InvalidMailboxError() + + Raised when serializing a message with an address header that contains + a mailbox incompatible with the policy in use. + (See :attr:`email.policy.EmailPolicy.utf8`.) + + .. versionadded:: 3.14 + + .. exception:: MessageDefect() This is the base class for all defects found when parsing email messages. diff --git a/Doc/library/email.policy.rst b/Doc/library/email.policy.rst index 6b997ee784f6e4..a3e0065cfe469d 100644 --- a/Doc/library/email.policy.rst +++ b/Doc/library/email.policy.rst @@ -406,11 +406,22 @@ added matters. To illustrate:: .. attribute:: utf8 If ``False``, follow :rfc:`5322`, supporting non-ASCII characters in - headers by encoding them as "encoded words". If ``True``, follow - :rfc:`6532` and use ``utf-8`` encoding for headers. Messages + headers by encoding them as :rfc:`2047` "encoded words". If ``True``, + follow :rfc:`6532` and use ``utf-8`` encoding for headers. Messages formatted in this way may be passed to SMTP servers that support the ``SMTPUTF8`` extension (:rfc:`6531`). + When ``False``, the generator will raise an + :exc:`~email.errors.InvalidMailboxError` if any address header includes + a mailbox ("addr-spec") with non-ASCII characters. To use a mailbox with + an internationalized domain name, first encode the domain using the + third-party :pypi:`idna` or :pypi:`uts46` module or with + :mod:`encodings.idna`. It is not possible to use a non-ASCII username + ("local-part") in a mailbox when ``utf8=False``. + + .. versionchanged:: 3.14 + Raises :exc:`~email.errors.InvalidMailboxError`. (Earlier versions + incorrectly applied :rfc:`2047` to non-ASCII addr-specs.) .. attribute:: refold_source diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index 9a51b9437333db..bff9beb32aab6b 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -2837,6 +2837,17 @@ def _refold_parse_tree(parse_tree, *, policy): _fold_mime_parameters(part, lines, maxlen, encoding) continue + if want_encoding and part.token_type == 'addr-spec': + # RFC2047 forbids encoded-word in any part of an addr-spec. + if charset == 'unknown-8bit': + # Non-ASCII addr-spec came from parsed message; leave unchanged. + want_encoding = False + else: + raise errors.InvalidMailboxError( + "Non-ASCII address requires policy with utf8=True:" + " '{}'".format(part) + ) + if want_encoding and not wrap_as_ew_blocked: if not part.as_ew_allowed: want_encoding = False diff --git a/Lib/email/errors.py b/Lib/email/errors.py index 6bc744bd59c5bb..e5601132d024fe 100644 --- a/Lib/email/errors.py +++ b/Lib/email/errors.py @@ -33,6 +33,10 @@ class HeaderWriteError(MessageError): """Error while writing headers.""" +class InvalidMailboxError(MessageError, ValueError): + """A mailbox was not compatible with the policy in use.""" + + # These are parsing defects which the parser was able to work around. class MessageDefect(ValueError): """Base class for a message defect.""" diff --git a/Lib/test/test_email/test_generator.py b/Lib/test/test_email/test_generator.py index c75a842c33578e..f28cbf1ebdcb3b 100644 --- a/Lib/test/test_email/test_generator.py +++ b/Lib/test/test_email/test_generator.py @@ -1,4 +1,5 @@ import io +import re import textwrap import unittest from email import message_from_string, message_from_bytes @@ -288,6 +289,30 @@ def test_keep_long_encoded_newlines(self): g.flatten(msg) self.assertEqual(s.getvalue(), self.typ(expected)) + def test_non_ascii_addr_spec_raises(self): + # RFC2047 encoded-word is not permitted in any part of an addr-spec. + # (See also test_non_ascii_addr_spec_preserved below.) + g = self.genclass(self.ioclass(), policy=self.policy.clone(utf8=False)) + cases = [ + 'wők@example.com', + 'wok@exàmple.com', + 'wők@exàmple.com', + '"Name, for display" ', + 'Näyttönimi ', + ] + for address in cases: + with self.subTest(address=address): + msg = EmailMessage() + msg['To'] = address + addr_spec = msg['To'].addresses[0].addr_spec + expected_error = ( + fr"(?i)(?=.*non-ascii)(?=.*utf8.*True)(?=.*{re.escape(addr_spec)})" + ) + with self.assertRaisesRegex( + email.errors.InvalidMailboxError, expected_error + ): + g.flatten(msg) + class TestGenerator(TestGeneratorBase, TestEmailBase): @@ -432,12 +457,12 @@ def test_cte_type_7bit_transforms_8bit_cte(self): def test_smtputf8_policy(self): msg = EmailMessage() - msg['From'] = "Páolo " + msg['From'] = "Páolo " msg['To'] = 'Dinsdale' msg['Subject'] = 'Nudge nudge, wink, wink \u1F609' msg.set_content("oh là là, know what I mean, know what I mean?") expected = textwrap.dedent("""\ - From: Páolo + From: Páolo To: Dinsdale Subject: Nudge nudge, wink, wink \u1F609 Content-Type: text/plain; charset="utf-8" @@ -472,6 +497,37 @@ def test_smtp_policy(self): g.flatten(msg) self.assertEqual(s.getvalue(), expected) + def test_non_ascii_addr_spec_preserved(self): + # A defective non-ASCII addr-spec parsed from the original + # message is left unchanged when flattening. + # (See also test_non_ascii_addr_spec_raises above.) + source = ( + 'To: jörg@example.com, "But a long name still works with refold_source" ' + ).encode() + expected = ( + b'To: j\xc3\xb6rg@example.com,\n' + b' "But a long name still works with refold_source" \n' + b'\n' + ) + msg = message_from_bytes(source, policy=policy.default) + s = io.BytesIO() + g = BytesGenerator(s, policy=policy.default) + g.flatten(msg) + self.assertEqual(s.getvalue(), expected) + + def test_idna_encoding_preserved(self): + # Nothing tries to decode a pre-encoded IDNA domain. + msg = EmailMessage() + msg["To"] = Address( + username='jörg', + domain='☕.example'.encode('idna').decode() # IDNA 2003 + ) + expected = 'To: jörg@xn--53h.example\n\n'.encode() + s = io.BytesIO() + g = BytesGenerator(s, policy=policy.default.clone(utf8=True)) + g.flatten(msg) + self.assertEqual(s.getvalue(), expected) + if __name__ == '__main__': unittest.main() diff --git a/Misc/NEWS.d/next/Library/2024-07-31-17-22-10.gh-issue-83938.TtUa-c.rst b/Misc/NEWS.d/next/Library/2024-07-31-17-22-10.gh-issue-83938.TtUa-c.rst new file mode 100644 index 00000000000000..fb1574fb4ef709 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-07-31-17-22-10.gh-issue-83938.TtUa-c.rst @@ -0,0 +1,5 @@ +The :mod:`email` module no longer incorrectly encodes non-ASCII characters +in email addresses using :rfc:`2047` encoding. Under a policy with ``utf8=True`` +this means the addresses will be correctly passed through. Under a policy with +``utf8=False``, attempting to serialize a message with non-ASCII email addresses +will now result in an :exc:`~email.errors.InvalidMailboxError`. diff --git a/Misc/NEWS.d/next/Library/2024-07-31-17-23-06.gh-issue-122476.TtUa-c.rst b/Misc/NEWS.d/next/Library/2024-07-31-17-23-06.gh-issue-122476.TtUa-c.rst new file mode 100644 index 00000000000000..fb1574fb4ef709 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-07-31-17-23-06.gh-issue-122476.TtUa-c.rst @@ -0,0 +1,5 @@ +The :mod:`email` module no longer incorrectly encodes non-ASCII characters +in email addresses using :rfc:`2047` encoding. Under a policy with ``utf8=True`` +this means the addresses will be correctly passed through. Under a policy with +``utf8=False``, attempting to serialize a message with non-ASCII email addresses +will now result in an :exc:`~email.errors.InvalidMailboxError`.