From c540c9f6a5e1d7ebe14e2a4b737e8b1e76a7bb1b Mon Sep 17 00:00:00 2001 From: Mike Edmunds Date: Tue, 6 Aug 2024 12:25:31 -0700 Subject: [PATCH 1/3] gh-121284: Fix email address header folding with parsed encoded-word Email generators using email.policy.default may convert an RFC 2047 encoded-word to unencoded form during header refolding. In a structured header, this could allow 'specials' chars outside a quoted-string, leading to invalid address headers and enabling spoofing. This change ensures a parsed encoded-word that contains specials is kept as an encoded-word while the header is refolded. --- Lib/email/_header_value_parser.py | 11 ++++++-- .../test_email/test__header_value_parser.py | 25 +++++++++++++++++++ ...-08-06-12-27-34.gh-issue-121284.8rwPxe.rst | 4 +++ 3 files changed, 38 insertions(+), 2 deletions(-) create mode 100644 Misc/NEWS.d/next/Security/2024-08-06-12-27-34.gh-issue-121284.8rwPxe.rst diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index ec2215a5e5f33c..19c6ae96ffba3a 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -2829,6 +2829,13 @@ def _refold_parse_tree(parse_tree, *, policy): _fold_mime_parameters(part, lines, maxlen, encoding) continue + allow_refolding_subparts = True + if part.token_type == 'encoded-word': + # A parsed encoded-word containing specials must remain encoded, + # to keep specials from sneaking into a structured header unquoted. + # (The encoded-word can be split for folding.) + allow_refolding_subparts = SPECIALSNL.isdisjoint(tstr) + if want_encoding and not wrap_as_ew_blocked: if not part.as_ew_allowed: want_encoding = False @@ -2848,7 +2855,7 @@ def _refold_parse_tree(parse_tree, *, policy): # want it on a line by itself even if it fits, or it # doesn't fit on a line by itself. Either way, fall through # to unpacking the subparts and wrapping them. - if not hasattr(part, 'encode'): + if allow_refolding_subparts and not hasattr(part, 'encode'): # It's not a Terminal, do each piece individually. parts = list(part) + parts want_encoding = False @@ -2902,7 +2909,7 @@ def _refold_parse_tree(parse_tree, *, policy): leading_whitespace = ''.join(whitespace_accumulator) last_ew = None continue - if not hasattr(part, 'encode'): + if allow_refolding_subparts and not hasattr(part, 'encode'): # It's not a terminal, try folding the subparts. newparts = list(part) if not part.as_ew_allowed: diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 5413319a414a62..487b316439474a 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -3076,6 +3076,31 @@ def test_address_list_with_unicode_names_in_quotes(self): '=?utf-8?q?H=C3=BCbsch?= Kaktus ,\n' ' =?utf-8?q?bei=C3=9Ft_bei=C3=9Ft?= \n') + def test_address_list_with_specials_in_encoded_word(self): + # An encoded-word parsed from a structured header must remain + # encoded when it contains specials. Regression for gh-121284. + policy = self.policy.clone(max_line_length=40) + cases = [ + # (to, folded) + ('=?utf-8?q?A_v=C3=A9ry_long_name_with=2C_comma?= ', + '=?utf-8?q?A_v=C3=A9ry_long_name_with?=\n' + ' =?utf-8?q?=2C_comma?= \n'), + ('=?utf-8?q?This_long_name_does_not_need_encoded=2Dword?= ', + 'This long name does not need\n' + ' encoded-word \n'), + ('"A véry long name with, comma" ', + # (This isn't the best fold point, but it's not invalid.) + 'A =?utf-8?q?v=C3=A9ry_long_name_with?=\n' + ' =?utf-8?q?=2C?= comma \n'), + ('"A véry long name containing a, comma" ', + 'A =?utf-8?q?v=C3=A9ry?= long name\n' + ' containing =?utf-8?q?a=2C?= comma\n' + ' \n'), + ] + for (to, folded) in cases: + with self.subTest(to=to): + self._test(parser.get_address_list(to)[0], folded, policy=policy) + def test_address_list_with_list_separator_after_fold(self): a = 'x' * 66 + '@example.com' to = f'{a}, "Hübsch Kaktus" ' diff --git a/Misc/NEWS.d/next/Security/2024-08-06-12-27-34.gh-issue-121284.8rwPxe.rst b/Misc/NEWS.d/next/Security/2024-08-06-12-27-34.gh-issue-121284.8rwPxe.rst new file mode 100644 index 00000000000000..1d6cd5529d0b5d --- /dev/null +++ b/Misc/NEWS.d/next/Security/2024-08-06-12-27-34.gh-issue-121284.8rwPxe.rst @@ -0,0 +1,4 @@ +Fix a problem where email.policy.default header refolding could incorrectly +convert an RFC 2047 encoded-word containing commas or other special +characters to unencoded, unquoted text, enabling sender or recipient +spoofing via a carefully crafted display-name. From 45159cd073c5ec45edd374842a06aa3cdae25aba Mon Sep 17 00:00:00 2001 From: Mike Edmunds Date: Sat, 18 Jan 2025 14:18:33 -0800 Subject: [PATCH 2/3] Be more specific in news --- .../2024-08-06-12-27-34.gh-issue-121284.8rwPxe.rst | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/Misc/NEWS.d/next/Security/2024-08-06-12-27-34.gh-issue-121284.8rwPxe.rst b/Misc/NEWS.d/next/Security/2024-08-06-12-27-34.gh-issue-121284.8rwPxe.rst index 1d6cd5529d0b5d..923e91170d355f 100644 --- a/Misc/NEWS.d/next/Security/2024-08-06-12-27-34.gh-issue-121284.8rwPxe.rst +++ b/Misc/NEWS.d/next/Security/2024-08-06-12-27-34.gh-issue-121284.8rwPxe.rst @@ -1,4 +1,7 @@ -Fix a problem where email.policy.default header refolding could incorrectly -convert an RFC 2047 encoded-word containing commas or other special -characters to unencoded, unquoted text, enabling sender or recipient -spoofing via a carefully crafted display-name. +Fix bug in the folding of rfc2047 encoded-words when flattening an email message +using a modern email policy. Previously when an encoded-word was too long +for a line, it would be decoded, split across lines, and re-encoded. But commas +and other special characters in the original text could be left unencoded and +unquoted. This could theoretically be used to spoof header lines using +a carefully constructed encoded-word if the resulting rendered email was +transmitted or re-parsed. From c249511d5e81d523a260d7369b1518574ddf5aca Mon Sep 17 00:00:00 2001 From: Mike Edmunds Date: Wed, 5 Mar 2025 10:57:10 -0800 Subject: [PATCH 3/3] squash! gh-121284: Fix email address header folding with parsed encoded-word [Better fix from @bitdancer.] Co-authored-by: R David Murray --- Lib/email/_header_value_parser.py | 21 +++++++------------ .../test_email/test__header_value_parser.py | 4 ++-- 2 files changed, 9 insertions(+), 16 deletions(-) diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index c0e856306d3ddc..9a51b9437333db 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -1053,7 +1053,7 @@ def get_fws(value): fws = WhiteSpaceTerminal(value[:len(value)-len(newvalue)], 'fws') return fws, newvalue -def get_encoded_word(value): +def get_encoded_word(value, terminal_type='vtext'): """ encoded-word = "=?" charset "?" encoding "?" encoded-text "?=" """ @@ -1092,7 +1092,7 @@ def get_encoded_word(value): ew.append(token) continue chars, *remainder = _wsp_splitter(text, 1) - vtext = ValueTerminal(chars, 'vtext') + vtext = ValueTerminal(chars, terminal_type) _validate_xtext(vtext) ew.append(vtext) text = ''.join(remainder) @@ -1134,7 +1134,7 @@ def get_unstructured(value): valid_ew = True if value.startswith('=?'): try: - token, value = get_encoded_word(value) + token, value = get_encoded_word(value, 'utext') except _InvalidEwError: valid_ew = False except errors.HeaderParseError: @@ -1163,7 +1163,7 @@ def get_unstructured(value): # the parser to go in an infinite loop. if valid_ew and rfc2047_matcher.search(tok): tok, *remainder = value.partition('=?') - vtext = ValueTerminal(tok, 'vtext') + vtext = ValueTerminal(tok, 'utext') _validate_xtext(vtext) unstructured.append(vtext) value = ''.join(remainder) @@ -2813,7 +2813,7 @@ def _refold_parse_tree(parse_tree, *, policy): continue tstr = str(part) if not want_encoding: - if part.token_type == 'ptext': + if part.token_type in ('ptext', 'vtext'): # Encode if tstr contains special characters. want_encoding = not SPECIALSNL.isdisjoint(tstr) else: @@ -2837,13 +2837,6 @@ def _refold_parse_tree(parse_tree, *, policy): _fold_mime_parameters(part, lines, maxlen, encoding) continue - allow_refolding_subparts = True - if part.token_type == 'encoded-word': - # A parsed encoded-word containing specials must remain encoded, - # to keep specials from sneaking into a structured header unquoted. - # (The encoded-word can be split for folding.) - allow_refolding_subparts = SPECIALSNL.isdisjoint(tstr) - if want_encoding and not wrap_as_ew_blocked: if not part.as_ew_allowed: want_encoding = False @@ -2863,7 +2856,7 @@ def _refold_parse_tree(parse_tree, *, policy): # want it on a line by itself even if it fits, or it # doesn't fit on a line by itself. Either way, fall through # to unpacking the subparts and wrapping them. - if allow_refolding_subparts and not hasattr(part, 'encode'): + if not hasattr(part, 'encode'): # It's not a Terminal, do each piece individually. parts = list(part) + parts want_encoding = False @@ -2917,7 +2910,7 @@ def _refold_parse_tree(parse_tree, *, policy): leading_whitespace = ''.join(whitespace_accumulator) last_ew = None continue - if allow_refolding_subparts and not hasattr(part, 'encode'): + if not hasattr(part, 'encode'): # It's not a terminal, try folding the subparts. newparts = list(part) if part.token_type == 'bare-quoted-string': diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index da7d4e8d864317..ac12c3b2306f7d 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -3083,8 +3083,8 @@ def test_address_list_with_specials_in_encoded_word(self): cases = [ # (to, folded) ('=?utf-8?q?A_v=C3=A9ry_long_name_with=2C_comma?= ', - '=?utf-8?q?A_v=C3=A9ry_long_name_with?=\n' - ' =?utf-8?q?=2C_comma?= \n'), + 'A =?utf-8?q?v=C3=A9ry_long_name_with?=\n' + ' =?utf-8?q?=2C?= comma \n'), ('=?utf-8?q?This_long_name_does_not_need_encoded=2Dword?= ', 'This long name does not need\n' ' encoded-word \n'),