From 6795f586837d5141f368533bac6197919a9c7e94 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cbhuvi27=E2=80=9D?= Date: Tue, 9 Jun 2026 08:19:09 +0530 Subject: [PATCH 1/3] gh-150771: Fix email serialization for shift_jis and euc-jp Convert surrogate-escaped payloads through the input charset before encoding to iso-2022-jp, fixing UnicodeEncodeError when printing messages created with set_content(). --- Lib/email/charset.py | 10 +++++- Lib/email/message.py | 4 ++- Lib/test/test_email/test_contentmanager.py | 32 +++++++++++++++++++ ...-06-09-12-00-00.gh-issue-150771.K7mNx2.rst | 3 ++ 4 files changed, 47 insertions(+), 2 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2026-06-09-12-00-00.gh-issue-150771.K7mNx2.rst diff --git a/Lib/email/charset.py b/Lib/email/charset.py index 5981791820e740c..e3ee13c3912241a 100644 --- a/Lib/email/charset.py +++ b/Lib/email/charset.py @@ -16,6 +16,7 @@ import email.quoprimime from email import errors +from email import utils from email.encoders import encode_7or8bit @@ -438,5 +439,12 @@ def body_encode(self, string): return email.quoprimime.body_encode(string) else: if isinstance(string, str): - string = string.encode(self.output_charset).decode('ascii') + if utils._has_surrogates(string): + string = string.encode('ascii', 'surrogateescape') + if self.input_charset != self.output_charset: + string = (string.decode(self.input_codec) + .encode(self.output_codec)) + string = string.decode('ascii', 'surrogateescape') + else: + string = string.encode(self.output_charset).decode('ascii') return string diff --git a/Lib/email/message.py b/Lib/email/message.py index 641fb2e944d4311..dff113ea407f07c 100644 --- a/Lib/email/message.py +++ b/Lib/email/message.py @@ -352,7 +352,9 @@ def set_payload(self, payload, charset=None): return if not isinstance(charset, Charset): charset = Charset(charset) - payload = payload.encode(charset.output_charset, 'surrogateescape') + if not utils._has_surrogates(payload): + payload = payload.encode(charset.output_charset, + 'surrogateescape') if hasattr(payload, 'decode'): self._payload = payload.decode('ascii', 'surrogateescape') else: diff --git a/Lib/test/test_email/test_contentmanager.py b/Lib/test/test_email/test_contentmanager.py index bc0e5d356181591..a28f9c1402984b0 100644 --- a/Lib/test/test_email/test_contentmanager.py +++ b/Lib/test/test_email/test_contentmanager.py @@ -355,6 +355,38 @@ def test_set_text_charset_cp949(self): self.assertEqual(m.get_payload(decode=True).decode('ks_c_5601-1987'), content) self.assertEqual(m.get_content(), content) + def test_set_text_charset_shift_jis(self): + m = self._make_message() + content = "\u65e5\u672c\u8a9e\n" + raw_data_manager.set_content(m, content, charset='shift_jis') + self.assertEqual(m['Content-Type'], 'text/plain; charset="shift_jis"') + self.assertEqual(m['Content-Transfer-Encoding'], '8bit') + self.assertEqual(m.get_payload(decode=True), content.encode('shift_jis')) + self.assertEqual(m.get_content(), content) + # Serialization converts the payload to iso-2022-jp for output. + self.assertEqual(str(m), textwrap.dedent("""\ + Content-Type: text/plain; charset="iso-2022-jp" + Content-Transfer-Encoding: 7bit + + \x1b$BF|K\\8l\x1b(B + """)) + + def test_set_text_charset_euc_jp(self): + m = self._make_message() + content = "\u65e5\u672c\u8a9e\n" + raw_data_manager.set_content(m, content, charset='euc-jp') + self.assertEqual(m['Content-Type'], 'text/plain; charset="euc-jp"') + self.assertEqual(m['Content-Transfer-Encoding'], '8bit') + self.assertEqual(m.get_payload(decode=True), content.encode('euc-jp')) + self.assertEqual(m.get_content(), content) + # Serialization converts the payload to iso-2022-jp for output. + self.assertEqual(str(m), textwrap.dedent("""\ + Content-Type: text/plain; charset="iso-2022-jp" + Content-Transfer-Encoding: 7bit + + \x1b$BF|K\\8l\x1b(B + """)) + def test_set_text_plain_long_line_heuristics(self): m = self._make_message() content = ("Simple but long message that is over 78 characters" diff --git a/Misc/NEWS.d/next/Library/2026-06-09-12-00-00.gh-issue-150771.K7mNx2.rst b/Misc/NEWS.d/next/Library/2026-06-09-12-00-00.gh-issue-150771.K7mNx2.rst new file mode 100644 index 000000000000000..79d724f354f237d --- /dev/null +++ b/Misc/NEWS.d/next/Library/2026-06-09-12-00-00.gh-issue-150771.K7mNx2.rst @@ -0,0 +1,3 @@ +Fix serialization of :mod:`email` messages using ``shift_jis`` or ``euc-jp`` +charsets. Converting surrogate-escaped payloads to the required +``iso-2022-jp`` output charset no longer raises :exc:`UnicodeEncodeError`. From f6c53cb11c9617db20f6bd2f235b7b9fc1b79fa8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cbhuvi27=E2=80=9D?= Date: Sun, 14 Jun 2026 08:15:17 +0530 Subject: [PATCH 2/3] gh-150771: Use output charset in set_text_content for shift_jis/euc-jp Encode the payload with the charset output mapping (iso-2022-jp) when set_content is called with shift_jis or euc-jp, instead of patching serialization in body_encode and set_payload. Reverts those changes. --- Lib/email/charset.py | 10 +--------- Lib/email/contentmanager.py | 3 ++- Lib/email/message.py | 4 +--- Lib/test/test_email/test_contentmanager.py | 12 ++++-------- .../2026-06-09-12-00-00.gh-issue-150771.K7mNx2.rst | 7 ++++--- 5 files changed, 12 insertions(+), 24 deletions(-) diff --git a/Lib/email/charset.py b/Lib/email/charset.py index e3ee13c3912241a..5981791820e740c 100644 --- a/Lib/email/charset.py +++ b/Lib/email/charset.py @@ -16,7 +16,6 @@ import email.quoprimime from email import errors -from email import utils from email.encoders import encode_7or8bit @@ -439,12 +438,5 @@ def body_encode(self, string): return email.quoprimime.body_encode(string) else: if isinstance(string, str): - if utils._has_surrogates(string): - string = string.encode('ascii', 'surrogateescape') - if self.input_charset != self.output_charset: - string = (string.decode(self.input_codec) - .encode(self.output_codec)) - string = string.decode('ascii', 'surrogateescape') - else: - string = string.encode(self.output_charset).decode('ascii') + string = string.encode(self.output_charset).decode('ascii') return string diff --git a/Lib/email/contentmanager.py b/Lib/email/contentmanager.py index faf2626bccce651..c0090af716575d7 100644 --- a/Lib/email/contentmanager.py +++ b/Lib/email/contentmanager.py @@ -174,7 +174,8 @@ def set_text_content(msg, string, subtype="plain", charset='utf-8', cte=None, params=None, headers=None): _prepare_set(msg, 'text', subtype, headers) - charset = email.charset.Charset(charset).input_charset + cs = email.charset.Charset(charset) + charset = cs.output_charset cte, payload = _encode_text(string, charset, cte, msg.policy) msg.set_payload(payload) msg.set_param('charset', charset, replace=True) diff --git a/Lib/email/message.py b/Lib/email/message.py index dff113ea407f07c..641fb2e944d4311 100644 --- a/Lib/email/message.py +++ b/Lib/email/message.py @@ -352,9 +352,7 @@ def set_payload(self, payload, charset=None): return if not isinstance(charset, Charset): charset = Charset(charset) - if not utils._has_surrogates(payload): - payload = payload.encode(charset.output_charset, - 'surrogateescape') + payload = payload.encode(charset.output_charset, 'surrogateescape') if hasattr(payload, 'decode'): self._payload = payload.decode('ascii', 'surrogateescape') else: diff --git a/Lib/test/test_email/test_contentmanager.py b/Lib/test/test_email/test_contentmanager.py index e1c2db0e2a2b0a1..ec3915447107320 100644 --- a/Lib/test/test_email/test_contentmanager.py +++ b/Lib/test/test_email/test_contentmanager.py @@ -366,11 +366,9 @@ def test_set_text_charset_shift_jis(self): m = self._make_message() content = "\u65e5\u672c\u8a9e\n" raw_data_manager.set_content(m, content, charset='shift_jis') - self.assertEqual(m['Content-Type'], 'text/plain; charset="shift_jis"') - self.assertEqual(m['Content-Transfer-Encoding'], '8bit') - self.assertEqual(m.get_payload(decode=True), content.encode('shift_jis')) + self.assertEqual(m['Content-Type'], 'text/plain; charset="iso-2022-jp"') + self.assertEqual(m.get_payload(decode=True), content.encode('iso-2022-jp')) self.assertEqual(m.get_content(), content) - # Serialization converts the payload to iso-2022-jp for output. self.assertEqual(str(m), textwrap.dedent("""\ Content-Type: text/plain; charset="iso-2022-jp" Content-Transfer-Encoding: 7bit @@ -382,11 +380,9 @@ def test_set_text_charset_euc_jp(self): m = self._make_message() content = "\u65e5\u672c\u8a9e\n" raw_data_manager.set_content(m, content, charset='euc-jp') - self.assertEqual(m['Content-Type'], 'text/plain; charset="euc-jp"') - self.assertEqual(m['Content-Transfer-Encoding'], '8bit') - self.assertEqual(m.get_payload(decode=True), content.encode('euc-jp')) + self.assertEqual(m['Content-Type'], 'text/plain; charset="iso-2022-jp"') + self.assertEqual(m.get_payload(decode=True), content.encode('iso-2022-jp')) self.assertEqual(m.get_content(), content) - # Serialization converts the payload to iso-2022-jp for output. self.assertEqual(str(m), textwrap.dedent("""\ Content-Type: text/plain; charset="iso-2022-jp" Content-Transfer-Encoding: 7bit diff --git a/Misc/NEWS.d/next/Library/2026-06-09-12-00-00.gh-issue-150771.K7mNx2.rst b/Misc/NEWS.d/next/Library/2026-06-09-12-00-00.gh-issue-150771.K7mNx2.rst index 79d724f354f237d..43d4c2006a6d892 100644 --- a/Misc/NEWS.d/next/Library/2026-06-09-12-00-00.gh-issue-150771.K7mNx2.rst +++ b/Misc/NEWS.d/next/Library/2026-06-09-12-00-00.gh-issue-150771.K7mNx2.rst @@ -1,3 +1,4 @@ -Fix serialization of :mod:`email` messages using ``shift_jis`` or ``euc-jp`` -charsets. Converting surrogate-escaped payloads to the required -``iso-2022-jp`` output charset no longer raises :exc:`UnicodeEncodeError`. +Fix :mod:`email` messages created with ``shift_jis`` or ``euc-jp`` charsets. +:func:`email.contentmanager.set_text_content` now stores the payload using +the output charset (``iso-2022-jp``) so :func:`str` on the message no longer +raises :exc:`UnicodeEncodeError`. From e2f3984ac4fc8847b896c35edf575a024ec1c40d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cbhuvi27=E2=80=9D?= Date: Sun, 14 Jun 2026 08:52:29 +0530 Subject: [PATCH 3/3] gh-150771: Fix NEWS entry doc references for docs CI Use plain backticks for set_content() instead of a broken :func: target. --- .../Library/2026-06-09-12-00-00.gh-issue-150771.K7mNx2.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Misc/NEWS.d/next/Library/2026-06-09-12-00-00.gh-issue-150771.K7mNx2.rst b/Misc/NEWS.d/next/Library/2026-06-09-12-00-00.gh-issue-150771.K7mNx2.rst index 43d4c2006a6d892..6535e5c48bf0360 100644 --- a/Misc/NEWS.d/next/Library/2026-06-09-12-00-00.gh-issue-150771.K7mNx2.rst +++ b/Misc/NEWS.d/next/Library/2026-06-09-12-00-00.gh-issue-150771.K7mNx2.rst @@ -1,4 +1,4 @@ Fix :mod:`email` messages created with ``shift_jis`` or ``euc-jp`` charsets. -:func:`email.contentmanager.set_text_content` now stores the payload using -the output charset (``iso-2022-jp``) so :func:`str` on the message no longer -raises :exc:`UnicodeEncodeError`. +``set_content()`` now stores the payload using the output charset +(``iso-2022-jp``) so printing the message no longer raises +:exc:`UnicodeEncodeError`.