From d701ca0db2e14a349571c06a1e919d37d801a7ed Mon Sep 17 00:00:00 2001 From: Pablo Galindo Salgado Date: Thu, 26 Feb 2026 21:33:35 +0000 Subject: [PATCH 1/3] gh-145234: Normalize decoded CR in string tokenizer --- Lib/test/test_py_compile.py | 8 ++++++++ .../2026-02-26-21-36-00.gh-issue-145234.w0mQ9n.rst | 3 +++ Parser/tokenizer/string_tokenizer.c | 12 ++++++++++++ 3 files changed, 23 insertions(+) create mode 100644 Misc/NEWS.d/next/Core_and_Builtins/2026-02-26-21-36-00.gh-issue-145234.w0mQ9n.rst diff --git a/Lib/test/test_py_compile.py b/Lib/test/test_py_compile.py index 66de61930968e4..da2d630d7ace7b 100644 --- a/Lib/test/test_py_compile.py +++ b/Lib/test/test_py_compile.py @@ -239,6 +239,14 @@ def test_quiet(self): with self.assertRaises(py_compile.PyCompileError): py_compile.compile(bad_coding, self.pyc_path, doraise=True, quiet=1) + def test_utf7_decoded_cr_compiles(self): + with open(self.source_path, 'wb') as file: + file.write(b"#coding=U7+AA0''\n") + + pyc_path = py_compile.compile(self.source_path, self.pyc_path, doraise=True) + self.assertEqual(pyc_path, self.pyc_path) + self.assertTrue(os.path.exists(self.pyc_path)) + class PyCompileTestsWithSourceEpoch(PyCompileTestsBase, unittest.TestCase, diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2026-02-26-21-36-00.gh-issue-145234.w0mQ9n.rst b/Misc/NEWS.d/next/Core_and_Builtins/2026-02-26-21-36-00.gh-issue-145234.w0mQ9n.rst new file mode 100644 index 00000000000000..698bda8ad0a089 --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2026-02-26-21-36-00.gh-issue-145234.w0mQ9n.rst @@ -0,0 +1,3 @@ +Fixed a ``SystemError`` in the parser when an encoding cookie (for example, +UTF-7) decodes to carriage returns (``\r``). Newlines are now normalized after +decoding in the string tokenizer. diff --git a/Parser/tokenizer/string_tokenizer.c b/Parser/tokenizer/string_tokenizer.c index 7299ecf483ccd9..9119d77c0fe21d 100644 --- a/Parser/tokenizer/string_tokenizer.c +++ b/Parser/tokenizer/string_tokenizer.c @@ -108,6 +108,18 @@ decode_str(const char *input, int single, struct tok_state *tok, int preserve_cr else if (!_PyTokenizer_ensure_utf8(str, tok, 1)) { return _PyTokenizer_error_ret(tok); } + if (utf8 != NULL) { + char *translated = _PyTokenizer_translate_newlines( + str, single, preserve_crlf, tok); + if (translated == NULL) { + Py_DECREF(utf8); + return _PyTokenizer_error_ret(tok); + } + PyMem_Free(tok->input); + tok->input = translated; + str = translated; + Py_CLEAR(utf8); + } assert(tok->decoding_buffer == NULL); tok->decoding_buffer = utf8; /* CAUTION */ return str; From 6545f1f162000e40864096df8ab48df7374b6c65 Mon Sep 17 00:00:00 2001 From: Pablo Galindo Salgado Date: Thu, 26 Feb 2026 21:35:32 +0000 Subject: [PATCH 2/3] gh-145234: Add patch attribution in NEWS entry --- .../2026-02-26-21-36-00.gh-issue-145234.w0mQ9n.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2026-02-26-21-36-00.gh-issue-145234.w0mQ9n.rst b/Misc/NEWS.d/next/Core_and_Builtins/2026-02-26-21-36-00.gh-issue-145234.w0mQ9n.rst index 698bda8ad0a089..caeffff0be8a85 100644 --- a/Misc/NEWS.d/next/Core_and_Builtins/2026-02-26-21-36-00.gh-issue-145234.w0mQ9n.rst +++ b/Misc/NEWS.d/next/Core_and_Builtins/2026-02-26-21-36-00.gh-issue-145234.w0mQ9n.rst @@ -1,3 +1,5 @@ Fixed a ``SystemError`` in the parser when an encoding cookie (for example, UTF-7) decodes to carriage returns (``\r``). Newlines are now normalized after decoding in the string tokenizer. + +Patch by Pablo Galindo. From cb60a1642dced9d82a80663438bb37429fe8a996 Mon Sep 17 00:00:00 2001 From: Pablo Galindo Salgado Date: Thu, 26 Feb 2026 22:18:08 +0000 Subject: [PATCH 3/3] gh-145234: keep tokenizer source pointer after UTF-8 translation --- Parser/tokenizer/string_tokenizer.c | 1 + 1 file changed, 1 insertion(+) diff --git a/Parser/tokenizer/string_tokenizer.c b/Parser/tokenizer/string_tokenizer.c index 9119d77c0fe21d..7f07cca37ee019 100644 --- a/Parser/tokenizer/string_tokenizer.c +++ b/Parser/tokenizer/string_tokenizer.c @@ -120,6 +120,7 @@ decode_str(const char *input, int single, struct tok_state *tok, int preserve_cr str = translated; Py_CLEAR(utf8); } + tok->str = str; assert(tok->decoding_buffer == NULL); tok->decoding_buffer = utf8; /* CAUTION */ return str;