Fix exception handling in file tokenizer

johnslavik · johnslavik · commit 8b9505f93227 · 2026-06-14T05:05:27.000+02:00
diff --git a/Lib/test/test_source_encoding.py b/Lib/test/test_source_encoding.py
@@ -387,8 +387,7 @@ def test_utf8_non_utf8_third_line_error(self):
                b'#third\xa4\n'
                b'raise RuntimeError\n')
         self.check_script_error(src,
-                br"'utf-8' codec can't decode byte|"
-                br"encoding problem: utf8")
+                br"'utf-8' codec can't decode byte")
 
     def test_crlf(self):
         src = (b'print(ascii("""\r\n"""))\n')
@@ -540,6 +539,15 @@ def check_script_error(self, src, expected, lineno=...):
                 line = line.removeprefix('\ufeff')
             self.assertIn(line.encode(), err)
 
+    def test_coding_spec_unknown_encoding(self):
+        src = (b'# coding: dict-unpacking-at-home\n'
+               b'{foo} = {"foo": "bar"}\n')
+        self.check_script_error(src, br"unknown encoding: dict-unpacking-at-home")
+
+    def test_coding_spec_decode_error(self):
+        src = (b'# coding: shift-jis\n'
+               b'print("\xc4\x85")\n')
+        self.check_script_error(src, br"'shift_jis' codec can't decode byte 0x85")
 
 
 if __name__ == "__main__":
diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2026-06-14-05-05-15.gh-issue-151461.5q0s88.rst b/Misc/NEWS.d/next/Core_and_Builtins/2026-06-14-05-05-15.gh-issue-151461.5q0s88.rst
@@ -0,0 +1,3 @@
+Fixed the tokenizer to no longer show a generic ``SyntaxError: encoding
+problem`` message and distinguish whether a codec was not found or a file
+could not be decoded. Patch by Bartosz Sławecki.
diff --git a/Parser/tokenizer/helpers.c b/Parser/tokenizer/helpers.c
@@ -2,6 +2,7 @@
 #include "errcode.h"
 #include "pycore_token.h"
 
+#include "../pegen.h"
 #include "../lexer/state.h"
 
 
@@ -419,7 +420,7 @@ _PyTokenizer_check_coding_spec(const char* line, Py_ssize_t size, struct tok_sta
         assert(tok->decoding_readline == NULL);
         if (strcmp(cs, "utf-8") != 0 && !set_readline(tok, cs)) {
             _PyTokenizer_error_ret(tok);
-            PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
+            _PyPegen_raise_tokenizer_init_error(tok->filename);
             PyMem_Free(cs);
             return 0;
         }

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	+Fixed the tokenizer to no longer show a generic ``SyntaxError: encoding
	`2`	+problem`` message and distinguish whether a codec was not found or a file
	`3`	`+could not be decoded. Patch by Bartosz Sławecki.`