Skip to content

Commit 8b9505f

Browse files
committed
Fix exception handling in file tokenizer
1 parent 6679ac0 commit 8b9505f

3 files changed

Lines changed: 15 additions & 3 deletions

File tree

Lib/test/test_source_encoding.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -387,8 +387,7 @@ def test_utf8_non_utf8_third_line_error(self):
387387
b'#third\xa4\n'
388388
b'raise RuntimeError\n')
389389
self.check_script_error(src,
390-
br"'utf-8' codec can't decode byte|"
391-
br"encoding problem: utf8")
390+
br"'utf-8' codec can't decode byte")
392391

393392
def test_crlf(self):
394393
src = (b'print(ascii("""\r\n"""))\n')
@@ -540,6 +539,15 @@ def check_script_error(self, src, expected, lineno=...):
540539
line = line.removeprefix('\ufeff')
541540
self.assertIn(line.encode(), err)
542541

542+
def test_coding_spec_unknown_encoding(self):
543+
src = (b'# coding: dict-unpacking-at-home\n'
544+
b'{foo} = {"foo": "bar"}\n')
545+
self.check_script_error(src, br"unknown encoding: dict-unpacking-at-home")
546+
547+
def test_coding_spec_decode_error(self):
548+
src = (b'# coding: shift-jis\n'
549+
b'print("\xc4\x85")\n')
550+
self.check_script_error(src, br"'shift_jis' codec can't decode byte 0x85")
543551

544552

545553
if __name__ == "__main__":
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Fixed the tokenizer to no longer show a generic ``SyntaxError: encoding
2+
problem`` message and distinguish whether a codec was not found or a file
3+
could not be decoded. Patch by Bartosz Sławecki.

Parser/tokenizer/helpers.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
#include "errcode.h"
33
#include "pycore_token.h"
44

5+
#include "../pegen.h"
56
#include "../lexer/state.h"
67

78

@@ -419,7 +420,7 @@ _PyTokenizer_check_coding_spec(const char* line, Py_ssize_t size, struct tok_sta
419420
assert(tok->decoding_readline == NULL);
420421
if (strcmp(cs, "utf-8") != 0 && !set_readline(tok, cs)) {
421422
_PyTokenizer_error_ret(tok);
422-
PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
423+
_PyPegen_raise_tokenizer_init_error(tok->filename);
423424
PyMem_Free(cs);
424425
return 0;
425426
}

0 commit comments

Comments
 (0)