Skip to content

Commit f1a5f68

Browse files
authored
gh-151461: Fix encoding-related exception handling in file tokenizer (GH-151462)
1 parent e91f68a commit f1a5f68

7 files changed

Lines changed: 72 additions & 53 deletions

File tree

Lib/test/test_source_encoding.py

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -387,8 +387,7 @@ def test_utf8_non_utf8_third_line_error(self):
387387
b'#third\xa4\n'
388388
b'raise RuntimeError\n')
389389
self.check_script_error(src,
390-
br"'utf-8' codec can't decode byte|"
391-
br"encoding problem: utf8")
390+
br"'utf-8' codec can't decode byte")
392391

393392
def test_crlf(self):
394393
src = (b'print(ascii("""\r\n"""))\n')
@@ -540,6 +539,20 @@ def check_script_error(self, src, expected, lineno=...):
540539
line = line.removeprefix('\ufeff')
541540
self.assertIn(line.encode(), err)
542541

542+
def test_coding_spec_unknown_encoding(self):
543+
src = (b'# coding: c1252\n'
544+
b'print("Hi!")\n')
545+
self.check_script_error(src, br"unknown encoding: c1252")
546+
547+
def test_coding_spec_decode_error(self):
548+
src = (b'# coding: shift-jis\n'
549+
b'print("\xc4\x85")\n')
550+
self.check_script_error(src, br"'shift_jis' codec can't decode byte")
551+
552+
def test_coding_spec_non_text_encoding(self):
553+
src = (b'# coding: hex_codec\n'
554+
b'print("eggs")\n')
555+
self.check_script_error(src, br"'hex_codec' is not a text encoding")
543556

544557

545558
if __name__ == "__main__":
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Fix direct execution of files with invalid source encodings to report the
2+
underlying codec lookup or decoding error instead of the generic
3+
``SyntaxError: encoding problem`` message. Patch by Bartosz Sławecki.

Parser/pegen.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
#include "lexer/lexer.h"
1111
#include "tokenizer/tokenizer.h"
12+
#include "tokenizer/helpers.h"
1213
#include "pegen.h"
1314

1415
// Internal parser functions
@@ -993,7 +994,7 @@ _PyPegen_run_parser_from_file_pointer(FILE *fp, int start_rule, PyObject *filena
993994
struct tok_state *tok = _PyTokenizer_FromFile(fp, enc, ps1, ps2);
994995
if (tok == NULL) {
995996
if (PyErr_Occurred()) {
996-
_PyPegen_raise_tokenizer_init_error(filename_ob);
997+
_PyTokenizer_raise_init_error(filename_ob);
997998
return NULL;
998999
}
9991000
return NULL;
@@ -1051,7 +1052,7 @@ _PyPegen_run_parser_from_string(const char *str, int start_rule, PyObject *filen
10511052
}
10521053
if (tok == NULL) {
10531054
if (PyErr_Occurred()) {
1054-
_PyPegen_raise_tokenizer_init_error(filename_ob);
1055+
_PyTokenizer_raise_init_error(filename_ob);
10551056
}
10561057
return NULL;
10571058
}

Parser/pegen.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -174,7 +174,6 @@ typedef enum {
174174
} TARGETS_TYPE;
175175

176176
int _Pypegen_raise_decode_error(Parser *p);
177-
void _PyPegen_raise_tokenizer_init_error(PyObject *filename);
178177
int _Pypegen_tokenizer_error(Parser *p);
179178
void *_PyPegen_raise_error(Parser *p, PyObject *errtype, int use_mark, const char *errmsg, ...);
180179
void *_PyPegen_raise_error_known_location(Parser *p, PyObject *errtype,

Parser/pegen_errors.c

Lines changed: 0 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -10,53 +10,6 @@
1010

1111
// TOKENIZER ERRORS
1212

13-
void
14-
_PyPegen_raise_tokenizer_init_error(PyObject *filename)
15-
{
16-
if (!(PyErr_ExceptionMatches(PyExc_LookupError)
17-
|| PyErr_ExceptionMatches(PyExc_SyntaxError)
18-
|| PyErr_ExceptionMatches(PyExc_ValueError)
19-
|| PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))) {
20-
return;
21-
}
22-
PyObject *errstr = NULL;
23-
PyObject *tuple = NULL;
24-
PyObject *type;
25-
PyObject *value;
26-
PyObject *tback;
27-
PyErr_Fetch(&type, &value, &tback);
28-
if (PyErr_GivenExceptionMatches(value, PyExc_SyntaxError)) {
29-
if (PyObject_SetAttr(value, &_Py_ID(filename), filename)) {
30-
goto error;
31-
}
32-
PyErr_Restore(type, value, tback);
33-
return;
34-
}
35-
errstr = PyObject_Str(value);
36-
if (!errstr) {
37-
goto error;
38-
}
39-
40-
PyObject *tmp = Py_BuildValue("(OiiO)", filename, 0, -1, Py_None);
41-
if (!tmp) {
42-
goto error;
43-
}
44-
45-
tuple = _PyTuple_FromPair(errstr, tmp);
46-
Py_DECREF(tmp);
47-
if (!tuple) {
48-
goto error;
49-
}
50-
PyErr_SetObject(PyExc_SyntaxError, tuple);
51-
52-
error:
53-
Py_XDECREF(type);
54-
Py_XDECREF(value);
55-
Py_XDECREF(tback);
56-
Py_XDECREF(errstr);
57-
Py_XDECREF(tuple);
58-
}
59-
6013
static inline void
6114
raise_unclosed_parentheses_error(Parser *p) {
6215
int error_lineno = p->tok->parenlinenostack[p->tok->level-1];

Parser/tokenizer/helpers.c

Lines changed: 50 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
#include "Python.h"
22
#include "errcode.h"
3+
#include "pycore_runtime.h" // _Py_ID()
34
#include "pycore_token.h"
5+
#include "pycore_tuple.h" // _PyTuple_FromPair
46

57
#include "../lexer/state.h"
68

@@ -149,6 +151,53 @@ _PyTokenizer_warn_invalid_escape_sequence(struct tok_state *tok, int first_inval
149151
return 0;
150152
}
151153

154+
void
155+
_PyTokenizer_raise_init_error(PyObject *filename)
156+
{
157+
if (!(PyErr_ExceptionMatches(PyExc_LookupError)
158+
|| PyErr_ExceptionMatches(PyExc_SyntaxError)
159+
|| PyErr_ExceptionMatches(PyExc_ValueError)
160+
|| PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))) {
161+
return;
162+
}
163+
PyObject *errstr = NULL;
164+
PyObject *tuple = NULL;
165+
PyObject *type;
166+
PyObject *value;
167+
PyObject *tback;
168+
PyErr_Fetch(&type, &value, &tback);
169+
if (PyErr_GivenExceptionMatches(value, PyExc_SyntaxError)) {
170+
if (PyObject_SetAttr(value, &_Py_ID(filename), filename)) {
171+
goto error;
172+
}
173+
PyErr_Restore(type, value, tback);
174+
return;
175+
}
176+
errstr = PyObject_Str(value);
177+
if (!errstr) {
178+
goto error;
179+
}
180+
181+
PyObject *tmp = Py_BuildValue("(OiiO)", filename, 0, -1, Py_None);
182+
if (!tmp) {
183+
goto error;
184+
}
185+
186+
tuple = _PyTuple_FromPair(errstr, tmp);
187+
Py_DECREF(tmp);
188+
if (!tuple) {
189+
goto error;
190+
}
191+
PyErr_SetObject(PyExc_SyntaxError, tuple);
192+
193+
error:
194+
Py_XDECREF(type);
195+
Py_XDECREF(value);
196+
Py_XDECREF(tback);
197+
Py_XDECREF(errstr);
198+
Py_XDECREF(tuple);
199+
}
200+
152201
int
153202
_PyTokenizer_parser_warn(struct tok_state *tok, PyObject *category, const char *format, ...)
154203
{
@@ -418,8 +467,8 @@ _PyTokenizer_check_coding_spec(const char* line, Py_ssize_t size, struct tok_sta
418467
if (tok->encoding == NULL) {
419468
assert(tok->decoding_readline == NULL);
420469
if (strcmp(cs, "utf-8") != 0 && !set_readline(tok, cs)) {
470+
_PyTokenizer_raise_init_error(tok->filename);
421471
_PyTokenizer_error_ret(tok);
422-
PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
423472
PyMem_Free(cs);
424473
return 0;
425474
}

Parser/tokenizer/helpers.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ int _PyTokenizer_indenterror(struct tok_state *tok);
1515
int _PyTokenizer_warn_invalid_escape_sequence(struct tok_state *tok, int first_invalid_escape_char);
1616
int _PyTokenizer_parser_warn(struct tok_state *tok, PyObject *category, const char *format, ...);
1717
char *_PyTokenizer_error_ret(struct tok_state *tok);
18+
void _PyTokenizer_raise_init_error(PyObject *filename);
1819

1920
char *_PyTokenizer_new_string(const char *s, Py_ssize_t len, struct tok_state *tok);
2021
char *_PyTokenizer_translate_newlines(const char *s, int exec_input, int preserve_crlf, struct tok_state *tok);

0 commit comments

Comments
 (0)