Skip to content

Commit 3ad080a

Browse files
authored
[3.14] gh-151461: Fix encoding-related exception handling in file tokenizer (GH-151462) (GH-151474)
1 parent bbaaebd commit 3ad080a

7 files changed

Lines changed: 71 additions & 53 deletions

File tree

Lib/test/test_source_encoding.py

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -387,8 +387,7 @@ def test_utf8_non_utf8_third_line_error(self):
387387
b'#third\xa4\n'
388388
b'raise RuntimeError\n')
389389
self.check_script_error(src,
390-
br"'utf-8' codec can't decode byte|"
391-
br"encoding problem: utf8")
390+
br"'utf-8' codec can't decode byte")
392391

393392
def test_crlf(self):
394393
src = (b'print(ascii("""\r\n"""))\n')
@@ -541,6 +540,20 @@ def check_script_error(self, src, expected, lineno=...):
541540
line = line.encode(sys.__stderr__.encoding, sys.__stderr__.errors)
542541
self.assertIn(line, err)
543542

543+
def test_coding_spec_unknown_encoding(self):
544+
src = (b'# coding: c1252\n'
545+
b'print("Hi!")\n')
546+
self.check_script_error(src, br"unknown encoding: c1252")
547+
548+
def test_coding_spec_decode_error(self):
549+
src = (b'# coding: shift-jis\n'
550+
b'print("\xc4\x85")\n')
551+
self.check_script_error(src, br"'shift_jis' codec can't decode byte")
552+
553+
def test_coding_spec_non_text_encoding(self):
554+
src = (b'# coding: hex_codec\n'
555+
b'print("eggs")\n')
556+
self.check_script_error(src, br"'hex_codec' is not a text encoding")
544557

545558

546559
if __name__ == "__main__":
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Fix direct execution of files with invalid source encodings to report the
2+
underlying codec lookup or decoding error instead of the generic
3+
``SyntaxError: encoding problem`` message. Patch by Bartosz Sławecki.

Parser/pegen.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
#include "lexer/lexer.h"
1111
#include "tokenizer/tokenizer.h"
12+
#include "tokenizer/helpers.h"
1213
#include "pegen.h"
1314

1415
// Internal parser functions
@@ -1002,7 +1003,7 @@ _PyPegen_run_parser_from_file_pointer(FILE *fp, int start_rule, PyObject *filena
10021003
struct tok_state *tok = _PyTokenizer_FromFile(fp, enc, ps1, ps2);
10031004
if (tok == NULL) {
10041005
if (PyErr_Occurred()) {
1005-
_PyPegen_raise_tokenizer_init_error(filename_ob);
1006+
_PyTokenizer_raise_init_error(filename_ob);
10061007
return NULL;
10071008
}
10081009
return NULL;
@@ -1055,7 +1056,7 @@ _PyPegen_run_parser_from_string(const char *str, int start_rule, PyObject *filen
10551056
}
10561057
if (tok == NULL) {
10571058
if (PyErr_Occurred()) {
1058-
_PyPegen_raise_tokenizer_init_error(filename_ob);
1059+
_PyTokenizer_raise_init_error(filename_ob);
10591060
}
10601061
return NULL;
10611062
}

Parser/pegen.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -175,7 +175,6 @@ typedef enum {
175175
} TARGETS_TYPE;
176176

177177
int _Pypegen_raise_decode_error(Parser *p);
178-
void _PyPegen_raise_tokenizer_init_error(PyObject *filename);
179178
int _Pypegen_tokenizer_error(Parser *p);
180179
void *_PyPegen_raise_error(Parser *p, PyObject *errtype, int use_mark, const char *errmsg, ...);
181180
void *_PyPegen_raise_error_known_location(Parser *p, PyObject *errtype,

Parser/pegen_errors.c

Lines changed: 0 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -9,53 +9,6 @@
99

1010
// TOKENIZER ERRORS
1111

12-
void
13-
_PyPegen_raise_tokenizer_init_error(PyObject *filename)
14-
{
15-
if (!(PyErr_ExceptionMatches(PyExc_LookupError)
16-
|| PyErr_ExceptionMatches(PyExc_SyntaxError)
17-
|| PyErr_ExceptionMatches(PyExc_ValueError)
18-
|| PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))) {
19-
return;
20-
}
21-
PyObject *errstr = NULL;
22-
PyObject *tuple = NULL;
23-
PyObject *type;
24-
PyObject *value;
25-
PyObject *tback;
26-
PyErr_Fetch(&type, &value, &tback);
27-
if (PyErr_GivenExceptionMatches(value, PyExc_SyntaxError)) {
28-
if (PyObject_SetAttr(value, &_Py_ID(filename), filename)) {
29-
goto error;
30-
}
31-
PyErr_Restore(type, value, tback);
32-
return;
33-
}
34-
errstr = PyObject_Str(value);
35-
if (!errstr) {
36-
goto error;
37-
}
38-
39-
PyObject *tmp = Py_BuildValue("(OiiO)", filename, 0, -1, Py_None);
40-
if (!tmp) {
41-
goto error;
42-
}
43-
44-
tuple = PyTuple_Pack(2, errstr, tmp);
45-
Py_DECREF(tmp);
46-
if (!tuple) {
47-
goto error;
48-
}
49-
PyErr_SetObject(PyExc_SyntaxError, tuple);
50-
51-
error:
52-
Py_XDECREF(type);
53-
Py_XDECREF(value);
54-
Py_XDECREF(tback);
55-
Py_XDECREF(errstr);
56-
Py_XDECREF(tuple);
57-
}
58-
5912
static inline void
6013
raise_unclosed_parentheses_error(Parser *p) {
6114
int error_lineno = p->tok->parenlinenostack[p->tok->level-1];

Parser/tokenizer/helpers.c

Lines changed: 49 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
#include "Python.h"
22
#include "errcode.h"
3+
#include "pycore_runtime.h" // _Py_ID()
34
#include "pycore_token.h"
45

56
#include "../lexer/state.h"
@@ -149,6 +150,53 @@ _PyTokenizer_warn_invalid_escape_sequence(struct tok_state *tok, int first_inval
149150
return 0;
150151
}
151152

153+
void
154+
_PyTokenizer_raise_init_error(PyObject *filename)
155+
{
156+
if (!(PyErr_ExceptionMatches(PyExc_LookupError)
157+
|| PyErr_ExceptionMatches(PyExc_SyntaxError)
158+
|| PyErr_ExceptionMatches(PyExc_ValueError)
159+
|| PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))) {
160+
return;
161+
}
162+
PyObject *errstr = NULL;
163+
PyObject *tuple = NULL;
164+
PyObject *type;
165+
PyObject *value;
166+
PyObject *tback;
167+
PyErr_Fetch(&type, &value, &tback);
168+
if (PyErr_GivenExceptionMatches(value, PyExc_SyntaxError)) {
169+
if (PyObject_SetAttr(value, &_Py_ID(filename), filename)) {
170+
goto error;
171+
}
172+
PyErr_Restore(type, value, tback);
173+
return;
174+
}
175+
errstr = PyObject_Str(value);
176+
if (!errstr) {
177+
goto error;
178+
}
179+
180+
PyObject *tmp = Py_BuildValue("(OiiO)", filename, 0, -1, Py_None);
181+
if (!tmp) {
182+
goto error;
183+
}
184+
185+
tuple = PyTuple_Pack(2, errstr, tmp);
186+
Py_DECREF(tmp);
187+
if (!tuple) {
188+
goto error;
189+
}
190+
PyErr_SetObject(PyExc_SyntaxError, tuple);
191+
192+
error:
193+
Py_XDECREF(type);
194+
Py_XDECREF(value);
195+
Py_XDECREF(tback);
196+
Py_XDECREF(errstr);
197+
Py_XDECREF(tuple);
198+
}
199+
152200
int
153201
_PyTokenizer_parser_warn(struct tok_state *tok, PyObject *category, const char *format, ...)
154202
{
@@ -418,8 +466,8 @@ _PyTokenizer_check_coding_spec(const char* line, Py_ssize_t size, struct tok_sta
418466
if (tok->encoding == NULL) {
419467
assert(tok->decoding_readline == NULL);
420468
if (strcmp(cs, "utf-8") != 0 && !set_readline(tok, cs)) {
469+
_PyTokenizer_raise_init_error(tok->filename);
421470
_PyTokenizer_error_ret(tok);
422-
PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
423471
PyMem_Free(cs);
424472
return 0;
425473
}

Parser/tokenizer/helpers.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ int _PyTokenizer_indenterror(struct tok_state *tok);
1515
int _PyTokenizer_warn_invalid_escape_sequence(struct tok_state *tok, int first_invalid_escape_char);
1616
int _PyTokenizer_parser_warn(struct tok_state *tok, PyObject *category, const char *format, ...);
1717
char *_PyTokenizer_error_ret(struct tok_state *tok);
18+
void _PyTokenizer_raise_init_error(PyObject *filename);
1819

1920
char *_PyTokenizer_new_string(const char *s, Py_ssize_t len, struct tok_state *tok);
2021
char *_PyTokenizer_translate_newlines(const char *s, int exec_input, int preserve_crlf, struct tok_state *tok);

0 commit comments

Comments
 (0)