Skip to content

Commit e62a2b3

Browse files
committed
📝 json: address review on ensure_ascii=False speedup
Move the new escape-size test to the end of TestUnicode and adopt the reviewer's clearer is_optimized/need_escape structure, asserting exact encoder output. Control characters use the JSON \uXXXX form (json.dumps never emits caret notation), keeping the test green under both encoders. Record the measured speedup in the NEWS entry: ~1.5x for long ASCII or Latin-1 strings (1.25x at 1k chars, 1.55x at 100k).
1 parent 466e2d7 commit e62a2b3

2 files changed

Lines changed: 39 additions & 27 deletions

File tree

‎Lib/test/test_json/test_unicode.py‎

Lines changed: 36 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -39,31 +39,6 @@ def test_ascii_non_printable_encode(self):
3939
self.assertEqual(self.dumps(u, ensure_ascii=False),
4040
'"\\b\\t\\n\\f\\r\\u0000\\u001f\x7f"')
4141

42-
def test_ensure_ascii_false_long_string_paths(self):
43-
# Exercise the encoder's escape-size scan for ensure_ascii=False over
44-
# long runs that cross the 8-byte scan windows and the short-string
45-
# guard: a special character at every offset, in 1-byte (ASCII and
46-
# Latin-1) and wider (BMP, astral) strings.
47-
dumps, loads = self.dumps, self.loads
48-
for n in range(40):
49-
run = "a" * n
50-
for tail in ('"', "\\", "\n", "\x00", "\x1f", "\x7f", "\xe9",
51-
"中", "\U0001f600"):
52-
s = run + tail + "tail"
53-
self.assertEqual(loads(dumps(s, ensure_ascii=False)), s)
54-
# The no-escape fast path returns the string verbatim between quotes,
55-
# including kept-as-is Latin-1 and 0x7f.
56-
for s in ("x" * 20, "\xe9" * 20, "kept latin1 \xe9\xff \x7f text " * 3):
57-
self.assertEqual(dumps(s, ensure_ascii=False), '"' + s + '"')
58-
# The structural escapes and control chars are still escaped after a
59-
# long no-escape run.
60-
self.assertEqual(dumps("a" * 20 + '"', ensure_ascii=False),
61-
'"' + "a" * 20 + '\\""')
62-
self.assertEqual(dumps("a" * 20 + "\\", ensure_ascii=False),
63-
'"' + "a" * 20 + '\\\\"')
64-
self.assertEqual(dumps("a" * 20 + "\x01", ensure_ascii=False),
65-
'"' + "a" * 20 + '\\u0001"')
66-
6742
def test_ascii_non_printable_decode(self):
6843
self.assertEqual(self.loads('"\\b\\t\\n\\f\\r"'),
6944
'\b\t\n\f\r')
@@ -158,6 +133,42 @@ def test_object_pairs_hook_with_unicode(self):
158133
object_hook = lambda x: None),
159134
OrderedDict(p))
160135

136+
def test_ensure_ascii_false_long_string_paths(self):
137+
# Cover the SWAR scan in _json escape_size(): it inspects eight bytes
138+
# per iteration, so exercise runs that cross the 8-byte windows and the
139+
# short-string guard with a special character at every offset.
140+
dumps, loads = self.dumps, self.loads
141+
142+
def is_optimized(s):
143+
# The no-escape fast path returns the string verbatim in quotes.
144+
self.assertEqual(dumps(s, ensure_ascii=False), f'"{s}"')
145+
146+
# Bytes that are kept as-is, including Latin-1 and 0x7f, stay verbatim.
147+
for s in ("abc", "\xe9", "kept latin1 \xe9\xff \x7f text"):
148+
is_optimized(s)
149+
is_optimized(s * 8)
150+
151+
def need_escape(s, expected):
152+
encoded = dumps(s, ensure_ascii=False)
153+
self.assertEqual(encoded, expected)
154+
self.assertEqual(loads(encoded), s)
155+
156+
tail = "tail"
157+
for n in range(40):
158+
run = "a" * n
159+
for char, escaped in (('"', '\\"'), ("\\", "\\\\"), ("\n", "\\n"),
160+
("\x00", "\\u0000"), ("\x1f", "\\u001f")):
161+
need_escape(run + char + tail, f'"{run}{escaped}{tail}"')
162+
for char in ("\x7f", "\xe9", "中", "\U0001f600"):
163+
s = run + char + tail
164+
need_escape(s, f'"{s}"')
165+
166+
# Structural escapes and control characters are still escaped after a
167+
# long no-escape run.
168+
base = "a" * 20
169+
for char, escaped in (('"', '\\"'), ("\\", "\\\\"), ("\x01", "\\u0001")):
170+
need_escape(base + char, f'"{base}{escaped}"')
171+
161172

162173
class TestPyUnicode(TestUnicode, PyTest): pass
163174
class TestCUnicode(TestUnicode, CTest): pass
Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
Speed up :func:`json.dumps` with ``ensure_ascii=False`` for strings made up of
22
long runs of characters that need no escaping, by scanning eight bytes at a
3-
time. Short strings, strings that need escaping, and strings with characters
4-
above U+00FF are unaffected. Patch by Bernát Gábor.
3+
time (roughly 1.5x faster for long ASCII or Latin-1 strings). Short strings,
4+
strings that need escaping, and strings with characters above U+00FF are
5+
unaffected. Patch by Bernát Gábor.

0 commit comments

Comments
 (0)