📝 json: address review on ensure_ascii=False speedup

gaborbernat · gaborbernat · commit e62a2b39d014 · 2026-06-10T10:10:11.000-07:00
Move the new escape-size test to the end of TestUnicode and adopt the
reviewer's clearer is_optimized/need_escape structure, asserting exact
encoder output. Control characters use the JSON \uXXXX form (json.dumps
never emits caret notation), keeping the test green under both encoders.

Record the measured speedup in the NEWS entry: ~1.5x for long ASCII or
Latin-1 strings (1.25x at 1k chars, 1.55x at 100k).
diff --git a/Lib/test/test_json/test_unicode.py b/Lib/test/test_json/test_unicode.py
@@ -39,31 +39,6 @@ def test_ascii_non_printable_encode(self):
         self.assertEqual(self.dumps(u, ensure_ascii=False),
                          '"\\b\\t\\n\\f\\r\\u0000\\u001f\x7f"')
 
-    def test_ensure_ascii_false_long_string_paths(self):
-        # Exercise the encoder's escape-size scan for ensure_ascii=False over
-        # long runs that cross the 8-byte scan windows and the short-string
-        # guard: a special character at every offset, in 1-byte (ASCII and
-        # Latin-1) and wider (BMP, astral) strings.
-        dumps, loads = self.dumps, self.loads
-        for n in range(40):
-            run = "a" * n
-            for tail in ('"', "\\", "\n", "\x00", "\x1f", "\x7f", "\xe9",
-                         "中", "\U0001f600"):
-                s = run + tail + "tail"
-                self.assertEqual(loads(dumps(s, ensure_ascii=False)), s)
-        # The no-escape fast path returns the string verbatim between quotes,
-        # including kept-as-is Latin-1 and 0x7f.
-        for s in ("x" * 20, "\xe9" * 20, "kept latin1 \xe9\xff \x7f text " * 3):
-            self.assertEqual(dumps(s, ensure_ascii=False), '"' + s + '"')
-        # The structural escapes and control chars are still escaped after a
-        # long no-escape run.
-        self.assertEqual(dumps("a" * 20 + '"', ensure_ascii=False),
-                         '"' + "a" * 20 + '\\""')
-        self.assertEqual(dumps("a" * 20 + "\\", ensure_ascii=False),
-                         '"' + "a" * 20 + '\\\\"')
-        self.assertEqual(dumps("a" * 20 + "\x01", ensure_ascii=False),
-                         '"' + "a" * 20 + '\\u0001"')
-
     def test_ascii_non_printable_decode(self):
         self.assertEqual(self.loads('"\\b\\t\\n\\f\\r"'),
                          '\b\t\n\f\r')
@@ -158,6 +133,42 @@ def test_object_pairs_hook_with_unicode(self):
                                     object_hook = lambda x: None),
                          OrderedDict(p))
 
+    def test_ensure_ascii_false_long_string_paths(self):
+        # Cover the SWAR scan in _json escape_size(): it inspects eight bytes
+        # per iteration, so exercise runs that cross the 8-byte windows and the
+        # short-string guard with a special character at every offset.
+        dumps, loads = self.dumps, self.loads
+
+        def is_optimized(s):
+            # The no-escape fast path returns the string verbatim in quotes.
+            self.assertEqual(dumps(s, ensure_ascii=False), f'"{s}"')
+
+        # Bytes that are kept as-is, including Latin-1 and 0x7f, stay verbatim.
+        for s in ("abc", "\xe9", "kept latin1 \xe9\xff \x7f text"):
+            is_optimized(s)
+            is_optimized(s * 8)
+
+        def need_escape(s, expected):
+            encoded = dumps(s, ensure_ascii=False)
+            self.assertEqual(encoded, expected)
+            self.assertEqual(loads(encoded), s)
+
+        tail = "tail"
+        for n in range(40):
+            run = "a" * n
+            for char, escaped in (('"', '\\"'), ("\\", "\\\\"), ("\n", "\\n"),
+                                  ("\x00", "\\u0000"), ("\x1f", "\\u001f")):
+                need_escape(run + char + tail, f'"{run}{escaped}{tail}"')
+            for char in ("\x7f", "\xe9", "中", "\U0001f600"):
+                s = run + char + tail
+                need_escape(s, f'"{s}"')
+
+        # Structural escapes and control characters are still escaped after a
+        # long no-escape run.
+        base = "a" * 20
+        for char, escaped in (('"', '\\"'), ("\\", "\\\\"), ("\x01", "\\u0001")):
+            need_escape(base + char, f'"{base}{escaped}"')
+
 
 class TestPyUnicode(TestUnicode, PyTest): pass
 class TestCUnicode(TestUnicode, CTest): pass
diff --git a/Misc/NEWS.d/next/Library/2026-06-03-11-49-35.gh-issue-150878.ZCL1T0.rst b/Misc/NEWS.d/next/Library/2026-06-03-11-49-35.gh-issue-150878.ZCL1T0.rst
@@ -1,4 +1,5 @@
 Speed up :func:`json.dumps` with ``ensure_ascii=False`` for strings made up of
 long runs of characters that need no escaping, by scanning eight bytes at a
-time. Short strings, strings that need escaping, and strings with characters
-above U+00FF are unaffected. Patch by Bernát Gábor.
+time (roughly 1.5x faster for long ASCII or Latin-1 strings). Short strings,
+strings that need escaping, and strings with characters above U+00FF are
+unaffected. Patch by Bernát Gábor.