@@ -39,31 +39,6 @@ def test_ascii_non_printable_encode(self):
3939 self .assertEqual (self .dumps (u , ensure_ascii = False ),
4040 '"\\ b\\ t\\ n\\ f\\ r\\ u0000\\ u001f\x7f "' )
4141
42- def test_ensure_ascii_false_long_string_paths (self ):
43- # Exercise the encoder's escape-size scan for ensure_ascii=False over
44- # long runs that cross the 8-byte scan windows and the short-string
45- # guard: a special character at every offset, in 1-byte (ASCII and
46- # Latin-1) and wider (BMP, astral) strings.
47- dumps , loads = self .dumps , self .loads
48- for n in range (40 ):
49- run = "a" * n
50- for tail in ('"' , "\\ " , "\n " , "\x00 " , "\x1f " , "\x7f " , "\xe9 " ,
51- "ä¸" , "\U0001f600 " ):
52- s = run + tail + "tail"
53- self .assertEqual (loads (dumps (s , ensure_ascii = False )), s )
54- # The no-escape fast path returns the string verbatim between quotes,
55- # including kept-as-is Latin-1 and 0x7f.
56- for s in ("x" * 20 , "\xe9 " * 20 , "kept latin1 \xe9 \xff \x7f text " * 3 ):
57- self .assertEqual (dumps (s , ensure_ascii = False ), '"' + s + '"' )
58- # The structural escapes and control chars are still escaped after a
59- # long no-escape run.
60- self .assertEqual (dumps ("a" * 20 + '"' , ensure_ascii = False ),
61- '"' + "a" * 20 + '\\ ""' )
62- self .assertEqual (dumps ("a" * 20 + "\\ " , ensure_ascii = False ),
63- '"' + "a" * 20 + '\\ \\ "' )
64- self .assertEqual (dumps ("a" * 20 + "\x01 " , ensure_ascii = False ),
65- '"' + "a" * 20 + '\\ u0001"' )
66-
6742 def test_ascii_non_printable_decode (self ):
6843 self .assertEqual (self .loads ('"\\ b\\ t\\ n\\ f\\ r"' ),
6944 '\b \t \n \f \r ' )
@@ -158,6 +133,42 @@ def test_object_pairs_hook_with_unicode(self):
158133 object_hook = lambda x : None ),
159134 OrderedDict (p ))
160135
136+ def test_ensure_ascii_false_long_string_paths (self ):
137+ # Cover the SWAR scan in _json escape_size(): it inspects eight bytes
138+ # per iteration, so exercise runs that cross the 8-byte windows and the
139+ # short-string guard with a special character at every offset.
140+ dumps , loads = self .dumps , self .loads
141+
142+ def is_optimized (s ):
143+ # The no-escape fast path returns the string verbatim in quotes.
144+ self .assertEqual (dumps (s , ensure_ascii = False ), f'"{ s } "' )
145+
146+ # Bytes that are kept as-is, including Latin-1 and 0x7f, stay verbatim.
147+ for s in ("abc" , "\xe9 " , "kept latin1 \xe9 \xff \x7f text" ):
148+ is_optimized (s )
149+ is_optimized (s * 8 )
150+
151+ def need_escape (s , expected ):
152+ encoded = dumps (s , ensure_ascii = False )
153+ self .assertEqual (encoded , expected )
154+ self .assertEqual (loads (encoded ), s )
155+
156+ tail = "tail"
157+ for n in range (40 ):
158+ run = "a" * n
159+ for char , escaped in (('"' , '\\ "' ), ("\\ " , "\\ \\ " ), ("\n " , "\\ n" ),
160+ ("\x00 " , "\\ u0000" ), ("\x1f " , "\\ u001f" )):
161+ need_escape (run + char + tail , f'"{ run } { escaped } { tail } "' )
162+ for char in ("\x7f " , "\xe9 " , "ä¸" , "\U0001f600 " ):
163+ s = run + char + tail
164+ need_escape (s , f'"{ s } "' )
165+
166+ # Structural escapes and control characters are still escaped after a
167+ # long no-escape run.
168+ base = "a" * 20
169+ for char , escaped in (('"' , '\\ "' ), ("\\ " , "\\ \\ " ), ("\x01 " , "\\ u0001" )):
170+ need_escape (base + char , f'"{ base } { escaped } "' )
171+
161172
162173class TestPyUnicode (TestUnicode , PyTest ): pass
163174class TestCUnicode (TestUnicode , CTest ): pass
0 commit comments