diff --git a/java/src/json/ext/Parser.java b/java/src/json/ext/Parser.java index c54d1a7f..3d9b64fa 100644 --- a/java/src/json/ext/Parser.java +++ b/java/src/json/ext/Parser.java @@ -68,6 +68,15 @@ public class Parser extends RubyObject { private static final String CONST_INFINITY = "Infinity"; private static final String CONST_MINUS_INFINITY = "MinusInfinity"; + private static final String VALIDATE_UTF8_STRINGS = "jruby.json.validateUTF8Strings"; + private static final String VALIDATE_UTF8_STRINGS_DEFAULT = "true"; + private static final boolean validateUTF8Strings; + + static { + String enable = System.getProperty(VALIDATE_UTF8_STRINGS, VALIDATE_UTF8_STRINGS_DEFAULT); + validateUTF8Strings = "true".equalsIgnoreCase(enable) || "1".equals(enable); + } + static final ObjectAllocator ALLOCATOR = Parser::new; public Parser(Ruby runtime, RubyClass metaClass) { @@ -681,12 +690,7 @@ private IRubyObject parseString(boolean isName) { final byte[] data = this.data; final int contentStart = cursor + 1; // skip opening quote - // The scanner finds the closing quote and reports whether the body - // is plain printable ASCII (no escape, no ASCII control character, - // no non-ASCII byte). Anything non-plain is handed to StringDecoder, - // which performs the UTF-8/control validation, escape expansion, and - // error reporting. - long scanned = scanner.scan(data, chunks, contentStart, end); + long scanned = scanner.scan(data, chunks, contentStart, end, validateUTF8Strings); final int q = (int) scanned; if (q < 0) { throw newException(Utils.M_PARSER_ERROR, @@ -694,6 +698,8 @@ private IRubyObject parseString(boolean isName) { } boolean plain = (scanned & StringScanner.PLAIN_BIT) != 0; + boolean isAscii = (scanned & StringScanner.ASCII_BIT) != 0; + cursor = q + 1; // past closing quote // Note: When running multiple read-world benchmarks in the same JVM, @@ -713,8 +719,7 @@ private IRubyObject parseString(boolean isName) { off = contentStart; len = q - contentStart; } else { - ByteList decoded = decoder.decode(context, byteList, - contentStart - begin, q - begin); + ByteList decoded = decodeString(context, byteList, contentStart - begin, q - begin, isAscii); buf = decoded.getUnsafeBytes(); off = decoded.begin(); len = decoded.realSize(); @@ -731,8 +736,7 @@ private IRubyObject parseString(boolean isName) { string = RubyString.newString(context.runtime, data, contentStart, q - contentStart, UTF8Encoding.INSTANCE); } else { - ByteList content = decoder.decode(context, byteList, - contentStart - begin, q - begin); + ByteList content = decodeString(context, byteList, contentStart - begin, q - begin, isAscii); string = context.runtime.newString(content); string.setEncoding(UTF8Encoding.INSTANCE); string.clearCodeRange(); @@ -745,6 +749,16 @@ private IRubyObject parseString(boolean isName) { return string; } + private ByteList decodeString(ThreadContext context, ByteList byteList, int start, int end, boolean isAscii) { + if (isAscii) { + return decoder.decodeNoValidate(context, byteList, start, end, chunks); + } + if (!validateUTF8Strings) { + return decoder.decodeNoValidate(context, byteList, start, end, chunks); + } + return decoder.decode(context, byteList, start, end); + } + private static boolean isLetter(byte b) { return (b >= 'a' && b <= 'z') || (b >= 'A' && b <= 'Z'); } diff --git a/java/src/json/ext/StringDecoder.java b/java/src/json/ext/StringDecoder.java index 28874acd..e35638a4 100644 --- a/java/src/json/ext/StringDecoder.java +++ b/java/src/json/ext/StringDecoder.java @@ -10,6 +10,7 @@ import org.jruby.util.ByteList; import java.io.IOException; +import java.nio.ByteBuffer; /** * A decoder that reads a JSON-encoded string from the given sources and @@ -25,6 +26,8 @@ final class StringDecoder extends ByteListTranscoder { private boolean allowControlCharacters = false; private boolean allowInvalidEscape = false; + private final StringScanner scanner = StringScanner.getInstance(); + private ByteList out; // Array used for writing multibyte characters into the buffer at once @@ -49,6 +52,49 @@ ByteList decode(ThreadContext context, ByteList src, int start, int end) { } } + // Decodes strings with no UTF-8 validation. It is assumed the string is either ASCII-only + // or UTF-8 avlidation has been disabled. + ByteList decodeNoValidate(ThreadContext context, ByteList src, int start, int end, ByteBuffer chunks) { + try { + init(src, start, end); + this.out = new ByteList(end - start); + final byte[] data = src.unsafeBytes(); + final int base = src.begin(); + final int absEnd = base + srcEnd; + int cursor = base + pos; + int runStart = cursor; + while (cursor < absEnd) { + cursor = scanner.scanEscape(data, chunks, cursor, absEnd); + if (cursor >= absEnd) { + break; + } + int b = Byte.toUnsignedInt(data[cursor]); + if (b == '\\') { + if (cursor > runStart) { + append(data, runStart, cursor - runStart); + } + charStart = cursor - base; + pos = charStart + 1; + handleEscapeSequence(context); + cursor = base + pos; + runStart = cursor; + } else { + if (!allowControlCharacters) { + charStart = cursor - base; + throw invalidControlChar(context); + } + cursor++; + } + } + if (absEnd > runStart) { + append(data, runStart, absEnd - runStart); + } + return out; + } catch (IOException e) { + throw context.runtime.newIOErrorFromException(e); + } + } + @Override protected int readUtf8Char(ThreadContext context) { int c = super.readUtf8Char(context); diff --git a/java/src/json/ext/StringScanner.java b/java/src/json/ext/StringScanner.java index 5ca59d17..605c0ad9 100644 --- a/java/src/json/ext/StringScanner.java +++ b/java/src/json/ext/StringScanner.java @@ -16,6 +16,8 @@ class StringScanner { /** Set in the returned bits when the whole body is plain printable ASCII. */ static final long PLAIN_BIT = 1L << 32; + static final long ASCII_BIT = 1L << 33; + /** Returned when no closing quote is found before {@code end}. */ static final long NOT_FOUND = -1L; @@ -41,8 +43,6 @@ class StringScanner { .loadClass(VECTORIZED_SCANNER_CLASS); scanner = (StringScanner) vectorized.getDeclaredConstructor().newInstance(); } catch (Throwable t) { - // jdk.incubator.vector unavailable (or any load failure): - // keep the SWAR implementation. scanner = new StringScanner(); } } @@ -54,31 +54,52 @@ static StringScanner getInstance() { } /** - * Scans {@code data[start..end)} for the closing quote, honouring backslash - * escapes. + * Scans {@code data[start..end)} for a closing quote. This method will + * also report if there are any interesting bytes within the range + * {@link data[start..end)}. An interestin byte is defined as control + * characters, backslashes or bytes with the high bit set. + * + * @param chunks a little-endian {@link ByteBuffer} over {@code data} * - * @param chunks a little-endian {@link ByteBuffer} over {@code data}, used - * for the 8-byte SWAR reads (the vectorized subclass reads - * {@code data} directly and ignores it). * @return packed result: the low 32 bits hold the index of the closing * quote, or {@code -1} ({@link #NOT_FOUND}) when none is found * before {@code end}; {@link #PLAIN_BIT} is set when the entire body * is plain printable ASCII (no escape, no ASCII control character, - * and no non-ASCII byte) and can be copied verbatim. + * and no non-ASCII byte) and can be copied verbatim; + * {@link #ASCII_BIT} is set when the body contains no non-ASCII byte + * (it may still hold escapes or control characters) */ - long scan(byte[] data, ByteBuffer chunks, int start, int end) { + long scan(byte[] data, ByteBuffer chunks, int start, int end, boolean validateUtf8) { int p = start; boolean plain = true; + boolean ascii = true; outer: while (true) { - // SWAR: skip 8-byte chunks that contain nothing interesting. while (p + 8 <= end) { long x = chunks.getLong(p); - // Due to the byte-by-byte handling if we match an interesting byte, - // if we already know this is a non-ASCII-only string, we simply - // look for quotes and backslashes. - long m = plain ? stringScanMask(x) : quoteBackslashMask(x); + // Pick the cheapest mask that still observes every transition we + // still care about: + // plain -> control, quote, backslash and high bytes + // non-plain, ASCII -> quote, backslash and high bytes, so the + // first non-ASCII byte still stops us and + // clears ASCII_BIT + // non-plain, !ASCII -> quote and backslash only; multi-byte + // UTF-8 is skipped eight bytes at a time + // + // When UTF-8 validation is disabled, non-ASCII bytes are copied + // verbatim, so they stay on the plain fast path and the high-bit + // term drops out of every mask: plain scans for control, quote + // and backslash; non-plain scans for quote and backslash only. + long m; + if (validateUtf8) { + m = plain ? stringScanMask(x) + : ascii ? quoteBackslashHighMask(x) + : quoteBackslashMask(x); + } else { + m = plain ? controlQuoteBackslashMask(x) + : quoteBackslashMask(x); + } if (m == 0) { p += 8; } else { @@ -90,24 +111,63 @@ long scan(byte[] data, ByteBuffer chunks, int start, int end) { while (p < end) { int b = data[p] & 0xFF; if (b == '"') { - return ((long) p) | (plain ? PLAIN_BIT : 0L); + return (((long) p) | (plain ? PLAIN_BIT : 0L) | (ascii ? ASCII_BIT : 0L)); } if (b == '\\') { plain = false; p += 2; // skip the backslash and the escaped byte continue outer; } - if (b < 0x20 || b >= 0x80) { + if (b < 0x20) { plain = false; p++; continue outer; } + if (b >= 0x80) { + ascii = false; + if (validateUtf8) { + plain = false; + } + p++; + continue outer; + } p++; } return NOT_FOUND; } } + + /** + * Scans {@code data[start..end)} for the next backslash or control character. + * + *

The caller must guarantee the {@link data[start..end]} contains + * no non-ASCII bytes that need to be decoded.

+ * + * @return the index of the first backslash or control byte, or {@code end} + * when none is found. + */ + int scanEscape(byte[] data, ByteBuffer chunks, int start, int end) { + int p = start; + while (p + 8 <= end) { + long x = chunks.getLong(p); + long m = backslashControlMask(x); + if (m == 0) { + p += 8; + } else { + return p + (Long.numberOfTrailingZeros(m) >>> 3); + } + } + while (p < end) { + int b = data[p] & 0xFF; + if (b == '\\' || b < 0x20) { + return p; + } + p++; + } + return end; + } + /** * Returns a mask whose high bit (0x80) is set in every lane of {@code x} * that needs scalar attention: an ASCII control character (< 0x20), a @@ -124,11 +184,39 @@ private static long stringScanMask(long x) { return (control | high | quote | bslash) & HIGH_BITS; } + /** + * Like {@link #quoteBackslashMask} but for the ASCII-only decode fast path: + * flags backslashes and ASCII control characters (< 0x20). The control + * test relies on every byte being ASCII (< 0x80), which the caller + * guarantees, so no high-bit cleanup is needed. + */ + private static long backslashControlMask(long x) { + long control = (x - SPACES) & ~x; // bytes < 0x20 (ASCII) + long s = x ^ BACKSLASHES; + long bslash = (s - ONES) & ~s; + return (control | bslash) & HIGH_BITS; + } + + /** + * Like {@link #stringScanMask} but omits the non-ASCII (high-bit) term: + * flags double quotes, backslashes and ASCII control characters (< 0x20) + * only. Used as the starting mask when UTF-8 validation is disabled, where + * non-ASCII bytes are copied verbatim and so stay on the plain fast path. + */ + private static long controlQuoteBackslashMask(long x) { + long control = (x - SPACES) & ~x; // bytes < 0x20 (ASCII) + long q = x ^ DOUBLE_QUOTES; + long quote = (q - ONES) & ~q; + long s = x ^ BACKSLASHES; + long bslash = (s - ONES) & ~s; + return (control | quote | bslash) & HIGH_BITS; + } + /** * Like {@link #stringScanMask} but only flags double quotes and backslashes. - * Used once a string is known to require the decoder, so the remaining scan - * for the closing quote still skips clean chunks (including multi-byte - * UTF-8) eight bytes at a time. + * Used once a string is known to require the decoder and to already + * contain non-ASCII bytes, so the remaining scan for the closing quote skips + * clean chunks (including multi-byte UTF-8) eight bytes at a time. */ private static long quoteBackslashMask(long x) { long q = x ^ DOUBLE_QUOTES; @@ -137,4 +225,19 @@ private static long quoteBackslashMask(long x) { long bslash = (s - ONES) & ~s; return (quote | bslash) & HIGH_BITS; } + + /** + * Like {@link #quoteBackslashMask} but also flags non-ASCII bytes. Used once + * a string is known to require the decoder but is still ASCII-only, so the + * scan keeps skipping printable runs eight bytes at a time yet still stops on + * the first high byte and clears {@link #ASCII_BIT}. + */ + private static long quoteBackslashHighMask(long x) { + long high = x; // bit 0x80 set iff non-ASCII + long q = x ^ DOUBLE_QUOTES; + long quote = (q - ONES) & ~q; + long s = x ^ BACKSLASHES; + long bslash = (s - ONES) & ~s; + return (high | quote | bslash) & HIGH_BITS; + } } diff --git a/java/src/json/ext/VectorizedStringScanner.java b/java/src/json/ext/VectorizedStringScanner.java index 8a9c1e3e..b63544fe 100644 --- a/java/src/json/ext/VectorizedStringScanner.java +++ b/java/src/json/ext/VectorizedStringScanner.java @@ -12,23 +12,37 @@ final class VectorizedStringScanner extends StringScanner { private static final ByteVector ZERO = ByteVector.zero(SP); private static final ByteVector TWO = ByteVector.broadcast(SP, 2); private static final ByteVector THIRTY_THREE = ByteVector.broadcast(SP, 33); + private static final ByteVector SPACE = ByteVector.broadcast(SP, 0x20); private static final ByteVector BACKSLASH = ByteVector.broadcast(SP, '\\'); private static final ByteVector DQUOTE = ByteVector.broadcast(SP, '"'); @Override - long scan(byte[] data, ByteBuffer chunks, int start, int end) { + long scan(byte[] data, ByteBuffer chunks, int start, int end, boolean validateUtf8) { final int width = SP.length(); int p = start; boolean plain = true; + boolean ascii = true; - // The same structure as the StringEncoder. The logic is + // The same structure as the StringEncoder. The logic is // duplicated for maximum inlining. + // + // When UTF-8 validation is disabled, non-ASCII lanes are copied verbatim + // and stay on the plain fast path, so the high-bit term drops out: plain + // scans for control, quote and backslash; non-plain for quote and + // backslash only. outer: while (true) { while (p + width <= end) { ByteVector chunk = ByteVector.fromArray(SP, data, p); - VectorMask interesting = - plain ? interestingLanes(chunk) : quoteOrBackslashLanes(chunk); + VectorMask interesting; + if (validateUtf8) { + interesting = plain ? interestingLanes(chunk) + : ascii ? quoteBackslashOrHighLanes(chunk) + : quoteOrBackslashLanes(chunk); + } else { + interesting = plain ? controlQuoteBackslashLanes(chunk) + : quoteOrBackslashLanes(chunk); + } if (interesting.anyTrue()) { p += interesting.firstTrue(); break; @@ -38,14 +52,22 @@ long scan(byte[] data, ByteBuffer chunks, int start, int end) { while (p < end) { int b = data[p] & 0xFF; if (b == '"') { - return ((long) p) | (plain ? PLAIN_BIT : 0L); + return ((long) p) | (plain ? PLAIN_BIT : 0L) | (ascii ? ASCII_BIT : 0L); } if (b == '\\') { plain = false; p += 2; // skip the backslash and the escaped byte continue outer; } - if (b < 0x20 || b >= 0x80) { + if (b >= 0x80) { + ascii = false; + if (validateUtf8) { + plain = false; + } + p++; + continue outer; + } + if (b < 0x20) { plain = false; p++; continue outer; @@ -56,7 +78,30 @@ long scan(byte[] data, ByteBuffer chunks, int start, int end) { } } - // Lanes that are control characters, double quotes, backslashes or non-ASCII. + @Override + int scanEscape(byte[] data, ByteBuffer chunks, int start, int end) { + final int width = SP.length(); + if (start + width > end) { + return super.scanEscape(data, chunks, start, end); + } + int p = start; + while (p + width <= end) { + ByteVector chunk = ByteVector.fromArray(SP, data, p); + VectorMask interesting = escapeOrControlLanes(chunk); + if (interesting.anyTrue()) { + return p + interesting.firstTrue(); + } + p += width; + } + return super.scanEscape(data, chunks, p, end); + } + + private static VectorMask escapeOrControlLanes(ByteVector chunk) { + VectorMask negative = chunk.lt(ZERO); + VectorMask control = chunk.lt(SPACE).andNot(negative); + return control.or(chunk.eq(BACKSLASH)); + } + private static VectorMask interestingLanes(ByteVector chunk) { VectorMask negative = chunk.lt(ZERO); VectorMask lowOrQuote = chunk.lanewise(VectorOperators.XOR, TWO) @@ -65,8 +110,22 @@ private static VectorMask interestingLanes(ByteVector chunk) { return lowOrQuote.or(chunk.eq(BACKSLASH)).or(negative); } - // Lanes that are a double quote or a backslash (non-plain phase). private static VectorMask quoteOrBackslashLanes(ByteVector chunk) { return chunk.eq(DQUOTE).or(chunk.eq(BACKSLASH)); } + + // Like interestingLanes but without the non-ASCII (high-bit) term: flags + // ASCII control characters, double quotes and backslashes only. Used as the + // plain-path mask when UTF-8 validation is disabled. + private static VectorMask controlQuoteBackslashLanes(ByteVector chunk) { + VectorMask negative = chunk.lt(ZERO); + VectorMask lowOrQuote = chunk.lanewise(VectorOperators.XOR, TWO) + .lt(THIRTY_THREE) + .andNot(negative); + return lowOrQuote.or(chunk.eq(BACKSLASH)); + } + + private static VectorMask quoteBackslashOrHighLanes(ByteVector chunk) { + return chunk.eq(DQUOTE).or(chunk.eq(BACKSLASH)).or(chunk.lt(ZERO)); + } }