Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 24 additions & 10 deletions java/src/json/ext/Parser.java

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

46 changes: 46 additions & 0 deletions java/src/json/ext/StringDecoder.java
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import org.jruby.util.ByteList;

import java.io.IOException;
import java.nio.ByteBuffer;

/**
* A decoder that reads a JSON-encoded string from the given sources and
Expand All @@ -25,6 +26,8 @@ final class StringDecoder extends ByteListTranscoder {
private boolean allowControlCharacters = false;
private boolean allowInvalidEscape = false;

private final StringScanner scanner = StringScanner.getInstance();

private ByteList out;

// Array used for writing multibyte characters into the buffer at once
Expand All @@ -49,6 +52,49 @@ ByteList decode(ThreadContext context, ByteList src, int start, int end) {
}
}

// Decodes strings with no UTF-8 validation. It is assumed the string is either ASCII-only
// or UTF-8 avlidation has been disabled.
ByteList decodeNoValidate(ThreadContext context, ByteList src, int start, int end, ByteBuffer chunks) {
try {
init(src, start, end);
this.out = new ByteList(end - start);
final byte[] data = src.unsafeBytes();
final int base = src.begin();
final int absEnd = base + srcEnd;
int cursor = base + pos;
int runStart = cursor;
while (cursor < absEnd) {
cursor = scanner.scanEscape(data, chunks, cursor, absEnd);
if (cursor >= absEnd) {
break;
}
int b = Byte.toUnsignedInt(data[cursor]);
if (b == '\\') {
if (cursor > runStart) {
append(data, runStart, cursor - runStart);
}
charStart = cursor - base;
pos = charStart + 1;
handleEscapeSequence(context);
cursor = base + pos;
runStart = cursor;
} else {
if (!allowControlCharacters) {
charStart = cursor - base;
throw invalidControlChar(context);
}
cursor++;
}
}
if (absEnd > runStart) {
append(data, runStart, absEnd - runStart);
}
return out;
} catch (IOException e) {
throw context.runtime.newIOErrorFromException(e);
}
}

@Override
protected int readUtf8Char(ThreadContext context) {
int c = super.readUtf8Char(context);
Expand Down
141 changes: 122 additions & 19 deletions java/src/json/ext/StringScanner.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
class StringScanner {
/** Set in the returned bits when the whole body is plain printable ASCII. */
static final long PLAIN_BIT = 1L << 32;
static final long ASCII_BIT = 1L << 33;

/** Returned when no closing quote is found before {@code end}. */
static final long NOT_FOUND = -1L;

Expand All @@ -41,8 +43,6 @@ class StringScanner {
.loadClass(VECTORIZED_SCANNER_CLASS);
scanner = (StringScanner) vectorized.getDeclaredConstructor().newInstance();
} catch (Throwable t) {
// jdk.incubator.vector unavailable (or any load failure):
// keep the SWAR implementation.
scanner = new StringScanner();
}
}
Expand All @@ -54,31 +54,52 @@ static StringScanner getInstance() {
}

/**
* Scans {@code data[start..end)} for the closing quote, honouring backslash
* escapes.
* Scans {@code data[start..end)} for a closing quote. This method will
* also report if there are any interesting bytes within the range
* {@link data[start..end)}. An interestin byte is defined as control
* characters, backslashes or bytes with the high bit set.
*
* @param chunks a little-endian {@link ByteBuffer} over {@code data}
*
* @param chunks a little-endian {@link ByteBuffer} over {@code data}, used
* for the 8-byte SWAR reads (the vectorized subclass reads
* {@code data} directly and ignores it).
* @return packed result: the low 32 bits hold the index of the closing
* quote, or {@code -1} ({@link #NOT_FOUND}) when none is found
* before {@code end}; {@link #PLAIN_BIT} is set when the entire body
* is plain printable ASCII (no escape, no ASCII control character,
* and no non-ASCII byte) and can be copied verbatim.
* and no non-ASCII byte) and can be copied verbatim;
* {@link #ASCII_BIT} is set when the body contains no non-ASCII byte
* (it may still hold escapes or control characters)
*/
long scan(byte[] data, ByteBuffer chunks, int start, int end) {
long scan(byte[] data, ByteBuffer chunks, int start, int end, boolean validateUtf8) {
int p = start;
boolean plain = true;
boolean ascii = true;

outer:
while (true) {
// SWAR: skip 8-byte chunks that contain nothing interesting.
while (p + 8 <= end) {
long x = chunks.getLong(p);
// Due to the byte-by-byte handling if we match an interesting byte,
// if we already know this is a non-ASCII-only string, we simply
// look for quotes and backslashes.
long m = plain ? stringScanMask(x) : quoteBackslashMask(x);
// Pick the cheapest mask that still observes every transition we
// still care about:
// plain -> control, quote, backslash and high bytes
// non-plain, ASCII -> quote, backslash and high bytes, so the
// first non-ASCII byte still stops us and
// clears ASCII_BIT
// non-plain, !ASCII -> quote and backslash only; multi-byte
// UTF-8 is skipped eight bytes at a time
//
// When UTF-8 validation is disabled, non-ASCII bytes are copied
// verbatim, so they stay on the plain fast path and the high-bit
// term drops out of every mask: plain scans for control, quote
// and backslash; non-plain scans for quote and backslash only.
long m;
if (validateUtf8) {
m = plain ? stringScanMask(x)
: ascii ? quoteBackslashHighMask(x)
: quoteBackslashMask(x);
} else {
m = plain ? controlQuoteBackslashMask(x)
: quoteBackslashMask(x);
}
if (m == 0) {
p += 8;
} else {
Expand All @@ -90,24 +111,63 @@ long scan(byte[] data, ByteBuffer chunks, int start, int end) {
while (p < end) {
int b = data[p] & 0xFF;
if (b == '"') {
return ((long) p) | (plain ? PLAIN_BIT : 0L);
return (((long) p) | (plain ? PLAIN_BIT : 0L) | (ascii ? ASCII_BIT : 0L));
}
if (b == '\\') {
plain = false;
p += 2; // skip the backslash and the escaped byte
continue outer;
}
if (b < 0x20 || b >= 0x80) {
if (b < 0x20) {
plain = false;
p++;
continue outer;
}
if (b >= 0x80) {
ascii = false;
if (validateUtf8) {
plain = false;
}
p++;
continue outer;
}
p++;
}
return NOT_FOUND;
}
}


/**
* Scans {@code data[start..end)} for the next backslash or control character.
*
* <p>The caller must guarantee the {@link data[start..end]} contains
* no non-ASCII bytes that need to be decoded.</p>
*
* @return the index of the first backslash or control byte, or {@code end}
* when none is found.
*/
int scanEscape(byte[] data, ByteBuffer chunks, int start, int end) {
int p = start;
while (p + 8 <= end) {
long x = chunks.getLong(p);
long m = backslashControlMask(x);
if (m == 0) {
p += 8;
} else {
return p + (Long.numberOfTrailingZeros(m) >>> 3);
}
}
while (p < end) {
int b = data[p] & 0xFF;
if (b == '\\' || b < 0x20) {
return p;
}
p++;
}
return end;
}

/**
* Returns a mask whose high bit (0x80) is set in every lane of {@code x}
* that needs scalar attention: an ASCII control character (&lt; 0x20), a
Expand All @@ -124,11 +184,39 @@ private static long stringScanMask(long x) {
return (control | high | quote | bslash) & HIGH_BITS;
}

/**
* Like {@link #quoteBackslashMask} but for the ASCII-only decode fast path:
* flags backslashes and ASCII control characters (&lt; 0x20). The control
* test relies on every byte being ASCII (&lt; 0x80), which the caller
* guarantees, so no high-bit cleanup is needed.
*/
private static long backslashControlMask(long x) {
long control = (x - SPACES) & ~x; // bytes < 0x20 (ASCII)
long s = x ^ BACKSLASHES;
long bslash = (s - ONES) & ~s;
return (control | bslash) & HIGH_BITS;
}

/**
* Like {@link #stringScanMask} but omits the non-ASCII (high-bit) term:
* flags double quotes, backslashes and ASCII control characters (&lt; 0x20)
* only. Used as the starting mask when UTF-8 validation is disabled, where
* non-ASCII bytes are copied verbatim and so stay on the plain fast path.
*/
private static long controlQuoteBackslashMask(long x) {
long control = (x - SPACES) & ~x; // bytes < 0x20 (ASCII)
long q = x ^ DOUBLE_QUOTES;
long quote = (q - ONES) & ~q;
long s = x ^ BACKSLASHES;
long bslash = (s - ONES) & ~s;
return (control | quote | bslash) & HIGH_BITS;
}

/**
* Like {@link #stringScanMask} but only flags double quotes and backslashes.
* Used once a string is known to require the decoder, so the remaining scan
* for the closing quote still skips clean chunks (including multi-byte
* UTF-8) eight bytes at a time.
* Used once a string is known to require the decoder <em>and</em> to already
* contain non-ASCII bytes, so the remaining scan for the closing quote skips
* clean chunks (including multi-byte UTF-8) eight bytes at a time.
*/
private static long quoteBackslashMask(long x) {
long q = x ^ DOUBLE_QUOTES;
Expand All @@ -137,4 +225,19 @@ private static long quoteBackslashMask(long x) {
long bslash = (s - ONES) & ~s;
return (quote | bslash) & HIGH_BITS;
}

/**
* Like {@link #quoteBackslashMask} but also flags non-ASCII bytes. Used once
* a string is known to require the decoder but is still ASCII-only, so the
* scan keeps skipping printable runs eight bytes at a time yet still stops on
* the first high byte and clears {@link #ASCII_BIT}.
*/
private static long quoteBackslashHighMask(long x) {
long high = x; // bit 0x80 set iff non-ASCII
long q = x ^ DOUBLE_QUOTES;
long quote = (q - ONES) & ~q;
long s = x ^ BACKSLASHES;
long bslash = (s - ONES) & ~s;
return (high | quote | bslash) & HIGH_BITS;
}
}
Loading
Loading