diff --git a/sjsonnet/src-js/sjsonnet/CharSWAR.scala b/sjsonnet/src-js/sjsonnet/CharSWAR.scala index bcdb85e7..6e64ce82 100644 --- a/sjsonnet/src-js/sjsonnet/CharSWAR.scala +++ b/sjsonnet/src-js/sjsonnet/CharSWAR.scala @@ -33,4 +33,14 @@ object CharSWAR { } false } + + def findFirstEscapeChar(arr: Array[Byte], from: Int, to: Int): Int = { + var i = from + while (i < to) { + val b = arr(i) & 0xff + if (b < 32 || b == '"' || b == '\\') return i + i += 1 + } + -1 + } } diff --git a/sjsonnet/src-jvm/sjsonnet/CharSWAR.java b/sjsonnet/src-jvm/sjsonnet/CharSWAR.java index 46bc7d11..e681409f 100644 --- a/sjsonnet/src-jvm/sjsonnet/CharSWAR.java +++ b/sjsonnet/src-jvm/sjsonnet/CharSWAR.java @@ -31,6 +31,7 @@ private CharSWAR() {} // MethodHandles.byteArrayViewVarHandle(long[].class, ByteOrder) private static final VarHandle LONG_VIEW = MethodHandles.byteArrayViewVarHandle(long[].class, ByteOrder.nativeOrder()); + private static final boolean LITTLE_ENDIAN = ByteOrder.nativeOrder() == ByteOrder.LITTLE_ENDIAN; // --- 8-bit SWAR constants (Netty/Pekko pattern) --- // @@ -90,12 +91,35 @@ static boolean hasEscapeChar(char[] arr, int from, int to) { return false; } + /** + * Find the first byte in {@code arr[from..to)} that needs JSON string escaping, or {@code -1} + * when the range is clean. + */ + static int findFirstEscapeChar(byte[] arr, int from, int to) { + int i = from; + int limit = to - 7; + while (i < limit) { + long word = (long) LONG_VIEW.get(arr, i); + long mask = swarMatchMask(word); + if (mask != 0L) { + return i + firstMatchedByte(mask); + } + i += 8; + } + while (i < to) { + int b = arr[i] & 0xFF; + if (b < 32 || b == '"' || b == '\\') return i; + i++; + } + return -1; + } + private static boolean hasEscapeCharSWAR(byte[] arr, int from, int to) { int i = from; int limit = to - 7; // 8 bytes per VarHandle.get while (i < limit) { long word = (long) LONG_VIEW.get(arr, i); - if (swarHasMatch(word)) return true; + if (swarMatchMask(word) != 0L) return true; i += 8; } // Tail: remaining 0-7 bytes @@ -114,7 +138,7 @@ private static boolean hasEscapeCharSWAR(byte[] arr, int from, int to) { *

Uses Netty/Pekko pattern: XOR to produce zero lanes, then * Hacker's Delight formula to detect zero bytes. */ - private static boolean swarHasMatch(long word) { + private static long swarMatchMask(long word) { // 1. Detect '"' via XOR + zero-detection (Netty SWARUtil.applyPattern) long q = word ^ QUOTE; long qz = ~((q & HOLE) + HOLE | q | HOLE); @@ -127,7 +151,13 @@ private static boolean swarHasMatch(long word) { long c = word & CTRL; long cz = ~((c & HOLE) + HOLE | c | HOLE); - return (qz | bz | cz) != 0L; + return qz | bz | cz; + } + + private static int firstMatchedByte(long mask) { + return (LITTLE_ENDIAN + ? Long.numberOfTrailingZeros(mask) + : Long.numberOfLeadingZeros(mask)) >>> 3; } /** Scalar scan for String (used for short strings). */ diff --git a/sjsonnet/src-native/sjsonnet/CharSWAR.scala b/sjsonnet/src-native/sjsonnet/CharSWAR.scala index 5331c012..da1f3b35 100644 --- a/sjsonnet/src-native/sjsonnet/CharSWAR.scala +++ b/sjsonnet/src-native/sjsonnet/CharSWAR.scala @@ -21,12 +21,14 @@ object CharSWAR { private final val QUOTE = 0x2222222222222222L private final val BSLAS = 0x5c5c5c5c5c5c5c5cL private final val CTRL = 0xe0e0e0e0e0e0e0e0L + private final val LITTLE_ENDIAN = + java.nio.ByteOrder.nativeOrder() == java.nio.ByteOrder.LITTLE_ENDIAN /** - * SWAR: returns true if any byte lane in `word` contains '"' (0x22), '\\' (0x5C), or a control + * SWAR: returns a mask for byte lanes in `word` containing '"' (0x22), '\\' (0x5C), or a control * char (< 0x20). */ - @inline private def swarHasMatch(word: Long): Boolean = { + @inline private def swarMatchMask(word: Long): Long = { // 1. Detect '"' via XOR + zero-detection val q = word ^ QUOTE val qz = ~((q & HOLE) + HOLE | q | HOLE) @@ -39,9 +41,13 @@ object CharSWAR { val c = word & CTRL val cz = ~((c & HOLE) + HOLE | c | HOLE) - (qz | bz | cz) != 0L + qz | bz | cz } + @inline private def firstMatchedByte(mask: Long): Int = + (if (LITTLE_ENDIAN) java.lang.Long.numberOfTrailingZeros(mask) + else java.lang.Long.numberOfLeadingZeros(mask)) >>> 3 + def hasEscapeChar(s: String): Boolean = { val len = s.length if (len < 128) { @@ -77,7 +83,7 @@ object CharSWAR { val limit = to - 7 while (i < limit) { val word = Intrinsics.loadLong(barr.atRawUnsafe(i)) - if (swarHasMatch(word)) return true + if (swarMatchMask(word) != 0L) return true i += 8 } // Tail: remaining 0-7 bytes @@ -89,6 +95,28 @@ object CharSWAR { false } + def findFirstEscapeChar(arr: Array[Byte], from: Int, to: Int): Int = { + val len = to - from + if (len < 8) return findFirstEscapeCharScalar(arr, from, to) + val barr = arr.asInstanceOf[ByteArray] + var i = from + val limit = to - 7 + while (i < limit) { + val word = Intrinsics.loadLong(barr.atRawUnsafe(i)) + val mask = swarMatchMask(word) + if (mask != 0L) { + return i + firstMatchedByte(mask) + } + i += 8 + } + while (i < to) { + val b = arr(i) & 0xff + if (b < 32 || b == '"' || b == '\\') return i + i += 1 + } + -1 + } + @inline private def hasEscapeCharScalar(s: String, len: Int): Boolean = { var i = 0 while (i < len) { @@ -108,4 +136,14 @@ object CharSWAR { } false } + + @inline private def findFirstEscapeCharScalar(arr: Array[Byte], from: Int, to: Int): Int = { + var i = from + while (i < to) { + val b = arr(i) & 0xff + if (b < 32 || b == '"' || b == '\\') return i + i += 1 + } + -1 + } } diff --git a/sjsonnet/src/sjsonnet/BaseByteRenderer.scala b/sjsonnet/src/sjsonnet/BaseByteRenderer.scala index 95a67aef..4cfc3f32 100644 --- a/sjsonnet/src/sjsonnet/BaseByteRenderer.scala +++ b/sjsonnet/src/sjsonnet/BaseByteRenderer.scala @@ -307,13 +307,14 @@ class BaseByteRenderer[T <: java.io.OutputStream]( } /** - * SWAR-accelerated path for long strings. Converts to UTF-8 bytes once, scans with SWAR, and - * bulk-copies if clean. The getBytes allocation is amortized by avoiding per-char processing. + * SWAR-accelerated path for long strings. Converts to UTF-8 bytes once, then bulk-copies clean + * chunks and escapes only the bytes that require it. */ private def visitLongString(str: String): Unit = { val bytes = str.getBytes(java.nio.charset.StandardCharsets.UTF_8) - if (!CharSWAR.hasEscapeChar(bytes, 0, bytes.length)) { - val bLen = bytes.length + val bLen = bytes.length + val firstEscape = CharSWAR.findFirstEscapeChar(bytes, 0, bLen) + if (firstEscape < 0) { elemBuilder.ensureLength(bLen + 2) val arr = elemBuilder.arr val pos = elemBuilder.length @@ -322,13 +323,92 @@ class BaseByteRenderer[T <: java.io.OutputStream]( arr(pos + 1 + bLen) = '"'.toByte elemBuilder.length = pos + bLen + 2 } else { - upickle.core.RenderUtils.escapeByte( - unicodeCharBuilder, - elemBuilder, - str, - escapeUnicode = false, - wrapQuotes = true - ) + val escapedLen = escapedStringLength(bytes, bLen, firstEscape) + elemBuilder.ensureLength(escapedLen) + val arr = elemBuilder.arr + var outPos = elemBuilder.length + arr(outPos) = '"'.toByte + outPos += 1 + var from = 0 + var escPos = firstEscape + while (escPos >= 0) { + if (escPos > from) { + val chunkLen = escPos - from + System.arraycopy(bytes, from, arr, outPos, chunkLen) + outPos += chunkLen + } + outPos = escapeByteInline(bytes(escPos) & 0xff, arr, outPos) + from = escPos + 1 + escPos = if (from < bLen) CharSWAR.findFirstEscapeChar(bytes, from, bLen) else -1 + } + if (from < bLen) { + val tailLen = bLen - from + System.arraycopy(bytes, from, arr, outPos, tailLen) + outPos += tailLen + } + arr(outPos) = '"'.toByte + elemBuilder.length = outPos + 1 + } + } + + private def escapedStringLength(bytes: Array[Byte], bLen: Int, firstEscape: Int): Int = { + var len = bLen + 2 + var from = firstEscape + var escPos = firstEscape + while (escPos >= 0) { + len += escapeExtraLength(bytes(escPos) & 0xff) + from = escPos + 1 + escPos = if (from < bLen) CharSWAR.findFirstEscapeChar(bytes, from, bLen) else -1 + } + len + } + + @inline private def escapeExtraLength(b: Int): Int = + (b: @scala.annotation.switch) match { + case '"' | '\\' | '\b' | '\f' | '\n' | '\r' | '\t' => 1 + case _ => 5 + } + + /** Inline JSON escape for one byte that is known to require escaping. */ + @inline private def escapeByteInline(b: Int, arr: Array[Byte], outPos0: Int): Int = { + val outPos = outPos0 + (b: @scala.annotation.switch) match { + case '"' => + arr(outPos) = '\\'.toByte + arr(outPos + 1) = '"'.toByte + outPos + 2 + case '\\' => + arr(outPos) = '\\'.toByte + arr(outPos + 1) = '\\'.toByte + outPos + 2 + case '\b' => + arr(outPos) = '\\'.toByte + arr(outPos + 1) = 'b'.toByte + outPos + 2 + case '\f' => + arr(outPos) = '\\'.toByte + arr(outPos + 1) = 'f'.toByte + outPos + 2 + case '\n' => + arr(outPos) = '\\'.toByte + arr(outPos + 1) = 'n'.toByte + outPos + 2 + case '\r' => + arr(outPos) = '\\'.toByte + arr(outPos + 1) = 'r'.toByte + outPos + 2 + case '\t' => + arr(outPos) = '\\'.toByte + arr(outPos + 1) = 't'.toByte + outPos + 2 + case c => + arr(outPos) = '\\'.toByte + arr(outPos + 1) = 'u'.toByte + arr(outPos + 2) = '0'.toByte + arr(outPos + 3) = '0'.toByte + arr(outPos + 4) = BaseByteRenderer.HEX_BYTES((c >> 4) & 0xf) + arr(outPos + 5) = BaseByteRenderer.HEX_BYTES(c & 0xf) + outPos + 6 } } @@ -377,6 +457,26 @@ object BaseByteRenderer { a } + /** Hex digits used by inline byte escaping for control chars. */ + private[sjsonnet] val HEX_BYTES: Array[Byte] = Array( + '0'.toByte, + '1'.toByte, + '2'.toByte, + '3'.toByte, + '4'.toByte, + '5'.toByte, + '6'.toByte, + '7'.toByte, + '8'.toByte, + '9'.toByte, + 'a'.toByte, + 'b'.toByte, + 'c'.toByte, + 'd'.toByte, + 'e'.toByte, + 'f'.toByte + ) + /** * Reusable scratch buffer for writeLongDirect (max 20 bytes for Long.MinValue). Not thread-safe, * but renderers are single-threaded.