From 883fca5feedaba4c9ac490e173540f3b2e2ca4f4 Mon Sep 17 00:00:00 2001
From: "He-Pin(kerr)" <hepin1989@gmail.com>
Date: Wed, 13 May 2026 15:43:25 +0800
Subject: [PATCH 1/4] perf: parse strict JSON imports from bytes

Motivation:
PR #840 introduced a strict JSON fast path for .json imports but still
forces a full UTF-8 string decode for every cached file before handing
the text to ujson.StringParser. Real-world workloads (e.g. kube-prometheus)
import many .json files; decoding each one twice (once into String for
parsing, again as cache content) is pure overhead.

Key Design Decision:
ujson 4.4.3 ships ByteArrayParser, which parses UTF-8 JSON directly from
a byte array without an intermediate String. Cache small resolved files
as raw bytes (already what we read from disk) and lazily decode text
only when the importstr/parser-input path actually needs it. Preserve
parse-cache content identity by hashing the cached bytes with SHA-256
(length + hex digest) so external ParseCache implementations keep the
same collision resistance as the old full-string key.

Modification:
* Importer.scala: CachedResolver.parseJsonImport now calls
  ujson.ByteArrayParser.transform(content.readRawBytes(), visitor)
  instead of decoding the whole file to String first.
* CachedResolvedFile.scala (JVM/Native): small files are cached as
  Array[Byte]; getParserInput / readString materialize the String
  lazily; readRawBytes returns the cached bytes directly; contentHash
  is length + SHA-256 over the cached bytes; binary imports still use
  StaticBinaryResolvedFile.
* PreloaderTests.scala: tighten the strict-JSON fast-path coverage so
  it fails if the fast path ever falls back to readString().

Result:
* Output equality vs upstream sjsonnet and jrsonnet preserved on
  kube-prometheus and large_string_template.
* Native kube-prometheus hyperfine A/B (forward & reverse):
  clean 139.4 +/- 2.8 ms -> candidate 132.7 +/- 1.9 ms (forward)
  candidate 132.1 +/- 1.9 ms vs clean 140.3 +/- 2.6 ms (reverse)
* Full ./mill __.test green.

References:
Follow-up to https://github.com/databricks/sjsonnet/pull/840
---
 .../sjsonnet/CachedResolvedFile.scala         | 62 +++++++++++++------
 sjsonnet/src/sjsonnet/Importer.scala          |  2 +-
 .../test/src/sjsonnet/PreloaderTests.scala    |  3 +-
 3 files changed, 46 insertions(+), 21 deletions(-)

diff --git a/sjsonnet/src-jvm-native/sjsonnet/CachedResolvedFile.scala b/sjsonnet/src-jvm-native/sjsonnet/CachedResolvedFile.scala
index b0d1cd7ba..f29f12aa0 100644
--- a/sjsonnet/src-jvm-native/sjsonnet/CachedResolvedFile.scala
+++ b/sjsonnet/src-jvm-native/sjsonnet/CachedResolvedFile.scala
@@ -5,6 +5,7 @@ import fastparse.ParserInput
 import java.io.File
 import java.nio.charset.StandardCharsets
 import java.nio.file.Files
+import java.security.MessageDigest
 
 /**
  * A class that encapsulates a resolved import. This is used to cache the result of resolving an
@@ -37,17 +38,13 @@ class CachedResolvedFile(
     s"Resolved import path $resolvedImportPath is too large: ${jFile.length()} bytes > $memoryLimitBytes bytes"
   )
 
-  private val resolvedImportContent: ResolvedFile = {
-    // TODO: Support caching binary data
-    if (jFile.length() > cacheThresholdBytes) {
-      // If the file is too large, then we will just read it from disk
-      null
-    } else if (binaryData) {
-      StaticBinaryResolvedFile(readRawBytes(jFile))
-    } else {
-      StaticResolvedFile(readString(jFile))
-    }
-  }
+  private val cachedBytes: Array[Byte] =
+    if (jFile.length() > cacheThresholdBytes) null
+    else readRawBytes(jFile)
+
+  private val cachedBinaryContent: ResolvedFile =
+    if (cachedBytes != null && binaryData) StaticBinaryResolvedFile(cachedBytes)
+    else null
 
   private def readString(jFile: File): String = {
     new String(Files.readAllBytes(jFile.toPath), StandardCharsets.UTF_8)
@@ -55,45 +52,72 @@ class CachedResolvedFile(
 
   private def readRawBytes(jFile: File): Array[Byte] = Files.readAllBytes(jFile.toPath)
 
+  private lazy val resolvedTextContent: ResolvedFile =
+    StaticResolvedFile(new String(cachedBytes, StandardCharsets.UTF_8))
+
+  private lazy val cachedBytesHash: String =
+    cachedBytes.length.toString + ":" + bytesToHex(
+      MessageDigest.getInstance("SHA-256").digest(cachedBytes)
+    )
+
+  private def bytesToHex(bytes: Array[Byte]): String = {
+    val hexChars = "0123456789abcdef"
+    val out = new Array[Char](bytes.length * 2)
+    var i = 0
+    var j = 0
+    while (i < bytes.length) {
+      val b = bytes(i) & 0xff
+      out(j) = hexChars.charAt(b >>> 4)
+      out(j + 1) = hexChars.charAt(b & 0x0f)
+      i += 1
+      j += 2
+    }
+    new String(out)
+  }
+
   /**
    * A method that will return a reader for the resolved import. If the import is too large, then
    * this will return a reader that will read the file from disk. Otherwise, it will return a reader
    * that reads from memory.
    */
   def getParserInput(): ParserInput = {
-    if (resolvedImportContent == null) {
+    if (cachedBytes == null) {
       FileParserInput(jFile)
+    } else if (binaryData) {
+      cachedBinaryContent.getParserInput()
     } else {
-      resolvedImportContent.getParserInput()
+      resolvedTextContent.getParserInput()
     }
   }
 
   override def readString(): String = {
-    if (resolvedImportContent == null) {
+    if (cachedBytes == null) {
       // If the file is too large, then we will just read it from disk
       readString(jFile)
+    } else if (binaryData) {
+      cachedBinaryContent.readString()
     } else {
       // Otherwise, we will read it from memory
-      resolvedImportContent.readString()
+      resolvedTextContent.readString()
     }
   }
 
   override def contentHash(): String = {
-    if (resolvedImportContent == null) {
+    if (cachedBytes == null) {
       // If the file is too large, then we will just read it from disk
       Platform.hashFile(jFile)
     } else {
-      resolvedImportContent.contentHash()
+      cachedBytesHash
     }
   }
 
   override def readRawBytes(): Array[Byte] = {
-    if (resolvedImportContent == null) {
+    if (cachedBytes == null) {
       // If the file is too large, then we will just read it from disk
       readRawBytes(jFile)
     } else {
       // Otherwise, we will read it from memory
-      resolvedImportContent.readRawBytes()
+      cachedBytes
     }
   }
 }
diff --git a/sjsonnet/src/sjsonnet/Importer.scala b/sjsonnet/src/sjsonnet/Importer.scala
index ca823389d..0ddc7c784 100644
--- a/sjsonnet/src/sjsonnet/Importer.scala
+++ b/sjsonnet/src/sjsonnet/Importer.scala
@@ -302,7 +302,7 @@ object CachedResolver {
     try {
       val visitor =
         new JsonImportVisitor(fileScope, internedStrings, settings)
-      Some((ujson.StringParser.transform(content.readString(), visitor), fileScope))
+      Some((ujson.ByteArrayParser.transform(content.readRawBytes(), visitor), fileScope))
     } catch {
       case _: ujson.ParsingFailedException | _: DuplicateJsonKey | _: InvalidJsonNumber |
           _: JsonParseDepthExceeded | _: NumberFormatException =>
diff --git a/sjsonnet/test/src/sjsonnet/PreloaderTests.scala b/sjsonnet/test/src/sjsonnet/PreloaderTests.scala
index 9d3bc985e..f8b9e0818 100644
--- a/sjsonnet/test/src/sjsonnet/PreloaderTests.scala
+++ b/sjsonnet/test/src/sjsonnet/PreloaderTests.scala
@@ -173,7 +173,8 @@ object PreloaderTests extends TestSuite {
       class JsonOnlyResolvedFile(content: String) extends ResolvedFile {
         def getParserInput(): fastparse.ParserInput =
           throw new RuntimeException("strict JSON should not be parsed with fastparse")
-        def readString(): String = content
+        def readString(): String =
+          throw new RuntimeException("strict JSON should not be decoded as text")
         def contentHash(): String = content
         def readRawBytes(): Array[Byte] =
           content.getBytes(java.nio.charset.StandardCharsets.UTF_8)

From 90eba49e480c8ffab4eb60ab796f1a872dafcf34 Mon Sep 17 00:00:00 2001
From: He-Pin <hepin1989@gmail.com>
Date: Wed, 13 May 2026 19:56:35 +0800
Subject: [PATCH 2/4] perf: use hybrid sort for inline object order

Motivation:
Large inline objects produced by strict JSON imports can exceed the small-object shape that computeSortedInlineOrder was originally tuned for. Native sampling on kube-prometheus showed sorted inline-order computation as a materialization hotspot, and insertion sort becomes quadratic on those wider objects.

Modification:
Keep insertion sort for small inline objects, and use an in-place quicksort with insertion-sort cleanup for larger visible field sets. Record the accepted benchmark result and rejected parser/key-render micro-routes in the performance ledgers.

Result:
Kube-prometheus Native A/B improved on top of strict JSON byte imports, with forward mean 145.3ms -> 140.0ms and reverse mean 151.6ms -> 148.9ms. Formatting and the full test suite pass.

References:
Upstream-base: databricks/sjsonnet@cedc083b4676be43e01bdd6f6cb5d7f4432d0d32
Prior optimization: 883fca5f perf: parse strict JSON imports from bytes
---
 bench/reports/sjsonnet-vs-jrsonnet-gaps.md | 133 +++++++++++++++++++++
 bench/reports/sync-points.md               |  45 +++++++
 sjsonnet/src/sjsonnet/Materializer.scala   |  56 ++++++++-
 3 files changed, 229 insertions(+), 5 deletions(-)
 create mode 100644 bench/reports/sjsonnet-vs-jrsonnet-gaps.md
 create mode 100644 bench/reports/sync-points.md

diff --git a/bench/reports/sjsonnet-vs-jrsonnet-gaps.md b/bench/reports/sjsonnet-vs-jrsonnet-gaps.md
new file mode 100644
index 000000000..2ebc933a1
--- /dev/null
+++ b/bench/reports/sjsonnet-vs-jrsonnet-gaps.md
@@ -0,0 +1,133 @@
+# sjsonnet vs jrsonnet current gap ledger
+
+This report tracks only locally rechecked gaps where current sjsonnet is still
+slower than a source-built jrsonnet reference. The jrsonnet upstream benchmark
+document is still useful for broad ranking, but its sjsonnet rows reference older
+released builds and must not be treated as current truth without local recheck.
+
+## Baseline
+
+| Field | Value |
+|---|---|
+| sjsonnet base | `upstream/master` at `cedc083b4676be43e01bdd6f6cb5d7f4432d0d32` |
+| sjsonnet binary | Scala Native `sjsonnet.native[3.3.7].nativeLink` |
+| jrsonnet reference | `origin/master` at `5e8cbcdbc860a616dbd193428f8933dd7532f537`, `cargo build --release -p jrsonnet` |
+| benchmark rule | single benchmark process; no concurrent Mill/JMH/hyperfine |
+
+## Latest confirmed local gaps
+
+| priority | workload | sjsonnet Native | jrsonnet | gap | status | next direction |
+|---:|---|---:|---:|---:|---|---|
+| 1 | `bench/resources/cpp_suite/large_string_template.jsonnet` | `10.03 +/- 0.80 ms` | `2.87 +/- 0.13 ms` | jrsonnet `3.49x` faster | open | Need a materially new parser/format design; previous scanner/renderer micro-routes were rejected. |
+| 2 | `jrsonnet/tests/realworld/entry-kube-prometheus.jsonnet -J vendor` | `132.09 +/- 2.33 ms` | `85.29 +/- 1.12 ms` | jrsonnet `1.55x` faster | improved | Strict JSON byte import parsing reduced sjsonnet Native time by about 5% locally; remaining gap is mostly materialization/rendering and startup. |
+
+## Accepted in this session
+
+| idea | validation | result |
+|---|---|---|
+| Parse strict `.json` imports from UTF-8 bytes and cache small resolved files as bytes until text is needed | Output equality against jrsonnet and previous sjsonnet on kube-prometheus; focused `PreloaderTests`; Native A/B forward+reverse; focused JMH guards; full `__.test` | kept: kube Native improved from clean `139.4 +/- 2.8 ms` to candidate `132.7 +/- 1.9 ms` forward, and clean `140.3 +/- 2.6 ms` to candidate `132.1 +/- 1.9 ms` reverse. Debug stats parse time improved from prior clean `~88.3ms` to `69.8ms` in the final run. |
+| Use in-place quicksort for inline object sorted order when field count is large | `sample` on repeated kube materialization; output equality on kube and `large_string_template`; Native A/B forward+reverse; focused renderer/json tests; `__.checkFormat`; full `__.test` | kept: `sample` reduced `computeSortedInlineOrder` top-stack samples from `164` to `63` and sort-specific samples from `164` to `75`; kube Native improved from `145.3 +/- 3.6 ms` to `140.0 +/- 3.2 ms` forward and from `151.6 +/- 10.2 ms` to `148.9 +/- 3.7 ms` reverse. |
+
+## Historical jrsonnet-doc gaps that are no longer primary local gaps
+
+| workload | reason |
+|---|---|
+| Foldl string concat | Prior stacked recheck showed sjsonnet faster than source-built jrsonnet on extracted foldl workloads. |
+| Go `std.foldl` | Prior stacked recheck showed sjsonnet faster than source-built jrsonnet. |
+| Big object | Prior stacked recheck was effectively neutral; latest focus stays on larger confirmed gaps. |
+| `realistic2` | Prior stacked recheck showed sjsonnet faster than source-built jrsonnet. |
+| `large_string_join` | Prior local join work closed the jrsonnet gap; keep as guard only. |
+
+## Rejected in this session
+
+| idea | validation | result |
+|---|---|---|
+| Raise nested `ByteBuilder` flush threshold from 8 KiB to 64 KiB | Output equality on large-template and kube; kube Native A/B | negative: clean `144.0 +/- 2.1 ms`, candidate `153.3 +/- 15.5 ms`. |
+| Raise nested flush threshold to 16 KiB / 32 KiB | Output equality on kube; forward and reverse hyperfine | unstable/noisy: 32 KiB forward looked `~3%` faster, reverse had clean `141.5 +/- 1.6 ms` faster than candidate `143.7 +/- 1.9 ms`. |
+| Fast-path single-part parsed string instead of always calling `mkString` | Output equality on `large_string_template`; forward and reverse hyperfine | unstable/noisy: forward candidate `10.4 +/- 0.6 ms` vs clean `10.6 +/- 1.2 ms`; reverse clean `10.3 +/- 0.7 ms` vs candidate `10.5 +/- 0.8 ms`. |
+| Add 4 inline object value-cache slots | Output equality on kube; debug stats; forward and reverse hyperfine; focused JMH guards | not enough: overflows `2452 -> 946`, but Native A/B was neutral (`1.00x` in reverse). |
+| Add lazy small overflow cache before HashMap | Output equality on kube; debug stats; hyperfine | negative: overflows `2452 -> 83`, but clean `140.9 +/- 1.4 ms` beat candidate `141.7 +/- 2.3 ms`. |
+| Mark strict JSON import objects to skip materializer cycle tracking | Output equality on kube; debug stats; forward and reverse hyperfine | not enough: materialize debug time improved, but Native A/B was only weak positive forward and neutral reverse. |
+| Parse strict JSON integers through `ParseUtils.parseIntegralNum` before `toDouble` fallback | Output equality on kube and `large_string_template`; JSON fast-path tests; Native kube forward/reverse A/B | not enough: explicit integral scan regressed parse debug time; `decIndex/expIndex` variant removed the scan but remained noisy. Forward median favored candidate, while reverse median/min favored baseline, so it was reverted. |
+| Precheck object keys with `Platform.isAsciiJsonSafe` before direct byte copy | Output equality on kube and `large_string_template`; renderer test covering safe, escaped, and Unicode keys; Native kube forward/reverse A/B | negative: forward median/min were weakly positive but mean was worse; reverse favored baseline across mean/median/min (`141.3/140.2/135.8ms` baseline vs `144.2/142.5/138.1ms` candidate). Reverted. |
+
+## Current hypothesis
+
+Large-template remains ratio-priority, but the known parser/format/renderer
+micro-routes are exhausted or noisy. Kube-prometheus improved through byte-based
+strict JSON imports, but source-built jrsonnet is still about `1.55x` faster.
+Next work should profile the remaining materialization/render/startup costs and
+target a larger structural cost rather than a single parameter or small cache
+tweak.
+
+## 2026-05-13 native-vs-jvm split
+
+Re-profile shows the `large_string_template` gap is largely a Scala Native
+runtime artifact, not an algorithmic gap in the formatter or parser:
+
+| metric | value |
+|---|---|
+| JVM JMH (warm) `RegressionBenchmark.main` | `0.873 ms/op` |
+| Native cold hyperfine | `~14.5 ms` mean, `9.8 ms` min |
+| Native `--debug-stats` (single run, with timing overhead) | parse `12.8ms` + eval `14.6ms` + materialize `2.3ms` |
+| jrsonnet native | `5.5 +/- 0.5 ms` |
+| sjsonnet Native trivial-startup (`null`) | `~6.6 ms` mean, `5.5 ms` min |
+| jrsonnet trivial-startup (`null`) | `~3.9 ms` mean, `2.7 ms` min |
+
+Implication: ~5.5ms of the sjsonnet 14.5ms is process startup. Actual work
+ratio min-to-min is roughly `4.3ms / 2.1ms` = ~2x, not 3.4x. Stream-render to
+stdout is already in place (`SjsonnetMainBase.renderNormal` uses `ByteRenderer`
+directly to `stdoutStream` in `case None if stdoutStream != null`), so the
+final output stage is already byte-streamed.
+
+The remaining double work is: `Format.format` builds a full ~590KB String into
+a `StringBuilder`, then `BaseByteRenderer.visitString` re-scans that String for
+JSON escape chars. Removing this double scan requires routing the renderer into
+`Format` so format chunks are escaped and emitted as they are produced. That is
+a structural cross-cutting change touching the Format ABI and several stdlib
+callers; it is not a single-file micro-optimization and warrants explicit user
+go-ahead before implementation.
+
+## 2026-05-13 rejected: visitLongString chunked-char copy
+
+Rewrote `BaseByteRenderer.visitLongString` to avoid the `str.getBytes(UTF-8)`
+allocation by scanning chars directly, copying ASCII runs via
+`Platform.copyAsciiStringRangeToBytes` (which wraps `String.getBytes(srcBegin,
+srcEnd, dst, dstPos)`), and emitting escapes inline.
+
+JVM JMH guard on `large_string_template`:
+
+| variant | iter median (ms/op) |
+|---|---:|
+| clean baseline (5 runs) | `0.82` (range `0.79-0.92`) |
+| chunked-char path (5 runs) | `1.21` (range `1.20-1.24`) |
+
+Result: **+46% JVM regression**. JVM's intrinsified `String.getBytes(UTF-8)` on
+the whole string plus a single SWAR scan is faster than per-chunk
+`String.getBytes(srcBegin, srcEnd, dst, dstPos)` calls. The hypothesized Native
+gain (skip a ~600KB allocation per long string) was not measured, but the
+shared-code JVM cost makes the change unshippable per PR-rule-#19 (no
+regression). Rejected without Native A/B; pursuing platform-gating would add
+complexity disproportionate to the unproven benefit.
+
+## 2026-05-13 rejected: lazy simple-named format byte rendering
+
+Explored a structural version of "Format renders directly to bytes" for large
+`%(key)s` object format strings. The implementation kept `Format.format`
+unchanged for string semantics, added a lazy `Val.Str` representation only for
+large simple-named formats, forced object key lookups up front to preserve error
+timing, and taught `ByteRenderer` to render pre-escaped format pieces directly.
+
+Three variants were tried:
+
+| variant | JVM JMH `large_string_template` | Native forward A/B | Native reverse A/B | decision |
+|---|---:|---:|---:|---|
+| per-static-chunk escaped byte arrays | `0.81-0.85 ms/op` | baseline `10.05ms`, candidate `10.40ms` | baseline `10.53ms`, candidate `10.94ms` | reject |
+| flat static byte buffer + offsets | `0.73-0.74 ms/op` | baseline `10.23ms`, candidate `10.39ms` | baseline `10.40ms`, candidate `10.61ms` | reject |
+| flat static bytes + pre-escaped dynamic bytes | `0.73-0.78 ms/op` | baseline `10.070ms`, candidate `10.047ms` | baseline `9.890ms`, candidate `10.226ms` | reject |
+
+Conclusion: this direction improves warm JVM JMH but does not improve the
+Scala Native whole-process target. The extra Native work to pre-escape and
+retain byte slices offsets the avoided final `StringBuilder`/renderer scan, and
+the only positive Native run was within noise and reversed when command order
+changed. Code was reverted; no runtime optimization retained.
diff --git a/bench/reports/sync-points.md b/bench/reports/sync-points.md
new file mode 100644
index 000000000..6606df3e9
--- /dev/null
+++ b/bench/reports/sync-points.md
@@ -0,0 +1,45 @@
+# Performance sync points
+
+This file tracks current performance migration and exploration work so the same
+idea is not repeated without new evidence.
+
+## Active baselines
+
+| Area | Ref | Notes |
+|---|---|---|
+| upstream/master | `cedc083b4676be43e01bdd6f6cb5d7f4432d0d32` | Clean base used for current local rechecks. |
+| jrsonnet | `5e8cbcdbc860a616dbd193428f8933dd7532f537` | Source-built with `cargo build --release -p jrsonnet`. |
+
+## Current confirmed gaps
+
+| workload | status | report |
+|---|---|---|
+| `large_string_template` | open; jrsonnet `3.49x` faster | `bench/reports/sjsonnet-vs-jrsonnet-gaps.md` |
+| kube-prometheus realworld | improved by strict JSON byte import parsing; jrsonnet still `1.55x` faster | `bench/reports/sjsonnet-vs-jrsonnet-gaps.md` |
+
+## Accepted ideas
+
+| idea | status | evidence |
+|---|---|---|
+| Strict JSON byte import parsing | implemented locally; not committed | `Importer.parseJsonImport` uses `ujson.ByteArrayParser`; `CachedResolvedFile` caches small files as bytes and lazily decodes text; kube Native A/B improved candidate to `132.7/132.1 ms` vs clean `139.4/140.3 ms`. |
+| Hybrid sort for inline object materialization | implemented locally; pending PR | `Materializer.computeSortedInlineOrder` keeps insertion sort for ≤16 visible fields and uses in-place quicksort for larger inline objects. Native kube A/B on top of strict JSON bytes improved forward `145.3 -> 140.0 ms` and reverse `151.6 -> 148.9 ms`; output equality and full `__.test` passed. |
+
+## Rejected ideas
+
+| idea | reason |
+|---|---|
+| Nested byte-buffer flush threshold 16/32/64 KiB | Not stable positive under same-run forward/reverse Native A/B. |
+| Single-part parsed string fast path | Not stable positive under same-run forward/reverse Native A/B. |
+| 4-slot object value cache | Reduced overflow count but produced only neutral Native wall-clock results. |
+| Lazy small overflow cache before HashMap | Reduced overflow count further but regressed Native wall-clock. |
+| Strict JSON object cycle-check skip marker | Debug stats improved, but same-run Native A/B was not stable enough to keep. |
+| visitLongString char/range-copy path | Stable JVM JMH regression on `large_string_template` (`~0.82ms` baseline to `~1.21ms` candidate); rejected before Native A/B. |
+| Lazy simple-named format byte rendering | Three structural variants improved/held JVM JMH but were neutral-to-negative on Scala Native whole-process `large_string_template`; code reverted. |
+| Strict JSON integer parse via `ParseUtils.parseIntegralNum` | Tried both an explicit integral scan and the parser-provided `decIndex/expIndex` fast path. Output stayed identical, but kube Native A/B was not stable-positive; reverse median/min favored the existing `toString.toDouble` path. |
+| ByteRenderer ASCII-safe object key precheck | Replaced direct key rendering with `Platform.isAsciiJsonSafe` + low-byte copy for safe keys. Output stayed identical, but kube Native reverse A/B favored the existing short-string renderer across mean/median/min. |
+
+## Policy
+
+Before opening a performance PR, rerun focused JMH and Scala Native hyperfine
+against the current base and source-built jrsonnet. Keep a change only when the
+target benchmark is stable-positive and guard benchmarks do not regress.
diff --git a/sjsonnet/src/sjsonnet/Materializer.scala b/sjsonnet/src/sjsonnet/Materializer.scala
index 9892e3da3..bc770d153 100644
--- a/sjsonnet/src/sjsonnet/Materializer.scala
+++ b/sjsonnet/src/sjsonnet/Materializer.scala
@@ -619,20 +619,66 @@ object Materializer extends Materializer {
       }
       i += 1
     }
-    // Insertion sort by key name (optimal for 2-8 elements)
-    i = 1
-    while (i < visCount) {
+    sortInlineOrder(order, keys, visCount)
+    order
+  }
+
+  private def sortInlineOrder(order: Array[Int], keys: Array[String], len: Int): Unit = {
+    if (len <= 1) return
+    if (len <= 16) insertionSortInlineOrder(order, keys, 0, len - 1)
+    else quickSortInlineOrder(order, keys, 0, len - 1)
+  }
+
+  private def insertionSortInlineOrder(
+      order: Array[Int],
+      keys: Array[String],
+      left: Int,
+      right: Int): Unit = {
+    var i = left + 1
+    while (i <= right) {
       val pivotIdx = order(i)
       val pivotKey = keys(pivotIdx)
       var j = i - 1
-      while (j >= 0 && Util.compareStringsByCodepoint(keys(order(j)), pivotKey) > 0) {
+      while (j >= left && Util.compareStringsByCodepoint(keys(order(j)), pivotKey) > 0) {
         order(j + 1) = order(j)
         j -= 1
       }
       order(j + 1) = pivotIdx
       i += 1
     }
-    order
+  }
+
+  private def quickSortInlineOrder(
+      order: Array[Int],
+      keys: Array[String],
+      left0: Int,
+      right0: Int): Unit = {
+    var left = left0
+    var right = right0
+    while (right - left > 16) {
+      val pivotKey = keys(order((left + right) >>> 1))
+      var i = left
+      var j = right
+      while (i <= j) {
+        while (Util.compareStringsByCodepoint(keys(order(i)), pivotKey) < 0) i += 1
+        while (Util.compareStringsByCodepoint(keys(order(j)), pivotKey) > 0) j -= 1
+        if (i <= j) {
+          val tmp = order(i)
+          order(i) = order(j)
+          order(j) = tmp
+          i += 1
+          j -= 1
+        }
+      }
+      if (j - left < right - i) {
+        if (left < j) quickSortInlineOrder(order, keys, left, j)
+        left = i
+      } else {
+        if (i < right) quickSortInlineOrder(order, keys, i, right)
+        right = j
+      }
+    }
+    insertionSortInlineOrder(order, keys, left, right)
   }
 
   /**

From 7a6e25b8170915de6597404230cff31a0c976de8 Mon Sep 17 00:00:00 2001
From: He-Pin <hepin1989@gmail.com>
Date: Wed, 13 May 2026 21:26:32 +0800
Subject: [PATCH 3/4] docs: record rejected performance probes

Motivation:
Keep the performance exploration ledger current so future optimization work does not repeat Native-negative or build-invalid routes.

Modification:
Record rejected short-string, ASCII-safe, inline sort-cache, path-only parse-cache, and Native GC configuration probes with the validation evidence that ruled them out.

Result:
No runtime code changes are retained; the branch documents the failed hypotheses and preserves the current accepted optimization stack.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 bench/reports/sjsonnet-vs-jrsonnet-gaps.md | 6 ++++++
 bench/reports/sync-points.md               | 6 ++++++
 2 files changed, 12 insertions(+)

diff --git a/bench/reports/sjsonnet-vs-jrsonnet-gaps.md b/bench/reports/sjsonnet-vs-jrsonnet-gaps.md
index 2ebc933a1..ef25f8dec 100644
--- a/bench/reports/sjsonnet-vs-jrsonnet-gaps.md
+++ b/bench/reports/sjsonnet-vs-jrsonnet-gaps.md
@@ -50,6 +50,12 @@ released builds and must not be treated as current truth without local recheck.
 | Mark strict JSON import objects to skip materializer cycle tracking | Output equality on kube; debug stats; forward and reverse hyperfine | not enough: materialize debug time improved, but Native A/B was only weak positive forward and neutral reverse. |
 | Parse strict JSON integers through `ParseUtils.parseIntegralNum` before `toDouble` fallback | Output equality on kube and `large_string_template`; JSON fast-path tests; Native kube forward/reverse A/B | not enough: explicit integral scan regressed parse debug time; `decIndex/expIndex` variant removed the scan but remained noisy. Forward median favored candidate, while reverse median/min favored baseline, so it was reverted. |
 | Precheck object keys with `Platform.isAsciiJsonSafe` before direct byte copy | Output equality on kube and `large_string_template`; renderer test covering safe, escaped, and Unicode keys; Native kube forward/reverse A/B | negative: forward median/min were weakly positive but mean was worse; reverse favored baseline across mean/median/min (`141.3/140.2/135.8ms` baseline vs `144.2/142.5/138.1ms` candidate). Reverted. |
+| Render short strings by scanning `String.charAt` directly instead of copying to the reusable char buffer first | Output equality on kube and `large_string_template`; renderer/json focused tests; Native kube and `large_string_template` forward/reverse A/B | reject: kube moved weakly positive (`140.75ms` baseline to `139.38ms` candidate forward; `157.39ms` baseline to `147.77ms` candidate reverse), but `large_string_template` regressed/noised negative (`10.99ms` baseline to `14.96ms` candidate forward; `10.27ms` baseline to `10.82ms` candidate reverse). The reusable-buffer `getChars` path remains safer for the large-string priority gap. |
+| Mark long strict-JSON imported string values as ASCII-safe during parse | Output equality on kube and `large_string_template`; JVM tests; Native kube forward/reverse A/B | reject: debug stats looked lower in one run, but wall-clock did not hold. Forward was noise-level (`141.19ms` baseline vs `138.64ms` candidate mean, nearly identical median/min), while reverse favored baseline (`134.93ms` baseline vs `137.95ms` candidate). Reverted. |
+| Lower parsed Jsonnet string ASCII-safe threshold from `>1024` to `>=128` | Output equality on kube and `large_string_template`; JVM tests; Native kube forward/reverse A/B | reject: the extra parse-time scan did not pay back. Forward favored baseline (`142.00ms` vs `147.34ms` candidate mean), and reverse again favored baseline (`139.74ms` vs `142.20ms` candidate mean). Reverted. |
+| Lazily cache computed inline-object sorted order during materialization | Output equality on kube and `large_string_template`; JVM tests; Native kube forward/reverse A/B | reject: reduced a repeated-work sampling hotspot in theory, but single-run kube was not stable-positive. Forward only improved median/min while mean worsened (`137.43ms` baseline vs `143.38ms` candidate); reverse favored baseline mean/median (`134.71/133.77ms` baseline vs `135.61/134.03ms` candidate). Reverted. |
+| Native CLI path-only parse cache to avoid file content hashing | JVM tests; Native link; output equality on `null`, kube, and `large_string_template`; Native `null` and kube A/B | reject: skipping `contentHash()` was neutral on `null` and negative/noisy on kube. `null` was effectively unchanged (`4.98ms` baseline vs `4.95ms` candidate mean), while kube favored baseline in both command orders (`141.49/137.67ms` baseline vs `141.87/139.48ms` candidate forward; reverse baseline `138.86/136.33ms` vs candidate `168.07/143.16ms`). Reverted. |
+| Switch Native release GC from default Immix to Commix | Mill build check | rejected before benchmarking: the current Mill Scala Native plugin API in this build did not expose `GC.commix` through `scalanativelib.api`, and `scala.scalanative.build.GC` was not on the build script classpath. Reverted rather than guessing further. |
 
 ## Current hypothesis
 
diff --git a/bench/reports/sync-points.md b/bench/reports/sync-points.md
index 6606df3e9..474012fd8 100644
--- a/bench/reports/sync-points.md
+++ b/bench/reports/sync-points.md
@@ -37,6 +37,12 @@ idea is not repeated without new evidence.
 | Lazy simple-named format byte rendering | Three structural variants improved/held JVM JMH but were neutral-to-negative on Scala Native whole-process `large_string_template`; code reverted. |
 | Strict JSON integer parse via `ParseUtils.parseIntegralNum` | Tried both an explicit integral scan and the parser-provided `decIndex/expIndex` fast path. Output stayed identical, but kube Native A/B was not stable-positive; reverse median/min favored the existing `toString.toDouble` path. |
 | ByteRenderer ASCII-safe object key precheck | Replaced direct key rendering with `Platform.isAsciiJsonSafe` + low-byte copy for safe keys. Output stayed identical, but kube Native reverse A/B favored the existing short-string renderer across mean/median/min. |
+| Direct `String.charAt` scan in `visitShortString` | Avoided the reusable `getChars` temp-buffer copy. Output stayed identical and kube Native improved weakly, but `large_string_template` regressed/noised negative in both command orders, so the existing reusable-buffer renderer path was restored. |
+| Long strict-JSON imported string values marked ASCII-safe during parse | Mirrored the large Jsonnet string literal optimization for `.json` imports. Output stayed identical, but kube Native reverse A/B favored baseline, so the parse-time scan was removed. |
+| Lower parsed Jsonnet string ASCII-safe threshold to `>=128` | Tried to align parser marking with ByteRenderer's long-string cutoff. Output stayed identical, but the parse-time scan regressed kube Native in both command orders. |
+| Lazy materialization-time cache for inline-object sorted order | Stored `computeSortedInlineOrder` results back on `Val.Obj` when absent. Output stayed identical, but real kube Native single-run A/B was neutral-to-negative, so the lazy write was removed. |
+| Native CLI path-only parse cache | Avoided `ResolvedFile.contentHash()` for the Native CLI to bypass SHA-256/OpenSSL provider work. It linked and preserved output, but Native wall-clock was neutral on `null` and negative/noisy on kube, so the default content-hash cache was restored. |
+| Native GC switch to Commix | Attempted to set `nativeGC` to Commix in Mill. Build script compilation failed because the GC API was not exposed on the current Mill build classpath, so the config experiment was reverted. |
 
 ## Policy
 

From a0861508e5f9c8a36a7ba0e30aa381d4b957cc3b Mon Sep 17 00:00:00 2001
From: He-Pin <hepin1989@gmail.com>
Date: Fri, 15 May 2026 18:30:38 +0800
Subject: [PATCH 4/4] perf: use median-of-three pivot in inline-order quicksort
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Motivation:
quickSortInlineOrder previously chose a fixed mid-element pivot
(keys(order((left + right) >>> 1))). On already-sorted, reverse-sorted,
or run-heavy inputs — common patterns in Jsonnet object key sets —
the fixed pivot leads to highly unbalanced partitions and worst-case
O(n^2) behaviour. Materializing large objects (e.g. kube-prometheus
manifests with hundreds of keys per object) is sensitive to this.

Modification:
Replace the fixed mid pivot with a median-of-three selection over
keys(order(left)), keys(order(mid)), and keys(order(right)). Three
codepoint comparisons per partition return the median key, used as
the pivot value (Hoare partition is unchanged). The helper is
isolated in medianOfThreeKey for readability.

Result:
- Quicksort partition imbalance is bounded for sorted/reverse-sorted
  inputs, restoring expected O(n log n) on those patterns.
- Three extra comparisons per partition; negligible vs the partition
  scan cost on objects large enough to enter the quicksort path
  (right - left > 16).
- All JVM tests pass.
---
 sjsonnet/src/sjsonnet/Materializer.scala | 32 +++++++++++++++++++++++-
 1 file changed, 31 insertions(+), 1 deletion(-)

diff --git a/sjsonnet/src/sjsonnet/Materializer.scala b/sjsonnet/src/sjsonnet/Materializer.scala
index bc770d153..f0767b8ef 100644
--- a/sjsonnet/src/sjsonnet/Materializer.scala
+++ b/sjsonnet/src/sjsonnet/Materializer.scala
@@ -656,7 +656,7 @@ object Materializer extends Materializer {
     var left = left0
     var right = right0
     while (right - left > 16) {
-      val pivotKey = keys(order((left + right) >>> 1))
+      val pivotKey = medianOfThreeKey(order, keys, left, right)
       var i = left
       var j = right
       while (i <= j) {
@@ -681,6 +681,36 @@ object Materializer extends Materializer {
     insertionSortInlineOrder(order, keys, left, right)
   }
 
+  /**
+   * Median-of-three pivot selection for [[quickSortInlineOrder]]. Returns the median key among
+   * `keys(order(left))`, `keys(order(mid))`, and `keys(order(right))` (where `mid = (left + right)
+   * >>> 1`). Compared to a fixed mid-element pivot, this reduces worst-case behaviour on inputs
+   * that are already sorted, reverse-sorted, or contain runs — patterns that occur frequently in
+   * Jsonnet object key sets.
+   */
+  private def medianOfThreeKey(
+      order: Array[Int],
+      keys: Array[String],
+      left: Int,
+      right: Int): String = {
+    val mid = (left + right) >>> 1
+    val a = keys(order(left))
+    val b = keys(order(mid))
+    val c = keys(order(right))
+    val ab = Util.compareStringsByCodepoint(a, b)
+    val ac = Util.compareStringsByCodepoint(a, c)
+    val bc = Util.compareStringsByCodepoint(b, c)
+    if (ab <= 0) {
+      if (bc <= 0) b // a <= b <= c
+      else if (ac <= 0) c // a <= c < b
+      else a // c < a <= b
+    } else {
+      if (ac <= 0) a // b < a <= c
+      else if (bc <= 0) c // b <= c < a
+      else b // c < b < a
+    }
+  }
+
   /**
    * Checks whether a [[Expr.ObjBody.MemberList]]'s expressions reference `self`, `super`, or `$` at
    * the current object scope level. When `true` (no self-ref), field caching can be safely skipped