From 883fca5feedaba4c9ac490e173540f3b2e2ca4f4 Mon Sep 17 00:00:00 2001
From: "He-Pin(kerr)" <hepin1989@gmail.com>
Date: Wed, 13 May 2026 15:43:25 +0800
Subject: [PATCH 1/2] perf: parse strict JSON imports from bytes

Motivation:
PR #840 introduced a strict JSON fast path for .json imports but still
forces a full UTF-8 string decode for every cached file before handing
the text to ujson.StringParser. Real-world workloads (e.g. kube-prometheus)
import many .json files; decoding each one twice (once into String for
parsing, again as cache content) is pure overhead.

Key Design Decision:
ujson 4.4.3 ships ByteArrayParser, which parses UTF-8 JSON directly from
a byte array without an intermediate String. Cache small resolved files
as raw bytes (already what we read from disk) and lazily decode text
only when the importstr/parser-input path actually needs it. Preserve
parse-cache content identity by hashing the cached bytes with SHA-256
(length + hex digest) so external ParseCache implementations keep the
same collision resistance as the old full-string key.

Modification:
* Importer.scala: CachedResolver.parseJsonImport now calls
  ujson.ByteArrayParser.transform(content.readRawBytes(), visitor)
  instead of decoding the whole file to String first.
* CachedResolvedFile.scala (JVM/Native): small files are cached as
  Array[Byte]; getParserInput / readString materialize the String
  lazily; readRawBytes returns the cached bytes directly; contentHash
  is length + SHA-256 over the cached bytes; binary imports still use
  StaticBinaryResolvedFile.
* PreloaderTests.scala: tighten the strict-JSON fast-path coverage so
  it fails if the fast path ever falls back to readString().

Result:
* Output equality vs upstream sjsonnet and jrsonnet preserved on
  kube-prometheus and large_string_template.
* Native kube-prometheus hyperfine A/B (forward & reverse):
  clean 139.4 +/- 2.8 ms -> candidate 132.7 +/- 1.9 ms (forward)
  candidate 132.1 +/- 1.9 ms vs clean 140.3 +/- 2.6 ms (reverse)
* Full ./mill __.test green.

References:
Follow-up to https://github.com/databricks/sjsonnet/pull/840
---
 .../sjsonnet/CachedResolvedFile.scala         | 62 +++++++++++++------
 sjsonnet/src/sjsonnet/Importer.scala          |  2 +-
 .../test/src/sjsonnet/PreloaderTests.scala    |  3 +-
 3 files changed, 46 insertions(+), 21 deletions(-)

diff --git a/sjsonnet/src-jvm-native/sjsonnet/CachedResolvedFile.scala b/sjsonnet/src-jvm-native/sjsonnet/CachedResolvedFile.scala
index b0d1cd7ba..f29f12aa0 100644
--- a/sjsonnet/src-jvm-native/sjsonnet/CachedResolvedFile.scala
+++ b/sjsonnet/src-jvm-native/sjsonnet/CachedResolvedFile.scala
@@ -5,6 +5,7 @@ import fastparse.ParserInput
 import java.io.File
 import java.nio.charset.StandardCharsets
 import java.nio.file.Files
+import java.security.MessageDigest
 
 /**
  * A class that encapsulates a resolved import. This is used to cache the result of resolving an
@@ -37,17 +38,13 @@ class CachedResolvedFile(
     s"Resolved import path $resolvedImportPath is too large: ${jFile.length()} bytes > $memoryLimitBytes bytes"
   )
 
-  private val resolvedImportContent: ResolvedFile = {
-    // TODO: Support caching binary data
-    if (jFile.length() > cacheThresholdBytes) {
-      // If the file is too large, then we will just read it from disk
-      null
-    } else if (binaryData) {
-      StaticBinaryResolvedFile(readRawBytes(jFile))
-    } else {
-      StaticResolvedFile(readString(jFile))
-    }
-  }
+  private val cachedBytes: Array[Byte] =
+    if (jFile.length() > cacheThresholdBytes) null
+    else readRawBytes(jFile)
+
+  private val cachedBinaryContent: ResolvedFile =
+    if (cachedBytes != null && binaryData) StaticBinaryResolvedFile(cachedBytes)
+    else null
 
   private def readString(jFile: File): String = {
     new String(Files.readAllBytes(jFile.toPath), StandardCharsets.UTF_8)
@@ -55,45 +52,72 @@ class CachedResolvedFile(
 
   private def readRawBytes(jFile: File): Array[Byte] = Files.readAllBytes(jFile.toPath)
 
+  private lazy val resolvedTextContent: ResolvedFile =
+    StaticResolvedFile(new String(cachedBytes, StandardCharsets.UTF_8))
+
+  private lazy val cachedBytesHash: String =
+    cachedBytes.length.toString + ":" + bytesToHex(
+      MessageDigest.getInstance("SHA-256").digest(cachedBytes)
+    )
+
+  private def bytesToHex(bytes: Array[Byte]): String = {
+    val hexChars = "0123456789abcdef"
+    val out = new Array[Char](bytes.length * 2)
+    var i = 0
+    var j = 0
+    while (i < bytes.length) {
+      val b = bytes(i) & 0xff
+      out(j) = hexChars.charAt(b >>> 4)
+      out(j + 1) = hexChars.charAt(b & 0x0f)
+      i += 1
+      j += 2
+    }
+    new String(out)
+  }
+
   /**
    * A method that will return a reader for the resolved import. If the import is too large, then
    * this will return a reader that will read the file from disk. Otherwise, it will return a reader
    * that reads from memory.
    */
   def getParserInput(): ParserInput = {
-    if (resolvedImportContent == null) {
+    if (cachedBytes == null) {
       FileParserInput(jFile)
+    } else if (binaryData) {
+      cachedBinaryContent.getParserInput()
     } else {
-      resolvedImportContent.getParserInput()
+      resolvedTextContent.getParserInput()
     }
   }
 
   override def readString(): String = {
-    if (resolvedImportContent == null) {
+    if (cachedBytes == null) {
       // If the file is too large, then we will just read it from disk
       readString(jFile)
+    } else if (binaryData) {
+      cachedBinaryContent.readString()
     } else {
       // Otherwise, we will read it from memory
-      resolvedImportContent.readString()
+      resolvedTextContent.readString()
     }
   }
 
   override def contentHash(): String = {
-    if (resolvedImportContent == null) {
+    if (cachedBytes == null) {
       // If the file is too large, then we will just read it from disk
       Platform.hashFile(jFile)
     } else {
-      resolvedImportContent.contentHash()
+      cachedBytesHash
     }
   }
 
   override def readRawBytes(): Array[Byte] = {
-    if (resolvedImportContent == null) {
+    if (cachedBytes == null) {
       // If the file is too large, then we will just read it from disk
       readRawBytes(jFile)
     } else {
       // Otherwise, we will read it from memory
-      resolvedImportContent.readRawBytes()
+      cachedBytes
     }
   }
 }
diff --git a/sjsonnet/src/sjsonnet/Importer.scala b/sjsonnet/src/sjsonnet/Importer.scala
index ca823389d..0ddc7c784 100644
--- a/sjsonnet/src/sjsonnet/Importer.scala
+++ b/sjsonnet/src/sjsonnet/Importer.scala
@@ -302,7 +302,7 @@ object CachedResolver {
     try {
       val visitor =
         new JsonImportVisitor(fileScope, internedStrings, settings)
-      Some((ujson.StringParser.transform(content.readString(), visitor), fileScope))
+      Some((ujson.ByteArrayParser.transform(content.readRawBytes(), visitor), fileScope))
     } catch {
       case _: ujson.ParsingFailedException | _: DuplicateJsonKey | _: InvalidJsonNumber |
           _: JsonParseDepthExceeded | _: NumberFormatException =>
diff --git a/sjsonnet/test/src/sjsonnet/PreloaderTests.scala b/sjsonnet/test/src/sjsonnet/PreloaderTests.scala
index 9d3bc985e..f8b9e0818 100644
--- a/sjsonnet/test/src/sjsonnet/PreloaderTests.scala
+++ b/sjsonnet/test/src/sjsonnet/PreloaderTests.scala
@@ -173,7 +173,8 @@ object PreloaderTests extends TestSuite {
       class JsonOnlyResolvedFile(content: String) extends ResolvedFile {
         def getParserInput(): fastparse.ParserInput =
           throw new RuntimeException("strict JSON should not be parsed with fastparse")
-        def readString(): String = content
+        def readString(): String =
+          throw new RuntimeException("strict JSON should not be decoded as text")
         def contentHash(): String = content
         def readRawBytes(): Array[Byte] =
           content.getBytes(java.nio.charset.StandardCharsets.UTF_8)

From e34fb3c16b101072f24074ee5af27558130bdfc5 Mon Sep 17 00:00:00 2001
From: He-Pin <hepin1989@gmail.com>
Date: Fri, 15 May 2026 18:14:11 +0800
Subject: [PATCH 2/2] perf: hash cached import bytes with xxHash64
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Motivation:
The previous PR introduced SHA-256 + length-prefix for the in-memory
cached path of CachedResolvedFile.contentHash(), which is inconsistent
with the existing xxHash64 streaming hash used on the disk path
(Platform.hashFile). SHA-256 is also far more expensive than needed
for a parse-cache key — the path is already part of the cache key,
so xxHash64's collision resistance is sufficient.

Modification:
- Add Platform.hashBytes(Array[Byte]) on JVM (xxHash64) and Native
  (MurmurHash3.bytesHash) mirroring the algorithm choices in hashFile.
- Replace the SHA-256 + bytesToHex implementation in
  CachedResolvedFile with a single Platform.hashBytes call.
- Drop the java.security.MessageDigest import.

Result:
- Cached and disk paths now produce identical hashes for the same
  byte content on JVM (verified by XxHash64Tests, which exercises
  the disk path with cacheThresholdBytes = 0 and asserts equality
  with a non-streaming xxHash64 of the same bytes).
- Faster contentHash() for in-memory imports (xxHash64 vs SHA-256).
- All JVM tests pass; Native compiles.
---
 .../sjsonnet/CachedResolvedFile.scala         | 21 +------------------
 sjsonnet/src-jvm/sjsonnet/Platform.scala      |  3 +++
 sjsonnet/src-native/sjsonnet/Platform.scala   |  3 +++
 3 files changed, 7 insertions(+), 20 deletions(-)

diff --git a/sjsonnet/src-jvm-native/sjsonnet/CachedResolvedFile.scala b/sjsonnet/src-jvm-native/sjsonnet/CachedResolvedFile.scala
index f29f12aa0..c7208df67 100644
--- a/sjsonnet/src-jvm-native/sjsonnet/CachedResolvedFile.scala
+++ b/sjsonnet/src-jvm-native/sjsonnet/CachedResolvedFile.scala
@@ -5,7 +5,6 @@ import fastparse.ParserInput
 import java.io.File
 import java.nio.charset.StandardCharsets
 import java.nio.file.Files
-import java.security.MessageDigest
 
 /**
  * A class that encapsulates a resolved import. This is used to cache the result of resolving an
@@ -55,25 +54,7 @@ class CachedResolvedFile(
   private lazy val resolvedTextContent: ResolvedFile =
     StaticResolvedFile(new String(cachedBytes, StandardCharsets.UTF_8))
 
-  private lazy val cachedBytesHash: String =
-    cachedBytes.length.toString + ":" + bytesToHex(
-      MessageDigest.getInstance("SHA-256").digest(cachedBytes)
-    )
-
-  private def bytesToHex(bytes: Array[Byte]): String = {
-    val hexChars = "0123456789abcdef"
-    val out = new Array[Char](bytes.length * 2)
-    var i = 0
-    var j = 0
-    while (i < bytes.length) {
-      val b = bytes(i) & 0xff
-      out(j) = hexChars.charAt(b >>> 4)
-      out(j + 1) = hexChars.charAt(b & 0x0f)
-      i += 1
-      j += 2
-    }
-    new String(out)
-  }
+  private lazy val cachedBytesHash: String = Platform.hashBytes(cachedBytes)
 
   /**
    * A method that will return a reader for the resolved import. If the import is too large, then
diff --git a/sjsonnet/src-jvm/sjsonnet/Platform.scala b/sjsonnet/src-jvm/sjsonnet/Platform.scala
index 5bf307c56..23f329ac2 100644
--- a/sjsonnet/src-jvm/sjsonnet/Platform.scala
+++ b/sjsonnet/src-jvm/sjsonnet/Platform.scala
@@ -140,6 +140,9 @@ object Platform {
 
   private val xxHashFactory = XXHashFactory.fastestInstance()
 
+  def hashBytes(bytes: Array[Byte]): String =
+    xxHashFactory.hash64().hash(bytes, 0, bytes.length, 0).toString
+
   def hashFile(file: File): String = {
     val buffer = new Array[Byte](8192)
     val hash: StreamingXXHash64 = xxHashFactory.newStreamingHash64(0)
diff --git a/sjsonnet/src-native/sjsonnet/Platform.scala b/sjsonnet/src-native/sjsonnet/Platform.scala
index fa9b3aa7a..c95707224 100644
--- a/sjsonnet/src-native/sjsonnet/Platform.scala
+++ b/sjsonnet/src-native/sjsonnet/Platform.scala
@@ -197,6 +197,9 @@ object Platform {
   // Same as go-jsonnet https://github.com/google/go-jsonnet/blob/2b4d7535f540f128e38830492e509a550eb86d57/builtins.go#L959
   def sha3(s: String): String = computeHash("SHA3-512", s)
 
+  def hashBytes(bytes: Array[Byte]): String =
+    scala.util.hashing.MurmurHash3.bytesHash(bytes).toHexString
+
   def hashFile(file: File): String = {
     scala.util.hashing.MurmurHash3
       .orderedHash(