From 883fca5feedaba4c9ac490e173540f3b2e2ca4f4 Mon Sep 17 00:00:00 2001 From: "He-Pin(kerr)" Date: Wed, 13 May 2026 15:43:25 +0800 Subject: [PATCH 1/2] perf: parse strict JSON imports from bytes Motivation: PR #840 introduced a strict JSON fast path for .json imports but still forces a full UTF-8 string decode for every cached file before handing the text to ujson.StringParser. Real-world workloads (e.g. kube-prometheus) import many .json files; decoding each one twice (once into String for parsing, again as cache content) is pure overhead. Key Design Decision: ujson 4.4.3 ships ByteArrayParser, which parses UTF-8 JSON directly from a byte array without an intermediate String. Cache small resolved files as raw bytes (already what we read from disk) and lazily decode text only when the importstr/parser-input path actually needs it. Preserve parse-cache content identity by hashing the cached bytes with SHA-256 (length + hex digest) so external ParseCache implementations keep the same collision resistance as the old full-string key. Modification: * Importer.scala: CachedResolver.parseJsonImport now calls ujson.ByteArrayParser.transform(content.readRawBytes(), visitor) instead of decoding the whole file to String first. * CachedResolvedFile.scala (JVM/Native): small files are cached as Array[Byte]; getParserInput / readString materialize the String lazily; readRawBytes returns the cached bytes directly; contentHash is length + SHA-256 over the cached bytes; binary imports still use StaticBinaryResolvedFile. * PreloaderTests.scala: tighten the strict-JSON fast-path coverage so it fails if the fast path ever falls back to readString(). Result: * Output equality vs upstream sjsonnet and jrsonnet preserved on kube-prometheus and large_string_template. * Native kube-prometheus hyperfine A/B (forward & reverse): clean 139.4 +/- 2.8 ms -> candidate 132.7 +/- 1.9 ms (forward) candidate 132.1 +/- 1.9 ms vs clean 140.3 +/- 2.6 ms (reverse) * Full ./mill __.test green. References: Follow-up to https://github.com/databricks/sjsonnet/pull/840 --- .../sjsonnet/CachedResolvedFile.scala | 62 +++++++++++++------ sjsonnet/src/sjsonnet/Importer.scala | 2 +- .../test/src/sjsonnet/PreloaderTests.scala | 3 +- 3 files changed, 46 insertions(+), 21 deletions(-) diff --git a/sjsonnet/src-jvm-native/sjsonnet/CachedResolvedFile.scala b/sjsonnet/src-jvm-native/sjsonnet/CachedResolvedFile.scala index b0d1cd7ba..f29f12aa0 100644 --- a/sjsonnet/src-jvm-native/sjsonnet/CachedResolvedFile.scala +++ b/sjsonnet/src-jvm-native/sjsonnet/CachedResolvedFile.scala @@ -5,6 +5,7 @@ import fastparse.ParserInput import java.io.File import java.nio.charset.StandardCharsets import java.nio.file.Files +import java.security.MessageDigest /** * A class that encapsulates a resolved import. This is used to cache the result of resolving an @@ -37,17 +38,13 @@ class CachedResolvedFile( s"Resolved import path $resolvedImportPath is too large: ${jFile.length()} bytes > $memoryLimitBytes bytes" ) - private val resolvedImportContent: ResolvedFile = { - // TODO: Support caching binary data - if (jFile.length() > cacheThresholdBytes) { - // If the file is too large, then we will just read it from disk - null - } else if (binaryData) { - StaticBinaryResolvedFile(readRawBytes(jFile)) - } else { - StaticResolvedFile(readString(jFile)) - } - } + private val cachedBytes: Array[Byte] = + if (jFile.length() > cacheThresholdBytes) null + else readRawBytes(jFile) + + private val cachedBinaryContent: ResolvedFile = + if (cachedBytes != null && binaryData) StaticBinaryResolvedFile(cachedBytes) + else null private def readString(jFile: File): String = { new String(Files.readAllBytes(jFile.toPath), StandardCharsets.UTF_8) @@ -55,45 +52,72 @@ class CachedResolvedFile( private def readRawBytes(jFile: File): Array[Byte] = Files.readAllBytes(jFile.toPath) + private lazy val resolvedTextContent: ResolvedFile = + StaticResolvedFile(new String(cachedBytes, StandardCharsets.UTF_8)) + + private lazy val cachedBytesHash: String = + cachedBytes.length.toString + ":" + bytesToHex( + MessageDigest.getInstance("SHA-256").digest(cachedBytes) + ) + + private def bytesToHex(bytes: Array[Byte]): String = { + val hexChars = "0123456789abcdef" + val out = new Array[Char](bytes.length * 2) + var i = 0 + var j = 0 + while (i < bytes.length) { + val b = bytes(i) & 0xff + out(j) = hexChars.charAt(b >>> 4) + out(j + 1) = hexChars.charAt(b & 0x0f) + i += 1 + j += 2 + } + new String(out) + } + /** * A method that will return a reader for the resolved import. If the import is too large, then * this will return a reader that will read the file from disk. Otherwise, it will return a reader * that reads from memory. */ def getParserInput(): ParserInput = { - if (resolvedImportContent == null) { + if (cachedBytes == null) { FileParserInput(jFile) + } else if (binaryData) { + cachedBinaryContent.getParserInput() } else { - resolvedImportContent.getParserInput() + resolvedTextContent.getParserInput() } } override def readString(): String = { - if (resolvedImportContent == null) { + if (cachedBytes == null) { // If the file is too large, then we will just read it from disk readString(jFile) + } else if (binaryData) { + cachedBinaryContent.readString() } else { // Otherwise, we will read it from memory - resolvedImportContent.readString() + resolvedTextContent.readString() } } override def contentHash(): String = { - if (resolvedImportContent == null) { + if (cachedBytes == null) { // If the file is too large, then we will just read it from disk Platform.hashFile(jFile) } else { - resolvedImportContent.contentHash() + cachedBytesHash } } override def readRawBytes(): Array[Byte] = { - if (resolvedImportContent == null) { + if (cachedBytes == null) { // If the file is too large, then we will just read it from disk readRawBytes(jFile) } else { // Otherwise, we will read it from memory - resolvedImportContent.readRawBytes() + cachedBytes } } } diff --git a/sjsonnet/src/sjsonnet/Importer.scala b/sjsonnet/src/sjsonnet/Importer.scala index ca823389d..0ddc7c784 100644 --- a/sjsonnet/src/sjsonnet/Importer.scala +++ b/sjsonnet/src/sjsonnet/Importer.scala @@ -302,7 +302,7 @@ object CachedResolver { try { val visitor = new JsonImportVisitor(fileScope, internedStrings, settings) - Some((ujson.StringParser.transform(content.readString(), visitor), fileScope)) + Some((ujson.ByteArrayParser.transform(content.readRawBytes(), visitor), fileScope)) } catch { case _: ujson.ParsingFailedException | _: DuplicateJsonKey | _: InvalidJsonNumber | _: JsonParseDepthExceeded | _: NumberFormatException => diff --git a/sjsonnet/test/src/sjsonnet/PreloaderTests.scala b/sjsonnet/test/src/sjsonnet/PreloaderTests.scala index 9d3bc985e..f8b9e0818 100644 --- a/sjsonnet/test/src/sjsonnet/PreloaderTests.scala +++ b/sjsonnet/test/src/sjsonnet/PreloaderTests.scala @@ -173,7 +173,8 @@ object PreloaderTests extends TestSuite { class JsonOnlyResolvedFile(content: String) extends ResolvedFile { def getParserInput(): fastparse.ParserInput = throw new RuntimeException("strict JSON should not be parsed with fastparse") - def readString(): String = content + def readString(): String = + throw new RuntimeException("strict JSON should not be decoded as text") def contentHash(): String = content def readRawBytes(): Array[Byte] = content.getBytes(java.nio.charset.StandardCharsets.UTF_8) From e34fb3c16b101072f24074ee5af27558130bdfc5 Mon Sep 17 00:00:00 2001 From: He-Pin Date: Fri, 15 May 2026 18:14:11 +0800 Subject: [PATCH 2/2] perf: hash cached import bytes with xxHash64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Motivation: The previous PR introduced SHA-256 + length-prefix for the in-memory cached path of CachedResolvedFile.contentHash(), which is inconsistent with the existing xxHash64 streaming hash used on the disk path (Platform.hashFile). SHA-256 is also far more expensive than needed for a parse-cache key — the path is already part of the cache key, so xxHash64's collision resistance is sufficient. Modification: - Add Platform.hashBytes(Array[Byte]) on JVM (xxHash64) and Native (MurmurHash3.bytesHash) mirroring the algorithm choices in hashFile. - Replace the SHA-256 + bytesToHex implementation in CachedResolvedFile with a single Platform.hashBytes call. - Drop the java.security.MessageDigest import. Result: - Cached and disk paths now produce identical hashes for the same byte content on JVM (verified by XxHash64Tests, which exercises the disk path with cacheThresholdBytes = 0 and asserts equality with a non-streaming xxHash64 of the same bytes). - Faster contentHash() for in-memory imports (xxHash64 vs SHA-256). - All JVM tests pass; Native compiles. --- .../sjsonnet/CachedResolvedFile.scala | 21 +------------------ sjsonnet/src-jvm/sjsonnet/Platform.scala | 3 +++ sjsonnet/src-native/sjsonnet/Platform.scala | 3 +++ 3 files changed, 7 insertions(+), 20 deletions(-) diff --git a/sjsonnet/src-jvm-native/sjsonnet/CachedResolvedFile.scala b/sjsonnet/src-jvm-native/sjsonnet/CachedResolvedFile.scala index f29f12aa0..c7208df67 100644 --- a/sjsonnet/src-jvm-native/sjsonnet/CachedResolvedFile.scala +++ b/sjsonnet/src-jvm-native/sjsonnet/CachedResolvedFile.scala @@ -5,7 +5,6 @@ import fastparse.ParserInput import java.io.File import java.nio.charset.StandardCharsets import java.nio.file.Files -import java.security.MessageDigest /** * A class that encapsulates a resolved import. This is used to cache the result of resolving an @@ -55,25 +54,7 @@ class CachedResolvedFile( private lazy val resolvedTextContent: ResolvedFile = StaticResolvedFile(new String(cachedBytes, StandardCharsets.UTF_8)) - private lazy val cachedBytesHash: String = - cachedBytes.length.toString + ":" + bytesToHex( - MessageDigest.getInstance("SHA-256").digest(cachedBytes) - ) - - private def bytesToHex(bytes: Array[Byte]): String = { - val hexChars = "0123456789abcdef" - val out = new Array[Char](bytes.length * 2) - var i = 0 - var j = 0 - while (i < bytes.length) { - val b = bytes(i) & 0xff - out(j) = hexChars.charAt(b >>> 4) - out(j + 1) = hexChars.charAt(b & 0x0f) - i += 1 - j += 2 - } - new String(out) - } + private lazy val cachedBytesHash: String = Platform.hashBytes(cachedBytes) /** * A method that will return a reader for the resolved import. If the import is too large, then diff --git a/sjsonnet/src-jvm/sjsonnet/Platform.scala b/sjsonnet/src-jvm/sjsonnet/Platform.scala index 5bf307c56..23f329ac2 100644 --- a/sjsonnet/src-jvm/sjsonnet/Platform.scala +++ b/sjsonnet/src-jvm/sjsonnet/Platform.scala @@ -140,6 +140,9 @@ object Platform { private val xxHashFactory = XXHashFactory.fastestInstance() + def hashBytes(bytes: Array[Byte]): String = + xxHashFactory.hash64().hash(bytes, 0, bytes.length, 0).toString + def hashFile(file: File): String = { val buffer = new Array[Byte](8192) val hash: StreamingXXHash64 = xxHashFactory.newStreamingHash64(0) diff --git a/sjsonnet/src-native/sjsonnet/Platform.scala b/sjsonnet/src-native/sjsonnet/Platform.scala index fa9b3aa7a..c95707224 100644 --- a/sjsonnet/src-native/sjsonnet/Platform.scala +++ b/sjsonnet/src-native/sjsonnet/Platform.scala @@ -197,6 +197,9 @@ object Platform { // Same as go-jsonnet https://github.com/google/go-jsonnet/blob/2b4d7535f540f128e38830492e509a550eb86d57/builtins.go#L959 def sha3(s: String): String = computeHash("SHA3-512", s) + def hashBytes(bytes: Array[Byte]): String = + scala.util.hashing.MurmurHash3.bytesHash(bytes).toHexString + def hashFile(file: File): String = { scala.util.hashing.MurmurHash3 .orderedHash(